date:20140723

[PATCH v3 2/6] kvm: Use APIC_DEFAULT_PHYS_BASE macro as the apic access page address.

2014-07-23 Thread Tang Chen

We have APIC_DEFAULT_PHYS_BASE defined as 0xfee0, which is also the address 
of
apic access page. So use this macro.

Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
---
 arch/x86/kvm/svm.c | 3 ++-
 arch/x86/kvm/vmx.c | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ec8366c..576b525 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1257,7 +1257,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, 
unsigned int id)
svm-asid_generation = 0;
init_vmcb(svm);
 
-   svm-vcpu.arch.apic_base = 0xfee0 | MSR_IA32_APICBASE_ENABLE;
+   svm-vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+  MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(svm-vcpu))
svm-vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 801332e..0e1117c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3982,13 +3982,13 @@ static int alloc_apic_access_page(struct kvm *kvm)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
kvm_userspace_mem.flags = 0;
-   kvm_userspace_mem.guest_phys_addr = 0xfee0ULL;
+   kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE;
kvm_userspace_mem.memory_size = PAGE_SIZE;
r = __kvm_set_memory_region(kvm, kvm_userspace_mem);
if (r)
goto out;
 
-   page = gfn_to_page(kvm, 0xfee00);
+   page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE  PAGE_SHIFT);
if (is_error_page(page)) {
r = -EFAULT;
goto out;
@@ -4460,7 +4460,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
vmx-vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(vmx-vcpu, 0);
-   apic_base_msr.data = 0xfee0 | MSR_IA32_APICBASE_ENABLE;
+   apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(vmx-vcpu))
apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
apic_base_msr.host_initiated = true;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 4/6] kvm: Make init_rmode_identity_map() return 0 on success.

2014-07-23 Thread Tang Chen

In init_rmode_identity_map(), there two variables indicating the return
value, r and ret, and it return 0 on error, 1 on success. The function
is only called by vmx_create_vcpu(), and r is redundant.

This patch removes the redundant variable r, and make init_rmode_identity_map()
return 0 on success, -errno on failure.

Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
---
 arch/x86/kvm/vmx.c | 25 +++--
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b8bf47d..6ab4f87 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3922,45 +3922,42 @@ out:
 
 static int init_rmode_identity_map(struct kvm *kvm)
 {
-   int i, idx, r, ret = 0;
+   int i, idx, ret = 0;
pfn_t identity_map_pfn;
u32 tmp;
 
if (!enable_ept)
-   return 1;
+   return 0;
 
/* Protect kvm-arch.ept_identity_pagetable_done. */
mutex_lock(kvm-slots_lock);
 
-   if (likely(kvm-arch.ept_identity_pagetable_done)) {
-   ret = 1;
+   if (likely(kvm-arch.ept_identity_pagetable_done))
goto out2;
-   }
 
identity_map_pfn = kvm-arch.ept_identity_map_addr  PAGE_SHIFT;
 
-   r = alloc_identity_pagetable(kvm);
-   if (r)
+   ret = alloc_identity_pagetable(kvm);
+   if (ret)
goto out2;
 
idx = srcu_read_lock(kvm-srcu);
-   r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
-   if (r  0)
+   ret = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
+   if (ret)
goto out;
/* Set up identity-mapping pagetable for EPT in real mode */
for (i = 0; i  PT32_ENT_PER_PAGE; i++) {
tmp = (i  22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
-   r = kvm_write_guest_page(kvm, identity_map_pfn,
+   ret = kvm_write_guest_page(kvm, identity_map_pfn,
tmp, i * sizeof(tmp), sizeof(tmp));
-   if (r  0)
+   if (ret)
goto out;
}
kvm-arch.ept_identity_pagetable_done = true;
-   ret = 1;
+
 out:
srcu_read_unlock(kvm-srcu, idx);
-
 out2:
mutex_unlock(kvm-slots_lock);
return ret;
@@ -7584,7 +7581,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, 
unsigned int id)
kvm-arch.ept_identity_map_addr =
VMX_EPT_IDENTITY_PAGETABLE_ADDR;
err = -ENOMEM;
-   if (!init_rmode_identity_map(kvm))
+   if (init_rmode_identity_map(kvm))
goto free_vmcs;
}
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 3/6] kvm: Remove ept_identity_pagetable from struct kvm_arch.

2014-07-23 Thread Tang Chen

kvm_arch-ept_identity_pagetable holds the ept identity pagetable page. But
it is never used to refer to the page at all.

In vcpu initialization, it indicates two things:
1. indicates if ept page is allocated
2. indicates if a memory slot for identity page is initialized

Actually, kvm_arch-ept_identity_pagetable_done is enough to tell if the ept
identity pagetable is initialized. So we can remove ept_identity_pagetable.

NOTE: In the original code, ept identity pagetable page is pinned in memroy.
  As a result, it cannot be migrated/hot-removed. After this patch, since
  kvm_arch-ept_identity_pagetable is removed, ept identity pagetable page
  is no longer pinned in memory. And it can be migrated/hot-removed.

Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/vmx.c  | 50 -
 arch/x86/kvm/x86.c  |  2 --
 3 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4931415..62f973e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -578,7 +578,6 @@ struct kvm_arch {
 
gpa_t wall_clock;
 
-   struct page *ept_identity_pagetable;
bool ept_identity_pagetable_done;
gpa_t ept_identity_map_addr;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0e1117c..b8bf47d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -741,6 +741,7 @@ static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu 
*vcpu);
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 static bool vmx_mpx_supported(void);
+static int alloc_identity_pagetable(struct kvm *kvm);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3921,21 +3922,27 @@ out:
 
 static int init_rmode_identity_map(struct kvm *kvm)
 {
-   int i, idx, r, ret;
+   int i, idx, r, ret = 0;
pfn_t identity_map_pfn;
u32 tmp;
 
if (!enable_ept)
return 1;
-   if (unlikely(!kvm-arch.ept_identity_pagetable)) {
-   printk(KERN_ERR EPT: identity-mapping pagetable 
-   haven't been allocated!\n);
-   return 0;
+
+   /* Protect kvm-arch.ept_identity_pagetable_done. */
+   mutex_lock(kvm-slots_lock);
+
+   if (likely(kvm-arch.ept_identity_pagetable_done)) {
+   ret = 1;
+   goto out2;
}
-   if (likely(kvm-arch.ept_identity_pagetable_done))
-   return 1;
-   ret = 0;
+
identity_map_pfn = kvm-arch.ept_identity_map_addr  PAGE_SHIFT;
+
+   r = alloc_identity_pagetable(kvm);
+   if (r)
+   goto out2;
+
idx = srcu_read_lock(kvm-srcu);
r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
if (r  0)
@@ -3953,6 +3960,9 @@ static int init_rmode_identity_map(struct kvm *kvm)
ret = 1;
 out:
srcu_read_unlock(kvm-srcu, idx);
+
+out2:
+   mutex_unlock(kvm-slots_lock);
return ret;
 }
 
@@ -4002,31 +4012,23 @@ out:
 
 static int alloc_identity_pagetable(struct kvm *kvm)
 {
-   struct page *page;
+   /*
+* In init_rmode_identity_map(), kvm-arch.ept_identity_pagetable_done
+* is checked before calling this function and set to true after the
+* calling. The access to kvm-arch.ept_identity_pagetable_done should
+* be protected by kvm-slots_lock.
+*/
+
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
 
-   mutex_lock(kvm-slots_lock);
-   if (kvm-arch.ept_identity_pagetable)
-   goto out;
kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
kvm_userspace_mem.flags = 0;
kvm_userspace_mem.guest_phys_addr =
kvm-arch.ept_identity_map_addr;
kvm_userspace_mem.memory_size = PAGE_SIZE;
r = __kvm_set_memory_region(kvm, kvm_userspace_mem);
-   if (r)
-   goto out;
 
-   page = gfn_to_page(kvm, kvm-arch.ept_identity_map_addr  PAGE_SHIFT);
-   if (is_error_page(page)) {
-   r = -EFAULT;
-   goto out;
-   }
-
-   kvm-arch.ept_identity_pagetable = page;
-out:
-   mutex_unlock(kvm-slots_lock);
return r;
 }
 
@@ -7582,8 +7584,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, 
unsigned int id)
kvm-arch.ept_identity_map_addr =
VMX_EPT_IDENTITY_PAGETABLE_ADDR;
err = -ENOMEM;
-   if (alloc_identity_pagetable(kvm) != 0)
-   goto free_vmcs;
if (!init_rmode_identity_map(kvm))
goto free_vmcs;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f32a025..ffbe557 100644
--- a/arch/x86/kvm/x86.c
+++

[PATCH v3 0/6] kvm, mem-hotplug: Do not pin ept identity pagetable and apic access page.

2014-07-23 Thread Tang Chen

ept identity pagetable and apic access page in kvm are pinned in memory.
As a result, they cannot be migrated/hot-removed.

But actually they don't need to be pinned in memory.

[For ept identity page]
Just do not pin it. When it is migrated, guest will be able to find the
new page in the next ept violation.

[For apic access page]
The hpa of apic access page is stored in VMCS APIC_ACCESS_ADDR pointer.
When apic access page is migrated, we update VMCS APIC_ACCESS_ADDR pointer
for each vcpu in addition.

NOTE: Patch 1~5 are tested with -cpu xxx,-x2apic option, and they work well.
  Patch 6 is not tested yet, not sure if it is right.

Change log v2 - v3:
1. Remove original [PATCH 3/6] since ept_identity_pagetable has been removed
   in new [PATCH 3/6].
2. In [PATCH 3/6], fix the problem that kvm-slots_lock does not protect 
   kvm-arch.ept_identity_pagetable_done checking.
3. In [PATCH 3/6], drop gfn_to_page() since ept_identity_pagetable has been 
   removed.
4. Add new [PATCH 4/6], remove redundant variable in init_rmode_identity_map(), 
   and make it return 0 on success.
5. In [PATCH 5/6], drop put_page(kvm-arch.apic_access_page) from x86.c .
6. In [PATCH 5/6], update kvm-arch.apic_access_page in 
vcpu_reload_apic_access_page().
7. Add new [PATCH 6/6], reload apic access page in L2-L1 exit.

Change log v1 - v2:
1. Add [PATCH 4/5] to remove unnecessary kvm_arch-ept_identity_pagetable.
2. In [PATCH 5/5], only introduce KVM_REQ_APIC_PAGE_RELOAD request.
3. In [PATCH 5/5], add set_apic_access_page_addr() for svm.


Tang Chen (6):
  kvm: Add gfn_to_page_no_pin() to translate gfn to page without
pinning.
  kvm: Use APIC_DEFAULT_PHYS_BASE macro as the apic access page address.
  kvm: Remove ept_identity_pagetable from struct kvm_arch.
  kvm: Make init_rmode_identity_map() return 0 on success.
  kvm, mem-hotplug: Do not pin apic access page in memory.
  kvm, mem-hotplug: Reload L1's apic access page if it is migrated when
L2 is running.

 arch/x86/include/asm/kvm_host.h |   3 +-
 arch/x86/kvm/svm.c  |  15 +-
 arch/x86/kvm/vmx.c  | 108 +++-
 arch/x86/kvm/x86.c  |  22 ++--
 include/linux/kvm_host.h|   3 ++
 virt/kvm/kvm_main.c |  29 ++-
 6 files changed, 139 insertions(+), 41 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 5/6] kvm, mem-hotplug: Do not pin apic access page in memory.

2014-07-23 Thread Tang Chen

apic access page is pinned in memory. As a result, it cannot be 
migrated/hot-removed.
Actually, it is not necessary to be pinned.

The hpa of apic access page is stored in VMCS APIC_ACCESS_ADDR pointer. When
the page is migrated, kvm_mmu_notifier_invalidate_page() will invalidate the
corresponding ept entry. This patch introduces a new vcpu request named
KVM_REQ_APIC_PAGE_RELOAD, and makes this request to all the vcpus at this time,
and force all the vcpus exit guest, and re-enter guest till they updates the 
VMCS
APIC_ACCESS_ADDR pointer to the new apic access page address, and updates
kvm-arch.apic_access_page to the new page.

Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx.c  |  8 +++-
 arch/x86/kvm/x86.c  | 17 +++--
 include/linux/kvm_host.h|  2 ++
 virt/kvm/kvm_main.c | 12 
 6 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 62f973e..9ce6bfd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -737,6 +737,7 @@ struct kvm_x86_ops {
void (*hwapic_isr_update)(struct kvm *kvm, int isr);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
+   void (*set_apic_access_page_addr)(struct kvm *kvm, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 576b525..dc76f29 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3612,6 +3612,11 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu 
*vcpu, bool set)
return;
 }
 
+static void svm_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
+{
+   return;
+}
+
 static int svm_vm_has_apicv(struct kvm *kvm)
 {
return 0;
@@ -4365,6 +4370,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
+   .set_apic_access_page_addr = svm_set_apic_access_page_addr,
.vm_has_apicv = svm_vm_has_apicv,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_isr_update = svm_hwapic_isr_update,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6ab4f87..c123c1d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3995,7 +3995,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
if (r)
goto out;
 
-   page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE  PAGE_SHIFT);
+   page = gfn_to_page_no_pin(kvm, APIC_DEFAULT_PHYS_BASE  PAGE_SHIFT);
if (is_error_page(page)) {
r = -EFAULT;
goto out;
@@ -7072,6 +7072,11 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu 
*vcpu, bool set)
vmx_set_msr_bitmap(vcpu);
 }
 
+static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
+{
+   vmcs_write64(APIC_ACCESS_ADDR, hpa);
+}
+
 static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
 {
u16 status;
@@ -8841,6 +8846,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
+   .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.vm_has_apicv = vmx_vm_has_apicv,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.hwapic_irr_update = vmx_hwapic_irr_update,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ffbe557..7541a66 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5929,6 +5929,19 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
kvm_apic_update_tmr(vcpu, tmr);
 }
 
+static void vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+{
+   /*
+* apic access page could be migrated. When the page is being migrated,
+* GUP will wait till the migrate entry is replaced with the new pte
+* entry pointing to the new page.
+*/
+   vcpu-kvm-arch.apic_access_page = gfn_to_page_no_pin(vcpu-kvm,
+   APIC_DEFAULT_PHYS_BASE  PAGE_SHIFT);
+   kvm_x86_ops-set_apic_access_page_addr(vcpu-kvm,
+   page_to_phys(vcpu-kvm-arch.apic_access_page));
+}
+
 /*
  * Returns 1 to let __vcpu_run() continue the guest execution loop without
  * exiting to the userspace.  Otherwise, the value will be returned to the
@@ -5989,6 +6002,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_deliver_pmi(vcpu);
if

[PATCH v3 1/6] kvm: Add gfn_to_page_no_pin() to translate gfn to page without pinning.

2014-07-23 Thread Tang Chen

gfn_to_page() will finally call hva_to_pfn() to get the pfn, and pin the page
in memory by calling GUP functions. This function unpins the page.

Will be used by the followed patches.

Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c  | 17 -
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ec4e3bd..7c58d9d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -541,6 +541,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, 
struct page **pages,
int nr_pages);
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+struct page *gfn_to_page_no_pin(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable);
 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4b6c01b..6091849 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1371,9 +1371,24 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 
return kvm_pfn_to_page(pfn);
 }
-
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
+struct page *gfn_to_page_no_pin(struct kvm *kvm, gfn_t gfn)
+{
+   struct page *page = gfn_to_page(kvm, gfn);
+
+   /*
+* gfn_to_page() will finally call hva_to_pfn() to get the pfn, and pin
+* the page in memory by calling GUP functions. This function unpins
+* the page.
+*/
+   if (!is_error_page(page))
+   put_page(page);
+
+   return page;
+}
+EXPORT_SYMBOL_GPL(gfn_to_page_no_pin);
+
 void kvm_release_page_clean(struct page *page)
 {
WARN_ON(is_error_page(page));
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 6/6] kvm, mem-hotplug: Reload L1's apic access page if it is migrated when L2 is running.

2014-07-23 Thread Tang Chen

This patch only handle L1 and L2 vm share one apic access page situation.

When L1 vm is running, if the shared apic access page is migrated, mmu_notifier 
will
request all vcpus to exit to L0, and reload apic access page physical address 
for
all the vcpus' vmcs (which is done by patch 5/6). And when it enters L2 vm, 
L2's vmcs
will be updated in prepare_vmcs02() called by nested_vm_run(). So we need to do
nothing.

When L2 vm is running, if the shared apic access page is migrated, mmu_notifier 
will
request all vcpus to exit to L0, and reload apic access page physical address 
for
all L2 vmcs. And this patch requests apic access page reload in L2-L1 vmexit.

Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx.c  | 37 +
 arch/x86/kvm/x86.c  |  3 +++
 4 files changed, 47 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9ce6bfd..613ee7f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -738,6 +738,7 @@ struct kvm_x86_ops {
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
void (*set_apic_access_page_addr)(struct kvm *kvm, hpa_t hpa);
+   void (*set_nested_apic_page_migrated)(struct kvm_vcpu *vcpu, bool set);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index dc76f29..87273ef 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3617,6 +3617,11 @@ static void svm_set_apic_access_page_addr(struct kvm 
*kvm, hpa_t hpa)
return;
 }
 
+static void svm_set_nested_apic_page_migrated(struct kvm_vcpu *vcpu, bool set)
+{
+   return;
+}
+
 static int svm_vm_has_apicv(struct kvm *kvm)
 {
return 0;
@@ -4371,6 +4376,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
.set_apic_access_page_addr = svm_set_apic_access_page_addr,
+   .set_nested_apic_page_migrated = svm_set_nested_apic_page_migrated,
.vm_has_apicv = svm_vm_has_apicv,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_isr_update = svm_hwapic_isr_update,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c123c1d..9231afe 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -379,6 +379,16 @@ struct nested_vmx {
 * we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
+   /*
+* L1's apic access page can be migrated. When L1 and L2 are sharing
+* the apic access page, after the page is migrated when L2 is running,
+* we have to reload it to L1 vmcs before we enter L1.
+*
+* When the shared apic access page is migrated in L1 mode, we don't
+* need to do anything else because we reload apic access page each
+* time when entering L2 in prepare_vmcs02().
+*/
+   bool apic_access_page_migrated;
u64 msr_ia32_feature_control;
 
struct hrtimer preemption_timer;
@@ -7077,6 +7087,12 @@ static void vmx_set_apic_access_page_addr(struct kvm 
*kvm, hpa_t hpa)
vmcs_write64(APIC_ACCESS_ADDR, hpa);
 }
 
+static void vmx_set_nested_apic_page_migrated(struct kvm_vcpu *vcpu, bool set)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   vmx-nested.apic_access_page_migrated = set;
+}
+
 static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
 {
u16 status;
@@ -8727,6 +8743,26 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 
exit_reason,
}
 
/*
+* When shared (L1  L2) apic access page is migrated during L2 is
+* running, mmu_notifier will force to reload the page's hpa for L2
+* vmcs. Need to reload it for L1 before entering L1.
+*/
+   if (vmx-nested.apic_access_page_migrated) {
+   /*
+* Do not call kvm_reload_apic_access_page() because we are now
+* in L2. We should not call make_all_cpus_request() to exit to
+* L0, otherwise we will reload for L2 vmcs again.
+*/
+   int i;
+
+   for (i = 0; i  atomic_read(vcpu-kvm-online_vcpus); i++)
+   kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD,
+vcpu-kvm-vcpus[i]);
+
+   vmx-nested.apic_access_page_migrated = false;
+   }
+
+   /*
 * Exiting from L2 to L1, we're now back to L1 which thinks it just
 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
 * success or

Re: [PATCH] vhost: Add polling mode

2014-07-23 Thread Razya Ladelsky

Jason Wang jasow...@redhat.com wrote on 23/07/2014 08:26:36 AM:

 From: Jason Wang jasow...@redhat.com
 To: Razya Ladelsky/Haifa/IBM@IBMIL, kvm@vger.kernel.org, Michael S.
 Tsirkin m...@redhat.com, 
 Cc: abel.gor...@gmail.com, Joel Nider/Haifa/IBM@IBMIL, Yossi 
 Kuperman1/Haifa/IBM@IBMIL, Eran Raichstein/Haifa/IBM@IBMIL, Alex 
 Glikson/Haifa/IBM@IBMIL
 Date: 23/07/2014 08:26 AM
 Subject: Re: [PATCH] vhost: Add polling mode

 On 07/21/2014 09:23 PM, Razya Ladelsky wrote:
  Hello All,

  When vhost is waiting for buffers from the guest driver (e.g., more 
  packets
  to send in vhost-net's transmit queue), it normally goes to sleep and 
  waits
  for the guest to kick it. This kick involves a PIO in the guest, and
  therefore an exit (and possibly userspace involvement in translating 
this 
  PIO
  exit into a file descriptor event), all of which hurts performance.

  If the system is under-utilized (has cpu time to spare), vhost can 
  continuously poll the virtqueues for new buffers, and avoid asking 
  the guest to kick us.
  This patch adds an optional polling mode to vhost, that can be enabled 

  via a kernel module parameter, poll_start_rate.

  When polling is active for a virtqueue, the guest is asked to
  disable notification (kicks), and the worker thread continuously 
checks 
  for
  new buffers. When it does discover new buffers, it simulates a kick 
by
  invoking the underlying backend driver (such as vhost-net), which 
thinks 
  it
  got a real kick from the guest, and acts accordingly. If the 
underlying
  driver asks not to be kicked, we disable polling on this virtqueue.

  We start polling on a virtqueue when we notice it has
  work to do. Polling on this virtqueue is later disabled after 3 
seconds of
  polling turning up no new work, as in this case we are better off 
  returning
  to the exit-based notification mechanism. The default timeout of 3 
seconds
  can be changed with the poll_stop_idle kernel module parameter.

  This polling approach makes lot of sense for new HW with 
posted-interrupts
  for which we have exitless host-to-guest notifications. But even with 
  support 
  for posted interrupts, guest-to-host communication still causes exits. 

  Polling adds the missing part.

  When systems are overloaded, there won?t be enough cpu time for the 
  various 
  vhost threads to poll their guests' devices. For these scenarios, we 
plan 
  to add support for vhost threads that can be shared by multiple 
devices, 
  even of multiple vms. 
  Our ultimate goal is to implement the I/O acceleration features 
described 
  in:
  KVM Forum 2013: Efficient and Scalable Virtio (by Abel Gordon) 
  https://www.youtube.com/watch?v=9EyweibHfEs
  and
  https://www.mail-archive.com/kvm@vger.kernel.org/msg98179.html

  Comments are welcome, 
  Thank you,
  Razya

 Thanks for the work. Do you have perf numbers for this?

Hi Jason,
Thanks for reviewing. I ran some experiments with TCP stream netperf and 
filebench (having 2 threads performing random reads) benchmarks on an IBM 
System x3650 M4.
All runs loaded the guests in a way that they were (cpu) saturated.
The system had two cores per guest, as to allow for both the vcpu and the 
vhost thread to
run concurrently for maximum throughput (but I didn't pin the threads to 
specific cores)
I get:

Netperf, 1 vm:
The polling patch improved throughput by ~33%. Number of exits/sec 
decreased 6x.
The same improvement was shown when I tested with 3 vms running netperf.

filebench, 1 vm:
ops/sec improved by 13% with the polling patch. Number of exits was 
reduced by 31%.
The same experiment with 3 vms running filebench showed similar numbers.

 And looks like the patch only poll for virtqueue. In the future, may
 worth to add callbacks for vhost_net to poll socket. Then it could be
 used with rx busy polling in host which may speedup the rx also.

Did you mean polling the network device to avoid interrupts?

  diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
  index c90f437..678d766 100644
  --- a/drivers/vhost/vhost.c
  +++ b/drivers/vhost/vhost.c
  @@ -24,9 +24,17 @@
   #include linux/slab.h
   #include linux/kthread.h
   #include linux/cgroup.h
  +#include linux/jiffies.h
   #include linux/module.h

   #include vhost.h
  +static int poll_start_rate = 0;
  +module_param(poll_start_rate, int, S_IRUGO|S_IWUSR);
  +MODULE_PARM_DESC(poll_start_rate, Start continuous polling of 
virtqueue 
  when rate of events is at least this number per jiffy. If 0, never 
start 
  polling.);
  +
  +static int poll_stop_idle = 3*HZ; /* 3 seconds */
  +module_param(poll_stop_idle, int, S_IRUGO|S_IWUSR);
  +MODULE_PARM_DESC(poll_stop_idle, Stop continuous polling of 
virtqueue 
  after this many jiffies of no work.);

 I'm not sure using jiffy is good enough since user need know HZ value.
 May worth to look at sk_busy_loop() which use sched_clock() and us. 

Ok, Will look into it, thanks.

  +/* Enable or disable virtqueue polling

RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail

2014-07-23 Thread mihai.cara...@freescale.com

 -Original Message-
 From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-
 ow...@vger.kernel.org] On Behalf Of Alexander Graf
 Sent: Wednesday, July 23, 2014 12:21 AM
 To: Caraman Mihai Claudiu-B02008
 Cc: kvm-...@vger.kernel.org; linuxppc-...@lists.ozlabs.org;
 kvm@vger.kernel.org
 Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail

 On 21.07.14 11:59, mihai.cara...@freescale.com wrote:
  -Original Message-
  From: Linuxppc-dev [mailto:linuxppc-dev-
  bounces+mihai.caraman=freescale@lists.ozlabs.org] On Behalf Of
  mihai.cara...@freescale.com
  Sent: Friday, July 18, 2014 12:06 PM
  To: Alexander Graf; kvm-...@vger.kernel.org
  Cc: linuxppc-...@lists.ozlabs.org; kvm@vger.kernel.org
  Subject: RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to
 fail

  -Original Message-
  From: Alexander Graf [mailto:ag...@suse.de]
  Sent: Thursday, July 17, 2014 5:21 PM
  To: Caraman Mihai Claudiu-B02008; kvm-...@vger.kernel.org
  Cc: kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org
  Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to
  fail

  On 17.07.14 13:22, Mihai Caraman wrote:
  On book3e, guest last instruction is read on the exit path using
 load
  external pid (lwepx) dedicated instruction. This load operation may
  fail
  due to TLB eviction and execute-but-not-read entries.

  This patch lay down the path for an alternative solution to read the
  guest
  last instruction, by allowing kvmppc_get_lat_inst() function to
 fail.
  Architecture specific implmentations of kvmppc_load_last_inst() may
  read
  last guest instruction and instruct the emulation layer to re-
 execute
  the
  guest in case of failure.

  Make kvmppc_get_last_inst() definition common between architectures.

  Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
  ---
  ...

  diff --git a/arch/powerpc/include/asm/kvm_ppc.h
  b/arch/powerpc/include/asm/kvm_ppc.h
  index e2fd5a1..7f9c634 100644
  --- a/arch/powerpc/include/asm/kvm_ppc.h
  +++ b/arch/powerpc/include/asm/kvm_ppc.h
  @@ -47,6 +47,11 @@ enum emulation_result {
   EMULATE_EXIT_USER,/* emulation requires exit to user-
  space */
 };

  +enum instruction_type {
  +INST_GENERIC,
  +INST_SC,/* system call */
  +};
  +
 extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct
 kvm_vcpu
  *vcpu);
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct
  kvm_vcpu
  *vcpu);
 extern void kvmppc_handler_highmem(void);
  @@ -62,6 +67,9 @@ extern int kvmppc_handle_store(struct kvm_run
 *run,
  struct kvm_vcpu *vcpu,
  u64 val, unsigned int bytes,
  int is_default_endian);

  +extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
  + enum instruction_type type, u32 *inst);
  +
 extern int kvmppc_emulate_instruction(struct kvm_run *run,
   struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct
 kvm_vcpu
  *vcpu);
  @@ -234,6 +242,23 @@ struct kvmppc_ops {
 extern struct kvmppc_ops *kvmppc_hv_ops;
 extern struct kvmppc_ops *kvmppc_pr_ops;

  +static inline int kvmppc_get_last_inst(struct kvm_vcpu *vcpu,
  +enum instruction_type type, u32
 *inst)
  +{
  +int ret = EMULATE_DONE;
  +
  +/* Load the instruction manually if it failed to do so in the
  + * exit path */
  +if (vcpu-arch.last_inst == KVM_INST_FETCH_FAILED)
  +ret = kvmppc_load_last_inst(vcpu, type, vcpu-
  arch.last_inst);
  +
  +
  +*inst = (ret == EMULATE_DONE  kvmppc_need_byteswap(vcpu)) ?
  +swab32(vcpu-arch.last_inst) : vcpu-arch.last_inst;
  This makes even less sense than the previous version. Either you
 treat
  inst as definitely overwritten or as preserves previous data on
  failure.
  Both v4 and v5 versions treat inst as definitely overwritten.

  So either you unconditionally swap like you did before
  If we make abstraction of its symmetry, KVM_INST_FETCH_FAILED is
 operated
  in host endianness, so it doesn't need byte swap.

  I agree with your reasoning if last_inst is initialized and compared
 with
  data in guest endianess, which is not the case yet for
  KVM_INST_FETCH_FAILED.
  Alex, are you relying on the fact that KVM_INST_FETCH_FAILED value is
 symmetrical?
  With a non symmetrical value like 0xDEADBEEF, and considering a little-
 endian guest
  on a big-endian host, we need to fix kvm logic to initialize and
 compare last_inst
  with 0xEFBEADDE swaped value.

  Your suggestion to unconditionally swap makes sense only with the above
 fix, otherwise
  inst may end up with 0xEFBEADDE swaped value with is wrong.

 Only for *inst which we would treat as undefined after the function
 returned EMULATE_AGAIN. 

Right. With this do you acknowledge that v5

Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail

2014-07-23 Thread Alexander Graf

Am 23.07.2014 um 10:24 schrieb mihai.cara...@freescale.com 
mihai.cara...@freescale.com:

 -Original Message-
 From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-
 ow...@vger.kernel.org] On Behalf Of Alexander Graf
 Sent: Wednesday, July 23, 2014 12:21 AM
 To: Caraman Mihai Claudiu-B02008
 Cc: kvm-...@vger.kernel.org; linuxppc-...@lists.ozlabs.org;
 kvm@vger.kernel.org
 Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail

 On 21.07.14 11:59, mihai.cara...@freescale.com wrote:
 -Original Message-
 From: Linuxppc-dev [mailto:linuxppc-dev-
 bounces+mihai.caraman=freescale@lists.ozlabs.org] On Behalf Of
 mihai.cara...@freescale.com
 Sent: Friday, July 18, 2014 12:06 PM
 To: Alexander Graf; kvm-...@vger.kernel.org
 Cc: linuxppc-...@lists.ozlabs.org; kvm@vger.kernel.org
 Subject: RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to
 fail

 -Original Message-
 From: Alexander Graf [mailto:ag...@suse.de]
 Sent: Thursday, July 17, 2014 5:21 PM
 To: Caraman Mihai Claudiu-B02008; kvm-...@vger.kernel.org
 Cc: kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org
 Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to
 fail

 On 17.07.14 13:22, Mihai Caraman wrote:
 On book3e, guest last instruction is read on the exit path using
 load
 external pid (lwepx) dedicated instruction. This load operation may
 fail
 due to TLB eviction and execute-but-not-read entries.

 This patch lay down the path for an alternative solution to read the
 guest
 last instruction, by allowing kvmppc_get_lat_inst() function to
 fail.
 Architecture specific implmentations of kvmppc_load_last_inst() may
 read
 last guest instruction and instruct the emulation layer to re-
 execute
 the
 guest in case of failure.

 Make kvmppc_get_last_inst() definition common between architectures.

 Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
 ---
 ...

 diff --git a/arch/powerpc/include/asm/kvm_ppc.h
 b/arch/powerpc/include/asm/kvm_ppc.h
 index e2fd5a1..7f9c634 100644
 --- a/arch/powerpc/include/asm/kvm_ppc.h
 +++ b/arch/powerpc/include/asm/kvm_ppc.h
 @@ -47,6 +47,11 @@ enum emulation_result {
   EMULATE_EXIT_USER,/* emulation requires exit to user-
 space */
   };

 +enum instruction_type {
 +INST_GENERIC,
 +INST_SC,/* system call */
 +};
 +
   extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct
 kvm_vcpu
 *vcpu);
   extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct
 kvm_vcpu
 *vcpu);
   extern void kvmppc_handler_highmem(void);
 @@ -62,6 +67,9 @@ extern int kvmppc_handle_store(struct kvm_run
 *run,
 struct kvm_vcpu *vcpu,
  u64 val, unsigned int bytes,
  int is_default_endian);

 +extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
 + enum instruction_type type, u32 *inst);
 +
   extern int kvmppc_emulate_instruction(struct kvm_run *run,
 struct kvm_vcpu *vcpu);
   extern int kvmppc_emulate_mmio(struct kvm_run *run, struct
 kvm_vcpu
 *vcpu);
 @@ -234,6 +242,23 @@ struct kvmppc_ops {
   extern struct kvmppc_ops *kvmppc_hv_ops;
   extern struct kvmppc_ops *kvmppc_pr_ops;

 +static inline int kvmppc_get_last_inst(struct kvm_vcpu *vcpu,
 +enum instruction_type type, u32
 *inst)
 +{
 +int ret = EMULATE_DONE;
 +
 +/* Load the instruction manually if it failed to do so in the
 + * exit path */
 +if (vcpu-arch.last_inst == KVM_INST_FETCH_FAILED)
 +ret = kvmppc_load_last_inst(vcpu, type, vcpu-
 arch.last_inst);
 +
 +
 +*inst = (ret == EMULATE_DONE  kvmppc_need_byteswap(vcpu)) ?
 +swab32(vcpu-arch.last_inst) : vcpu-arch.last_inst;
 This makes even less sense than the previous version. Either you
 treat
 inst as definitely overwritten or as preserves previous data on
 failure.
 Both v4 and v5 versions treat inst as definitely overwritten.

 So either you unconditionally swap like you did before
 If we make abstraction of its symmetry, KVM_INST_FETCH_FAILED is
 operated
 in host endianness, so it doesn't need byte swap.

 I agree with your reasoning if last_inst is initialized and compared
 with
 data in guest endianess, which is not the case yet for
 KVM_INST_FETCH_FAILED.
 Alex, are you relying on the fact that KVM_INST_FETCH_FAILED value is
 symmetrical?
 With a non symmetrical value like 0xDEADBEEF, and considering a little-
 endian guest
 on a big-endian host, we need to fix kvm logic to initialize and
 compare last_inst
 with 0xEFBEADDE swaped value.

 Your suggestion to unconditionally swap makes sense only with the above
 fix, otherwise
 inst may end up with 0xEFBEADDE swaped value with is wrong.

 Only for *inst which we would treat as undefined after the function
 returned EMULATE_AGAIN.

 Right. With this do you acknowledge that v5 (definitely overwritten approach)
 is ok?

I think I'm starting to understand your logic of v5. You write fetch_failed 
into *inst

Re: [PATCH] vhost: Add polling mode

2014-07-23 Thread Jason Wang

On 07/23/2014 04:12 PM, Razya Ladelsky wrote:
 Jason Wang jasow...@redhat.com wrote on 23/07/2014 08:26:36 AM:

 From: Jason Wang jasow...@redhat.com
 To: Razya Ladelsky/Haifa/IBM@IBMIL, kvm@vger.kernel.org, Michael S.
 Tsirkin m...@redhat.com, 
 Cc: abel.gor...@gmail.com, Joel Nider/Haifa/IBM@IBMIL, Yossi 
 Kuperman1/Haifa/IBM@IBMIL, Eran Raichstein/Haifa/IBM@IBMIL, Alex 
 Glikson/Haifa/IBM@IBMIL
 Date: 23/07/2014 08:26 AM
 Subject: Re: [PATCH] vhost: Add polling mode

 On 07/21/2014 09:23 PM, Razya Ladelsky wrote:
 Hello All,

 When vhost is waiting for buffers from the guest driver (e.g., more 
 packets
 to send in vhost-net's transmit queue), it normally goes to sleep and 
 waits
 for the guest to kick it. This kick involves a PIO in the guest, and
 therefore an exit (and possibly userspace involvement in translating 
 this 
 PIO
 exit into a file descriptor event), all of which hurts performance.

 If the system is under-utilized (has cpu time to spare), vhost can 
 continuously poll the virtqueues for new buffers, and avoid asking 
 the guest to kick us.
 This patch adds an optional polling mode to vhost, that can be enabled 
 via a kernel module parameter, poll_start_rate.

 When polling is active for a virtqueue, the guest is asked to
 disable notification (kicks), and the worker thread continuously 
 checks 
 for
 new buffers. When it does discover new buffers, it simulates a kick 
 by
 invoking the underlying backend driver (such as vhost-net), which 
 thinks 
 it
 got a real kick from the guest, and acts accordingly. If the 
 underlying
 driver asks not to be kicked, we disable polling on this virtqueue.

 We start polling on a virtqueue when we notice it has
 work to do. Polling on this virtqueue is later disabled after 3 
 seconds of
 polling turning up no new work, as in this case we are better off 
 returning
 to the exit-based notification mechanism. The default timeout of 3 
 seconds
 can be changed with the poll_stop_idle kernel module parameter.

 This polling approach makes lot of sense for new HW with 
 posted-interrupts
 for which we have exitless host-to-guest notifications. But even with 
 support 
 for posted interrupts, guest-to-host communication still causes exits. 
 Polling adds the missing part.

 When systems are overloaded, there won?t be enough cpu time for the 
 various 
 vhost threads to poll their guests' devices. For these scenarios, we 
 plan 
 to add support for vhost threads that can be shared by multiple 
 devices, 
 even of multiple vms. 
 Our ultimate goal is to implement the I/O acceleration features 
 described 
 in:
 KVM Forum 2013: Efficient and Scalable Virtio (by Abel Gordon) 
 https://www.youtube.com/watch?v=9EyweibHfEs
 and
 https://www.mail-archive.com/kvm@vger.kernel.org/msg98179.html

 Comments are welcome, 
 Thank you,
 Razya
 Thanks for the work. Do you have perf numbers for this?

 Hi Jason,
 Thanks for reviewing. I ran some experiments with TCP stream netperf and 
 filebench (having 2 threads performing random reads) benchmarks on an IBM 
 System x3650 M4.
 All runs loaded the guests in a way that they were (cpu) saturated.
 The system had two cores per guest, as to allow for both the vcpu and the 
 vhost thread to
 run concurrently for maximum throughput (but I didn't pin the threads to 
 specific cores)
 I get:

 Netperf, 1 vm:
 The polling patch improved throughput by ~33%. Number of exits/sec 
 decreased 6x.
 The same improvement was shown when I tested with 3 vms running netperf.

 filebench, 1 vm:
 ops/sec improved by 13% with the polling patch. Number of exits was 
 reduced by 31%.
 The same experiment with 3 vms running filebench showed similar numbers.

Looks good, may worth to add the result in the commit log.

 And looks like the patch only poll for virtqueue. In the future, may
 worth to add callbacks for vhost_net to poll socket. Then it could be
 used with rx busy polling in host which may speedup the rx also.
 Did you mean polling the network device to avoid interrupts?

Yes, recent linux host support rx busy polling which can reduce the
interrupts. If vhost can utilize this, it can also reduce the latency
caused by vhost thread wakeups.

And I'm also working on virtio-net busy polling in guest, if vhost can
poll socket, it can also help in guest rx polling.
 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
 index c90f437..678d766 100644
 --- a/drivers/vhost/vhost.c
 +++ b/drivers/vhost/vhost.c
 @@ -24,9 +24,17 @@
  #include linux/slab.h
  #include linux/kthread.h
  #include linux/cgroup.h
 +#include linux/jiffies.h
  #include linux/module.h

  #include vhost.h
 +static int poll_start_rate = 0;
 +module_param(poll_start_rate, int, S_IRUGO|S_IWUSR);
 +MODULE_PARM_DESC(poll_start_rate, Start continuous polling of 
 virtqueue 
 when rate of events is at least this number per jiffy. If 0, never 
 start 
 polling.);
 +
 +static int poll_stop_idle = 3*HZ; /* 3 seconds */
 +module_param(poll_stop_idle, int, S_IRUGO|S_IWUSR);

Re: [PATCH] vhost: Add polling mode

2014-07-23 Thread Abel Gordon

On Wed, Jul 23, 2014 at 11:42 AM, Jason Wang jasow...@redhat.com wrote:

 On 07/23/2014 04:12 PM, Razya Ladelsky wrote:
  Jason Wang jasow...@redhat.com wrote on 23/07/2014 08:26:36 AM:

  From: Jason Wang jasow...@redhat.com
  To: Razya Ladelsky/Haifa/IBM@IBMIL, kvm@vger.kernel.org, Michael S.
  Tsirkin m...@redhat.com,
  Cc: abel.gor...@gmail.com, Joel Nider/Haifa/IBM@IBMIL, Yossi
  Kuperman1/Haifa/IBM@IBMIL, Eran Raichstein/Haifa/IBM@IBMIL, Alex
  Glikson/Haifa/IBM@IBMIL
  Date: 23/07/2014 08:26 AM
  Subject: Re: [PATCH] vhost: Add polling mode

  On 07/21/2014 09:23 PM, Razya Ladelsky wrote:
  Hello All,

  When vhost is waiting for buffers from the guest driver (e.g., more
  packets
  to send in vhost-net's transmit queue), it normally goes to sleep and
  waits
  for the guest to kick it. This kick involves a PIO in the guest, and
  therefore an exit (and possibly userspace involvement in translating
  this
  PIO
  exit into a file descriptor event), all of which hurts performance.

  If the system is under-utilized (has cpu time to spare), vhost can
  continuously poll the virtqueues for new buffers, and avoid asking
  the guest to kick us.
  This patch adds an optional polling mode to vhost, that can be enabled
  via a kernel module parameter, poll_start_rate.

  When polling is active for a virtqueue, the guest is asked to
  disable notification (kicks), and the worker thread continuously
  checks
  for
  new buffers. When it does discover new buffers, it simulates a kick
  by
  invoking the underlying backend driver (such as vhost-net), which
  thinks
  it
  got a real kick from the guest, and acts accordingly. If the
  underlying
  driver asks not to be kicked, we disable polling on this virtqueue.

  We start polling on a virtqueue when we notice it has
  work to do. Polling on this virtqueue is later disabled after 3
  seconds of
  polling turning up no new work, as in this case we are better off
  returning
  to the exit-based notification mechanism. The default timeout of 3
  seconds
  can be changed with the poll_stop_idle kernel module parameter.

  This polling approach makes lot of sense for new HW with
  posted-interrupts
  for which we have exitless host-to-guest notifications. But even with
  support
  for posted interrupts, guest-to-host communication still causes exits.
  Polling adds the missing part.

  When systems are overloaded, there won?t be enough cpu time for the
  various
  vhost threads to poll their guests' devices. For these scenarios, we
  plan
  to add support for vhost threads that can be shared by multiple
  devices,
  even of multiple vms.
  Our ultimate goal is to implement the I/O acceleration features
  described
  in:
  KVM Forum 2013: Efficient and Scalable Virtio (by Abel Gordon)
  https://www.youtube.com/watch?v=9EyweibHfEs
  and
  https://www.mail-archive.com/kvm@vger.kernel.org/msg98179.html

  Comments are welcome,
  Thank you,
  Razya
  Thanks for the work. Do you have perf numbers for this?

  Hi Jason,
  Thanks for reviewing. I ran some experiments with TCP stream netperf and
  filebench (having 2 threads performing random reads) benchmarks on an IBM
  System x3650 M4.
  All runs loaded the guests in a way that they were (cpu) saturated.
  The system had two cores per guest, as to allow for both the vcpu and the
  vhost thread to
  run concurrently for maximum throughput (but I didn't pin the threads to
  specific cores)
  I get:

  Netperf, 1 vm:
  The polling patch improved throughput by ~33%. Number of exits/sec
  decreased 6x.
  The same improvement was shown when I tested with 3 vms running netperf.

  filebench, 1 vm:
  ops/sec improved by 13% with the polling patch. Number of exits was
  reduced by 31%.
  The same experiment with 3 vms running filebench showed similar numbers.

 Looks good, may worth to add the result in the commit log.

  And looks like the patch only poll for virtqueue. In the future, may
  worth to add callbacks for vhost_net to poll socket. Then it could be
  used with rx busy polling in host which may speedup the rx also.
  Did you mean polling the network device to avoid interrupts?

 Yes, recent linux host support rx busy polling which can reduce the
 interrupts. If vhost can utilize this, it can also reduce the latency
 caused by vhost thread wakeups.

 And I'm also working on virtio-net busy polling in guest, if vhost can
 poll socket, it can also help in guest rx polling.

Nice :)  Note that you may want to check if if the processor support
posted interrupts. I guess that if CPU supports posted interrupts then
benefits of polling in the front-end (from performance perspective)
may not worth the cpu cycles wasted in the guest.

  diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
  index c90f437..678d766 100644
  --- a/drivers/vhost/vhost.c
  +++ b/drivers/vhost/vhost.c
  @@ -24,9 +24,17 @@
   #include linux/slab.h
   #include linux/kthread.h
   #include linux/cgroup.h
  +#include linux/jiffies.h

RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail

2014-07-23 Thread mihai.cara...@freescale.com

  Right. With this do you acknowledge that v5 (definitely overwritten
 approach)
  is ok?
 
 I think I'm starting to understand your logic of v5. You write
 fetch_failed into *inst unswapped if the fetch failed.

v5
  - don't swap when load fails :)

 
 I think that's ok, but I definitely do not like the code flow - it's too
 hard to understand at a glimpse. Just rewrite it to swab at local
 variable level, preferably with if()s and comments what this is about and
 have a single unconditional *inst = fetched_inst; at the end of the
 function.

I will incorporate these change requests into v6.

Thanks,
-Mike
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: PPC: fix incorrect way saving SPRN_MMCR2

2014-07-23 Thread Xiao Guangrong

SPRN_SIER and SPRN_MMCR2 are doublely saved, particularly
SPRN_MMCR2 is oversaved with a incorrect value which comes
from SPRN_PMC5

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/powerpc/kvm/book3s_hv_interrupts.S | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S 
b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 8c86422..731be74 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -127,11 +127,6 @@ BEGIN_FTR_SECTION
stw r10, HSTATE_PMC + 24(r13)
stw r11, HSTATE_PMC + 28(r13)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
-BEGIN_FTR_SECTION
-   mfspr   r9, SPRN_SIER
-   std r8, HSTATE_MMCR + 40(r13)
-   std r9, HSTATE_MMCR + 48(r13)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 31:

/*
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 01/17] configure: add CONFIG_COLO to switch COLO support

2014-07-23 Thread Yang Hongyang

./configure --enable-colo/--disable-colo to switch COLO
support on/off.
COLO support is off by default.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 configure | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/configure b/configure
index f7685b5..4071943 100755
--- a/configure
+++ b/configure
@@ -258,6 +258,7 @@ xfs=
 vhost_net=no
 vhost_scsi=no
 kvm=no
+colo=no
 rdma=
 gprof=no
 debug_tcg=no
@@ -921,6 +922,10 @@ for opt do
   ;;
   --enable-kvm) kvm=yes
   ;;
+  --disable-colo) colo=no
+  ;;
+  --enable-colo) colo=yes
+  ;;
   --disable-tcg-interpreter) tcg_interpreter=no
   ;;
   --enable-tcg-interpreter) tcg_interpreter=yes
@@ -1314,6 +1319,10 @@ Advanced options (experts only):
   --disable-slirp  disable SLIRP userspace network connectivity
   --disable-kvmdisable KVM acceleration support
   --enable-kvm enable KVM acceleration support
+  --disable-colo   disable COarse-grain LOck-stepping Virtual
+   Machines for Non-stop Service(default)
+  --enable-coloenable COarse-grain LOck-stepping Virtual
+   Machines for Non-stop Service
   --disable-rdma   disable RDMA-based migration support
   --enable-rdmaenable RDMA-based migration support
   --enable-tcg-interpreter enable TCG with bytecode interpreter (TCI)
@@ -4215,6 +4224,7 @@ echo Linux AIO support $linux_aio
 echo ATTR/XATTR support $attr
 echo Install blobs $blobs
 echo KVM support   $kvm
+echo COLO support  $colo
 echo RDMA support  $rdma
 echo TCG interpreter   $tcg_interpreter
 echo fdt support   $fdt
@@ -4751,6 +4761,10 @@ if have_backend ftrace; then
 fi
 echo CONFIG_TRACE_FILE=$trace_file  $config_host_mak
 
+if test $colo = yes; then
+  echo CONFIG_COLO=y  $config_host_mak
+fi
+
 if test $rdma = yes ; then
   echo CONFIG_RDMA=y  $config_host_mak
 fi
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 00/17] COarse-grain LOck-stepping(COLO) Virtual Machines for Non-stop Service

2014-07-23 Thread Yang Hongyang

Virtual machine (VM) replication is a well known technique for
providing application-agnostic software-implemented hardware fault
tolerance non-stop service. COLO is a high availability solution.
Both primary VM (PVM) and secondary VM (SVM) run in parallel. They
receive the same request from client, and generate response in parallel
too. If the response packets from PVM and SVM are identical, they are
released immediately. Otherwise, a VM checkpoint (on demand) is
conducted. The idea is presented in Xen summit 2012, and 2013,
and academia paper in SOCC 2013. It's also presented in KVM forum
2013:
http://www.linux-kvm.org/wiki/images/1/1d/Kvm-forum-2013-COLO.pdf
Please refer to above document for detailed information. 
Please also refer to previous posted RFC proposal:
http://lists.nongnu.org/archive/html/qemu-devel/2014-06/msg05567.html

The patchset is also hosted on github:
https://github.com/macrosheep/qemu/tree/colo_v0.1

This patchset is RFC, implements the frame of colo, without
failover and nic/disk replication. But it is ready for demo
the COLO idea above QEMU-Kvm.
Steps using this patchset to get an overview of COLO:
1. configure the source with --enable-colo option
2. compile
3. just like QEMU's normal migration, run 2 QEMU VM:
   - Primary VM 
   - Secondary VM with -incoming tcp:[IP]:[PORT] option
4. on Primary VM's QEMU monitor, run following command:
   migrate_set_capability colo on
   migrate tcp:[IP]:[PORT]
5. done
you will see two runing VMs, whenever you make changes to PVM, SVM
will be synced to PVM's state.

TODO list:
1. failover
2. nic replication
3. disk replication[COLO Disk manager]

Any comments/feedbacks are warmly welcomed.

Thanks,
Yang

Yang Hongyang (17):
  configure: add CONFIG_COLO to switch COLO support
  COLO: introduce an api colo_supported() to indicate COLO support
  COLO migration: add a migration capability 'colo'
  COLO info: use colo info to tell migration target colo is enabled
  COLO save: integrate COLO checkpointed save into qemu migration
  COLO restore: integrate COLO checkpointed restore into qemu restore
  COLO buffer: implement colo buffer as well as QEMUFileOps based on it
  COLO: disable qdev hotplug
  COLO ctl: implement API's that communicate with colo agent
  COLO ctl: introduce is_slave() and is_master()
  COLO ctl: implement colo checkpoint protocol
  COLO ctl: add a RunState RUN_STATE_COLO
  COLO ctl: implement colo save
  COLO ctl: implement colo restore
  COLO save: reuse migration bitmap under colo checkpoint
  COLO ram cache: implement colo ram cache on slaver
  HACK: trigger checkpoint every 500ms

 Makefile.objs  |   2 +
 arch_init.c| 174 +-
 configure  |  14 +
 include/exec/cpu-all.h |   1 +
 include/migration/migration-colo.h |  36 +++
 include/migration/migration.h  |  13 +
 include/qapi/qmp/qerror.h  |   3 +
 migration-colo-comm.c  |  78 +
 migration-colo.c   | 643 +
 migration.c|  45 ++-
 qapi-schema.json   |   9 +-
 stubs/Makefile.objs|   1 +
 stubs/migration-colo.c |  34 ++
 vl.c   |  12 +
 14 files changed, 1044 insertions(+), 21 deletions(-)
 create mode 100644 include/migration/migration-colo.h
 create mode 100644 migration-colo-comm.c
 create mode 100644 migration-colo.c
 create mode 100644 stubs/migration-colo.c

-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 05/17] COLO save: integrate COLO checkpointed save into qemu migration

2014-07-23 Thread Yang Hongyang

  Integrate COLO checkpointed save flow into qemu migration.
  Add a migrate state: MIG_STATE_COLO, enter this migrate state
after the first live migration successfully finished.
  Create a colo thread to do the checkpointed save.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 include/migration/migration-colo.h |  4 
 include/migration/migration.h  | 13 +++
 migration-colo-comm.c  |  2 +-
 migration-colo.c   | 48 ++
 migration.c| 36 
 stubs/migration-colo.c |  4 
 6 files changed, 91 insertions(+), 16 deletions(-)

diff --git a/include/migration/migration-colo.h 
b/include/migration/migration-colo.h
index e3735d8..24589c0 100644
--- a/include/migration/migration-colo.h
+++ b/include/migration/migration-colo.h
@@ -18,4 +18,8 @@ void colo_info_mig_init(void);
 
 bool colo_supported(void);
 
+/* save */
+bool migrate_use_colo(void);
+void colo_init_checkpointer(MigrationState *s);
+
 #endif
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 3cb5ba8..3e81a27 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -64,6 +64,19 @@ struct MigrationState
 int64_t dirty_sync_count;
 };
 
+enum {
+MIG_STATE_ERROR = -1,
+MIG_STATE_NONE,
+MIG_STATE_SETUP,
+MIG_STATE_CANCELLING,
+MIG_STATE_CANCELLED,
+MIG_STATE_ACTIVE,
+MIG_STATE_COLO,
+MIG_STATE_COMPLETED,
+};
+
+void migrate_set_state(MigrationState *s, int old_state, int new_state);
+
 void process_incoming_migration(QEMUFile *f);
 
 void qemu_start_incoming_migration(const char *uri, Error **errp);
diff --git a/migration-colo-comm.c b/migration-colo-comm.c
index ccbc246..4504ceb 100644
--- a/migration-colo-comm.c
+++ b/migration-colo-comm.c
@@ -25,7 +25,7 @@ static bool colo_requested;
 
 /* save */
 
-static bool migrate_use_colo(void)
+bool migrate_use_colo(void)
 {
 MigrationState *s = migrate_get_current();
 return s-enabled_capabilities[MIGRATION_CAPABILITY_COLO];
diff --git a/migration-colo.c b/migration-colo.c
index 1d3bef8..0cef8bd 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -8,9 +8,57 @@
  * the COPYING file in the top-level directory.
  */
 
+#include qemu/main-loop.h
+#include qemu/thread.h
 #include migration/migration-colo.h
 
+static QEMUBH *colo_bh;
+
 bool colo_supported(void)
 {
 return true;
 }
+
+/* save */
+
+static void *colo_thread(void *opaque)
+{
+MigrationState *s = opaque;
+
+/*TODO: COLO checkpointed save loop*/
+
+if (s-state != MIG_STATE_ERROR) {
+migrate_set_state(s, MIG_STATE_COLO, MIG_STATE_COMPLETED);
+}
+
+qemu_mutex_lock_iothread();
+qemu_bh_schedule(s-cleanup_bh);
+qemu_mutex_unlock_iothread();
+
+return NULL;
+}
+
+static void colo_start_checkpointer(void *opaque)
+{
+MigrationState *s = opaque;
+
+if (colo_bh) {
+qemu_bh_delete(colo_bh);
+colo_bh = NULL;
+}
+
+qemu_mutex_unlock_iothread();
+qemu_thread_join(s-thread);
+qemu_mutex_lock_iothread();
+
+migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COLO);
+
+qemu_thread_create(s-thread, colo, colo_thread, s,
+   QEMU_THREAD_JOINABLE);
+}
+
+void colo_init_checkpointer(MigrationState *s)
+{
+colo_bh = qemu_bh_new(colo_start_checkpointer, s);
+qemu_bh_schedule(colo_bh);
+}
diff --git a/migration.c b/migration.c
index ca83310..b7f8e7e 100644
--- a/migration.c
+++ b/migration.c
@@ -27,16 +27,6 @@
 #include trace.h
 #include migration/migration-colo.h
 
-enum {
-MIG_STATE_ERROR = -1,
-MIG_STATE_NONE,
-MIG_STATE_SETUP,
-MIG_STATE_CANCELLING,
-MIG_STATE_CANCELLED,
-MIG_STATE_ACTIVE,
-MIG_STATE_COMPLETED,
-};
-
 #define MAX_THROTTLE  (32  20)  /* Migration speed throttling */
 
 /* Amount of time to allocate to each chunk of bandwidth-throttled
@@ -229,6 +219,11 @@ MigrationInfo *qmp_query_migrate(Error **errp)
 
 get_xbzrle_cache_stats(info);
 break;
+case MIG_STATE_COLO:
+info-has_status = true;
+info-status = g_strdup(colo);
+/* TODO: display COLO specific informations(checkpoint info etc.),*/
+break;
 case MIG_STATE_COMPLETED:
 get_xbzrle_cache_stats(info);
 
@@ -272,7 +267,8 @@ void 
qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
 MigrationState *s = migrate_get_current();
 MigrationCapabilityStatusList *cap;
 
-if (s-state == MIG_STATE_ACTIVE || s-state == MIG_STATE_SETUP) {
+if (s-state == MIG_STATE_ACTIVE || s-state == MIG_STATE_SETUP ||
+s-state == MIG_STATE_COLO) {
 error_set(errp, QERR_MIGRATION_ACTIVE);
 return;
 }
@@ -289,7 +285,7 @@ void 
qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
 
 /* shared migration helpers */
 
-static void migrate_set_state(MigrationState *s, int old_state, int new_state)
+void

[RFC PATCH 09/17] COLO ctl: implement API's that communicate with colo agent

2014-07-23 Thread Yang Hongyang

We use COLO agent to compare the packets returned by
Primary VM and Secondary VM, and decide whether to start a
checkpoint according to some rules. It is a linux kernel
module for host.
COLO controller communicate with the agent through ioctl().

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 migration-colo.c | 115 +--
 1 file changed, 112 insertions(+), 3 deletions(-)

diff --git a/migration-colo.c b/migration-colo.c
index f295e56..802f8b0 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -13,7 +13,16 @@
 #include block/coroutine.h
 #include qemu/error-report.h
 #include hw/qdev-core.h
+#include qemu/timer.h
 #include migration/migration-colo.h
+#include sys/ioctl.h
+
+/*
+ * checkpoint timer: unit ms
+ * this is large because COLO checkpoint will mostly depend on
+ * COLO compare module.
+ */
+#define CHKPOINT_TIMER 1
 
 static QEMUBH *colo_bh;
 
@@ -22,6 +31,56 @@ bool colo_supported(void)
 return true;
 }
 
+/* colo compare */
+#define COMP_IOC_MAGIC 'k'
+#define COMP_IOCTWAIT   _IO(COMP_IOC_MAGIC, 0)
+#define COMP_IOCTFLUSH  _IO(COMP_IOC_MAGIC, 1)
+#define COMP_IOCTRESUME _IO(COMP_IOC_MAGIC, 2)
+
+#define COMPARE_DEV /dev/HA_compare
+/* COLO compare module FD */
+static int comp_fd = -1;
+
+static int colo_compare_init(void)
+{
+comp_fd = open(COMPARE_DEV, O_RDONLY);
+if (comp_fd  0) {
+return -1;
+}
+
+return 0;
+}
+
+static void colo_compare_destroy(void)
+{
+if (comp_fd = 0) {
+close(comp_fd);
+comp_fd = -1;
+}
+}
+
+/*
+ * Communicate with COLO Agent through ioctl.
+ * return:
+ * 0: start a checkpoint
+ * other: errno == ETIME or ERESTART, try again
+ *errno == other, error, quit colo save
+ */
+static int colo_compare(void)
+{
+return ioctl(comp_fd, COMP_IOCTWAIT, 250);
+}
+
+static __attribute__((unused)) int colo_compare_flush(void)
+{
+return ioctl(comp_fd, COMP_IOCTFLUSH, 1);
+}
+
+static __attribute__((unused)) int colo_compare_resume(void)
+{
+return ioctl(comp_fd, COMP_IOCTRESUME, 1);
+}
+
 /* colo buffer */
 
 #define COLO_BUFFER_BASE_SIZE (1000*1000*4ULL)
@@ -131,15 +190,48 @@ static const QEMUFileOps colo_read_ops = {
 static void *colo_thread(void *opaque)
 {
 MigrationState *s = opaque;
-int dev_hotplug = qdev_hotplug;
+int dev_hotplug = qdev_hotplug, wait_cp = 0;
+int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+int64_t current_time;
+
+if (colo_compare_init()  0) {
+error_report(Init colo compare error\n);
+goto out;
+}
 
 qdev_hotplug = 0;
 
 colo_buffer_init();
 
-/*TODO: COLO checkpointed save loop*/
+while (s-state == MIG_STATE_COLO) {
+/* wait for a colo checkpoint */
+wait_cp = colo_compare();
+if (wait_cp) {
+if (errno != ETIME  errno != ERESTART) {
+error_report(compare module failed(%s), strerror(errno));
+goto out;
+}
+/*
+ * no checkpoint is needed, wait for 1ms and then
+ * check if we need checkpoint
+ */
+current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+if (current_time - start_time  CHKPOINT_TIMER) {
+usleep(1000);
+continue;
+}
+}
+
+/* start a colo checkpoint */
+
+/*TODO: COLO save */
 
+start_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+}
+
+out:
 colo_buffer_destroy();
+colo_compare_destroy();
 
 if (s-state != MIG_STATE_ERROR) {
 migrate_set_state(s, MIG_STATE_COLO, MIG_STATE_COMPLETED);
@@ -183,6 +275,17 @@ void colo_init_checkpointer(MigrationState *s)
 
 static Coroutine *colo;
 
+/*
+ * return:
+ * 0: start a checkpoint
+ * 1: some error happend, exit colo restore
+ */
+static int slave_wait_new_checkpoint(QEMUFile *f)
+{
+/* TODO: wait checkpoint start command from master */
+return 1;
+}
+
 void colo_process_incoming_checkpoints(QEMUFile *f)
 {
 int dev_hotplug = qdev_hotplug;
@@ -198,7 +301,13 @@ void colo_process_incoming_checkpoints(QEMUFile *f)
 
 colo_buffer_init();
 
-/* TODO: COLO checkpointed restore loop */
+while (true) {
+if (slave_wait_new_checkpoint(f)) {
+break;
+}
+
+/* TODO: COLO restore */
+}
 
 colo_buffer_destroy();
 colo = NULL;
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 02/17] COLO: introduce an api colo_supported() to indicate COLO support

2014-07-23 Thread Yang Hongyang

introduce an api colo_supported() to indicate COLO support, returns
true if colo supported(configured with --enable-colo).

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 Makefile.objs  |  1 +
 include/migration/migration-colo.h | 18 ++
 migration-colo.c   | 16 
 stubs/Makefile.objs|  1 +
 stubs/migration-colo.c | 16 
 5 files changed, 52 insertions(+)
 create mode 100644 include/migration/migration-colo.h
 create mode 100644 migration-colo.c
 create mode 100644 stubs/migration-colo.c

diff --git a/Makefile.objs b/Makefile.objs
index 1f76cea..cab5824 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -50,6 +50,7 @@ common-obj-$(CONFIG_POSIX) += os-posix.o
 common-obj-$(CONFIG_LINUX) += fsdev/
 
 common-obj-y += migration.o migration-tcp.o
+common-obj-$(CONFIG_COLO) += migration-colo.o
 common-obj-y += vmstate.o
 common-obj-y += qemu-file.o
 common-obj-$(CONFIG_RDMA) += migration-rdma.o
diff --git a/include/migration/migration-colo.h 
b/include/migration/migration-colo.h
new file mode 100644
index 000..35b384c
--- /dev/null
+++ b/include/migration/migration-colo.h
@@ -0,0 +1,18 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (C) 2014 FUJITSU LIMITED
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_MIGRATION_COLO_H
+#define QEMU_MIGRATION_COLO_H
+
+#include qemu-common.h
+
+bool colo_supported(void);
+
+#endif
diff --git a/migration-colo.c b/migration-colo.c
new file mode 100644
index 000..1d3bef8
--- /dev/null
+++ b/migration-colo.c
@@ -0,0 +1,16 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (C) 2014 FUJITSU LIMITED
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include migration/migration-colo.h
+
+bool colo_supported(void)
+{
+return true;
+}
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index 528e161..6810c89 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -39,3 +39,4 @@ stub-obj-$(CONFIG_WIN32) += fd-register.o
 stub-obj-y += cpus.o
 stub-obj-y += kvm.o
 stub-obj-y += qmp_pc_dimm_device_list.o
+stub-obj-y += migration-colo.o
diff --git a/stubs/migration-colo.c b/stubs/migration-colo.c
new file mode 100644
index 000..b9ee6a0
--- /dev/null
+++ b/stubs/migration-colo.c
@@ -0,0 +1,16 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (C) 2014 FUJITSU LIMITED
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include migration/migration-colo.h
+
+bool colo_supported(void)
+{
+return false;
+}
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 07/17] COLO buffer: implement colo buffer as well as QEMUFileOps based on it

2014-07-23 Thread Yang Hongyang

We need a buffer to store migration data.

On save side:
  all saved data was write into colo buffer first, so that we can know
the total size of the migration data. this can also separate the data
transmission from colo control data, we use colo control data over
socket fd to synchronous both side's stat.

On restore side:
  all migration data was read into colo buffer first, then load data
from the buffer: If network error happens while data transmission,
the slaver can still functinal because the migration data are not yet
loaded.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 migration-colo.c | 112 +++
 1 file changed, 112 insertions(+)

diff --git a/migration-colo.c b/migration-colo.c
index d566b9d..b90d9b6 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -11,6 +11,7 @@
 #include qemu/main-loop.h
 #include qemu/thread.h
 #include block/coroutine.h
+#include qemu/error-report.h
 #include migration/migration-colo.h
 
 static QEMUBH *colo_bh;
@@ -20,14 +21,122 @@ bool colo_supported(void)
 return true;
 }
 
+/* colo buffer */
+
+#define COLO_BUFFER_BASE_SIZE (1000*1000*4ULL)
+#define COLO_BUFFER_MAX_SIZE (1000*1000*1000*10ULL)
+
+typedef struct colo_buffer {
+uint8_t *data;
+uint64_t used;
+uint64_t freed;
+uint64_t size;
+} colo_buffer_t;
+
+static colo_buffer_t colo_buffer;
+
+static void colo_buffer_init(void)
+{
+if (colo_buffer.size == 0) {
+colo_buffer.data = g_malloc(COLO_BUFFER_BASE_SIZE);
+colo_buffer.size = COLO_BUFFER_BASE_SIZE;
+}
+colo_buffer.used = 0;
+colo_buffer.freed = 0;
+}
+
+static void colo_buffer_destroy(void)
+{
+if (colo_buffer.data) {
+g_free(colo_buffer.data);
+colo_buffer.data = NULL;
+}
+colo_buffer.used = 0;
+colo_buffer.freed = 0;
+colo_buffer.size = 0;
+}
+
+static void colo_buffer_extend(uint64_t len)
+{
+if (len  colo_buffer.size - colo_buffer.used) {
+len = len + colo_buffer.used - colo_buffer.size;
+len = ROUND_UP(len, COLO_BUFFER_BASE_SIZE) + COLO_BUFFER_BASE_SIZE;
+
+colo_buffer.size += len;
+if (colo_buffer.size  COLO_BUFFER_MAX_SIZE) {
+error_report(colo_buffer overflow!\n);
+exit(EXIT_FAILURE);
+}
+colo_buffer.data = g_realloc(colo_buffer.data, colo_buffer.size);
+}
+}
+
+static int colo_put_buffer(void *opaque, const uint8_t *buf,
+   int64_t pos, int size)
+{
+colo_buffer_extend(size);
+memcpy(colo_buffer.data + colo_buffer.used, buf, size);
+colo_buffer.used += size;
+
+return size;
+}
+
+static int colo_get_buffer_internal(uint8_t *buf, int size)
+{
+if ((size + colo_buffer.freed)  colo_buffer.used) {
+size = colo_buffer.used - colo_buffer.freed;
+}
+memcpy(buf, colo_buffer.data + colo_buffer.freed, size);
+colo_buffer.freed += size;
+
+return size;
+}
+
+static int colo_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+{
+return colo_get_buffer_internal(buf, size);
+}
+
+static int colo_close(void *opaque)
+{
+colo_buffer_t *cb = opaque ;
+
+cb-used = 0;
+cb-freed = 0;
+
+return 0;
+}
+
+static int colo_get_fd(void *opaque)
+{
+/* colo buffer, no fd */
+return -1;
+}
+
+static const QEMUFileOps colo_write_ops = {
+.put_buffer = colo_put_buffer,
+.get_fd = colo_get_fd,
+.close = colo_close,
+};
+
+static const QEMUFileOps colo_read_ops = {
+.get_buffer = colo_get_buffer,
+.get_fd = colo_get_fd,
+.close = colo_close,
+};
+
 /* save */
 
 static void *colo_thread(void *opaque)
 {
 MigrationState *s = opaque;
 
+colo_buffer_init();
+
 /*TODO: COLO checkpointed save loop*/
 
+colo_buffer_destroy();
+
 if (s-state != MIG_STATE_ERROR) {
 migrate_set_state(s, MIG_STATE_COLO, MIG_STATE_COMPLETED);
 }
@@ -77,8 +186,11 @@ void colo_process_incoming_checkpoints(QEMUFile *f)
 colo = qemu_coroutine_self();
 assert(colo != NULL);
 
+colo_buffer_init();
+
 /* TODO: COLO checkpointed restore loop */
 
+colo_buffer_destroy();
 colo = NULL;
 restore_exit_colo();
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 10/17] COLO ctl: introduce is_slave() and is_master()

2014-07-23 Thread Yang Hongyang

is_slaver is to determine whether the QEMU instance is a
slaver(migration target) at runtime.
is_master is to determine whether the QEMU instance is a
master(migration starter) at runtime.
This 2 APIs will be used later.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 migration-colo.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/migration-colo.c b/migration-colo.c
index 802f8b0..2699e77 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -187,6 +187,12 @@ static const QEMUFileOps colo_read_ops = {
 
 /* save */
 
+static __attribute__((unused)) bool is_master(void)
+{
+MigrationState *s = migrate_get_current();
+return (s-state == MIG_STATE_COLO);
+}
+
 static void *colo_thread(void *opaque)
 {
 MigrationState *s = opaque;
@@ -275,6 +281,11 @@ void colo_init_checkpointer(MigrationState *s)
 
 static Coroutine *colo;
 
+static __attribute__((unused)) bool is_slave(void)
+{
+return colo != NULL;
+}
+
 /*
  * return:
  * 0: start a checkpoint
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 12/17] COLO ctl: add a RunState RUN_STATE_COLO

2014-07-23 Thread Yang Hongyang

Guest will enter this state when paused to save/resore VM state
under colo checkpoint.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 qapi-schema.json | 4 +++-
 vl.c | 8 
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index 807f5a2..b42171c 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -145,12 +145,14 @@
 # @watchdog: the watchdog action is configured to pause and has been triggered
 #
 # @guest-panicked: guest has been panicked as a result of guest OS panic
+#
+# @colo: guest is paused to save/restore VM state under colo checkpoint
 ##
 { 'enum': 'RunState',
   'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
 'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
 'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
-'guest-panicked' ] }
+'guest-panicked', 'colo' ] }
 
 ##
 # @StatusInfo:
diff --git a/vl.c b/vl.c
index 1a282d8..545155d 100644
--- a/vl.c
+++ b/vl.c
@@ -597,6 +597,7 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 
 { RUN_STATE_INMIGRATE, RUN_STATE_RUNNING },
 { RUN_STATE_INMIGRATE, RUN_STATE_PAUSED },
+{ RUN_STATE_INMIGRATE, RUN_STATE_COLO },
 
 { RUN_STATE_INTERNAL_ERROR, RUN_STATE_PAUSED },
 { RUN_STATE_INTERNAL_ERROR, RUN_STATE_FINISH_MIGRATE },
@@ -606,6 +607,7 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 
 { RUN_STATE_PAUSED, RUN_STATE_RUNNING },
 { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_PAUSED, RUN_STATE_COLO},
 
 { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING },
 { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE },
@@ -616,9 +618,12 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 
 { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
 { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
+{ RUN_STATE_FINISH_MIGRATE, RUN_STATE_COLO},
 
 { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
 
+{ RUN_STATE_COLO, RUN_STATE_RUNNING },
+
 { RUN_STATE_RUNNING, RUN_STATE_DEBUG },
 { RUN_STATE_RUNNING, RUN_STATE_INTERNAL_ERROR },
 { RUN_STATE_RUNNING, RUN_STATE_IO_ERROR },
@@ -629,6 +634,7 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN },
 { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG },
 { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED },
+{ RUN_STATE_RUNNING, RUN_STATE_COLO},
 
 { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING },
 
@@ -639,9 +645,11 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED },
 { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING },
 { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_SUSPENDED, RUN_STATE_COLO},
 
 { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING },
 { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_WATCHDOG, RUN_STATE_COLO},
 
 { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING },
 { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE },
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 16/17] COLO ram cache: implement colo ram cache on slaver

2014-07-23 Thread Yang Hongyang

The ram cache was initially the same as PVM's memory. At
checkpoint, we cache the dirty memory of PVM into ram cache
(so that ram cache always the same as PVM's memory at every
checkpoint), flush cached memory to SVM after we received
all PVM dirty memory(only needed to flush memory that was
both dirty on PVM and SVM since last checkpoint).

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 arch_init.c| 154 -
 include/exec/cpu-all.h |   1 +
 include/migration/migration-colo.h |   3 +
 migration-colo.c   |   4 +
 4 files changed, 159 insertions(+), 3 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index c84e6c8..009bcb5 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -1013,6 +1013,7 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void 
*host)
 return 0;
 }
 
+static void *memory_region_get_ram_cache_ptr(MemoryRegion *mr, RAMBlock 
*block);
 static inline void *host_from_stream_offset(QEMUFile *f,
 ram_addr_t offset,
 int flags)
@@ -1027,7 +1028,12 @@ static inline void *host_from_stream_offset(QEMUFile *f,
 return NULL;
 }
 
-return memory_region_get_ram_ptr(block-mr) + offset;
+if (is_slave()) {
+migration_bitmap_set_dirty(block-mr-ram_addr + offset);
+return memory_region_get_ram_cache_ptr(block-mr, block) + offset;
+} else {
+return memory_region_get_ram_ptr(block-mr) + offset;
+}
 }
 
 len = qemu_get_byte(f);
@@ -1035,8 +1041,15 @@ static inline void *host_from_stream_offset(QEMUFile *f,
 id[len] = 0;
 
 QTAILQ_FOREACH(block, ram_list.blocks, next) {
-if (!strncmp(id, block-idstr, sizeof(id)))
-return memory_region_get_ram_ptr(block-mr) + offset;
+if (!strncmp(id, block-idstr, sizeof(id))) {
+if (is_slave()) {
+migration_bitmap_set_dirty(block-mr-ram_addr + offset);
+return memory_region_get_ram_cache_ptr(block-mr, block)
+   + offset;
+} else {
+return memory_region_get_ram_ptr(block-mr) + offset;
+}
+}
 }
 
 error_report(Can't find block %s!, id);
@@ -1054,11 +1067,13 @@ void ram_handle_compressed(void *host, uint8_t ch, 
uint64_t size)
 }
 }
 
+static void ram_flush_cache(void);
 static int ram_load(QEMUFile *f, void *opaque, int version_id)
 {
 ram_addr_t addr;
 int flags, ret = 0;
 static uint64_t seq_iter;
+bool need_flush = false;
 
 seq_iter++;
 
@@ -1121,6 +1136,7 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 break;
 }
 
+need_flush = true;
 ch = qemu_get_byte(f);
 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
 } else if (flags  RAM_SAVE_FLAG_PAGE) {
@@ -1133,6 +1149,7 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 break;
 }
 
+need_flush = true;
 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
 } else if (flags  RAM_SAVE_FLAG_XBZRLE) {
 void *host = host_from_stream_offset(f, addr, flags);
@@ -1148,6 +1165,7 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 ret = -EINVAL;
 break;
 }
+need_flush = true;
 } else if (flags  RAM_SAVE_FLAG_HOOK) {
 ram_control_load_hook(f, flags);
 } else if (flags  RAM_SAVE_FLAG_EOS) {
@@ -1161,11 +1179,141 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 ret = qemu_file_get_error(f);
 }
 
+if (!ret  is_slave()  need_flush) {
+ram_flush_cache();
+}
+
 DPRINTF(Completed load of VM with exit code %d seq iteration 
 % PRIu64 \n, ret, seq_iter);
 return ret;
 }
 
+/*
+ * colo cache: this is for secondary VM, we cache the whole
+ * memory of the secondary VM.
+ */
+void create_and_init_ram_cache(void)
+{
+/*
+ * called after first migration
+ */
+RAMBlock *block;
+int64_t ram_cache_pages = last_ram_offset()  TARGET_PAGE_BITS;
+
+QTAILQ_FOREACH(block, ram_list.blocks, next) {
+block-host_cache = g_malloc(block-length);
+memcpy(block-host_cache, block-host, block-length);
+}
+
+migration_bitmap = bitmap_new(ram_cache_pages);
+migration_dirty_pages = 0;
+memory_global_dirty_log_start();
+}
+
+void release_ram_cache(void)
+{
+RAMBlock *block;
+
+if (migration_bitmap) {
+memory_global_dirty_log_stop();
+g_free(migration_bitmap);
+migration_bitmap = NULL;
+}
+
+QTAILQ_FOREACH(block, ram_list.blocks, next) {
+g_free(block-host_cache);
+}
+}
+
+static void *memory_region_get_ram_cache_ptr(MemoryRegion *mr, RAMBlock *block)
+{
+   if (mr-alias) {
+

[RFC PATCH 15/17] COLO save: reuse migration bitmap under colo checkpoint

2014-07-23 Thread Yang Hongyang

reuse migration bitmap under colo checkpoint, only send dirty pages
per-checkpoint.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 arch_init.c| 20 +++-
 include/migration/migration-colo.h |  2 ++
 migration-colo.c   |  6 ++
 stubs/migration-colo.c | 10 ++
 4 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 8ddaf35..c84e6c8 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -52,6 +52,7 @@
 #include exec/ram_addr.h
 #include hw/acpi/acpi.h
 #include qemu/host-utils.h
+#include migration/migration-colo.h
 
 #ifdef DEBUG_ARCH_INIT
 #define DPRINTF(fmt, ...) \
@@ -769,6 +770,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 RAMBlock *block;
 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
 
+/*
+ * migration has already setup the bitmap, reuse it.
+ */
+if (is_master()) {
+qemu_mutex_lock_ramlist();
+reset_ram_globals();
+goto out_setup;
+}
+
 mig_throttle_on = false;
 dirty_rate_high_cnt = 0;
 bitmap_sync_count = 0;
@@ -828,6 +838,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 migration_bitmap_sync();
 qemu_mutex_unlock_iothread();
 
+out_setup:
 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
 
 QTAILQ_FOREACH(block, ram_list.blocks, next) {
@@ -937,7 +948,14 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 }
 
 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
-migration_end();
+
+/*
+ * Since we need to reuse dirty bitmap in colo,
+ * don't cleanup the bitmap.
+ */
+if (!migrate_use_colo() || migration_has_failed(migrate_get_current())) {
+migration_end();
+}
 
 qemu_mutex_unlock_ramlist();
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
diff --git a/include/migration/migration-colo.h 
b/include/migration/migration-colo.h
index 861fa27..c286a60 100644
--- a/include/migration/migration-colo.h
+++ b/include/migration/migration-colo.h
@@ -21,10 +21,12 @@ bool colo_supported(void);
 /* save */
 bool migrate_use_colo(void);
 void colo_init_checkpointer(MigrationState *s);
+bool is_master(void);
 
 /* restore */
 bool restore_use_colo(void);
 void restore_exit_colo(void);
+bool is_slave(void);
 
 void colo_process_incoming_checkpoints(QEMUFile *f);
 
diff --git a/migration-colo.c b/migration-colo.c
index 8596845..13a6a57 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -222,8 +222,6 @@ static const QEMUFileOps colo_read_ops = {
 };
 
 /* colo checkpoint control helper */
-static bool is_master(void);
-static bool is_slave(void);
 
 static void ctl_error_handler(void *opaque, int err)
 {
@@ -295,7 +293,7 @@ static int colo_ctl_get(QEMUFile *f, uint64_t require)
 
 /* save */
 
-static bool is_master(void)
+bool is_master(void)
 {
 MigrationState *s = migrate_get_current();
 return (s-state == MIG_STATE_COLO);
@@ -499,7 +497,7 @@ void colo_init_checkpointer(MigrationState *s)
 
 static Coroutine *colo;
 
-static bool is_slave(void)
+bool is_slave(void)
 {
 return colo != NULL;
 }
diff --git a/stubs/migration-colo.c b/stubs/migration-colo.c
index 55f0d37..ef65be6 100644
--- a/stubs/migration-colo.c
+++ b/stubs/migration-colo.c
@@ -22,3 +22,13 @@ void colo_init_checkpointer(MigrationState *s)
 void colo_process_incoming_checkpoints(QEMUFile *f)
 {
 }
+
+bool is_master(void)
+{
+return false;
+}
+
+bool is_slave(void)
+{
+return false;
+}
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 03/17] COLO migration: add a migration capability 'colo'

2014-07-23 Thread Yang Hongyang

Add a migration capability 'colo'. If this capability is on,
The migration will never end, and the VM will be continuously
checkpointed.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 include/qapi/qmp/qerror.h | 3 +++
 migration.c   | 6 ++
 qapi-schema.json  | 5 -
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h
index 902d1a7..226b805 100644
--- a/include/qapi/qmp/qerror.h
+++ b/include/qapi/qmp/qerror.h
@@ -166,4 +166,7 @@ void qerror_report_err(Error *err);
 #define QERR_SOCKET_CREATE_FAILED \
 ERROR_CLASS_GENERIC_ERROR, Failed to create socket
 
+#define QERR_COLO_UNSUPPORTED \
+ERROR_CLASS_GENERIC_ERROR, COLO is not currently supported, please rerun 
configure with --enable-colo option in order to support COLO feature
+
 #endif /* QERROR_H */
diff --git a/migration.c b/migration.c
index 8d675b3..ca83310 100644
--- a/migration.c
+++ b/migration.c
@@ -25,6 +25,7 @@
 #include qemu/thread.h
 #include qmp-commands.h
 #include trace.h
+#include migration/migration-colo.h
 
 enum {
 MIG_STATE_ERROR = -1,
@@ -277,6 +278,11 @@ void 
qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
 }
 
 for (cap = params; cap; cap = cap-next) {
+if (cap-value-capability == MIGRATION_CAPABILITY_COLO 
+cap-value-state  !colo_supported()) {
+error_set(errp, QERR_COLO_UNSUPPORTED);
+continue;
+}
 s-enabled_capabilities[cap-value-capability] = cap-value-state;
 }
 }
diff --git a/qapi-schema.json b/qapi-schema.json
index b11aad2..807f5a2 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -491,10 +491,13 @@
 # @auto-converge: If enabled, QEMU will automatically throttle down the guest
 #  to speed up convergence of RAM migration. (since 1.6)
 #
+# @colo: The migration will never end, and the VM will instead be continuously
+#checkpointed. The feature is disabled by default. (since 2.1)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
-  'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks'] }
+  'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', 'colo'] }
 
 ##
 # @MigrationCapabilityStatus
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 14/17] COLO ctl: implement colo restore

2014-07-23 Thread Yang Hongyang

implement colo restore

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 migration-colo.c | 43 +++
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/migration-colo.c b/migration-colo.c
index 03ac157..8596845 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -535,8 +535,9 @@ void colo_process_incoming_checkpoints(QEMUFile *f)
 {
 int fd = qemu_get_fd(f);
 int dev_hotplug = qdev_hotplug;
-QEMUFile *ctl = NULL;
+QEMUFile *ctl = NULL, *fb = NULL;
 int ret;
+uint64_t total_size;
 
 if (!restore_use_colo()) {
 return;
@@ -560,7 +561,8 @@ void colo_process_incoming_checkpoints(QEMUFile *f)
 goto out;
 }
 
-/* TODO: in COLO mode, slave is runing, so start the vm */
+/* in COLO mode, slave is runing, so start the vm */
+vm_start();
 
 while (true) {
 if (slave_wait_new_checkpoint(f)) {
@@ -569,43 +571,68 @@ void colo_process_incoming_checkpoints(QEMUFile *f)
 
 /* start colo checkpoint */
 
-/* TODO: suspend guest */
+/* suspend guest */
+vm_stop_force_state(RUN_STATE_COLO);
 
 ret = colo_ctl_put(ctl, COLO_CHECKPOINT_SUSPENDED);
 if (ret) {
 goto out;
 }
 
-/* TODO: open colo buffer for read */
+/* open colo buffer for read */
+fb = qemu_fopen_ops(colo_buffer, colo_read_ops);
+if (!fb) {
+error_report(can't open colo buffer\n);
+goto out;
+}
 
 ret = colo_ctl_get(f, COLO_CHECKPOINT_SEND);
 if (ret) {
 goto out;
 }
 
-/* TODO: read migration data into colo buffer */
+/* read migration data into colo buffer */
+
+/* read the vmstate total size first */
+ret = colo_ctl_get_value(f, total_size);
+if (ret) {
+goto out;
+}
+colo_buffer_extend(total_size);
+qemu_get_buffer(f, colo_buffer.data, total_size);
+colo_buffer.used = total_size;
 
 ret = colo_ctl_put(ctl, COLO_CHECKPOINT_RECEIVED);
 if (ret) {
 goto out;
 }
 
-/* TODO: load vm state */
+/* load vm state */
+if (qemu_loadvm_state(fb)  0) {
+error_report(COLO: loadvm failed\n);
+goto out;
+}
 
 ret = colo_ctl_put(ctl, COLO_CHECKPOINT_LOADED);
 if (ret) {
 goto out;
 }
 
-/* TODO: resume guest */
+/* resume guest */
+vm_start();
 
-/* TODO: close colo buffer */
+qemu_fclose(fb);
+fb = NULL;
 }
 
 out:
 colo_buffer_destroy();
 colo = NULL;
 
+if (fb) {
+qemu_fclose(fb);
+}
+
 if (ctl) {
 qemu_fclose(ctl);
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 08/17] COLO: disable qdev hotplug

2014-07-23 Thread Yang Hongyang

COLO do not support qdev hotplug migration, disable it.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 migration-colo.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/migration-colo.c b/migration-colo.c
index b90d9b6..f295e56 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -12,6 +12,7 @@
 #include qemu/thread.h
 #include block/coroutine.h
 #include qemu/error-report.h
+#include hw/qdev-core.h
 #include migration/migration-colo.h
 
 static QEMUBH *colo_bh;
@@ -130,6 +131,9 @@ static const QEMUFileOps colo_read_ops = {
 static void *colo_thread(void *opaque)
 {
 MigrationState *s = opaque;
+int dev_hotplug = qdev_hotplug;
+
+qdev_hotplug = 0;
 
 colo_buffer_init();
 
@@ -145,6 +149,8 @@ static void *colo_thread(void *opaque)
 qemu_bh_schedule(s-cleanup_bh);
 qemu_mutex_unlock_iothread();
 
+qdev_hotplug = dev_hotplug;
+
 return NULL;
 }
 
@@ -179,10 +185,14 @@ static Coroutine *colo;
 
 void colo_process_incoming_checkpoints(QEMUFile *f)
 {
+int dev_hotplug = qdev_hotplug;
+
 if (!restore_use_colo()) {
 return;
 }
 
+qdev_hotplug = 0;
+
 colo = qemu_coroutine_self();
 assert(colo != NULL);
 
@@ -194,5 +204,7 @@ void colo_process_incoming_checkpoints(QEMUFile *f)
 colo = NULL;
 restore_exit_colo();
 
+qdev_hotplug = dev_hotplug;
+
 return;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 13/17] COLO ctl: implement colo save

2014-07-23 Thread Yang Hongyang

implement colo save

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 migration-colo.c | 44 ++--
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/migration-colo.c b/migration-colo.c
index a708872..03ac157 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -14,6 +14,7 @@
 #include qemu/error-report.h
 #include hw/qdev-core.h
 #include qemu/timer.h
+#include sysemu/sysemu.h
 #include migration/migration-colo.h
 #include sys/ioctl.h
 
@@ -106,12 +107,12 @@ static int colo_compare(void)
 return ioctl(comp_fd, COMP_IOCTWAIT, 250);
 }
 
-static __attribute__((unused)) int colo_compare_flush(void)
+static int colo_compare_flush(void)
 {
 return ioctl(comp_fd, COMP_IOCTFLUSH, 1);
 }
 
-static __attribute__((unused)) int colo_compare_resume(void)
+static int colo_compare_resume(void)
 {
 return ioctl(comp_fd, COMP_IOCTRESUME, 1);
 }
@@ -315,30 +316,61 @@ static int do_colo_transaction(MigrationState *s, 
QEMUFile *control,
 goto out;
 }
 
-/* TODO: suspend and save vm state to colo buffer */
+/* suspend and save vm state to colo buffer */
+
+qemu_mutex_lock_iothread();
+vm_stop_force_state(RUN_STATE_COLO);
+qemu_mutex_unlock_iothread();
+/* Disable block migration */
+s-params.blk = 0;
+s-params.shared = 0;
+qemu_savevm_state_begin(trans, s-params);
+qemu_savevm_state_complete(trans);
+
+qemu_fflush(trans);
 
 ret = colo_ctl_put(s-file, COLO_CHECKPOINT_SEND);
 if (ret) {
 goto out;
 }
 
-/* TODO: send vmstate to slave */
+/* send vmstate to slave */
+
+/* we send the total size of the vmstate first */
+ret = colo_ctl_put(s-file, colo_buffer.used);
+if (ret) {
+goto out;
+}
+
+qemu_put_buffer_async(s-file, colo_buffer.data, colo_buffer.used);
+ret = qemu_file_get_error(s-file);
+if (ret  0) {
+goto out;
+}
+qemu_fflush(s-file);
 
 ret = colo_ctl_get(control, COLO_CHECKPOINT_RECEIVED);
 if (ret) {
 goto out;
 }
 
-/* TODO: Flush network etc. */
+/* Flush network etc. */
+colo_compare_flush();
 
 ret = colo_ctl_get(control, COLO_CHECKPOINT_LOADED);
 if (ret) {
 goto out;
 }
 
-/* TODO: resume master */
+colo_compare_resume();
+ret = 0;
 
 out:
+/* resume master */
+qemu_mutex_lock_iothread();
+vm_start();
+qemu_mutex_unlock_iothread();
+
 return ret;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 17/17] HACK: trigger checkpoint every 500ms

2014-07-23 Thread Yang Hongyang

Because COLO Agent is under development. We add this hack for
test purpose. Trigger checkpoint every 500ms so that we can
test the process of COLO save/restore.
NOTE:
  This is only a hack, and will be removed at last.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 migration-colo.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/migration-colo.c b/migration-colo.c
index 52156e7..4be037e 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -23,7 +23,7 @@
  * this is large because COLO checkpoint will mostly depend on
  * COLO compare module.
  */
-#define CHKPOINT_TIMER 1
+#define CHKPOINT_TIMER 500
 
 enum {
 COLO_READY = 0x46,
@@ -79,11 +79,6 @@ static int comp_fd = -1;
 
 static int colo_compare_init(void)
 {
-comp_fd = open(COMPARE_DEV, O_RDONLY);
-if (comp_fd  0) {
-return -1;
-}
-
 return 0;
 }
 
@@ -104,17 +99,18 @@ static void colo_compare_destroy(void)
  */
 static int colo_compare(void)
 {
-return ioctl(comp_fd, COMP_IOCTWAIT, 250);
+errno = ERESTART;
+return 1;
 }
 
 static int colo_compare_flush(void)
 {
-return ioctl(comp_fd, COMP_IOCTFLUSH, 1);
+return 0;
 }
 
 static int colo_compare_resume(void)
 {
-return ioctl(comp_fd, COMP_IOCTRESUME, 1);
+return 0;
 }
 
 /* colo buffer */
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 11/17] COLO ctl: implement colo checkpoint protocol

2014-07-23 Thread Yang Hongyang

implement colo checkpoint protocol.

Checkpoint synchronzing points.

  Primary Secondary
  NEW @
  Suspend
  SUSPENDED   @
  SuspendSave state
  SEND@
  Send state  Receive state
  RECEIVED@
  Flush network   Load state
  LOADED  @
  Resume  Resume

  Start Comparing
NOTE:
 1) '@' who sends the message
 2) Every sync-point is synchronized by two sides with only
one handshake(single direction) for low-latency.
If more strict synchronization is required, a opposite direction
sync-point should be added.
 3) Since sync-points are single direction, the remote side may
go forward a lot when this side just receives the sync-point.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 migration-colo.c | 268 +--
 1 file changed, 262 insertions(+), 6 deletions(-)

diff --git a/migration-colo.c b/migration-colo.c
index 2699e77..a708872 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -24,6 +24,41 @@
  */
 #define CHKPOINT_TIMER 1
 
+enum {
+COLO_READY = 0x46,
+
+/*
+ * Checkpoint synchronzing points.
+ *
+ *  Primary Secondary
+ *  NEW @
+ *  Suspend
+ *  SUSPENDED   @
+ *  SuspendSave state
+ *  SEND@
+ *  Send state  Receive state
+ *  RECEIVED@
+ *  Flush network   Load state
+ *  LOADED  @
+ *  Resume  Resume
+ *
+ *  Start Comparing
+ * NOTE:
+ * 1) '@' who sends the message
+ * 2) Every sync-point is synchronized by two sides with only
+ *one handshake(single direction) for low-latency.
+ *If more strict synchronization is required, a opposite direction
+ *sync-point should be added.
+ * 3) Since sync-points are single direction, the remote side may
+ *go forward a lot when this side just receives the sync-point.
+ */
+COLO_CHECKPOINT_NEW,
+COLO_CHECKPOINT_SUSPENDED,
+COLO_CHECKPOINT_SEND,
+COLO_CHECKPOINT_RECEIVED,
+COLO_CHECKPOINT_LOADED,
+};
+
 static QEMUBH *colo_bh;
 
 bool colo_supported(void)
@@ -185,30 +220,161 @@ static const QEMUFileOps colo_read_ops = {
 .close = colo_close,
 };
 
+/* colo checkpoint control helper */
+static bool is_master(void);
+static bool is_slave(void);
+
+static void ctl_error_handler(void *opaque, int err)
+{
+if (is_slave()) {
+/* TODO: determine whether we need to failover */
+/* FIXME: we will not failover currently, just kill slave */
+error_report(error: colo transmission failed!\n);
+exit(1);
+} else if (is_master()) {
+/* Master still alive, do not failover */
+error_report(error: colo transmission failed!\n);
+return;
+} else {
+error_report(COLO: Unexpected error happend!\n);
+exit(EXIT_FAILURE);
+}
+}
+
+static int colo_ctl_put(QEMUFile *f, uint64_t request)
+{
+int ret = 0;
+
+qemu_put_be64(f, request);
+qemu_fflush(f);
+
+ret = qemu_file_get_error(f);
+if (ret  0) {
+ctl_error_handler(f, ret);
+return 1;
+}
+
+return ret;
+}
+
+static int colo_ctl_get_value(QEMUFile *f, uint64_t *value)
+{
+int ret = 0;
+uint64_t temp;
+
+temp = qemu_get_be64(f);
+
+ret = qemu_file_get_error(f);
+if (ret  0) {
+ctl_error_handler(f, ret);
+return 1;
+}
+
+*value = temp;
+return 0;
+}
+
+static int colo_ctl_get(QEMUFile *f, uint64_t require)
+{
+int ret;
+uint64_t value;
+
+ret = colo_ctl_get_value(f, value);
+if (ret) {
+return ret;
+}
+
+if (value != require) {
+error_report(unexpected state received!\n);
+exit(1);
+}
+
+return ret;
+}
+
 /* save */
 
-static __attribute__((unused)) bool is_master(void)
+static bool is_master(void)
 {
 MigrationState *s = migrate_get_current();
 return (s-state == MIG_STATE_COLO);
 }
 
+static int do_colo_transaction(MigrationState *s, QEMUFile *control,
+   QEMUFile *trans)
+{
+int ret;
+
+ret = colo_ctl_put(s-file, COLO_CHECKPOINT_NEW);
+if (ret) {
+goto out;
+}
+
+ret = colo_ctl_get(control, COLO_CHECKPOINT_SUSPENDED);
+if (ret) {
+goto out;
+}
+
+/* TODO: suspend and save vm state to colo buffer */
+
+ret = colo_ctl_put(s-file, COLO_CHECKPOINT_SEND);
+if (ret) {
+goto out;
+}
+
+/*

[RFC PATCH 06/17] COLO restore: integrate COLO checkpointed restore into qemu restore

2014-07-23 Thread Yang Hongyang

enter colo checkpointed restore loop after live migration.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 include/migration/migration-colo.h |  6 ++
 migration-colo-comm.c  | 10 ++
 migration-colo.c   | 22 ++
 migration.c|  3 +++
 stubs/migration-colo.c |  4 
 5 files changed, 45 insertions(+)

diff --git a/include/migration/migration-colo.h 
b/include/migration/migration-colo.h
index 24589c0..861fa27 100644
--- a/include/migration/migration-colo.h
+++ b/include/migration/migration-colo.h
@@ -22,4 +22,10 @@ bool colo_supported(void);
 bool migrate_use_colo(void);
 void colo_init_checkpointer(MigrationState *s);
 
+/* restore */
+bool restore_use_colo(void);
+void restore_exit_colo(void);
+
+void colo_process_incoming_checkpoints(QEMUFile *f);
+
 #endif
diff --git a/migration-colo-comm.c b/migration-colo-comm.c
index 4504ceb..b12a57a 100644
--- a/migration-colo-comm.c
+++ b/migration-colo-comm.c
@@ -38,6 +38,16 @@ static void colo_info_save(QEMUFile *f, void *opaque)
 
 /* restore */
 
+bool restore_use_colo(void)
+{
+return colo_requested;
+}
+
+void restore_exit_colo(void)
+{
+colo_requested = false;
+}
+
 static int colo_info_load(QEMUFile *f, void *opaque, int version_id)
 {
 int value = qemu_get_byte(f);
diff --git a/migration-colo.c b/migration-colo.c
index 0cef8bd..d566b9d 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -10,6 +10,7 @@
 
 #include qemu/main-loop.h
 #include qemu/thread.h
+#include block/coroutine.h
 #include migration/migration-colo.h
 
 static QEMUBH *colo_bh;
@@ -62,3 +63,24 @@ void colo_init_checkpointer(MigrationState *s)
 colo_bh = qemu_bh_new(colo_start_checkpointer, s);
 qemu_bh_schedule(colo_bh);
 }
+
+/* restore */
+
+static Coroutine *colo;
+
+void colo_process_incoming_checkpoints(QEMUFile *f)
+{
+if (!restore_use_colo()) {
+return;
+}
+
+colo = qemu_coroutine_self();
+assert(colo != NULL);
+
+/* TODO: COLO checkpointed restore loop */
+
+colo = NULL;
+restore_exit_colo();
+
+return;
+}
diff --git a/migration.c b/migration.c
index b7f8e7e..190571d 100644
--- a/migration.c
+++ b/migration.c
@@ -86,6 +86,9 @@ static void process_incoming_migration_co(void *opaque)
 int ret;
 
 ret = qemu_loadvm_state(f);
+if (!ret) {
+colo_process_incoming_checkpoints(f);
+}
 qemu_fclose(f);
 free_xbzrle_decoded_buf();
 if (ret  0) {
diff --git a/stubs/migration-colo.c b/stubs/migration-colo.c
index 9013c40..55f0d37 100644
--- a/stubs/migration-colo.c
+++ b/stubs/migration-colo.c
@@ -18,3 +18,7 @@ bool colo_supported(void)
 void colo_init_checkpointer(MigrationState *s)
 {
 }
+
+void colo_process_incoming_checkpoints(QEMUFile *f)
+{
+}
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 04/17] COLO info: use colo info to tell migration target colo is enabled

2014-07-23 Thread Yang Hongyang

migrate colo info to migration target to tell the target colo is
enabled.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 Makefile.objs  |  1 +
 include/migration/migration-colo.h |  3 ++
 migration-colo-comm.c  | 68 ++
 vl.c   |  4 +++
 4 files changed, 76 insertions(+)
 create mode 100644 migration-colo-comm.c

diff --git a/Makefile.objs b/Makefile.objs
index cab5824..1836a68 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -50,6 +50,7 @@ common-obj-$(CONFIG_POSIX) += os-posix.o
 common-obj-$(CONFIG_LINUX) += fsdev/
 
 common-obj-y += migration.o migration-tcp.o
+common-obj-y += migration-colo-comm.o
 common-obj-$(CONFIG_COLO) += migration-colo.o
 common-obj-y += vmstate.o
 common-obj-y += qemu-file.o
diff --git a/include/migration/migration-colo.h 
b/include/migration/migration-colo.h
index 35b384c..e3735d8 100644
--- a/include/migration/migration-colo.h
+++ b/include/migration/migration-colo.h
@@ -12,6 +12,9 @@
 #define QEMU_MIGRATION_COLO_H
 
 #include qemu-common.h
+#include migration/migration.h
+
+void colo_info_mig_init(void);
 
 bool colo_supported(void);
 
diff --git a/migration-colo-comm.c b/migration-colo-comm.c
new file mode 100644
index 000..ccbc246
--- /dev/null
+++ b/migration-colo-comm.c
@@ -0,0 +1,68 @@
+/*
+ *  COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ *  (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ *  Copyright (C) 2014 FUJITSU LIMITED
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ *
+ */
+
+#include migration/migration-colo.h
+
+#define DEBUG_COLO
+
+#ifdef DEBUG_COLO
+#define DPRINTF(fmt, ...) \
+do { fprintf(stdout, COLO:  fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+static bool colo_requested;
+
+/* save */
+
+static bool migrate_use_colo(void)
+{
+MigrationState *s = migrate_get_current();
+return s-enabled_capabilities[MIGRATION_CAPABILITY_COLO];
+}
+
+static void colo_info_save(QEMUFile *f, void *opaque)
+{
+qemu_put_byte(f, migrate_use_colo());
+}
+
+/* restore */
+
+static int colo_info_load(QEMUFile *f, void *opaque, int version_id)
+{
+int value = qemu_get_byte(f);
+
+if (value  !colo_supported()) {
+fprintf(stderr, COLO is not supported\n);
+return -EINVAL;
+}
+
+if (value  !colo_requested) {
+DPRINTF(COLO requested!\n);
+}
+
+colo_requested = value;
+
+return 0;
+}
+
+static SaveVMHandlers savevm_colo_info_handlers = {
+.save_state = colo_info_save,
+.load_state = colo_info_load,
+};
+
+void colo_info_mig_init(void)
+{
+register_savevm_live(NULL, colo info, -1, 1,
+ savevm_colo_info_handlers, NULL);
+}
diff --git a/vl.c b/vl.c
index fe451aa..1a282d8 100644
--- a/vl.c
+++ b/vl.c
@@ -89,6 +89,7 @@ int main(int argc, char **argv)
 #include sysemu/dma.h
 #include audio/audio.h
 #include migration/migration.h
+#include migration/migration-colo.h
 #include sysemu/kvm.h
 #include qapi/qmp/qjson.h
 #include qemu/option.h
@@ -4339,6 +4340,9 @@ int main(int argc, char **argv, char **envp)
 
 blk_mig_init();
 ram_mig_init();
+if (colo_supported()) {
+colo_info_mig_init();
+}
 
 /* open the virtual block devices */
 if (snapshot)
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [RFC PATCH 03/17] COLO migration: add a migration capability 'colo'

2014-07-23 Thread Eric Blake

On 07/23/2014 08:25 AM, Yang Hongyang wrote:
 Add a migration capability 'colo'. If this capability is on,
 The migration will never end, and the VM will be continuously
 checkpointed.
 
 Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
 ---
  include/qapi/qmp/qerror.h | 3 +++
  migration.c   | 6 ++
  qapi-schema.json  | 5 -
  3 files changed, 13 insertions(+), 1 deletion(-)
 
 diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h
 index 902d1a7..226b805 100644
 --- a/include/qapi/qmp/qerror.h
 +++ b/include/qapi/qmp/qerror.h
 @@ -166,4 +166,7 @@ void qerror_report_err(Error *err);
  #define QERR_SOCKET_CREATE_FAILED \
  ERROR_CLASS_GENERIC_ERROR, Failed to create socket
  
 +#define QERR_COLO_UNSUPPORTED \
 +ERROR_CLASS_GENERIC_ERROR, COLO is not currently supported, please 
 rerun configure with --enable-colo option in order to support COLO feature

Unless you plan on using this message in more than one place, we prefer
that you don't add new #defines here.  Instead, just use error_setg with
the message inline.


 +++ b/qapi-schema.json
 @@ -491,10 +491,13 @@
  # @auto-converge: If enabled, QEMU will automatically throttle down the guest
  #  to speed up convergence of RAM migration. (since 1.6)
  #
 +# @colo: The migration will never end, and the VM will instead be 
 continuously
 +#checkpointed. The feature is disabled by default. (since 2.1)

You missed 2.1.  This has to be since 2.2.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [RFC PATCH 00/17] COarse-grain LOck-stepping(COLO) Virtual Machines for Non-stop Service

2014-07-23 Thread Eric Blake

On 07/23/2014 08:25 AM, Yang Hongyang wrote:
 Virtual machine (VM) replication is a well known technique for
 providing application-agnostic software-implemented hardware fault
 tolerance non-stop service. COLO is a high availability solution.
 Both primary VM (PVM) and secondary VM (SVM) run in parallel. They
 receive the same request from client, and generate response in parallel
 too. If the response packets from PVM and SVM are identical, they are
 released immediately. Otherwise, a VM checkpoint (on demand) is
 conducted. The idea is presented in Xen summit 2012, and 2013,
 and academia paper in SOCC 2013. It's also presented in KVM forum
 2013:
 http://www.linux-kvm.org/wiki/images/1/1d/Kvm-forum-2013-COLO.pdf
 Please refer to above document for detailed information. 
 Please also refer to previous posted RFC proposal:
 http://lists.nongnu.org/archive/html/qemu-devel/2014-06/msg05567.html
 
 The patchset is also hosted on github:
 https://github.com/macrosheep/qemu/tree/colo_v0.1
 
 This patchset is RFC, implements the frame of colo, without
 failover and nic/disk replication. But it is ready for demo
 the COLO idea above QEMU-Kvm.
 Steps using this patchset to get an overview of COLO:
 1. configure the source with --enable-colo option

Code that has to be opt-in tends to bitrot, because people don't
configure their build-bots to opt in.  What sort of penalties does
opting in cause to the code if colo is not used?  I'd much rather make
the default to compile colo unless configured --disable-colo.  Are there
any pre-req libraries required for it to work?  That would be the only
reason to make the default of on or off conditional, rather than
defaulting to on.


-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [RFC PATCH 02/17] COLO: introduce an api colo_supported() to indicate COLO support

2014-07-23 Thread Eric Blake

On 07/23/2014 08:25 AM, Yang Hongyang wrote:
 introduce an api colo_supported() to indicate COLO support, returns
 true if colo supported(configured with --enable-colo).

Space before () in English sentences:
 s/supported(configured/supported (configured/

As I mentioned in the cover letter, defaulting to off is probably a bad
idea; I'd rather default to on or even make it unconditional if it
doesn't negatively affect the code base when not used.

 
 Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
 ---
  Makefile.objs  |  1 +
  include/migration/migration-colo.h | 18 ++
  migration-colo.c   | 16 
  stubs/Makefile.objs|  1 +
  stubs/migration-colo.c | 16 
  5 files changed, 52 insertions(+)
  create mode 100644 include/migration/migration-colo.h
  create mode 100644 migration-colo.c
  create mode 100644 stubs/migration-colo.c
 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [RFC PATCH 12/17] COLO ctl: add a RunState RUN_STATE_COLO

2014-07-23 Thread Eric Blake

On 07/23/2014 08:25 AM, Yang Hongyang wrote:
 Guest will enter this state when paused to save/resore VM state

s/resore/restore/

 under colo checkpoint.
 
 Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
 ---
  qapi-schema.json | 4 +++-
  vl.c | 8 
  2 files changed, 11 insertions(+), 1 deletion(-)
 
 diff --git a/qapi-schema.json b/qapi-schema.json
 index 807f5a2..b42171c 100644
 --- a/qapi-schema.json
 +++ b/qapi-schema.json
 @@ -145,12 +145,14 @@
  # @watchdog: the watchdog action is configured to pause and has been 
 triggered
  #
  # @guest-panicked: guest has been panicked as a result of guest OS panic
 +#
 +# @colo: guest is paused to save/restore VM state under colo checkpoint

Missing a '(since 2.2)' designation.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

[PATCH v6 0/5] Read guest last instruction from kvmppc_get_last_inst()

2014-07-23 Thread Mihai Caraman

Read guest last instruction from kvmppc_get_last_inst() allowing the function
to fail in order to emulate again. On bookehv architecture search for
the physical address and kmap it, instead of using Load External PID (lwepx)
instruction. This fixes an infinite loop caused by lwepx's data TLB miss
exception handled in the host and the TODO for execute-but-not-read entries
and TLB eviction.

Mihai Caraman (5):
  KVM: PPC: e500mc: Revert add load inst fixup
  KVM: PPC: Book3e: Add TLBSEL/TSIZE defines for MAS0/1
  KVM: PPC: Book3s: Remove kvmppc_read_inst() function
  KVM: PPC: Alow kvmppc_get_last_inst() to fail
  KVM: PPC: Bookehv: Get vcpu's last instruction for emulation

 arch/powerpc/include/asm/kvm_book3s.h|  26 ---
 arch/powerpc/include/asm/kvm_booke.h |   5 --
 arch/powerpc/include/asm/kvm_ppc.h   |  31 +
 arch/powerpc/include/asm/mmu-book3e.h|   9 ++-
 arch/powerpc/kvm/book3s.c|  17 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  17 ++---
 arch/powerpc/kvm/book3s_paired_singles.c |  38 +++
 arch/powerpc/kvm/book3s_pr.c | 114 ---
 arch/powerpc/kvm/booke.c |  47 +
 arch/powerpc/kvm/bookehv_interrupts.S|  55 ++-
 arch/powerpc/kvm/e500_mmu_host.c |  98 ++
 arch/powerpc/kvm/emulate.c   |  18 +++--
 arch/powerpc/kvm/powerpc.c   |  11 ++-
 13 files changed, 314 insertions(+), 172 deletions(-)

-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 1/5] KVM: PPC: e500mc: Revert add load inst fixup

2014-07-23 Thread Mihai Caraman

The commit 1d628af7 add load inst fixup made an attempt to handle
failures generated by reading the guest current instruction. The fixup
code that was added works by chance hiding the real issue.

Load external pid (lwepx) instruction, used by KVM to read guest
instructions, is executed in a subsituted guest translation context
(EPLC[EGS] = 1). In consequence lwepx's TLB error and data storage
interrupts need to be handled by KVM, even though these interrupts
are generated from host context (MSR[GS] = 0) where lwepx is executed.

Currently, KVM hooks only interrupts generated from guest context
(MSR[GS] = 1), doing minimal checks on the fast path to avoid host
performance degradation. As a result, the host kernel handles lwepx
faults searching the faulting guest data address (loaded in DEAR) in
its own Logical Partition ID (LPID) 0 context. In case a host translation
is found the execution returns to the lwepx instruction instead of the
fixup, the host ending up in an infinite loop.

Revert the commit add load inst fixup. lwepx issue will be addressed
in a subsequent patch without needing fixup code.

Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
---
v6-v2:
 - no change

 arch/powerpc/kvm/bookehv_interrupts.S | 26 +-
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S 
b/arch/powerpc/kvm/bookehv_interrupts.S
index a1712b8..6ff4480 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -29,7 +29,6 @@
 #include asm/asm-compat.h
 #include asm/asm-offsets.h
 #include asm/bitsperlong.h
-#include asm/thread_info.h
 
 #ifdef CONFIG_64BIT
 #include asm/exception-64e.h
@@ -164,32 +163,9 @@
PPC_STL r30, VCPU_GPR(R30)(r4)
PPC_STL r31, VCPU_GPR(R31)(r4)
mtspr   SPRN_EPLC, r8
-
-   /* disable preemption, so we are sure we hit the fixup handler */
-   CURRENT_THREAD_INFO(r8, r1)
-   li  r7, 1
-   stw r7, TI_PREEMPT(r8)
-
isync
-
-   /*
-* In case the read goes wrong, we catch it and write an invalid value
-* in LAST_INST instead.
-*/
-1: lwepx   r9, 0, r5
-2:
-.section .fixup, ax
-3: li  r9, KVM_INST_FETCH_FAILED
-   b   2b
-.previous
-.section __ex_table,a
-   PPC_LONG_ALIGN
-   PPC_LONG 1b,3b
-.previous
-
+   lwepx   r9, 0, r5
mtspr   SPRN_EPLC, r3
-   li  r7, 0
-   stw r7, TI_PREEMPT(r8)
stw r9, VCPU_LAST_INST(r4)
.endif
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 2/5] KVM: PPC: Book3e: Add TLBSEL/TSIZE defines for MAS0/1

2014-07-23 Thread Mihai Caraman

Add mising defines MAS0_GET_TLBSEL() and MAS1_GET_TSIZE() for Book3E.

Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
---
v6-v2:
 - no change

 arch/powerpc/include/asm/mmu-book3e.h | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-book3e.h 
b/arch/powerpc/include/asm/mmu-book3e.h
index 8d24f78..cd4f04a 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -40,9 +40,11 @@
 
 /* MAS registers bit definitions */
 
-#define MAS0_TLBSEL_MASK0x3000
-#define MAS0_TLBSEL_SHIFT   28
-#define MAS0_TLBSEL(x)  (((x)  MAS0_TLBSEL_SHIFT)  MAS0_TLBSEL_MASK)
+#define MAS0_TLBSEL_MASK   0x3000
+#define MAS0_TLBSEL_SHIFT  28
+#define MAS0_TLBSEL(x) (((x)  MAS0_TLBSEL_SHIFT)  MAS0_TLBSEL_MASK)
+#define MAS0_GET_TLBSEL(mas0)  (((mas0)  MAS0_TLBSEL_MASK)  \
+   MAS0_TLBSEL_SHIFT)
 #define MAS0_ESEL_MASK 0x0FFF
 #define MAS0_ESEL_SHIFT16
 #define MAS0_ESEL(x)   (((x)  MAS0_ESEL_SHIFT)  MAS0_ESEL_MASK)
@@ -60,6 +62,7 @@
 #define MAS1_TSIZE_MASK0x0f80
 #define MAS1_TSIZE_SHIFT   7
 #define MAS1_TSIZE(x)  (((x)  MAS1_TSIZE_SHIFT)  MAS1_TSIZE_MASK)
+#define MAS1_GET_TSIZE(mas1)   (((mas1)  MAS1_TSIZE_MASK)  MAS1_TSIZE_SHIFT)
 
 #define MAS2_EPN   (~0xFFFUL)
 #define MAS2_X00x0040
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 5/5] KVM: PPC: Bookehv: Get vcpu's last instruction for emulation

2014-07-23 Thread Mihai Caraman

On book3e, KVM uses load external pid (lwepx) dedicated instruction to read
guest last instruction on the exit path. lwepx exceptions (DTLB_MISS, DSI
and LRAT), generated by loading a guest address, needs to be handled by KVM.
These exceptions are generated in a substituted guest translation context
(EPLC[EGS] = 1) from host context (MSR[GS] = 0).

Currently, KVM hooks only interrupts generated from guest context (MSR[GS] = 1),
doing minimal checks on the fast path to avoid host performance degradation.
lwepx exceptions originate from host state (MSR[GS] = 0) which implies
additional checks in DO_KVM macro (beside the current MSR[GS] = 1) by looking
at the Exception Syndrome Register (ESR[EPID]) and the External PID Load Context
Register (EPLC[EGS]). Doing this on each Data TLB miss exception is obvious
too intrusive for the host.

Read guest last instruction from kvmppc_load_last_inst() by searching for the
physical address and kmap it. This address the TODO for TLB eviction and
execute-but-not-read entries, and allow us to get rid of lwepx until we are
able to handle failures.

A simple stress benchmark shows a 1% sys performance degradation compared with
previous approach (lwepx without failure handling):

time for i in `seq 1 1`; do /bin/echo  /dev/null; done

real0m 8.85s
user0m 4.34s
sys 0m 4.48s

vs

real0m 8.84s
user0m 4.36s
sys 0m 4.44s

A solution to use lwepx and to handle its exceptions in KVM would be to 
temporary
highjack the interrupt vector from host. This imposes additional 
synchronizations
for cores like FSL e6500 that shares host IVOR registers between hardware 
threads.
This optimized solution can be later developed on top of this patch.

Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
---
v6:
 - no change

v5:
 - return ENULATE_AGAIN in case of failure

v4:
 - add switch and new function when getting last inst earlier
 - use enum instead of prev semnatic
 - get rid of mas0, optimize mas7_mas3
 - give more context in visible messages
 - check storage attributes mismatch on MMUv2
 - get rid of pfn_valid check

v3:
 - reworked patch description
 - use unaltered kmap addr for kunmap
 - get last instruction before beeing preempted

v2:
 - reworked patch description
 - used pr_* functions
 - addressed cosmetic feedback

 arch/powerpc/kvm/booke.c  | 44 +
 arch/powerpc/kvm/bookehv_interrupts.S | 37 --
 arch/powerpc/kvm/e500_mmu_host.c  | 92 +++
 3 files changed, 145 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 34a42b9..843077b 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -869,6 +869,28 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
}
 }
 
+static int kvmppc_resume_inst_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ enum emulation_result emulated, u32 last_inst)
+{
+   switch (emulated) {
+   case EMULATE_AGAIN:
+   return RESUME_GUEST;
+
+   case EMULATE_FAIL:
+   pr_debug(%s: load instruction from guest address %lx failed\n,
+  __func__, vcpu-arch.pc);
+   /* For debugging, encode the failing instruction and
+* report it to userspace. */
+   run-hw.hardware_exit_reason = ~0ULL  32;
+   run-hw.hardware_exit_reason |= last_inst;
+   kvmppc_core_queue_program(vcpu, ESR_PIL);
+   return RESUME_HOST;
+
+   default:
+   BUG();
+   }
+}
+
 /**
  * kvmppc_handle_exit
  *
@@ -880,6 +902,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu 
*vcpu,
int r = RESUME_HOST;
int s;
int idx;
+   u32 last_inst = KVM_INST_FETCH_FAILED;
+   enum emulation_result emulated = EMULATE_DONE;
 
/* update before a new last_exit_type is rewritten */
kvmppc_update_timing_stats(vcpu);
@@ -887,6 +911,20 @@ int kvmppc_handle_exit(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
/* restart interrupts if they were meant for the host */
kvmppc_restart_interrupt(vcpu, exit_nr);
 
+   /*
+* get last instruction before beeing preempted
+* TODO: for e6500 check also BOOKE_INTERRUPT_LRAT_ERROR  ESR_DATA
+*/
+   switch (exit_nr) {
+   case BOOKE_INTERRUPT_DATA_STORAGE:
+   case BOOKE_INTERRUPT_DTLB_MISS:
+   case BOOKE_INTERRUPT_HV_PRIV:
+   emulated = kvmppc_get_last_inst(vcpu, false, last_inst);
+   break;
+   default:
+   break;
+   }
+
local_irq_enable();
 
trace_kvm_exit(exit_nr, vcpu);
@@ -895,6 +933,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
run-exit_reason = KVM_EXIT_UNKNOWN;
run-ready_for_interrupt_injection = 1;
 
+   if (emulated != EMULATE_DONE) {
+   r =

[PATCH v6 3/5] KVM: PPC: Book3s: Remove kvmppc_read_inst() function

2014-07-23 Thread Mihai Caraman

In the context of replacing kvmppc_ld() function calls with a version of
kvmppc_get_last_inst() which allow to fail, Alex Graf suggested this:

If we get EMULATE_AGAIN, we just have to make sure we go back into the guest.
No need to inject an ISI into  the guest - it'll do that all by itself.
With an error returning kvmppc_get_last_inst we can just use completely
get rid of kvmppc_read_inst() and only use kvmppc_get_last_inst() instead.

As a intermediate step get rid of kvmppc_read_inst() and only use kvmppc_ld()
instead.

Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
---
v6:
 - add proper comments for VSX interrupt handling

v5:
 - make paired single emulation the unusual

v4:
 - new patch

 arch/powerpc/kvm/book3s_pr.c | 85 ++--
 1 file changed, 34 insertions(+), 51 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index e40765f..e76aec3 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -710,42 +710,6 @@ static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong 
fac)
 #endif
 }
 
-static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
-{
-   ulong srr0 = kvmppc_get_pc(vcpu);
-   u32 last_inst = kvmppc_get_last_inst(vcpu);
-   int ret;
-
-   ret = kvmppc_ld(vcpu, srr0, sizeof(u32), last_inst, false);
-   if (ret == -ENOENT) {
-   ulong msr = kvmppc_get_msr(vcpu);
-
-   msr = kvmppc_set_field(msr, 33, 33, 1);
-   msr = kvmppc_set_field(msr, 34, 36, 0);
-   msr = kvmppc_set_field(msr, 42, 47, 0);
-   kvmppc_set_msr_fast(vcpu, msr);
-   kvmppc_book3s_queue_irqprio(vcpu, 
BOOK3S_INTERRUPT_INST_STORAGE);
-   return EMULATE_AGAIN;
-   }
-
-   return EMULATE_DONE;
-}
-
-static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr)
-{
-
-   /* Need to do paired single emulation? */
-   if (!(vcpu-arch.hflags  BOOK3S_HFLAG_PAIRED_SINGLE))
-   return EMULATE_DONE;
-
-   /* Read out the instruction */
-   if (kvmppc_read_inst(vcpu) == EMULATE_DONE)
-   /* Need to emulate */
-   return EMULATE_FAIL;
-
-   return EMULATE_AGAIN;
-}
-
 /* Handle external providers (FPU, Altivec, VSX) */
 static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 ulong msr)
@@ -1149,31 +1113,49 @@ program_interrupt:
case BOOK3S_INTERRUPT_VSX:
{
int ext_msr = 0;
+   int emul;
+   ulong pc;
+   u32 last_inst;
+
+   if (vcpu-arch.hflags  BOOK3S_HFLAG_PAIRED_SINGLE) {
+   /* Do paired single instruction emulation */
+   pc = kvmppc_get_pc(vcpu);
+   last_inst = kvmppc_get_last_inst(vcpu);
+   emul = kvmppc_ld(vcpu, pc, sizeof(u32), last_inst,
+false);
+   if (emul == EMULATE_DONE)
+   goto program_interrupt;
+   else
+   r = RESUME_GUEST;
 
-   switch (exit_nr) {
-   case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP;  break;
-   case BOOK3S_INTERRUPT_ALTIVEC:ext_msr = MSR_VEC; break;
-   case BOOK3S_INTERRUPT_VSX:ext_msr = MSR_VSX; break;
+   break;
}
 
-   switch (kvmppc_check_ext(vcpu, exit_nr)) {
-   case EMULATE_DONE:
-   /* everything ok - let's enable the ext */
-   r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
+   /* Enable external provider */
+   switch (exit_nr) {
+   case BOOK3S_INTERRUPT_FP_UNAVAIL:
+   ext_msr = MSR_FP;
break;
-   case EMULATE_FAIL:
-   /* we need to emulate this instruction */
-   goto program_interrupt;
+
+   case BOOK3S_INTERRUPT_ALTIVEC:
+   ext_msr = MSR_VEC;
break;
-   default:
-   /* nothing to worry about - go again */
+
+   case BOOK3S_INTERRUPT_VSX:
+   ext_msr = MSR_VSX;
break;
}
+
+   r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
break;
}
case BOOK3S_INTERRUPT_ALIGNMENT:
-   if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
-   u32 last_inst = kvmppc_get_last_inst(vcpu);
+   {
+   ulong pc = kvmppc_get_pc(vcpu);
+   u32 last_inst = kvmppc_get_last_inst(vcpu);
+   int emul = kvmppc_ld(vcpu, pc, sizeof(u32), last_inst, false);
+
+   if (emul == EMULATE_DONE) {
u32 dsisr;

[PATCH v6 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail

2014-07-23 Thread Mihai Caraman

On book3e, guest last instruction is read on the exit path using load
external pid (lwepx) dedicated instruction. This load operation may fail
due to TLB eviction and execute-but-not-read entries.

This patch lay down the path for an alternative solution to read the guest
last instruction, by allowing kvmppc_get_lat_inst() function to fail.
Architecture specific implmentations of kvmppc_load_last_inst() may read
last guest instruction and instruct the emulation layer to re-execute the
guest in case of failure.

Make kvmppc_get_last_inst() definition common between architectures.

Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
---
v6:
 - rewrite kvmppc_get_last_inst() swap code to be understood at a glimpse :)
 - use inst in kvmppc_load_last_inst
 - these changes compile on book3s, please validate the functionality and
   do the necessary changes!

v5:
 - don't swap when load fail
 - convert the return value space of kvmppc_ld()

v4:
 - common declaration and enum for kvmppc_load_last_inst()
 - remove kvmppc_read_inst() in a preceding patch

v3:
 - rework patch description
 - add common definition for kvmppc_get_last_inst()
 - check return values in book3s code

v2:
 - integrated kvmppc_get_last_inst() in book3s code and checked build
 - addressed cosmetic feedback

 arch/powerpc/include/asm/kvm_book3s.h| 26 --
 arch/powerpc/include/asm/kvm_booke.h |  5 
 arch/powerpc/include/asm/kvm_ppc.h   | 31 ++
 arch/powerpc/kvm/book3s.c| 17 
 arch/powerpc/kvm/book3s_64_mmu_hv.c  | 17 
 arch/powerpc/kvm/book3s_paired_singles.c | 38 +--
 arch/powerpc/kvm/book3s_pr.c | 45 +++-
 arch/powerpc/kvm/booke.c |  3 +++
 arch/powerpc/kvm/e500_mmu_host.c |  6 +
 arch/powerpc/kvm/emulate.c   | 18 -
 arch/powerpc/kvm/powerpc.c   | 11 ++--
 11 files changed, 140 insertions(+), 77 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 20fb6f2..a86ca65 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -276,32 +276,6 @@ static inline bool kvmppc_need_byteswap(struct kvm_vcpu 
*vcpu)
return (kvmppc_get_msr(vcpu)  MSR_LE) != (MSR_KERNEL  MSR_LE);
 }
 
-static inline u32 kvmppc_get_last_inst_internal(struct kvm_vcpu *vcpu, ulong 
pc)
-{
-   /* Load the instruction manually if it failed to do so in the
-* exit path */
-   if (vcpu-arch.last_inst == KVM_INST_FETCH_FAILED)
-   kvmppc_ld(vcpu, pc, sizeof(u32), vcpu-arch.last_inst, false);
-
-   return kvmppc_need_byteswap(vcpu) ? swab32(vcpu-arch.last_inst) :
-   vcpu-arch.last_inst;
-}
-
-static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
-{
-   return kvmppc_get_last_inst_internal(vcpu, kvmppc_get_pc(vcpu));
-}
-
-/*
- * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
- * Because the sc instruction sets SRR0 to point to the following
- * instruction, we have to fetch from pc - 4.
- */
-static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
-{
-   return kvmppc_get_last_inst_internal(vcpu, kvmppc_get_pc(vcpu) - 4);
-}
-
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
return vcpu-arch.fault_dar;
diff --git a/arch/powerpc/include/asm/kvm_booke.h 
b/arch/powerpc/include/asm/kvm_booke.h
index c7aed61..cbb1990 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -69,11 +69,6 @@ static inline bool kvmppc_need_byteswap(struct kvm_vcpu 
*vcpu)
return false;
 }
 
-static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
-{
-   return vcpu-arch.last_inst;
-}
-
 static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
 {
vcpu-arch.ctr = val;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index e2fd5a1..2da5f547 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -47,6 +47,11 @@ enum emulation_result {
EMULATE_EXIT_USER,/* emulation requires exit to user-space */
 };
 
+enum instruction_type {
+   INST_GENERIC,
+   INST_SC,/* system call */
+};
+
 extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern void kvmppc_handler_highmem(void);
@@ -62,6 +67,9 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
   u64 val, unsigned int bytes,
   int is_default_endian);
 
+extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
+enum instruction_type type, u32 *inst);
+
 extern int kvmppc_emulate_instruction(struct kvm_run *run,

Re: [Qemu-devel] [RFC PATCH 07/17] COLO buffer: implement colo buffer as well as QEMUFileOps based on it

2014-07-23 Thread Eric Blake

On 07/23/2014 08:25 AM, Yang Hongyang wrote:
 We need a buffer to store migration data.
 
 On save side:
   all saved data was write into colo buffer first, so that we can know

s/was write/is written/

 the total size of the migration data. this can also separate the data
 transmission from colo control data, we use colo control data over
 socket fd to synchronous both side's stat.
 
 On restore side:
   all migration data was read into colo buffer first, then load data
 from the buffer: If network error happens while data transmission,

s/while/during/

 the slaver can still functinal because the migration data are not yet

s/slaver/slave/
s/functinal/function/
s/are/is/

 loaded.
 
 Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
 ---
  migration-colo.c | 112 
 +++
  1 file changed, 112 insertions(+)
 

 +/* colo buffer */
 +
 +#define COLO_BUFFER_BASE_SIZE (1000*1000*4ULL)
 +#define COLO_BUFFER_MAX_SIZE (1000*1000*1000*10ULL)

Spaces around binary operators.

 +
 +typedef struct colo_buffer {

For consistency with the rest of the code base, name this ColoBuffer,
not colo_buffer.

 +uint8_t *data;
 +uint64_t used;
 +uint64_t freed;
 +uint64_t size;
 +} colo_buffer_t;

HACKING says to NOT name types with a trailing _t.  Just name the
typedef ColoBuffer.


 +static void colo_buffer_destroy(void)
 +{
 +if (colo_buffer.data) {
 +g_free(colo_buffer.data);
 +colo_buffer.data = NULL;

g_free(NULL) behaves sanely, just make these two lines unconditional.


 +static void colo_buffer_extend(uint64_t len)
 +{
 +if (len  colo_buffer.size - colo_buffer.used) {
 +len = len + colo_buffer.used - colo_buffer.size;
 +len = ROUND_UP(len, COLO_BUFFER_BASE_SIZE) + COLO_BUFFER_BASE_SIZE;
 +
 +colo_buffer.size += len;
 +if (colo_buffer.size  COLO_BUFFER_MAX_SIZE) {
 +error_report(colo_buffer overflow!\n);

No trailing \n in error_report().

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [RFC PATCH 00/17] COarse-grain LOck-stepping(COLO) Virtual Machines for Non-stop Service

2014-07-23 Thread Hongyang Yang


On 07/23/2014 11:44 PM, Eric Blake wrote:

On 07/23/2014 08:25 AM, Yang Hongyang wrote:

Virtual machine (VM) replication is a well known technique for
providing application-agnostic software-implemented hardware fault
tolerance non-stop service. COLO is a high availability solution.
Both primary VM (PVM) and secondary VM (SVM) run in parallel. They
receive the same request from client, and generate response in parallel
too. If the response packets from PVM and SVM are identical, they are
released immediately. Otherwise, a VM checkpoint (on demand) is
conducted. The idea is presented in Xen summit 2012, and 2013,
and academia paper in SOCC 2013. It's also presented in KVM forum
2013:
http://www.linux-kvm.org/wiki/images/1/1d/Kvm-forum-2013-COLO.pdf
Please refer to above document for detailed information.
Please also refer to previous posted RFC proposal:
http://lists.nongnu.org/archive/html/qemu-devel/2014-06/msg05567.html

The patchset is also hosted on github:
https://github.com/macrosheep/qemu/tree/colo_v0.1

This patchset is RFC, implements the frame of colo, without
failover and nic/disk replication. But it is ready for demo
the COLO idea above QEMU-Kvm.
Steps using this patchset to get an overview of COLO:
1. configure the source with --enable-colo option


Code that has to be opt-in tends to bitrot, because people don't
configure their build-bots to opt in.  What sort of penalties does
opting in cause to the code if colo is not used?  I'd much rather make
the default to compile colo unless configured --disable-colo.  Are there
any pre-req libraries required for it to work?  That would be the only
reason to make the default of on or off conditional, rather than
defaulting to on.


Thanks for all your comments on this patchset, will address them.
For this one, it will not affect the rest of the code if COLO is compiled
but not used, and it does not require pre-req libraries for now, so we can
make COLO support default to on next time.






--
Thanks,
Yang.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 0/5] random,x86,kvm: Rework arch RNG seeds and get some from kvm

2014-07-23 Thread Andy Lutomirski

This introduces and uses a very simple synchronous mechanism to get
/dev/urandom-style bits appropriate for initial KVM PV guest RNG
seeding.

It also re-works the way that architectural random data is fed into
random.c's pools.  I added a new arch hook called arch_get_rng_seed.
The default implementation is more or less the same as the current
code, except that random_get_entropy is now called unconditionally.

x86 gets a custom arch_get_rng_seed.  It will use KVM_GET_RNG_SEED
if available, and, if it does anything, it will log the number of
bits collected from each available architectural source.  If more
paravirt seed sources show up, it will be a natural place to add
them.

I sent the corresponding kvm-unit-tests and qemu changes separately.

Changes from v4:
 - Got rid of the RDRAND behavior change.  If this series is accepted,
   I may resend it separately, but I think it's an unrelated issue.
 - Fix up the changelog entries -- I misunderstood how the old code
   worked.
 - Avoid lots of failed attempts to use KVM_GET_RNG_SEED if it's not
   available.

Changes from v3:
 - Other than KASLR, the guest pieces are completely rewritten.
   Patches 2-4 have essentially nothing in common with v2.

Changes from v2:
 - Bisection fix (patch 2 had a misplaced brace).  The final states is
   identical to that of v2.
 - Improve the 0/5 description a little bit.

Changes from v1:
 - Split patches 2 and 3
 - Log all arch sources in init_std_data
 - Fix the 32-bit kaslr build

Andy Lutomirski (5):
  x86,kvm: Add MSR_KVM_GET_RNG_SEED and a matching feature bit
  random: Add and use arch_get_rng_seed
  x86,random: Add an x86 implementation of arch_get_rng_seed
  x86,random,kvm: Use KVM_GET_RNG_SEED in arch_get_rng_seed
  x86,kaslr: Use MSR_KVM_GET_RNG_SEED for KASLR if available

 Documentation/virtual/kvm/cpuid.txt  |  3 ++
 arch/x86/Kconfig |  4 ++
 arch/x86/boot/compressed/aslr.c  | 27 +
 arch/x86/include/asm/archrandom.h|  6 +++
 arch/x86/include/asm/kvm_guest.h |  9 +
 arch/x86/include/asm/processor.h | 21 --
 arch/x86/include/uapi/asm/kvm_para.h |  2 +
 arch/x86/kernel/Makefile |  2 +
 arch/x86/kernel/archrandom.c | 74 
 arch/x86/kernel/kvm.c| 10 +
 arch/x86/kvm/cpuid.c |  3 +-
 arch/x86/kvm/x86.c   |  4 ++
 drivers/char/random.c| 14 +--
 include/linux/random.h   | 40 +++
 14 files changed, 212 insertions(+), 7 deletions(-)
 create mode 100644 arch/x86/kernel/archrandom.c

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 2/5] random: Add and use arch_get_rng_seed

2014-07-23 Thread Andy Lutomirski

Currently, init_std_data contains its own logic for using arch
random sources.  This replaces that logic with a generic function
arch_get_rng_seed that allows arch code to supply its own logic.
The default implementation tries arch_get_random_seed_long and
arch_get_random_long individually.

The only functional change here is that random_get_entropy() is used
unconditionally instead of being used only when the arch sources
fail.  This may add a tiny amount of security.

Signed-off-by: Andy Lutomirski l...@amacapital.net
---
 drivers/char/random.c  | 14 +++---
 include/linux/random.h | 40 
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 0a7ac0a..be7a94e 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1236,6 +1236,10 @@ void get_random_bytes_arch(void *buf, int nbytes)
 }
 EXPORT_SYMBOL(get_random_bytes_arch);
 
+static void seed_entropy_store(void *ctx, u32 data)
+{
+   mix_pool_bytes((struct entropy_store *)ctx, data, sizeof(data), NULL);
+}
 
 /*
  * init_std_data - initialize pool with system data
@@ -1251,15 +1255,19 @@ static void init_std_data(struct entropy_store *r)
int i;
ktime_t now = ktime_get_real();
unsigned long rv;
+   char log_prefix[128];
 
r-last_pulled = jiffies;
mix_pool_bytes(r, now, sizeof(now), NULL);
for (i = r-poolinfo-poolbytes; i  0; i -= sizeof(rv)) {
-   if (!arch_get_random_seed_long(rv) 
-   !arch_get_random_long(rv))
-   rv = random_get_entropy();
+   rv = random_get_entropy();
mix_pool_bytes(r, rv, sizeof(rv), NULL);
}
+
+   sprintf(log_prefix, random: seeded %s pool, r-name);
+   arch_get_rng_seed(r, seed_entropy_store, 8 * r-poolinfo-poolbytes,
+ log_prefix);
+
mix_pool_bytes(r, utsname(), sizeof(*(utsname())), NULL);
 }
 
diff --git a/include/linux/random.h b/include/linux/random.h
index 57fbbff..81a6145 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -106,6 +106,46 @@ static inline int arch_has_random_seed(void)
 }
 #endif
 
+#ifndef __HAVE_ARCH_GET_RNG_SEED
+
+/**
+ * arch_get_rng_seed() - get architectural rng seed data
+ * @ctx: context for the seed function
+ * @seed: function to call for each u32 obtained
+ * @bits_per_source: number of bits from each source to try to use
+ * @log_prefix: beginning of log output (may be NULL)
+ *
+ * Synchronously load some architectural entropy or other best-effort
+ * random seed data.  An arch-specific implementation should be no worse
+ * than this generic implementation.  If the arch code does something
+ * interesting, it may log something of the form log_prefix with
+ * 8 bits of stuff.
+ *
+ * No arch-specific implementation should be any worse than the generic
+ * implementation.
+ */
+static inline void arch_get_rng_seed(void *ctx,
+void (*seed)(void *ctx, u32 data),
+int bits_per_source,
+const char *log_prefix)
+{
+   int i;
+
+   for (i = 0; i  bits_per_source; i += 8 * sizeof(long)) {
+   unsigned long rv;
+
+   if (arch_get_random_seed_long(rv) ||
+   arch_get_random_long(rv)) {
+   seed(ctx, (u32)rv);
+#if BITS_PER_LONG  32
+   seed(ctx, (u32)(rv  32));
+#endif
+   }
+   }
+}
+
+#endif /* __HAVE_ARCH_GET_RNG_SEED */
+
 /* Pseudo random number generator from numerical recipes. */
 static inline u32 next_pseudo_random32(u32 seed)
 {
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 5/5] x86,kaslr: Use MSR_KVM_GET_RNG_SEED for KASLR if available

2014-07-23 Thread Andy Lutomirski

It's considerably better than any of the alternatives on KVM.

Rather than reinventing all of the cpu feature query code, this fixes
native_cpuid to work in PIC objects.

I haven't combined it with boot/cpuflags.c's cpuid implementation:
including asm/processor.h from boot/cpuflags.c results in a flood of
unrelated errors, and fixing it might be messy.

Reviewed-by: Kees Cook keesc...@chromium.org
Signed-off-by: Andy Lutomirski l...@amacapital.net
---
 arch/x86/boot/compressed/aslr.c  | 27 +++
 arch/x86/include/asm/processor.h | 21 ++---
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index fc6091a..8583f0e 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -5,6 +5,8 @@
 #include asm/archrandom.h
 #include asm/e820.h
 
+#include uapi/asm/kvm_para.h
+
 #include generated/compile.h
 #include linux/module.h
 #include linux/uts.h
@@ -15,6 +17,22 @@
 static const char build_str[] = UTS_RELEASE  ( LINUX_COMPILE_BY @
LINUX_COMPILE_HOST ) ( LINUX_COMPILER )  UTS_VERSION;
 
+static bool kvm_para_has_feature(unsigned int feature)
+{
+   u32 kvm_base;
+   u32 features;
+
+   if (!has_cpuflag(X86_FEATURE_HYPERVISOR))
+   return false;
+
+   kvm_base = hypervisor_cpuid_base(KVMKVMKVM\0\0\0, KVM_CPUID_FEATURES);
+   if (!kvm_base)
+   return false;
+
+   features = cpuid_eax(kvm_base | KVM_CPUID_FEATURES);
+   return features  (1UL  feature);
+}
+
 #define I8254_PORT_CONTROL 0x43
 #define I8254_PORT_COUNTER00x40
 #define I8254_CMD_READBACK 0xC0
@@ -81,6 +99,15 @@ static unsigned long get_random_long(void)
}
}
 
+   if (kvm_para_has_feature(KVM_FEATURE_GET_RNG_SEED)) {
+   u64 seed;
+
+   debug_putstr( MSR_KVM_GET_RNG_SEED);
+   rdmsrl(MSR_KVM_GET_RNG_SEED, seed);
+   random ^= (unsigned long)seed;
+   use_i8254 = false;
+   }
+
if (has_cpuflag(X86_FEATURE_TSC)) {
debug_putstr( RDTSC);
rdtscll(raw);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a4ea023..6096f3c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -189,10 +189,25 @@ static inline int have_cpuid_p(void)
 static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
 {
-   /* ecx is often an input as well as an output. */
-   asm volatile(cpuid
+   /*
+* This function can be used from the boot code, so it needs
+* to avoid using EBX in constraints in PIC mode.
+*
+* ecx is often an input as well as an output.
+*/
+   asm volatile(.ifnc %%ebx,%1 ; .ifnc %%rbx,%1   \n\t
+movl  %%ebx,%1\n\t
+.endif ; .endif   \n\t
+cpuid \n\t
+.ifnc %%ebx,%1 ; .ifnc %%rbx,%1   \n\t
+xchgl %%ebx,%1\n\t
+.endif ; .endif
: =a (*eax),
- =b (*ebx),
+#if defined(__i386__)  defined(__PIC__)
+ =r (*ebx),  /* gcc won't let us use ebx */
+#else
+ =b (*ebx),  /* ebx is okay */
+#endif
  =c (*ecx),
  =d (*edx)
: 0 (*eax), 2 (*ecx)
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 3/5] x86,random: Add an x86 implementation of arch_get_rng_seed

2014-07-23 Thread Andy Lutomirski

This does the same thing as the generic implementation, except
that it logs how many bits of each type it collected.  I want to
know whether the initial seeding is working and, if so, whether
the RNG is fast enough.

(I know that hpa assures me that the hardware RNG is more than
 fast enough, but I'd still like a direct way to verify this.)

Arguably, arch_get_random_seed could be removed now: I'm having some
trouble imagining a sensible non-architecture-specific use of it
that wouldn't be better served by arch_get_rng_seed.

Signed-off-by: Andy Lutomirski l...@amacapital.net
---
 arch/x86/include/asm/archrandom.h |  6 +
 arch/x86/kernel/Makefile  |  2 ++
 arch/x86/kernel/archrandom.c  | 51 +++
 3 files changed, 59 insertions(+)
 create mode 100644 arch/x86/kernel/archrandom.c

diff --git a/arch/x86/include/asm/archrandom.h 
b/arch/x86/include/asm/archrandom.h
index 69f1366..88f9c5a 100644
--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -117,6 +117,12 @@ GET_SEED(arch_get_random_seed_int, unsigned int, 
RDSEED_INT, ASM_NOP4);
 #define arch_has_random()  static_cpu_has(X86_FEATURE_RDRAND)
 #define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED)
 
+#define __HAVE_ARCH_GET_RNG_SEED
+extern void arch_get_rng_seed(void *ctx,
+ void (*seed)(void *ctx, u32 data),
+ int bits_per_source,
+ const char *log_prefix);
+
 #else
 
 static inline int rdrand_long(unsigned long *v)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 047f9ff..0718bae 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -92,6 +92,8 @@ obj-$(CONFIG_PARAVIRT)+= paravirt.o 
paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)   += pvclock.o
 
+obj-$(CONFIG_ARCH_RANDOM)  += archrandom.o
+
 obj-$(CONFIG_PCSPKR_PLATFORM)  += pcspeaker.o
 
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
diff --git a/arch/x86/kernel/archrandom.c b/arch/x86/kernel/archrandom.c
new file mode 100644
index 000..47d13b0
--- /dev/null
+++ b/arch/x86/kernel/archrandom.c
@@ -0,0 +1,51 @@
+/*
+ * This file is part of the Linux kernel.
+ *
+ * Copyright (c) 2014 Andy Lutomirski
+ * Authors: Andy Lutomirski l...@amacapital.net
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include asm/archrandom.h
+
+void arch_get_rng_seed(void *ctx,
+  void (*seed)(void *ctx, u32 data),
+  int bits_per_source,
+  const char *log_prefix)
+{
+   int i;
+   int rdseed_bits = 0, rdrand_bits = 0;
+   char buf[128] = ;
+   char *msgptr = buf;
+
+   for (i = 0; i  bits_per_source; i += 8 * sizeof(long)) {
+   unsigned long rv;
+
+   if (arch_get_random_seed_long(rv))
+   rdseed_bits += 8 * sizeof(rv);
+   else if (arch_get_random_long(rv))
+   rdrand_bits += 8 * sizeof(rv);
+   else
+   continue;   /* Don't waste time mixing. */
+
+   seed(ctx, (u32)rv);
+#if BITS_PER_LONG  32
+   seed(ctx, (u32)(rv  32));
+#endif
+   }
+
+   if (rdseed_bits)
+   msgptr += sprintf(msgptr, , %d bits from RDSEED, rdseed_bits);
+   if (rdrand_bits)
+   msgptr += sprintf(msgptr, , %d bits from RDRAND, rdrand_bits);
+   if (buf[0])
+   pr_info(%s with %s\n, log_prefix, buf + 2);
+}
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 4/5] x86,random,kvm: Use KVM_GET_RNG_SEED in arch_get_rng_seed

2014-07-23 Thread Andy Lutomirski

This is a straightforward implementation: for each bit of internal
RNG state, request one bit from KVM_GET_RNG_SEED.  This is done even
if RDSEED/RDRAND worked, since KVM_GET_RNG_SEED is likely to provide
cryptographically secure output even if the CPU's RNG is weak or
compromised.

Signed-off-by: Andy Lutomirski l...@amacapital.net
---
 arch/x86/Kconfig |  4 
 arch/x86/include/asm/kvm_guest.h |  9 +
 arch/x86/kernel/archrandom.c | 25 -
 arch/x86/kernel/kvm.c| 10 ++
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a8f749e..adfa09c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -593,6 +593,7 @@ config KVM_GUEST
bool KVM Guest support (including kvmclock)
depends on PARAVIRT
select PARAVIRT_CLOCK
+   select ARCH_RANDOM
default y
---help---
  This option enables various optimizations for running under the KVM
@@ -1507,6 +1508,9 @@ config ARCH_RANDOM
  If supported, this is a high bandwidth, cryptographically
  secure hardware random number generator.
 
+ This also enables paravirt RNGs such as KVM's if the relevant
+ PV guest support is enabled.
+
 config X86_SMAP
def_bool y
prompt Supervisor Mode Access Prevention if EXPERT
diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h
index a92b176..8c4dbd5 100644
--- a/arch/x86/include/asm/kvm_guest.h
+++ b/arch/x86/include/asm/kvm_guest.h
@@ -3,4 +3,13 @@
 
 int kvm_setup_vsyscall_timeinfo(void);
 
+#if defined(CONFIG_KVM_GUEST)  defined(CONFIG_ARCH_RANDOM)
+extern bool kvm_get_rng_seed(u64 *rv);
+#else
+static inline bool kvm_get_rng_seed(u64 *rv)
+{
+   return false;
+}
+#endif
+
 #endif /* _ASM_X86_KVM_GUEST_H */
diff --git a/arch/x86/kernel/archrandom.c b/arch/x86/kernel/archrandom.c
index 47d13b0..8c8d021 100644
--- a/arch/x86/kernel/archrandom.c
+++ b/arch/x86/kernel/archrandom.c
@@ -15,6 +15,7 @@
  */
 
 #include asm/archrandom.h
+#include asm/kvm_guest.h
 
 void arch_get_rng_seed(void *ctx,
   void (*seed)(void *ctx, u32 data),
@@ -22,7 +23,7 @@ void arch_get_rng_seed(void *ctx,
   const char *log_prefix)
 {
int i;
-   int rdseed_bits = 0, rdrand_bits = 0;
+   int rdseed_bits = 0, rdrand_bits = 0, kvm_bits = 0;
char buf[128] = ;
char *msgptr = buf;
 
@@ -42,10 +43,32 @@ void arch_get_rng_seed(void *ctx,
 #endif
}
 
+   /*
+* Use KVM_GET_RNG_SEED regardless of whether the CPU RNG
+* worked, since it incorporates entropy unavailable to the CPU,
+* and we shouldn't trust the hardware RNG more than we need to.
+* We request enough bits for the entire internal RNG state,
+* because there's no good reason not to.
+*/
+   for (i = 0; i  bits_per_source; i += 64) {
+   u64 rv;
+
+   if (kvm_get_rng_seed(rv)) {
+   seed(ctx, (u32)rv);
+   seed(ctx, (u32)(rv  32));
+   kvm_bits += 8 * sizeof(rv);
+   } else {
+   break;  /* If it fails once, it will keep failing. */
+   }
+   }
+
if (rdseed_bits)
msgptr += sprintf(msgptr, , %d bits from RDSEED, rdseed_bits);
if (rdrand_bits)
msgptr += sprintf(msgptr, , %d bits from RDRAND, rdrand_bits);
+   if (kvm_bits)
+   msgptr += sprintf(msgptr, , %d bits from KVM_GET_RNG_BITS,
+ kvm_bits);
if (buf[0])
pr_info(%s with %s\n, log_prefix, buf + 2);
 }
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 3dd8e2c..bd8783a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -416,6 +416,16 @@ void kvm_disable_steal_time(void)
wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 }
 
+bool kvm_get_rng_seed(u64 *v)
+{
+   /*
+* Allow migration from a hypervisor with the GET_RNG_SEED
+* feature to a hypervisor without it.
+*/
+   return (kvm_para_has_feature(KVM_FEATURE_GET_RNG_SEED) 
+   rdmsrl_safe(MSR_KVM_GET_RNG_SEED, v) == 0);
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 1/5] x86,kvm: Add MSR_KVM_GET_RNG_SEED and a matching feature bit

2014-07-23 Thread Andy Lutomirski

This adds a simple interface to allow a guest to request 64 bits of
host nonblocking entropy.  This is independent of virtio-rng for a
couple of reasons:

 - It's intended to be usable during early boot, when a trivial
   synchronous interface is needed.

 - virtio-rng gives blocking entropy, and making guest boot wait for
   the host's /dev/random will cause problems.

MSR_KVM_GET_RNG_SEED is intended to provide 64 bits of best-effort
cryptographically secure data for use as a seed.  It provides no
guarantee that the result contains any actual entropy.

Signed-off-by: Andy Lutomirski l...@amacapital.net
---
 Documentation/virtual/kvm/cpuid.txt  | 3 +++
 arch/x86/include/uapi/asm/kvm_para.h | 2 ++
 arch/x86/kvm/cpuid.c | 3 ++-
 arch/x86/kvm/x86.c   | 4 
 4 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/cpuid.txt 
b/Documentation/virtual/kvm/cpuid.txt
index 3c65feb..0ab043b 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -54,6 +54,9 @@ KVM_FEATURE_PV_UNHALT  || 7 || guest checks 
this feature bit
||   || before enabling paravirtualized
||   || spinlock support.
 --
+KVM_FEATURE_GET_RNG_SEED   || 8 || host provides rng seed data via
+   ||   || MSR_KVM_GET_RNG_SEED.
+--
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||24 || host will warn if no guest-side
||   || per-cpu warps are expected in
||   || kvmclock.
diff --git a/arch/x86/include/uapi/asm/kvm_para.h 
b/arch/x86/include/uapi/asm/kvm_para.h
index 94dc8ca..e2eaf93 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -24,6 +24,7 @@
 #define KVM_FEATURE_STEAL_TIME 5
 #define KVM_FEATURE_PV_EOI 6
 #define KVM_FEATURE_PV_UNHALT  7
+#define KVM_FEATURE_GET_RNG_SEED   8
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -40,6 +41,7 @@
 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02
 #define MSR_KVM_STEAL_TIME  0x4b564d03
 #define MSR_KVM_PV_EOI_EN  0x4b564d04
+#define MSR_KVM_GET_RNG_SEED 0x4b564d05
 
 struct kvm_steal_time {
__u64 steal;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 38a0afe..40d6763 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -479,7 +479,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 (1  KVM_FEATURE_ASYNC_PF) |
 (1  KVM_FEATURE_PV_EOI) |
 (1  KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
-(1  KVM_FEATURE_PV_UNHALT);
+(1  KVM_FEATURE_PV_UNHALT) |
+(1  KVM_FEATURE_GET_RNG_SEED);
 
if (sched_info_on())
entry-eax |= (1  KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f644933..4e81853 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -48,6 +48,7 @@
 #include linux/pci.h
 #include linux/timekeeper_internal.h
 #include linux/pvclock_gtod.h
+#include linux/random.h
 #include trace/events/kvm.h
 
 #define CREATE_TRACE_POINTS
@@ -2480,6 +2481,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, 
u64 *pdata)
case MSR_KVM_PV_EOI_EN:
data = vcpu-arch.pv_eoi.msr_val;
break;
+   case MSR_KVM_GET_RNG_SEED:
+   get_random_bytes(data, sizeof(data));
+   break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] vhost: Add polling mode

2014-07-23 Thread Jason Wang

On 07/23/2014 04:48 PM, Abel Gordon wrote:
 On Wed, Jul 23, 2014 at 11:42 AM, Jason Wang jasow...@redhat.com wrote:

  On 07/23/2014 04:12 PM, Razya Ladelsky wrote:
   Jason Wang jasow...@redhat.com wrote on 23/07/2014 08:26:36 AM:

   From: Jason Wang jasow...@redhat.com
   To: Razya Ladelsky/Haifa/IBM@IBMIL, kvm@vger.kernel.org, Michael S.
   Tsirkin m...@redhat.com,
   Cc: abel.gor...@gmail.com, Joel Nider/Haifa/IBM@IBMIL, Yossi
   Kuperman1/Haifa/IBM@IBMIL, Eran Raichstein/Haifa/IBM@IBMIL, Alex
   Glikson/Haifa/IBM@IBMIL
   Date: 23/07/2014 08:26 AM
   Subject: Re: [PATCH] vhost: Add polling mode

   On 07/21/2014 09:23 PM, Razya Ladelsky wrote:
   Hello All,

   When vhost is waiting for buffers from the guest driver (e.g., more
   packets
   to send in vhost-net's transmit queue), it normally goes to sleep 
   and
   waits
   for the guest to kick it. This kick involves a PIO in the guest, 
   and
   therefore an exit (and possibly userspace involvement in translating
   this
   PIO
   exit into a file descriptor event), all of which hurts performance.

   If the system is under-utilized (has cpu time to spare), vhost can
   continuously poll the virtqueues for new buffers, and avoid asking
   the guest to kick us.
   This patch adds an optional polling mode to vhost, that can be 
   enabled
   via a kernel module parameter, poll_start_rate.

   When polling is active for a virtqueue, the guest is asked to
   disable notification (kicks), and the worker thread continuously
   checks
   for
   new buffers. When it does discover new buffers, it simulates a 
   kick
   by
   invoking the underlying backend driver (such as vhost-net), which
   thinks
   it
   got a real kick from the guest, and acts accordingly. If the
   underlying
   driver asks not to be kicked, we disable polling on this virtqueue.

   We start polling on a virtqueue when we notice it has
   work to do. Polling on this virtqueue is later disabled after 3
   seconds of
   polling turning up no new work, as in this case we are better off
   returning
   to the exit-based notification mechanism. The default timeout of 3
   seconds
   can be changed with the poll_stop_idle kernel module parameter.

   This polling approach makes lot of sense for new HW with
   posted-interrupts
   for which we have exitless host-to-guest notifications. But even 
   with
   support
   for posted interrupts, guest-to-host communication still causes 
   exits.
   Polling adds the missing part.

   When systems are overloaded, there won?t be enough cpu time for the
   various
   vhost threads to poll their guests' devices. For these scenarios, we
   plan
   to add support for vhost threads that can be shared by multiple
   devices,
   even of multiple vms.
   Our ultimate goal is to implement the I/O acceleration features
   described
   in:
   KVM Forum 2013: Efficient and Scalable Virtio (by Abel Gordon)
   https://www.youtube.com/watch?v=9EyweibHfEs
   and
   https://www.mail-archive.com/kvm@vger.kernel.org/msg98179.html

   Comments are welcome,
   Thank you,
   Razya
   Thanks for the work. Do you have perf numbers for this?

   Hi Jason,
   Thanks for reviewing. I ran some experiments with TCP stream netperf and
   filebench (having 2 threads performing random reads) benchmarks on an 
   IBM
   System x3650 M4.
   All runs loaded the guests in a way that they were (cpu) saturated.
   The system had two cores per guest, as to allow for both the vcpu and 
   the
   vhost thread to
   run concurrently for maximum throughput (but I didn't pin the threads to
   specific cores)
   I get:

   Netperf, 1 vm:
   The polling patch improved throughput by ~33%. Number of exits/sec
   decreased 6x.
   The same improvement was shown when I tested with 3 vms running netperf.

   filebench, 1 vm:
   ops/sec improved by 13% with the polling patch. Number of exits was
   reduced by 31%.
   The same experiment with 3 vms running filebench showed similar numbers.

  Looks good, may worth to add the result in the commit log.

   And looks like the patch only poll for virtqueue. In the future, may
   worth to add callbacks for vhost_net to poll socket. Then it could be
   used with rx busy polling in host which may speedup the rx also.
   Did you mean polling the network device to avoid interrupts?

  Yes, recent linux host support rx busy polling which can reduce the
  interrupts. If vhost can utilize this, it can also reduce the latency
  caused by vhost thread wakeups.

  And I'm also working on virtio-net busy polling in guest, if vhost can
  poll socket, it can also help in guest rx polling.
 Nice :)  Note that you may want to check if if the processor support
 posted interrupts. I guess that if CPU supports posted interrupts then
 benefits of polling in the front-end (from performance perspective)
 may not worth the cpu cycles wasted in the guest.

Yes it's worth to check. But I think busy polling in guest may still
help since it may

RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail

2014-07-23 Thread mihai.cara...@freescale.com

 -Original Message-
 From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-
 ow...@vger.kernel.org] On Behalf Of Alexander Graf
 Sent: Wednesday, July 23, 2014 12:21 AM
 To: Caraman Mihai Claudiu-B02008
 Cc: kvm-ppc@vger.kernel.org; linuxppc-...@lists.ozlabs.org;
 k...@vger.kernel.org
 Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail

 On 21.07.14 11:59, mihai.cara...@freescale.com wrote:
  -Original Message-
  From: Linuxppc-dev [mailto:linuxppc-dev-
  bounces+mihai.caraman=freescale@lists.ozlabs.org] On Behalf Of
  mihai.cara...@freescale.com
  Sent: Friday, July 18, 2014 12:06 PM
  To: Alexander Graf; kvm-ppc@vger.kernel.org
  Cc: linuxppc-...@lists.ozlabs.org; k...@vger.kernel.org
  Subject: RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to
 fail

  -Original Message-
  From: Alexander Graf [mailto:ag...@suse.de]
  Sent: Thursday, July 17, 2014 5:21 PM
  To: Caraman Mihai Claudiu-B02008; kvm-ppc@vger.kernel.org
  Cc: k...@vger.kernel.org; linuxppc-...@lists.ozlabs.org
  Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to
  fail

  On 17.07.14 13:22, Mihai Caraman wrote:
  On book3e, guest last instruction is read on the exit path using
 load
  external pid (lwepx) dedicated instruction. This load operation may
  fail
  due to TLB eviction and execute-but-not-read entries.

  This patch lay down the path for an alternative solution to read the
  guest
  last instruction, by allowing kvmppc_get_lat_inst() function to
 fail.
  Architecture specific implmentations of kvmppc_load_last_inst() may
  read
  last guest instruction and instruct the emulation layer to re-
 execute
  the
  guest in case of failure.

  Make kvmppc_get_last_inst() definition common between architectures.

  Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
  ---
  ...

  diff --git a/arch/powerpc/include/asm/kvm_ppc.h
  b/arch/powerpc/include/asm/kvm_ppc.h
  index e2fd5a1..7f9c634 100644
  --- a/arch/powerpc/include/asm/kvm_ppc.h
  +++ b/arch/powerpc/include/asm/kvm_ppc.h
  @@ -47,6 +47,11 @@ enum emulation_result {
   EMULATE_EXIT_USER,/* emulation requires exit to user-
  space */
 };

  +enum instruction_type {
  +INST_GENERIC,
  +INST_SC,/* system call */
  +};
  +
 extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct
 kvm_vcpu
  *vcpu);
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct
  kvm_vcpu
  *vcpu);
 extern void kvmppc_handler_highmem(void);
  @@ -62,6 +67,9 @@ extern int kvmppc_handle_store(struct kvm_run
 *run,
  struct kvm_vcpu *vcpu,
  u64 val, unsigned int bytes,
  int is_default_endian);

  +extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
  + enum instruction_type type, u32 *inst);
  +
 extern int kvmppc_emulate_instruction(struct kvm_run *run,
   struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct
 kvm_vcpu
  *vcpu);
  @@ -234,6 +242,23 @@ struct kvmppc_ops {
 extern struct kvmppc_ops *kvmppc_hv_ops;
 extern struct kvmppc_ops *kvmppc_pr_ops;

  +static inline int kvmppc_get_last_inst(struct kvm_vcpu *vcpu,
  +enum instruction_type type, u32
 *inst)
  +{
  +int ret = EMULATE_DONE;
  +
  +/* Load the instruction manually if it failed to do so in the
  + * exit path */
  +if (vcpu-arch.last_inst == KVM_INST_FETCH_FAILED)
  +ret = kvmppc_load_last_inst(vcpu, type, vcpu-
  arch.last_inst);
  +
  +
  +*inst = (ret == EMULATE_DONE  kvmppc_need_byteswap(vcpu)) ?
  +swab32(vcpu-arch.last_inst) : vcpu-arch.last_inst;
  This makes even less sense than the previous version. Either you
 treat
  inst as definitely overwritten or as preserves previous data on
  failure.
  Both v4 and v5 versions treat inst as definitely overwritten.

  So either you unconditionally swap like you did before
  If we make abstraction of its symmetry, KVM_INST_FETCH_FAILED is
 operated
  in host endianness, so it doesn't need byte swap.

  I agree with your reasoning if last_inst is initialized and compared
 with
  data in guest endianess, which is not the case yet for
  KVM_INST_FETCH_FAILED.
  Alex, are you relying on the fact that KVM_INST_FETCH_FAILED value is
 symmetrical?
  With a non symmetrical value like 0xDEADBEEF, and considering a little-
 endian guest
  on a big-endian host, we need to fix kvm logic to initialize and
 compare last_inst
  with 0xEFBEADDE swaped value.

  Your suggestion to unconditionally swap makes sense only with the above
 fix, otherwise
  inst may end up with 0xEFBEADDE swaped value with is wrong.

 Only for *inst which we would treat as undefined after the function
 returned EMULATE_AGAIN. 

Right. With this do you acknowledge that v5

RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail

2014-07-23 Thread mihai.cara...@freescale.com

  Right. With this do you acknowledge that v5 (definitely overwritten
 approach)
  is ok?
 
 I think I'm starting to understand your logic of v5. You write
 fetch_failed into *inst unswapped if the fetch failed.

v5
  - don't swap when load fails :)

 
 I think that's ok, but I definitely do not like the code flow - it's too
 hard to understand at a glimpse. Just rewrite it to swab at local
 variable level, preferably with if()s and comments what this is about and
 have a single unconditional *inst = fetched_inst; at the end of the
 function.

I will incorporate these change requests into v6.

Thanks,
-Mike
--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 0/5] Read guest last instruction from kvmppc_get_last_inst()

2014-07-23 Thread Mihai Caraman

Read guest last instruction from kvmppc_get_last_inst() allowing the function
to fail in order to emulate again. On bookehv architecture search for
the physical address and kmap it, instead of using Load External PID (lwepx)
instruction. This fixes an infinite loop caused by lwepx's data TLB miss
exception handled in the host and the TODO for execute-but-not-read entries
and TLB eviction.

Mihai Caraman (5):
  KVM: PPC: e500mc: Revert add load inst fixup
  KVM: PPC: Book3e: Add TLBSEL/TSIZE defines for MAS0/1
  KVM: PPC: Book3s: Remove kvmppc_read_inst() function
  KVM: PPC: Alow kvmppc_get_last_inst() to fail
  KVM: PPC: Bookehv: Get vcpu's last instruction for emulation

 arch/powerpc/include/asm/kvm_book3s.h|  26 ---
 arch/powerpc/include/asm/kvm_booke.h |   5 --
 arch/powerpc/include/asm/kvm_ppc.h   |  31 +
 arch/powerpc/include/asm/mmu-book3e.h|   9 ++-
 arch/powerpc/kvm/book3s.c|  17 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  17 ++---
 arch/powerpc/kvm/book3s_paired_singles.c |  38 +++
 arch/powerpc/kvm/book3s_pr.c | 114 ---
 arch/powerpc/kvm/booke.c |  47 +
 arch/powerpc/kvm/bookehv_interrupts.S|  55 ++-
 arch/powerpc/kvm/e500_mmu_host.c |  98 ++
 arch/powerpc/kvm/emulate.c   |  18 +++--
 arch/powerpc/kvm/powerpc.c   |  11 ++-
 13 files changed, 314 insertions(+), 172 deletions(-)

-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail

2014-07-23 Thread Mihai Caraman

On book3e, guest last instruction is read on the exit path using load
external pid (lwepx) dedicated instruction. This load operation may fail
due to TLB eviction and execute-but-not-read entries.

This patch lay down the path for an alternative solution to read the guest
last instruction, by allowing kvmppc_get_lat_inst() function to fail.
Architecture specific implmentations of kvmppc_load_last_inst() may read
last guest instruction and instruct the emulation layer to re-execute the
guest in case of failure.

Make kvmppc_get_last_inst() definition common between architectures.

Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
---
v6:
 - rewrite kvmppc_get_last_inst() swap code to be understood at a glimpse :)
 - use inst in kvmppc_load_last_inst
 - these changes compile on book3s, please validate the functionality and
   do the necessary changes!

v5:
 - don't swap when load fail
 - convert the return value space of kvmppc_ld()

v4:
 - common declaration and enum for kvmppc_load_last_inst()
 - remove kvmppc_read_inst() in a preceding patch

v3:
 - rework patch description
 - add common definition for kvmppc_get_last_inst()
 - check return values in book3s code

v2:
 - integrated kvmppc_get_last_inst() in book3s code and checked build
 - addressed cosmetic feedback

 arch/powerpc/include/asm/kvm_book3s.h| 26 --
 arch/powerpc/include/asm/kvm_booke.h |  5 
 arch/powerpc/include/asm/kvm_ppc.h   | 31 ++
 arch/powerpc/kvm/book3s.c| 17 
 arch/powerpc/kvm/book3s_64_mmu_hv.c  | 17 
 arch/powerpc/kvm/book3s_paired_singles.c | 38 +--
 arch/powerpc/kvm/book3s_pr.c | 45 +++-
 arch/powerpc/kvm/booke.c |  3 +++
 arch/powerpc/kvm/e500_mmu_host.c |  6 +
 arch/powerpc/kvm/emulate.c   | 18 -
 arch/powerpc/kvm/powerpc.c   | 11 ++--
 11 files changed, 140 insertions(+), 77 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 20fb6f2..a86ca65 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -276,32 +276,6 @@ static inline bool kvmppc_need_byteswap(struct kvm_vcpu 
*vcpu)
return (kvmppc_get_msr(vcpu)  MSR_LE) != (MSR_KERNEL  MSR_LE);
 }
 
-static inline u32 kvmppc_get_last_inst_internal(struct kvm_vcpu *vcpu, ulong 
pc)
-{
-   /* Load the instruction manually if it failed to do so in the
-* exit path */
-   if (vcpu-arch.last_inst == KVM_INST_FETCH_FAILED)
-   kvmppc_ld(vcpu, pc, sizeof(u32), vcpu-arch.last_inst, false);
-
-   return kvmppc_need_byteswap(vcpu) ? swab32(vcpu-arch.last_inst) :
-   vcpu-arch.last_inst;
-}
-
-static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
-{
-   return kvmppc_get_last_inst_internal(vcpu, kvmppc_get_pc(vcpu));
-}
-
-/*
- * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
- * Because the sc instruction sets SRR0 to point to the following
- * instruction, we have to fetch from pc - 4.
- */
-static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
-{
-   return kvmppc_get_last_inst_internal(vcpu, kvmppc_get_pc(vcpu) - 4);
-}
-
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
return vcpu-arch.fault_dar;
diff --git a/arch/powerpc/include/asm/kvm_booke.h 
b/arch/powerpc/include/asm/kvm_booke.h
index c7aed61..cbb1990 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -69,11 +69,6 @@ static inline bool kvmppc_need_byteswap(struct kvm_vcpu 
*vcpu)
return false;
 }
 
-static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
-{
-   return vcpu-arch.last_inst;
-}
-
 static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
 {
vcpu-arch.ctr = val;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index e2fd5a1..2da5f547 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -47,6 +47,11 @@ enum emulation_result {
EMULATE_EXIT_USER,/* emulation requires exit to user-space */
 };
 
+enum instruction_type {
+   INST_GENERIC,
+   INST_SC,/* system call */
+};
+
 extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern void kvmppc_handler_highmem(void);
@@ -62,6 +67,9 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
   u64 val, unsigned int bytes,
   int is_default_endian);
 
+extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
+enum instruction_type type, u32 *inst);
+
 extern int kvmppc_emulate_instruction(struct kvm_run *run,

[PATCH v6 5/5] KVM: PPC: Bookehv: Get vcpu's last instruction for emulation

2014-07-23 Thread Mihai Caraman

On book3e, KVM uses load external pid (lwepx) dedicated instruction to read
guest last instruction on the exit path. lwepx exceptions (DTLB_MISS, DSI
and LRAT), generated by loading a guest address, needs to be handled by KVM.
These exceptions are generated in a substituted guest translation context
(EPLC[EGS] = 1) from host context (MSR[GS] = 0).

Currently, KVM hooks only interrupts generated from guest context (MSR[GS] = 1),
doing minimal checks on the fast path to avoid host performance degradation.
lwepx exceptions originate from host state (MSR[GS] = 0) which implies
additional checks in DO_KVM macro (beside the current MSR[GS] = 1) by looking
at the Exception Syndrome Register (ESR[EPID]) and the External PID Load Context
Register (EPLC[EGS]). Doing this on each Data TLB miss exception is obvious
too intrusive for the host.

Read guest last instruction from kvmppc_load_last_inst() by searching for the
physical address and kmap it. This address the TODO for TLB eviction and
execute-but-not-read entries, and allow us to get rid of lwepx until we are
able to handle failures.

A simple stress benchmark shows a 1% sys performance degradation compared with
previous approach (lwepx without failure handling):

time for i in `seq 1 1`; do /bin/echo  /dev/null; done

real0m 8.85s
user0m 4.34s
sys 0m 4.48s

vs

real0m 8.84s
user0m 4.36s
sys 0m 4.44s

A solution to use lwepx and to handle its exceptions in KVM would be to 
temporary
highjack the interrupt vector from host. This imposes additional 
synchronizations
for cores like FSL e6500 that shares host IVOR registers between hardware 
threads.
This optimized solution can be later developed on top of this patch.

Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
---
v6:
 - no change

v5:
 - return ENULATE_AGAIN in case of failure

v4:
 - add switch and new function when getting last inst earlier
 - use enum instead of prev semnatic
 - get rid of mas0, optimize mas7_mas3
 - give more context in visible messages
 - check storage attributes mismatch on MMUv2
 - get rid of pfn_valid check

v3:
 - reworked patch description
 - use unaltered kmap addr for kunmap
 - get last instruction before beeing preempted

v2:
 - reworked patch description
 - used pr_* functions
 - addressed cosmetic feedback

 arch/powerpc/kvm/booke.c  | 44 +
 arch/powerpc/kvm/bookehv_interrupts.S | 37 --
 arch/powerpc/kvm/e500_mmu_host.c  | 92 +++
 3 files changed, 145 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 34a42b9..843077b 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -869,6 +869,28 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
}
 }
 
+static int kvmppc_resume_inst_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ enum emulation_result emulated, u32 last_inst)
+{
+   switch (emulated) {
+   case EMULATE_AGAIN:
+   return RESUME_GUEST;
+
+   case EMULATE_FAIL:
+   pr_debug(%s: load instruction from guest address %lx failed\n,
+  __func__, vcpu-arch.pc);
+   /* For debugging, encode the failing instruction and
+* report it to userspace. */
+   run-hw.hardware_exit_reason = ~0ULL  32;
+   run-hw.hardware_exit_reason |= last_inst;
+   kvmppc_core_queue_program(vcpu, ESR_PIL);
+   return RESUME_HOST;
+
+   default:
+   BUG();
+   }
+}
+
 /**
  * kvmppc_handle_exit
  *
@@ -880,6 +902,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu 
*vcpu,
int r = RESUME_HOST;
int s;
int idx;
+   u32 last_inst = KVM_INST_FETCH_FAILED;
+   enum emulation_result emulated = EMULATE_DONE;
 
/* update before a new last_exit_type is rewritten */
kvmppc_update_timing_stats(vcpu);
@@ -887,6 +911,20 @@ int kvmppc_handle_exit(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
/* restart interrupts if they were meant for the host */
kvmppc_restart_interrupt(vcpu, exit_nr);
 
+   /*
+* get last instruction before beeing preempted
+* TODO: for e6500 check also BOOKE_INTERRUPT_LRAT_ERROR  ESR_DATA
+*/
+   switch (exit_nr) {
+   case BOOKE_INTERRUPT_DATA_STORAGE:
+   case BOOKE_INTERRUPT_DTLB_MISS:
+   case BOOKE_INTERRUPT_HV_PRIV:
+   emulated = kvmppc_get_last_inst(vcpu, false, last_inst);
+   break;
+   default:
+   break;
+   }
+
local_irq_enable();
 
trace_kvm_exit(exit_nr, vcpu);
@@ -895,6 +933,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
run-exit_reason = KVM_EXIT_UNKNOWN;
run-ready_for_interrupt_injection = 1;
 
+   if (emulated != EMULATE_DONE) {
+   r =

56 matches

Mail list logo