[PATCH v2 4/4] nested vmx: use a list to store the launched vmcs12 for L1 VMM

2012-11-22 Thread Dongxiao Xu
The launch state is not a member in the VMCS area, use a separate
variable (list) to store it instead.

Signed-off-by: Dongxiao Xu 
---
 arch/x86/kvm/vmx.c |   86 +---
 1 files changed, 81 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 20de88b..3be9265 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -177,8 +177,7 @@ struct __packed vmcs12 {
u32 revision_id;
u32 abort;
 
-   u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
-   u32 padding[7]; /* room for future expansion */
+   u32 padding[8]; /* room for future expansion */
 
u64 io_bitmap_a;
u64 io_bitmap_b;
@@ -339,6 +338,11 @@ struct vmcs02_list {
struct loaded_vmcs vmcs02;
 };
 
+struct vmcs12_list {
+   unsigned long vmcs12_pa;
+   struct list_head node;
+};
+
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -364,6 +368,8 @@ struct nested_vmx {
 * we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
+   /* vmcs12_pool contains the launched vmcs12. */
+   struct list_head vmcs12_pool;
 };
 
 struct vcpu_vmx {
@@ -619,6 +625,58 @@ static void nested_release_page_clean(struct page *page)
kvm_release_page_clean(page);
 }
 
+static int vmcs12_launched(struct list_head *vmcs12_pool,
+  unsigned long vmcs12_pa)
+{
+   struct vmcs12_list *iter;
+   struct list_head *pos;
+   int launched = 0;
+
+   list_for_each(pos, vmcs12_pool) {
+   iter = list_entry(pos, struct vmcs12_list, node);
+   if (vmcs12_pa == iter->vmcs12_pa) {
+   launched = 1;
+   break;
+   }
+   }
+
+   return launched;
+}
+
+static int set_vmcs12_launched(struct list_head *vmcs12_pool,
+  unsigned long vmcs12_pa)
+{
+   struct vmcs12_list *vmcs12;
+
+   if (vmcs12_launched(vmcs12_pool, vmcs12_pa))
+   return 0;
+
+   vmcs12 = kzalloc(sizeof(struct vmcs12_list), GFP_KERNEL);
+   if (!vmcs12)
+   return -ENOMEM;
+
+   vmcs12->vmcs12_pa = vmcs12_pa;
+   list_add(&vmcs12->node, vmcs12_pool);
+
+   return 0;
+}
+
+static void clear_vmcs12_launched(struct list_head *vmcs12_pool,
+  unsigned long vmcs12_pa)
+{
+   struct vmcs12_list *iter;
+   struct list_head *pos;
+
+   list_for_each(pos, vmcs12_pool) {
+   iter = list_entry(pos, struct vmcs12_list, node);
+   if (vmcs12_pa == iter->vmcs12_pa) {
+   list_del(&iter->node);
+   kfree(iter);
+   break;
+   }
+   }
+}
+
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
@@ -5116,6 +5174,18 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx 
*vmx)
 }
 
 /*
+ * Free the vmcs12 list.
+ */
+static void nested_free_vmcs12_list(struct vcpu_vmx *vmx)
+{
+   struct vmcs12_list *item, *n;
+   list_for_each_entry_safe(item, n, &vmx->nested.vmcs12_pool, node) {
+   list_del(&item->node);
+   kfree(item);
+   }
+}
+
+/*
  * Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
@@ -5212,6 +5282,7 @@ static void free_nested(struct vcpu_vmx *vmx)
}
 
nested_free_all_saved_vmcss(vmx);
+   nested_free_vmcs12_list(vmx);
 }
 
 /* Emulate the VMXOFF instruction */
@@ -5364,7 +5435,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
return 1;
}
vmcs12 = kmap(page);
-   vmcs12->launch_state = 0;
+   clear_vmcs12_launched(&vmx->nested.vmcs12_pool, __pa(vmcs12));
kunmap(page);
nested_release_page(page);
 
@@ -6460,6 +6531,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, 
unsigned int id)
 
vmx->nested.current_vmptr = -1ull;
vmx->nested.current_vmcs12 = NULL;
+   INIT_LIST_HEAD(&vmx->nested.vmcs12_pool);
 
return &vmx->vcpu;
 
@@ -6839,6 +6911,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool 
launch)
struct vcpu_vmx *vmx = to_vmx(vcpu);
int cpu;
struct loaded_vmcs *vmcs02;
+   int is_launched;
 
if (!nested_vmx_check_permission(vcpu) ||
!nested_vmx_check_vmcs12(vcpu))
@@ -6857,7 +6930,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool 
launch)
 * for misconfigurations which will anyway be caught by the processor
 * when using the merged vmcs02.
 */
-   if (vmcs12->launch_state == launch) {
+   is_launched =
+   vmcs12_launched(&vmx->n

[PATCH v2 1/4] nested vmx: clean up for vmcs12 read and write

2012-11-22 Thread Dongxiao Xu
abstract vmcs12_read and vmcs12_write functions to do the vmcs12
read/write operations.

Signed-off-by: Dongxiao Xu 
---
 arch/x86/kvm/vmx.c |   85 +---
 1 files changed, 41 insertions(+), 44 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f858159..2f8344f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -583,10 +583,15 @@ static const unsigned short vmcs_field_to_offset_table[] 
= {
 };
 static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
 
-static inline short vmcs_field_to_offset(unsigned long field)
+static inline bool vmcs_field_valid(unsigned long field)
 {
if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
-   return -1;
+   return 0;
+   return 1;
+}
+
+static inline short vmcs_field_to_offset(unsigned long field)
+{
return vmcs_field_to_offset_table[field];
 }
 
@@ -5407,32 +5412,45 @@ static inline int vmcs_field_readonly(unsigned long 
field)
  * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
  * 64-bit fields are to be returned).
  */
-static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
-   unsigned long field, u64 *ret)
+static inline u64 vmcs12_read(struct kvm_vcpu *vcpu, unsigned long field)
 {
-   short offset = vmcs_field_to_offset(field);
-   char *p;
+   char *p = ((char *)(get_vmcs12(vcpu))) + vmcs_field_to_offset(field);
 
-   if (offset < 0)
-   return 0;
+   switch (vmcs_field_type(field)) {
+   case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+   return *((natural_width *)p);
+   case VMCS_FIELD_TYPE_U16:
+   return *((u16 *)p);
+   case VMCS_FIELD_TYPE_U32:
+   return *((u32 *)p);
+   case VMCS_FIELD_TYPE_U64:
+   return *((u64 *)p);
+   default:
+   return 0; /* can never happen. */
+   }
+}
 
-   p = ((char *)(get_vmcs12(vcpu))) + offset;
+static inline void vmcs12_write(struct kvm_vcpu *vcpu,
+   unsigned long field,
+   u64 value)
+{
+   char *p = ((char *)(get_vmcs12(vcpu))) + vmcs_field_to_offset(field);
 
switch (vmcs_field_type(field)) {
case VMCS_FIELD_TYPE_NATURAL_WIDTH:
-   *ret = *((natural_width *)p);
-   return 1;
+   *(natural_width *)p = value;
+   break;
case VMCS_FIELD_TYPE_U16:
-   *ret = *((u16 *)p);
-   return 1;
+   *(u16 *)p = value;
+   break;
case VMCS_FIELD_TYPE_U32:
-   *ret = *((u32 *)p);
-   return 1;
+   *(u32 *)p = value;
+   break;
case VMCS_FIELD_TYPE_U64:
-   *ret = *((u64 *)p);
-   return 1;
+   *(u64 *)p = value;
+   break;
default:
-   return 0; /* can never happen. */
+   break; /* can never happen. */
}
 }
 
@@ -5465,12 +5483,13 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 
/* Decode instruction info and find the field to read */
field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
-   /* Read the field, zero-extended to a u64 field_value */
-   if (!vmcs12_read_any(vcpu, field, &field_value)) {
+   if (!vmcs_field_valid(field)) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
skip_emulated_instruction(vcpu);
return 1;
}
+   /* Read the field, zero-extended to a u64 field_value */
+   field_value = vmcs12_read(vcpu, field);
/*
 * Now copy part of this value to register or memory, as requested.
 * Note that the number of bits actually copied is 32 or 64 depending
@@ -5500,8 +5519,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
gva_t gva;
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-   char *p;
-   short offset;
/* The value to write might be 32 or 64 bits, depending on L1's long
 * mode, and eventually we need to write that into a field of several
 * possible lengths. The code below first zero-extends the value to 64
@@ -5537,33 +5554,13 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
skip_emulated_instruction(vcpu);
return 1;
}
-
-   offset = vmcs_field_to_offset(field);
-   if (offset < 0) {
+   if (!vmcs_field_valid(field)) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
skip_emulated_instruction(vcpu);
return 1;
}
-   p = ((char *) get_vmcs12(vcpu)) + offset;
 
-   switch (vmcs_field_type(field)) {
-   case VMCS_FIELD_TYPE_U16:
- 

[PATCH v2 3/4] nested vmx: use vmcs12_read/write() to operate VMCS fields

2012-11-22 Thread Dongxiao Xu
When referencing vmcs12 fields, the current approach is to use
"struct.field" style. This commit replace all the current solution
by calling vmcs12_read() and vmcs12_write() fucntions.

Signed-off-by: Dongxiao Xu 
---
 arch/x86/kvm/vmx.c |  591 
 1 files changed, 317 insertions(+), 274 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 639cad0..20de88b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -629,6 +629,11 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
 
+static inline u64 vmcs12_read(struct kvm_vcpu *vcpu, unsigned long field);
+static inline void vmcs12_write(struct kvm_vcpu *vcpu,
+   unsigned long field,
+   u64 value);
+
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 /*
@@ -891,19 +896,19 @@ static inline bool report_flexpriority(void)
 
 static inline bool nested_cpu_has(struct kvm_vcpu *vcpu, u32 bit)
 {
-   return get_vmcs12(vcpu)->cpu_based_vm_exec_control & bit;
+   return vmcs12_read(vcpu, CPU_BASED_VM_EXEC_CONTROL) & bit;
 }
 
 static inline bool nested_cpu_has2(struct kvm_vcpu *vcpu, u32 bit)
 {
-   return (get_vmcs12(vcpu)->cpu_based_vm_exec_control &
+   return (vmcs12_read(vcpu, CPU_BASED_VM_EXEC_CONTROL) &
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
-   (get_vmcs12(vcpu)->secondary_vm_exec_control & bit);
+   (vmcs12_read(vcpu, SECONDARY_VM_EXEC_CONTROL) & bit);
 }
 
 static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu)
 {
-   return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+   return vmcs12_read(vcpu, PIN_BASED_VM_EXEC_CONTROL) &
PIN_BASED_VIRTUAL_NMIS;
 }
 
@@ -915,7 +920,6 @@ static inline bool is_exception(u32 intr_info)
 
 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
-   struct vmcs12 *vmcs12,
u32 reason, unsigned long qualification);
 
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
@@ -1220,7 +1224,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 * specified above if L1 did not want them.
 */
if (is_guest_mode(vcpu))
-   eb |= get_vmcs12(vcpu)->exception_bitmap;
+   eb |= vmcs12_read(vcpu, EXCEPTION_BITMAP);
 
vmcs_write32(EXCEPTION_BITMAP, eb);
 }
@@ -1582,7 +1586,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
if (is_guest_mode(vcpu))
vcpu->arch.cr0_guest_owned_bits &=
-   ~get_vmcs12(vcpu)->cr0_guest_host_mask;
+   ~vmcs12_read(vcpu, CR0_GUEST_HOST_MASK);
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 }
 
@@ -1593,15 +1597,19 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu 
*vcpu);
  * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
  * its hypervisor (cr0_read_shadow).
  */
-static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
+static inline unsigned long nested_read_cr0(struct kvm_vcpu *vcpu)
 {
-   return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
-   (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
+   return (vmcs12_read(vcpu, GUEST_CR0) &
+   ~vmcs12_read(vcpu, CR0_GUEST_HOST_MASK)) |
+   (vmcs12_read(vcpu, CR0_READ_SHADOW) &
+   vmcs12_read(vcpu, CR0_GUEST_HOST_MASK));
 }
-static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
+static inline unsigned long nested_read_cr4(struct kvm_vcpu *vcpu)
 {
-   return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
-   (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
+   return (vmcs12_read(vcpu, GUEST_CR4) &
+   ~vmcs12_read(vcpu, CR4_GUEST_HOST_MASK)) |
+   (vmcs12_read(vcpu, CR4_READ_SHADOW) &
+   vmcs12_read(vcpu, CR4_GUEST_HOST_MASK));
 }
 
 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
@@ -1623,10 +1631,10 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 * up-to-date here because we just decached cr0.TS (and we'll
 * only update vmcs12->guest_cr0 on nested exit).
 */
-   struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-   vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
+   u64 guest_cr0 = (vmcs12_read(vcpu, GUEST_CR0) & ~X86_CR0_TS) |
(vcpu->arch.cr0 & X86_CR0_TS);
-   vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+   vmcs12_write(vcpu, GUEST_CR0, guest_cr0);
+ 

[PATCH v2 2/4] nested vmx: clean up for nested_cpu_has_xxx functions

2012-11-22 Thread Dongxiao Xu
This is a preparation for the later change, which use vmcs12_read()
and vmcs12_write() to replace the way to access vmcs12 fields.

Since the above functions uses 'vcpu' as parameter, we also use
'vcpu' as the parameter in nested_cpu_has_xxx functions.

Signed-off-by: Dongxiao Xu 
---
 arch/x86/kvm/vmx.c |   57 +--
 1 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2f8344f..639cad0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -889,22 +889,22 @@ static inline bool report_flexpriority(void)
return flexpriority_enabled;
 }
 
-static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
+static inline bool nested_cpu_has(struct kvm_vcpu *vcpu, u32 bit)
 {
-   return vmcs12->cpu_based_vm_exec_control & bit;
+   return get_vmcs12(vcpu)->cpu_based_vm_exec_control & bit;
 }
 
-static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
+static inline bool nested_cpu_has2(struct kvm_vcpu *vcpu, u32 bit)
 {
-   return (vmcs12->cpu_based_vm_exec_control &
+   return (get_vmcs12(vcpu)->cpu_based_vm_exec_control &
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
-   (vmcs12->secondary_vm_exec_control & bit);
+   (get_vmcs12(vcpu)->secondary_vm_exec_control & bit);
 }
 
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-   struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu)
 {
-   return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
+   return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+   PIN_BASED_VIRTUAL_NMIS;
 }
 
 static inline bool is_exception(u32 intr_info)
@@ -1888,7 +1888,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, 
u64 offset)
/* recalculate vmcs02.TSC_OFFSET: */
vmcs12 = get_vmcs12(vcpu);
vmcs_write64(TSC_OFFSET, offset +
-   (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
+   (nested_cpu_has(vcpu, CPU_BASED_USE_TSC_OFFSETING) ?
 vmcs12->tsc_offset : 0));
} else {
vmcs_write64(TSC_OFFSET, offset);
@@ -5712,7 +5712,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu 
*vcpu,
u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
gpa_t bitmap;
 
-   if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
+   if (!nested_cpu_has(vcpu, CPU_BASED_USE_MSR_BITMAPS))
return 1;
 
/*
@@ -5768,7 +5768,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu 
*vcpu,
(vmcs12->cr3_target_count >= 4 &&
vmcs12->cr3_target_value3 == val))
return 0;
-   if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
+   if (nested_cpu_has(vcpu, CPU_BASED_CR3_LOAD_EXITING))
return 1;
break;
case 4:
@@ -5777,7 +5777,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu 
*vcpu,
return 1;
break;
case 8:
-   if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
+   if (nested_cpu_has(vcpu, CPU_BASED_CR8_LOAD_EXITING))
return 1;
break;
}
@@ -5865,15 +5865,15 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu 
*vcpu)
case EXIT_REASON_CPUID:
return 1;
case EXIT_REASON_HLT:
-   return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
+   return nested_cpu_has(vcpu, CPU_BASED_HLT_EXITING);
case EXIT_REASON_INVD:
return 1;
case EXIT_REASON_INVLPG:
-   return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+   return nested_cpu_has(vcpu, CPU_BASED_INVLPG_EXITING);
case EXIT_REASON_RDPMC:
-   return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+   return nested_cpu_has(vcpu, CPU_BASED_RDPMC_EXITING);
case EXIT_REASON_RDTSC:
-   return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
+   return nested_cpu_has(vcpu, CPU_BASED_RDTSC_EXITING);
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
@@ -5887,7 +5887,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_CR_ACCESS:
return nested_vmx_exit_handled_cr(vcpu, vmcs12);
case EXIT_REASON_DR_ACCESS:
-   return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
+   return nested_cpu_has(vcpu, CPU_BA

Re: [PATCH v2 8/8] kvm tools: add support for ARMv7 processors

2012-11-22 Thread Will Deacon
Hi Peter,

On Thu, Nov 22, 2012 at 04:13:17PM +, Peter Maydell wrote:
> On 22 November 2012 15:58, Will Deacon  wrote:
> > +++ b/tools/kvm/arm/aarch32/smp-pen.S
> > @@ -0,0 +1,30 @@
> > +#include "kvm/kvm-arch.h"
> > +
> > +#include "arm-common/gic.h"
> > +
> > +   .arm
> > +
> > +   .globl  smp_pen_start
> > +   .globl  smp_jump_addr
> > +   .globl  smp_pen_end
> > +
> > +   .align
> > +smp_pen_start:
> > +   @ Ensure that the CPU interface is enabled for the wfi wakeup
> > +   ldr r0, =ARM_GIC_CPUI_BASE
> > +   mov r1, #GIC_CPUI_CTLR_EN
> > +   str r1, [r0]
> > +
> > +   @ Now wait for the primary to poke us
> > +   adr r0, smp_jump_addr
> > +   dsb
> > +   wfi
> > +   ldr r1, [r0]
> > +   mov pc, r1
> > +
> > +   .ltorg
> > +
> > +   .align
> > +smp_jump_addr:
> > +   .long   0xdeadc0de
> > +smp_pen_end:
> 
> You've left the gate ajar on your pen -- this won't cope with
> spurious WFI wakeups (the architecture allows WFI to return
> at any time, down to the trivial case of "implemented as NOP").
> Needs a 'branch back to WFI if not yet poked' (or you could
> make the initial value stored at smp_jump_addr be the address
> of the wfi :-))

Thanks for pointing this out, somehow I missed it despite updating the ARMv8
code. Will fix for v3.

Will
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/8] rbtree: include linux/compiler.h for definition of __always_inline

2012-11-22 Thread Sasha Levin
On 11/22/2012 10:58 AM, Will Deacon wrote:
> Commit 29fc7c5a4f516d388fb6e1f6d24bfb04b8093e54 upstream.
> 
> rb_erase_augmented() is a static function annotated with
> __always_inline.  This causes a compile failure when attempting to use
> the rbtree implementation as a library (e.g.  kvm tool):
> 
>   rbtree_augmented.h:125:24: error: expected `=', `,', `;', `asm' or 
> `__attribute__' before `void'

On a side note, our rbtree-interval is broken at the moment due to kernel side
changing the implementation and (IMO) breaking augmented rbtrees, followed
by several patches in our own code that tried to fix the breakage but haven't
identified the problem correctly - leading to more subtle breakage.

If you see things broken with mmio, that might be the reason.

I have a fix for that which goes to both kernel and our code, but I'm
waiting to verify that the kernel side is indeed broken.


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 02/18] KVM/MIPS32: Arch specific KVM data structures.

2012-11-22 Thread Sanjay Lal

Signed-off-by: Sanjay Lal 
---
 arch/mips/include/asm/kvm.h  |  55 
 arch/mips/include/asm/kvm_host.h | 669 +++
 2 files changed, 724 insertions(+)
 create mode 100644 arch/mips/include/asm/kvm.h
 create mode 100644 arch/mips/include/asm/kvm_host.h

diff --git a/arch/mips/include/asm/kvm.h b/arch/mips/include/asm/kvm.h
new file mode 100644
index 000..85789ea
--- /dev/null
+++ b/arch/mips/include/asm/kvm.h
@@ -0,0 +1,55 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#ifndef __LINUX_KVM_MIPS_H
+#define __LINUX_KVM_MIPS_H
+
+#include 
+
+#define __KVM_MIPS
+
+#define N_MIPS_COPROC_REGS  32
+#define N_MIPS_COPROC_SEL  8
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+   __u32 gprs[32];
+   __u32 hi;
+   __u32 lo;
+   __u32 pc;
+
+   __u32 cp0reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL];
+};
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+};
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+};
+
+struct kvm_debug_exit_arch {
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+};
+
+struct kvm_mips_interrupt {
+   /* in */
+   __u32 cpu;
+   __u32 irq;
+};
+
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
+#endif /* __LINUX_KVM_MIPS_H */
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
new file mode 100644
index 000..181fa58
--- /dev/null
+++ b/arch/mips/include/asm/kvm_host.h
@@ -0,0 +1,669 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#ifndef __MIPS_KVM_HOST_H__
+#define __MIPS_KVM_HOST_H__
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+#define KVM_MAX_VCPUS  1
+#define KVM_MEMORY_SLOTS   8
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS  0
+
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
+/* Don't support huge pages */
+#define KVM_HPAGE_GFN_SHIFT(x) 0
+
+/* We don't currently support large pages. */
+#define KVM_NR_PAGE_SIZES  1
+#define KVM_PAGES_PER_HPAGE(x) 1
+
+
+
+/* Special address that contains the comm page, used for reducing # of traps */
+#define KVM_GUEST_COMMPAGE_ADDR 0x0
+
+#define KVM_GUEST_KERNEL_MODE(vcpu)
((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \
+   
((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0))
+
+#define KVM_GUEST_KUSEG 0xUL
+#define KVM_GUEST_KSEG0 0x4000UL
+#define KVM_GUEST_KSEG230x6000UL
+#define KVM_GUEST_KSEGX(a)  ((_ACAST32_(a)) & 0x6000)
+#define KVM_GUEST_CPHYSADDR(a)  ((_ACAST32_(a)) & 0x1fff)
+
+#define KVM_GUEST_CKSEG0ADDR(a)(KVM_GUEST_CPHYSADDR(a) | 
KVM_GUEST_KSEG0)
+#define KVM_GUEST_CKSEG1ADDR(a)(KVM_GUEST_CPHYSADDR(a) | 
KVM_GUEST_KSEG1)
+#define KVM_GUEST_CKSEG23ADDR(a)   (KVM_GUEST_CPHYSADDR(a) | 
KVM_GUEST_KSEG23)
+
+/*
+ * Map an address to a certain kernel segment
+ */
+#define KVM_GUEST_KSEG0ADDR(a) (KVM_GUEST_CPHYSADDR(a) | 
KVM_GUEST_KSEG0)
+#define KVM_GUEST_KSEG1ADDR(a) (KVM_GUEST_CPHYSADDR(a) | 
KVM_GUEST_KSEG1)
+#define KVM_GUEST_KSEG23ADDR(a)(KVM_GUEST_CPHYSADDR(a) | 
KVM_GUEST_KSEG23)
+
+#define KVM_INVALID_PAGE0xdeadbeef
+#define KVM_INVALID_INST0xdeadbeef
+#define KVM_INVALID_ADDR0xdeadbeef
+
+#define KVM_MALTA_GUEST_RTC_ADDR0xb870UL
+
+#ifndef __unused
+#define __unused __attribute__((unused))
+#endif
+
+#define GUEST_TICKS_PER_JIFFY (4000/HZ)
+#define MS_TO_NS(x) (x * 1E6L)
+
+#define CAUSEB_DC   27
+#define CAUSEF_DC   (_ULCAST_(1)   << 27)
+
+struct kvm;
+struct kvm_run;
+struct kvm_vcpu;
+struct kvm_interrupt;
+
+extern atomic_t kvm_mips_instance;
+extern pfn_t(*kvm_mips_gfn_to_pfn) (struct kvm *kvm, gfn_t gfn);
+extern void (*kvm_mips_release_pfn_clean) (pfn_t pfn);
+extern bool(*kvm_mips_is_error_pfn) (pfn_t pfn);
+
+struct kvm_vm_stat {
+   u32 remote_tlb_flush;
+};
+
+struct kvm_vcpu_stat {
+   u32 wait_exits;
+   u32 cache_exits;
+   u32 signal_exits;
+   u32 int_exits;
+   u32 cop_unusable_exits;
+   u32 tlbmod_exits;
+   u32 tlbmiss_ld_exits;
+   u32 tlbmiss_st_exits;
+   u32 addrerr_st_exits;
+   u32 addrerr_ld_exits;
+   u32 syscall_exits;
+   u32 resvd_inst_exits;
+   u32 break_inst_exits;
+   u32 flush_dcache_exits;
+   u32 halt_wak

[PATCH v2 00/18] KVM for MIPS32 Processors

2012-11-22 Thread Sanjay Lal
The following patchset implements KVM support for MIPS32R2 processors,
using Trap & Emulate, with basic runtime binary translation to improve
performance.  The goal has been to keep the Guest kernel changes to a
minimum.

The patch is against Linux 3.7-rc6.  This is Version 2 of the patch set.

There is a companion patchset for QEMU that adds KVM support for the 
MIPS target.

KVM/MIPS should support MIPS32-R2 processors and beyond.
It has been tested on the following platforms:
 - Malta Board with FPGA based 34K (Little Endian).
 - Sigma Designs TangoX board with a 24K based 8654 SoC (Little Endian).
 - Malta Board with 74K @ 1GHz (Little Endian).
 - OVPSim MIPS simulator from Imperas emulating a Malta board with 
   24Kc and 1074Kc cores (Little Endian).

Both Guest kernel and Guest Userspace execute in UM. The Guest address space is
as folows:
Guest User address space:   0x -> 0x4000
Guest Kernel Unmapped:  0x4000 -> 0x6000
Guest Kernel Mapped:0x6000 -> 0x8000

As a result, Guest Usermode virtual memory is limited to 1GB.

Relase Notes

(1) 16K Page Size:
   Both Host Kernel and Guest Kernel should have the same page size, 
   currently at least 16K.  Note that due to cache aliasing issues, 
   4K page sizes are NOT supported.

(2) No HugeTLB/Large Page Support:
   Both the host kernel and Guest kernel should have the page size 
   set to at least 16K.
   This will be implemented in a future release.

(3) SMP Guests to not work
   Linux-3.7-rc2 based SMP guest hangs due to the following code sequence 
   in the generated TLB handlers:
LL/TLBP/SC
   Since the TLBP instruction causes a trap the reservation gets cleared
   when we ERET back to the guest. This causes the guest to hang in an 
   infinite loop.
   As a workaround, make sure that CONFIG_SMP is disabled for Guest kernels.
   This will be fixed in a future release.

(4) FPU support:
   Currently KVM/MIPS emulates a 24K CPU without a FPU.
   This will be fixed in a future release

--
Sanjay Lal (18):
  KVM/MIPS32: Infrastructure/build files.
  KVM/MIPS32: Arch specific KVM data structures.
  KVM/MIPS32: Entry point for trampolining to the guest and trap
handlers.
  KVM/MIPS32: MIPS arch specific APIs for KVM
  KVM/MIPS32: KVM Guest kernel support.
  KVM/MIPS32: Privileged instruction/target branch emulation.
  KVM/MIPS32: MMU/TLB operations for the Guest.
  KVM/MIPS32: Release notes and KVM module Makefile
  KVM/MIPS32: COP0 accesses profiling.
  KVM/MIPS32: Guest interrupt delivery.
  KVM/MIPS32: Routines to handle specific traps/exceptions while
executing the guest.
  MIPS: Export routines needed by the KVM module.
  MIPS: If KVM is enabled then use the KVM specific routine to flush
the TLBs on a ASID wrap.
  MIPS: ASM offsets for VCPU arch specific fields.
  MIPS: Pull in MIPS fix: fix endless loop when processing signals for
kernel tasks.
  MIPS: Export symbols used by KVM/MIPS module
  KVM/MIPS32: Do not call vcpu_load when injecting interrupts.
  KVM/MIPS32: Binary patching of select privileged instructions.

 arch/mips/Kbuild|4 +
 arch/mips/Kconfig   |   18 +
 arch/mips/configs/malta_kvm_defconfig   | 2268 +++
 arch/mips/configs/malta_kvm_guest_defconfig | 2237 ++
 arch/mips/include/asm/kvm.h |   55 +
 arch/mips/include/asm/kvm_host.h|  669 
 arch/mips/include/asm/mach-generic/spaces.h |9 +-
 arch/mips/include/asm/mmu_context.h |6 +
 arch/mips/include/asm/processor.h   |5 +
 arch/mips/include/asm/uaccess.h |   11 +-
 arch/mips/kernel/asm-offsets.c  |   66 +
 arch/mips/kernel/binfmt_elfo32.c|4 +
 arch/mips/kernel/cevt-r4k.c |4 +
 arch/mips/kernel/entry.S|7 +-
 arch/mips/kernel/smp.c  |1 +
 arch/mips/kernel/traps.c|7 +-
 arch/mips/kvm/00README.txt  |   31 +
 arch/mips/kvm/Kconfig   |   60 +
 arch/mips/kvm/Makefile  |   17 +
 arch/mips/kvm/kvm_cb.c  |   14 +
 arch/mips/kvm/kvm_locore.S  |  651 
 arch/mips/kvm/kvm_mips.c|  965 
 arch/mips/kvm/kvm_mips_comm.h   |   23 +
 arch/mips/kvm/kvm_mips_commpage.c   |   37 +
 arch/mips/kvm/kvm_mips_dyntrans.c   |  149 ++
 arch/mips/kvm/kvm_mips_emul.c   | 1840 ++
 arch/mips/kvm/kvm_mips_int.c|  243 +++
 arch/mips/kvm/kvm_mips_int.h|   49 +
 arch/mips/kvm/kvm_mips_opcode.h |   24 +
 arch/mips/kvm/kvm_mips_stats.c  |   81 +
 arch/mips/kvm/kvm_tlb.c |  932 +++
 arch/mips/kvm/kvm_trap_emul.c   |  482 ++
 arch/mips/kvm/trace.h   |

RE: [PATCH 4/4] nested vmx: use a list to store the launched vmcs12 for L1 VMM

2012-11-22 Thread Xu, Dongxiao
> -Original Message-
> From: Gleb Natapov [mailto:g...@redhat.com]
> Sent: Wednesday, November 21, 2012 10:15 PM
> To: Xu, Dongxiao
> Cc: kvm@vger.kernel.org; mtosa...@redhat.com
> Subject: Re: [PATCH 4/4] nested vmx: use a list to store the launched vmcs12
> for L1 VMM
> 
> On Wed, Nov 21, 2012 at 05:04:37PM +0800, Dongxiao Xu wrote:
> > The launch state is not a member in the VMCS area, use a separate
> > variable (list) to store it instead.
> >
> Why? Guest shouldn't be aware of the format of VMCS area.

Yes, I agree. Guest VMM/L1 VMM shouldn't be aware of the VMCS format.

For Root VMM/L0 VMM, it need to track the launch state of the vmcs12, in order 
to correctly emulate the VMLAUNCH and VMRESUME instructions.
Originally we store the launch state in the VMCS area, however in fact, there 
is no "launch state" field in VMCS. This patch is to move it out and use a 
separate list to store it.

Thanks,
Dongxiao


> 
> > Signed-off-by: Dongxiao Xu 
> > ---
> >  arch/x86/kvm/vmx.c |   86
> +---
> >  1 files changed, 81 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index
> > 6687fb6..d03ab4e 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -177,8 +177,7 @@ struct __packed vmcs12 {
> > u32 revision_id;
> > u32 abort;
> >
> > -   u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
> > -   u32 padding[7]; /* room for future expansion */
> > +   u32 padding[8]; /* room for future expansion */
> >
> > u64 io_bitmap_a;
> > u64 io_bitmap_b;
> > @@ -339,6 +338,11 @@ struct vmcs02_list {
> > struct loaded_vmcs vmcs02;
> >  };
> >
> > +struct vmcs12_list {
> > +   unsigned long vmcs12_pa;
> > +   struct list_head node;
> > +};
> > +
> >  /*
> >   * The nested_vmx structure is part of vcpu_vmx, and holds information we
> need
> >   * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
> > @@ -364,6 +368,8 @@ struct nested_vmx {
> >  * we must keep them pinned while L2 runs.
> >  */
> > struct page *apic_access_page;
> > +   /* vmcs12_pool contains the launched vmcs12. */
> > +   struct list_head vmcs12_pool;
> >  };
> >
> >  struct vcpu_vmx {
> > @@ -614,6 +620,58 @@ static void nested_release_page_clean(struct page
> *page)
> > kvm_release_page_clean(page);
> >  }
> >
> > +static int vmcs12_launched(struct list_head *vmcs12_pool,
> > +  unsigned long vmcs12_pa)
> > +{
> > +   struct vmcs12_list *iter;
> > +   struct list_head *pos;
> > +   int launched = 0;
> > +
> > +   list_for_each(pos, vmcs12_pool) {
> > +   iter = list_entry(pos, struct vmcs12_list, node);
> > +   if (vmcs12_pa == iter->vmcs12_pa) {
> > +   launched = 1;
> > +   break;
> > +   }
> > +   }
> > +
> > +   return launched;
> > +}
> > +
> > +static int set_vmcs12_launched(struct list_head *vmcs12_pool,
> > +  unsigned long vmcs12_pa)
> > +{
> > +   struct vmcs12_list *vmcs12;
> > +
> > +   if (vmcs12_launched(vmcs12_pool, vmcs12_pa))
> > +   return 0;
> > +
> > +   vmcs12 = kzalloc(sizeof(struct vmcs12_list), GFP_KERNEL);
> > +   if (!vmcs12)
> > +   return -ENOMEM;
> > +
> > +   vmcs12->vmcs12_pa = vmcs12_pa;
> > +   list_add(&vmcs12->node, vmcs12_pool);
> > +
> > +   return 0;
> > +}
> > +
> > +static void clear_vmcs12_launched(struct list_head *vmcs12_pool,
> > +  unsigned long vmcs12_pa)
> > +{
> > +   struct vmcs12_list *iter;
> > +   struct list_head *pos;
> > +
> > +   list_for_each(pos, vmcs12_pool) {
> > +   iter = list_entry(pos, struct vmcs12_list, node);
> > +   if (vmcs12_pa == iter->vmcs12_pa) {
> > +   list_del(&iter->node);
> > +   kfree(iter);
> > +   break;
> > +   }
> > +   }
> > +}
> > +
> >  static u64 construct_eptp(unsigned long root_hpa);  static void
> > kvm_cpu_vmxon(u64 addr);  static void kvm_cpu_vmxoff(void); @@ -5111,6
> > +5169,18 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx
> > *vmx)  }
> >
> >  /*
> > + * Free the vmcs12 list.
> > + */
> > +static void nested_free_vmcs12_list(struct vcpu_vmx *vmx) {
> > +   struct vmcs12_list *item, *n;
> > +   list_for_each_entry_safe(item, n, &vmx->nested.vmcs12_pool, node) {
> > +   list_del(&item->node);
> > +   kfree(item);
> > +   }
> > +}
> > +
> > +/*
> >   * Emulate the VMXON instruction.
> >   * Currently, we just remember that VMX is active, and do not save or even
> >   * inspect the argument to VMXON (the so-called "VMXON pointer")
> > because we @@ -5207,6 +5277,7 @@ static void free_nested(struct
> vcpu_vmx *vmx)
> > }
> >
> > nested_free_all_saved_vmcss(vmx);
> > +   nested_free_vmcs12_list(vmx);
> >  }
> >
> >  /* Emulate the VMXOFF instruction */
> > @@ -5359,7 +5430,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
> > return 1;
>

hva_to_pfn and memory leak

2012-11-22 Thread Koshelev Vladimir
Hello, guys!

I have written paravirtual interface for GPA->HPA translation inside guest. I 
build GPA->HPA translation table in guest virtual memory. 
To do this, I need to translate userspace virtual address from  memslot to host 
physical address. I use hva_to_pfn for it. It works fine, but linux doesn't 
free the guest
memory after guest power off. After testing I have found that cause of memory 
leak is hva_to_pfg call. You can find my code at http://pastebin.com/0zBV2aPN. 
Do I translate hva to hpa in the right way? 

I do this patch for RHEL Linux Kernel 2.6.32-279.5.2.el6 because I use CentOS 
6.3.

Thanks!

Vladimir.



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 18/18] KVM/MIPS32: Binary patching of select privileged instructions.

2012-11-22 Thread Sanjay Lal
Currently, the following instructions are translated:
- CACHE (indexed)
- CACHE (va based): translated to a synci, overkill on D-CACHE operations, but 
still much faster than a trap.
- mfc0/mtc0: the virtual COP0 registers for the guest are implemented as 2-D 
array
  [COP#][SEL] and this is mapped into the guest kernel address space @ VA 0x0.
  mfc0/mtc0 operations are transformed to load/stores.

Signed-off-by: Sanjay Lal 
---
 arch/mips/kvm/kvm_mips_comm.h |  23 ++
 arch/mips/kvm/kvm_mips_commpage.c |  37 ++
 arch/mips/kvm/kvm_mips_dyntrans.c | 149 ++
 3 files changed, 209 insertions(+)
 create mode 100644 arch/mips/kvm/kvm_mips_comm.h
 create mode 100644 arch/mips/kvm/kvm_mips_commpage.c
 create mode 100644 arch/mips/kvm/kvm_mips_dyntrans.c

diff --git a/arch/mips/kvm/kvm_mips_comm.h b/arch/mips/kvm/kvm_mips_comm.h
new file mode 100644
index 000..7e903ec
--- /dev/null
+++ b/arch/mips/kvm/kvm_mips_comm.h
@@ -0,0 +1,23 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* KVM/MIPS: commpage: mapped into get kernel space 
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#ifndef __KVM_MIPS_COMMPAGE_H__
+#define __KVM_MIPS_COMMPAGE_H__
+
+struct kvm_mips_commpage {
+   struct mips_coproc cop0;/* COP0 state is mapped into Guest 
kernel via commpage */
+};
+
+#define KVM_MIPS_COMM_EIDI_OFFSET   0x0
+
+extern void kvm_mips_commpage_init(struct kvm_vcpu *vcpu);
+
+#endif /* __KVM_MIPS_COMMPAGE_H__ */
diff --git a/arch/mips/kvm/kvm_mips_commpage.c 
b/arch/mips/kvm/kvm_mips_commpage.c
new file mode 100644
index 000..3873b1e
--- /dev/null
+++ b/arch/mips/kvm/kvm_mips_commpage.c
@@ -0,0 +1,37 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* commpage, currently used for Virtual COP0 registers.
+* Mapped into the guest kernel @ 0x0.
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "kvm_mips_comm.h"
+
+void kvm_mips_commpage_init(struct kvm_vcpu *vcpu)
+{
+   struct kvm_mips_commpage *page = vcpu->arch.kseg0_commpage;
+   memset(page, 0, sizeof(struct kvm_mips_commpage));
+
+   /* Specific init values for fields */
+   vcpu->arch.cop0 = &page->cop0;
+   memset(vcpu->arch.cop0, 0, sizeof(struct mips_coproc));
+
+   return;
+}
diff --git a/arch/mips/kvm/kvm_mips_dyntrans.c 
b/arch/mips/kvm/kvm_mips_dyntrans.c
new file mode 100644
index 000..c657b37
--- /dev/null
+++ b/arch/mips/kvm/kvm_mips_dyntrans.c
@@ -0,0 +1,149 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* KVM/MIPS: Binary Patching for privileged instructions, reduces traps.
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "kvm_mips_comm.h"
+
+#define SYNCI_TEMPLATE  0x041f
+#define SYNCI_BASE(x)   (((x) >> 21) & 0x1f)
+#define SYNCI_OFFSET((x) & 0x)
+
+#define LW_TEMPLATE 0x8c00
+#define CLEAR_TEMPLATE  0x0020
+#define SW_TEMPLATE 0xac00
+
+int
+kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
+  struct kvm_vcpu *vcpu)
+{
+   int result = 0;
+   ulong kseg0_opc;
+   uint32_t synci_inst = 0x0;
+
+   /* Replace the CACHE instruction, with a NOP */
+   kseg0_opc =
+   CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
+  (vcpu, (ulong) opc));
+   memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
+   mips32_SyncICache(kseg0_opc, 32);
+
+   return result;
+}
+
+/*
+ *  Address based CACHE instructions are transformed into synci(s). A little 
heavy
+ * for just D-cache invalidates, but avoids an expensive trap
+ */
+int
+kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
+   struct kvm_vcpu *vcpu)
+{
+   int result = 0;
+   ulong kseg0_opc;
+   uint32_t synci_inst = SYNCI_TEMPLATE, base, offset;
+
+   base = (inst >> 21) & 0x1f;
+   offset = inst & 0x;
+   synci_inst |= (base << 21);
+   synci_inst |= offset;
+
+   kseg0_opc =
+   CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
+  (vcpu, (ulong) opc));
+   memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
+   mips32_SyncICache(kseg0_opc, 32);
+
+   return result;
+}
+
+int
+kvm_mips_trans_mfc0(uint32_t

Re: [PATCH v2 5/6] x86: Enable ack interrupt on vmexit

2012-11-22 Thread Gleb Natapov
On Wed, Nov 21, 2012 at 04:09:38PM +0800, Yang Zhang wrote:
> Ack interrupt on vmexit is required by Posted Interrupt. With it,
> when external interrupt caused vmexit, the cpu will acknowledge the
> interrupt controller and save the interrupt's vector in vmcs.
> 
> There are several approaches to enable it. This patch uses a simply
> way: re-generate an interrupt via self ipi.
> 
> Signed-off-by: Yang Zhang 
> ---
>  arch/x86/kvm/vmx.c |   11 ++-
>  1 files changed, 10 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 7949d21..f6ef090 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2525,7 +2525,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
> *vmcs_conf)
>  #ifdef CONFIG_X86_64
>   min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
>  #endif
> - opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
> + opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
> + VM_EXIT_ACK_INTR_ON_EXIT;
Always? Do it only if posted interrupts are actually available
and going to be used.

>   if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
>   &_vmexit_control) < 0)
>   return -EIO;
> @@ -4457,6 +4458,14 @@ static int handle_exception(struct kvm_vcpu *vcpu)
>  
>  static int handle_external_interrupt(struct kvm_vcpu *vcpu)
>  {
> + unsigned int vector;
> +
> + vector = vmcs_read32(VM_EXIT_INTR_INFO);
> + vector &= INTR_INFO_VECTOR_MASK;
Valid bit is guarantied to be set here?

> +
> + apic_eoi();
This is way to late. handle_external_interrupt() is called longs after
preemption and local irqs are enabled. vcpu process may be scheduled out
and apic_eoi() will not be called for a long time leaving interrupt
stuck in ISR and blocking other interrupts.

> + apic->send_IPI_self(vector);
For level interrupt this is not needed, no?

> +
>   ++vcpu->stat.irq_exits;
>   return 1;
>  }
> -- 
> 1.7.1

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] x86, kvm: Remove incorrect redundant assembly constraint

2012-11-22 Thread H. Peter Anvin
From: "H. Peter Anvin" 

In __emulate_1op_rax_rdx, we use "+a" and "+d" which are input/output
constraints, and *then* use "a" and "d" as input constraints.  This is
incorrect, but happens to work on some versions of gcc.

However, it breaks gcc with -O0 and icc, and may break on future
versions of gcc.

Reported-and-tested-by: Melanie Blower 
Signed-off-by: H. Peter Anvin 
Link: 
http://lkml.kernel.org/r/b3584e72cfebed439a3eca9bce67a4ef1b17a...@fmsmsx107.amr.corp.intel.com
---
 arch/x86/kvm/emulate.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 39171cb..bba39bf 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -426,8 +426,7 @@ static void invalidate_registers(struct x86_emulate_ctxt 
*ctxt)
_ASM_EXTABLE(1b, 3b)\
: "=m" ((ctxt)->eflags), "=&r" (_tmp),  \
  "+a" (*rax), "+d" (*rdx), "+qm"(_ex)  \
-   : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \
- "a" (*rax), "d" (*rdx));  \
+   : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val));\
} while (0)
 
 /* instruction has only one source operand, destination is implicit (e.g. mul, 
div, imul, idiv) */
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2] Enabling IA32_TSC_ADJUST for Qemu KVM guest VMs

2012-11-22 Thread Will Auld
CPUID.7.0.EBX[1]=1 indicates IA32_TSC_ADJUST MSR 0x3b is supported

Basic design is to emulate the MSR by allowing reads and writes to the
hypervisor vcpu specific locations to store the value of the emulated MSRs.
In this way the IA32_TSC_ADJUST value will be included in all reads to
the TSC MSR whether through rdmsr or rdtsc.

As this is a new MSR that the guest may access and modify its value needs
to be migrated along with the other MRSs. The changes here are specifically
for recognizing when IA32_TSC_ADJUST is enabled in CPUID and code added
for migrating its value.

Signed-off-by: Will Auld 
---
 target-i386/cpu.h |  2 ++
 target-i386/kvm.c | 15 +++
 target-i386/machine.c | 21 +
 3 files changed, 38 insertions(+)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index aabf993..13d4152 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -284,6 +284,7 @@
 #define MSR_IA32_APICBASE_BSP   (1<<8)
 #define MSR_IA32_APICBASE_ENABLE(1<<11)
 #define MSR_IA32_APICBASE_BASE  (0xf<<12)
+#define MSR_TSC_ADJUST 0x003b
 #define MSR_IA32_TSCDEADLINE0x6e0
 
 #define MSR_MTRRcap0xfe
@@ -701,6 +702,7 @@ typedef struct CPUX86State {
 uint64_t async_pf_en_msr;
 
 uint64_t tsc;
+uint64_t tsc_adjust;
 uint64_t tsc_deadline;
 
 uint64_t mcg_status;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 696b14a..e974c42 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -61,6 +61,7 @@ const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 
 static bool has_msr_star;
 static bool has_msr_hsave_pa;
+static bool has_msr_tsc_adjust;
 static bool has_msr_tsc_deadline;
 static bool has_msr_async_pf_en;
 static bool has_msr_misc_enable;
@@ -641,6 +642,10 @@ static int kvm_get_supported_msrs(KVMState *s)
 has_msr_hsave_pa = true;
 continue;
 }
+if (kvm_msr_list->indices[i] == MSR_TSC_ADJUST) {
+has_msr_tsc_adjust = true;
+continue;
+}
 if (kvm_msr_list->indices[i] == MSR_IA32_TSCDEADLINE) {
 has_msr_tsc_deadline = true;
 continue;
@@ -978,6 +983,10 @@ static int kvm_put_msrs(CPUX86State *env, int level)
 if (has_msr_hsave_pa) {
 kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave);
 }
+if (has_msr_tsc_adjust) {
+kvm_msr_entry_set(&msrs[n++], 
+   MSR_TSC_ADJUST, env->tsc_adjust);
+}
 if (has_msr_tsc_deadline) {
 kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSCDEADLINE, env->tsc_deadline);
 }
@@ -1234,6 +1243,9 @@ static int kvm_get_msrs(CPUX86State *env)
 if (has_msr_hsave_pa) {
 msrs[n++].index = MSR_VM_HSAVE_PA;
 }
+if (has_msr_tsc_adjust) {
+msrs[n++].index = MSR_TSC_ADJUST;
+}
 if (has_msr_tsc_deadline) {
 msrs[n++].index = MSR_IA32_TSCDEADLINE;
 }
@@ -1308,6 +1320,9 @@ static int kvm_get_msrs(CPUX86State *env)
 case MSR_IA32_TSC:
 env->tsc = msrs[i].data;
 break;
+case MSR_TSC_ADJUST:
+env->tsc_adjust = msrs[i].data;
+break;
 case MSR_IA32_TSCDEADLINE:
 env->tsc_deadline = msrs[i].data;
 break;
diff --git a/target-i386/machine.c b/target-i386/machine.c
index a8be058..95bda9b 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -310,6 +310,24 @@ static const VMStateDescription vmstate_fpop_ip_dp = {
 }
 };
 
+static bool tsc_adjust_needed(void *opaque)
+{
+CPUX86State *cpu = opaque;
+
+return cpu->tsc_adjust != 0;
+}
+
+static const VMStateDescription vmstate_msr_tsc_adjust = {
+.name = "cpu/msr_tsc_adjust",
+.version_id = 1,
+.minimum_version_id = 1,
+.minimum_version_id_old = 1,
+.fields  = (VMStateField []) {
+VMSTATE_UINT64(tsc_adjust, CPUX86State),
+VMSTATE_END_OF_LIST()
+}
+};
+
 static bool tscdeadline_needed(void *opaque)
 {
 CPUX86State *env = opaque;
@@ -457,6 +475,9 @@ static const VMStateDescription vmstate_cpu = {
 .vmsd = &vmstate_fpop_ip_dp,
 .needed = fpop_ip_dp_needed,
 }, {
+.vmsd = &vmstate_msr_tsc_adjust,
+.needed = tsc_adjust_needed,
+}, {
 .vmsd = &vmstate_msr_tscdeadline,
 .needed = tscdeadline_needed,
 }, {
-- 
1.8.0.rc0



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Performance issue

2012-11-22 Thread George-Cristian Bîrzan
I'm trying to understand a performance problem (50% degradation in the
VM) that I'm experiencing some systems with qemu-kvm. Running Fedora
with 3.5.3-1.fc17.x86_64 or 3.6.6-1.fc17.x86_64, qemu 1.0.1 or 1.2.1
on AMD Opteron 6176 and 6174, and all of them behave identically.

A Windows guest is receiving a UDP MPEG stream that is being processed
by TSReader. The stream comes in at about 73Mbps, but the VM cannot
process more than 43Mbps. It's not a networking issue, the packets
reach the guest and with iperf we can easily do 80Mbps. Also, with
iperf, it can receive the packets from the streamer (even though it
doesn't detect things properly, but it was just a way to see ).

However, on an identical host (a 6174 CPU, even), a Windows install
has absolutely no problem processing the same stream.

This is the command we're using to start qemu-kvm:

/usr/bin/qemu-kvm -name b691546e-79f8-49c6-a293-81067503a6ad -S -M
pc-1.2 -cpu host -enable-kvm -m 16384 -smp
16,sockets=1,cores=16,threads=1 -uuid
b691546e-79f8-49c6-a293-81067503a6ad -no-user-config -nodefaults
-chardev 
socket,id=charmonitor,path=/var/lib/libvirt/qemu/b691546e-79f8-49c6-a293-81067503a6ad.monitor,server,nowait
-mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc
-no-shutdown -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2
-drive 
file=/var/lib/libvirt/images/dis-magnetics-2-223101/d8b233c6-8424-4de9-ae3c-7c9a60288514,if=none,id=drive-virtio-disk0,format=qcow2,cache=writeback,aio=native
-device 
virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
-netdev tap,fd=29,id=hostnet0,vhost=on,vhostfd=31 -device
virtio-net-pci,netdev=hostnet0,id=net0,mac=22:2e:fb:a2:36:be,bus=pci.0,addr=0x3
-netdev tap,fd=32,id=hostnet1,vhost=on,vhostfd=33 -device
virtio-net-pci,netdev=hostnet1,id=net1,mac=22:94:44:5a:cb:24,bus=pci.0,addr=0x4
-vnc 127.0.0.1:4,password -vga cirrus -device
virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x6

As a sidenote, the TSReader application only uses one thread for
decoding the stream, one for network IO. While using more threads
would solve the problem.

I've tried smaller guest, with 5 cores, pinned all of them to CPUs 6
to 11 (all in a NUMA node), each to an individual CPU, I've tried
enabling huge pages/TLB thingy... and that's about it. I'm completely
stuck.

Is this 50% hit something that's considered 'okay', or am I doing
something wrong? And if the latter, what/how can I debug it?

--
George-Cristian Bîrzan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: pci_enable_msix() fails with ENOMEM/EINVAL

2012-11-22 Thread Alex Williamson
On Wed, 2012-11-21 at 16:19 +0200, Alex Lyakas wrote:
> Hi,
> I was advised to turn off irqbalance and reproduced this issue, but
> the failure is in a different place now. Now request_threaded_irq()
> fails with EBUSY.
> According to the code, this can only happen on the path:
> request_threaded_irq() -> __setup_irq()
> Now in setup irq, the only place where EBUSY can show up for us is here:
> ...
>   raw_spin_lock_irqsave(&desc->lock, flags);
>   old_ptr = &desc->action;
>   old = *old_ptr;
>   if (old) {
>   /*
>* Can't share interrupts unless both agree to and are
>* the same type (level, edge, polarity). So both flag
>* fields must have IRQF_SHARED set and the bits which
>* set the trigger type must match. Also all must
>* agree on ONESHOT.
>*/
>   if (!((old->flags & new->flags) & IRQF_SHARED) ||
>   ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
>   ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
>   old_name = old->name;
>   goto mismatch;
>   }
> 
>   /* All handlers must agree on per-cpuness */
>   if ((old->flags & IRQF_PERCPU) !=
>   (new->flags & IRQF_PERCPU))
>   goto mismatch;
> 
> KVM calls request_threaded_irq() with flags==0, so can it be that
> different KVM processes request the same IRQ?

Shouldn't be possible, irqs are allocated from a bitmap protected by a
mutex, see __irq_alloc_descs

>  How different KVM
> processes spawned simultaneously agree between them on IRQ numbers?

They don't, MSI/X vectors are not currently share-able.  Can you show
that you're actually getting duplicate irq vectors?  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] KVM: PPC: Book3S HV: Handle guest-caused machine checks on POWER7 without panicking

2012-11-22 Thread Paul Mackerras
Currently, if a machine check interrupt happens while we are in the
guest, we exit the guest and call the host's machine check handler,
which tends to cause the host to panic.  Some machine checks can be
triggered by the guest; for example, if the guest creates two entries
in the SLB that map the same effective address, and then accesses that
effective address, the CPU will take a machine check interrupt.

To handle this better, we firstly don't call the host's machine check
handler for machine checks that happen inside the guest.  Instead we
call a new function, kvmppc_realmode_machine_check(), while still in
real mode before exiting the guest.  On POWER7, it handles the cases
that the guest can trigger, either by flushing and reloading the SLB,
or by flushing the TLB, and then it delivers the machine check interrupt
to the guest.  (On PPC970 we currently just exit the guest and leave it
up to kvmppc_handle_exit() to handle the condition, which it doesn't,
so the situation is no better -- but no worse -- on PPC970 now.)

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/kvm/Makefile   |1 +
 arch/powerpc/kvm/book3s_hv_ras.c|  115 +++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   61 +---
 3 files changed, 153 insertions(+), 24 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_ras.c

diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index cd89658..1e473d4 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -73,6 +73,7 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
book3s_hv_rmhandlers.o \
book3s_hv_rm_mmu.o \
book3s_64_vio_hv.o \
+   book3s_hv_ras.o \
book3s_hv_builtin.o
 
 kvm-book3s_64-module-objs := \
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
new file mode 100644
index 000..2646d07
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -0,0 +1,115 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2012 Paul Mackerras, IBM Corp. 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/* SRR1 bits for machine check on POWER7 */
+#define SRR1_MC_LDSTERR(1ul << (63-42))
+#define SRR1_MC_IFETCH_SH  (63-45)
+#define SRR1_MC_IFETCH_MASK0x7
+#define SRR1_MC_IFETCH_SLBPAR  2   /* SLB parity error */
+#define SRR1_MC_IFETCH_SLBMULTI3   /* SLB multi-hit */
+#define SRR1_MC_IFETCH_SLBPARMULTI 4   /* SLB parity + multi-hit */
+#define SRR1_MC_IFETCH_TLBMULTI5   /* I-TLB multi-hit */
+
+/* DSISR bits for machine check on POWER7 */
+#define DSISR_MC_DERAT_MULTI   0x800   /* D-ERAT multi-hit */
+#define DSISR_MC_TLB_MULTI 0x400   /* D-TLB multi-hit */
+#define DSISR_MC_SLB_PARITY0x100   /* SLB parity error */
+#define DSISR_MC_SLB_MULTI 0x080   /* SLB multi-hit */
+#define DSISR_MC_SLB_PARMULTI  0x040   /* SLB parity + multi-hit */
+
+/* POWER7 SLB flush and reload */
+static void reload_slb(struct kvm_vcpu *vcpu)
+{
+   struct slb_shadow *slb;
+   unsigned long i, n;
+
+   /* First clear out SLB */
+   asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+
+   /* Do they have an SLB shadow buffer registered? */
+   slb = vcpu->arch.slb_shadow.pinned_addr;
+   if (!slb)
+   return;
+
+   /* Sanity check */
+   n = slb->persistent;
+   if (n > SLB_MIN_SIZE)
+   n = SLB_MIN_SIZE;
+   if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end)
+   return;
+
+   /* Load up the SLB from that */
+   for (i = 0; i < n; ++i) {
+   unsigned long rb = slb->save_area[i].esid;
+   unsigned long rs = slb->save_area[i].vsid;
+
+   rb = (rb & ~0xFFFul) | i;   /* insert entry number */
+   asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
+   }
+}
+
+/* POWER7 TLB flush */
+static void flush_tlb_power7(struct kvm_vcpu *vcpu)
+{
+   unsigned long i, rb;
+
+   rb = 0x800; /* IS field = 0b10, flush congruence class */
+   for (i = 0; i < 128; ++i) {
+   asm volatile("tlbiel %0" : : "r" (rb));
+   rb += 0x1000;
+   }
+}
+
+/*
+ * On POWER7, see if we can handle a machine check that occurred inside
+ * the guest in real mode, without switching to the host partition.
+ *
+ * Returns: 0 => exit guest, 1 => deliver machine check to guest
+ */
+static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
+{
+   unsigned long srr1 = vcpu->arch.shregs.msr;
+
+   if (srr1 & SRR1_MC_LDSTERR) {
+   /* error on load/store */
+   unsigned long dsisr = vcpu->arch.shregs.dsisr;
+
+   if (dsisr & (DSISR_MC_SLB_PARMULTI | DS

[PATCH 4/5] KVM: PPC: Book3S HV: Don't give the guest RW access to RO pages

2012-11-22 Thread Paul Mackerras
Currently, if the guest does an H_PROTECT hcall requesting that the
permissions on a HPT entry be changed to allow writing, we make the
requested change even if the page is marked read-only in the host
Linux page tables.  This is a problem since it would for instance
allow a guest to modify a page that KSM has decided can be shared
between multiple guests.

To fix this, if the new permissions for the page allow writing, we need
to look up the memslot for the page, work out the host virtual address,
and look up the Linux page tables to get the PTE for the page.  If that
PTE is read-only, we reduce the HPTE permissions to read-only.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |   22 ++
 1 file changed, 22 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 7e1f7e2..19c93ba 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -629,6 +629,28 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long 
flags,
asm volatile("tlbiel %0" : : "r" (rb));
asm volatile("ptesync" : : : "memory");
}
+   /*
+* If the host has this page as readonly but the guest
+* wants to make it read/write, reduce the permissions.
+* Checking the host permissions involves finding the
+* memslot and then the Linux PTE for the page.
+*/
+   if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) {
+   unsigned long psize, gfn, hva;
+   struct kvm_memory_slot *memslot;
+   pgd_t *pgdir = vcpu->arch.pgdir;
+   pte_t pte;
+
+   psize = hpte_page_size(v, r);
+   gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
+   memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+   if (memslot) {
+   hva = __gfn_to_hva_memslot(memslot, gfn);
+   pte = lookup_linux_pte(pgdir, hva, 1, &psize);
+   if (pte_present(pte) && !pte_write(pte))
+   r = hpte_make_readonly(r);
+   }
+   }
}
hpte[1] = r;
eieio();
-- 
1.7.10.rc3.219.g53414

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] KVM: PPC: Book3S HV: Reset reverse-map chains when resetting the HPT

2012-11-22 Thread Paul Mackerras
With HV-style KVM, we maintain reverse-mapping lists that enable us to
find all the HPT (hashed page table) entries that reference each guest
physical page, with the heads of the lists in the memslot->arch.rmap
arrays.  When we reset the HPT (i.e. when we reboot the VM), we clear
out all the HPT entries but we were not clearing out the reverse
mapping lists.  The result is that as we create new HPT entries, the
lists get corrupted, which can easily lead to loops, resulting in the
host kernel hanging when it tries to traverse those lists.

This fixes the problem by zeroing out all the reverse mapping lists
when we zero out the HPT.  This incidentally means that we are also
zeroing our record of the referenced and changed bits (not the bits
in the Linux PTEs, used by the Linux MM subsystem, but the bits used
by the KVM_GET_DIRTY_LOG ioctl, and those used by kvm_age_hva() and
kvm_test_age_hva()).

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c |   24 
 1 file changed, 24 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 0aa4073..1029e22 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -46,6 +46,7 @@
 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
long pte_index, unsigned long pteh,
unsigned long ptel, unsigned long *pte_idx_ret);
+static void kvmppc_rmap_reset(struct kvm *kvm);
 
 long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 {
@@ -144,6 +145,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 
*htab_orderp)
/* Set the entire HPT to 0, i.e. invalid HPTEs */
memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
/*
+* Reset all the reverse-mapping chains for all memslots
+*/
+   kvmppc_rmap_reset(kvm);
+   /*
 * Set the whole last_vcpu array to an invalid vcpu number.
 * This ensures that each vcpu will flush its TLB on next entry.
 */
@@ -772,6 +777,25 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
goto out_put;
 }
 
+static void kvmppc_rmap_reset(struct kvm *kvm)
+{
+   struct kvm_memslots *slots;
+   struct kvm_memory_slot *memslot;
+   int srcu_idx;
+
+   srcu_idx = srcu_read_lock(&kvm->srcu);
+   slots = kvm->memslots;
+   kvm_for_each_memslot(memslot, slots) {
+   /*
+* This assumes it is acceptable to lose reference and
+* change bits across a reset.
+*/
+   memset(memslot->arch.rmap, 0,
+  memslot->npages * sizeof(*memslot->arch.rmap));
+   }
+   srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
 static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long start,
unsigned long end,
-- 
1.7.10.rc3.219.g53414

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost-blk: Add vhost-blk support v5

2012-11-22 Thread Asias He
On 11/21/2012 07:57 PM, Michael S. Tsirkin wrote:
> On Wed, Nov 21, 2012 at 12:24:55PM +0800, Asias He wrote:
>> On 11/20/2012 09:37 PM, Michael S. Tsirkin wrote:
>>> On Tue, Nov 20, 2012 at 02:39:40PM +0800, Asias He wrote:
 On 11/20/2012 04:26 AM, Michael S. Tsirkin wrote:
> On Mon, Nov 19, 2012 at 04:53:42PM +0800, Asias He wrote:
>> vhost-blk is an in-kernel virito-blk device accelerator.
>>
>> Due to lack of proper in-kernel AIO interface, this version converts
>> guest's I/O request to bio and use submit_bio() to submit I/O directly.
>> So this version any supports raw block device as guest's disk image,
>> e.g. /dev/sda, /dev/ram0. We can add file based image support to
>> vhost-blk once we have in-kernel AIO interface. There are some work in
>> progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:
>>
>>http://marc.info/?l=linux-fsdevel&m=133312234313122
>>
>> Performance evaluation:
>> -
>> 1) LKVM
>> Fio with libaio ioengine on Fusion IO device using kvm tool
>> IOPS(k)Before   After   Improvement
>> seq-read   107  121 +13.0%
>> seq-write  130  179 +37.6%
>> rnd-read   102  122 +19.6%
>> rnd-write  125  159 +27.0%
>>
>> 2) QEMU
>> Fio with libaio ioengine on Fusion IO device using QEMU
>> IOPS(k)Before   After   Improvement
>> seq-read   76   123 +61.8%
>> seq-write  139  173 +24.4%
>> rnd-read   73   120 +64.3%
>> rnd-write  75   156 +108.0%
>
> Could you compare with dataplane qemu as well please?


 Well, I will try to collect it.

>
>>
>> Userspace bits:
>> -
>> 1) LKVM
>> The latest vhost-blk userspace bits for kvm tool can be found here:
>> g...@github.com:asias/linux-kvm.git blk.vhost-blk
>>
>> 2) QEMU
>> The latest vhost-blk userspace prototype for QEMU can be found here:
>> g...@github.com:asias/qemu.git blk.vhost-blk
>>
>> Changes in v5:
>> - Do not assume the buffer layout
>> - Fix wakeup race
>>
>> Changes in v4:
>> - Mark req->status as userspace pointer
>> - Use __copy_to_user() instead of copy_to_user() in 
>> vhost_blk_set_status()
>> - Add if (need_resched()) schedule() in blk thread
>> - Kill vhost_blk_stop_vq() and move it into vhost_blk_stop()
>> - Use vq_err() instead of pr_warn()
>> - Fail un Unsupported request
>> - Add flush in vhost_blk_set_features()
>>
>> Changes in v3:
>> - Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph!
>> - Check file passed by user is a raw block device file
>>
>> Signed-off-by: Asias He 
>
> Since there are files shared by this and vhost net
> it's easiest for me to merge this all through the
> vhost tree.
>
> Jens, could you ack this and the bio usage in this driver
> please?
>
>> ---
>>  drivers/vhost/Kconfig |   1 +
>>  drivers/vhost/Kconfig.blk |  10 +
>>  drivers/vhost/Makefile|   2 +
>>  drivers/vhost/blk.c   | 697 
>> ++
>>  drivers/vhost/blk.h   |   8 +
>>  5 files changed, 718 insertions(+)
>>  create mode 100644 drivers/vhost/Kconfig.blk
>>  create mode 100644 drivers/vhost/blk.c
>>  create mode 100644 drivers/vhost/blk.h
>>
>> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
>> index 202bba6..acd8038 100644
>> --- a/drivers/vhost/Kconfig
>> +++ b/drivers/vhost/Kconfig
>> @@ -11,4 +11,5 @@ config VHOST_NET
>>  
>>  if STAGING
>>  source "drivers/vhost/Kconfig.tcm"
>> +source "drivers/vhost/Kconfig.blk"
>>  endif
>> diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk
>> new file mode 100644
>> index 000..ff8ab76
>> --- /dev/null
>> +++ b/drivers/vhost/Kconfig.blk
>> @@ -0,0 +1,10 @@
>> +config VHOST_BLK
>> +tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)"
>> +depends on BLOCK &&  EXPERIMENTAL && m
>> +---help---
>> +  This kernel module can be loaded in host kernel to accelerate
>> +  guest block with virtio_blk. Not to be confused with 
>> virtio_blk
>> +  module itself which needs to be loaded in guest kernel.
>> +
>> +  To compile this driver as a module, choose M here: the module 
>> will
>> +  be called vhost_blk.
>> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
>> index a27b053..1a8a4a5 100644
>> --- a/drivers/vhost/Makefile
>> +++ b/drivers/vhost/Makefile
>> @@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
>>  vhost_net-y := vhost.o net.o
>>  
>>  obj-$(CONFIG_TCM

[PATCH] Added x86/tsc_adjust.c to test the ia32_tsc_adjust funtionality.

2012-11-22 Thread Will Auld
Signed-off-by: Will Auld 
---
 config-x86-common.mak |  5 -
 x86/tsc_adjust.c  | 43 +++
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 x86/tsc_adjust.c

diff --git a/config-x86-common.mak b/config-x86-common.mak
index c76cd11..47a9056 100644
--- a/config-x86-common.mak
+++ b/config-x86-common.mak
@@ -34,7 +34,8 @@ tests-common = $(TEST_DIR)/vmexit.flat $(TEST_DIR)/tsc.flat \
$(TEST_DIR)/realmode.flat $(TEST_DIR)/msr.flat \
$(TEST_DIR)/hypercall.flat $(TEST_DIR)/sieve.flat \
$(TEST_DIR)/kvmclock_test.flat  $(TEST_DIR)/eventinj.flat \
-   $(TEST_DIR)/s3.flat $(TEST_DIR)/pmu.flat 
$(TEST_DIR)/asyncpf.flat
+   $(TEST_DIR)/s3.flat $(TEST_DIR)/pmu.flat \
+  $(TEST_DIR)/tsc_adjust.flat $(TEST_DIR)/asyncpf.flat
 
 ifdef API
 tests-common += api/api-sample
@@ -64,6 +65,8 @@ $(TEST_DIR)/port80.elf: $(cstart.o) $(TEST_DIR)/port80.o
 
 $(TEST_DIR)/tsc.elf: $(cstart.o) $(TEST_DIR)/tsc.o
 
+$(TEST_DIR)/tsc_adjust.elf: $(cstart.o) $(TEST_DIR)/tsc_adjust.o
+
 $(TEST_DIR)/apic.elf: $(cstart.o) $(TEST_DIR)/apic.o
 
 $(TEST_DIR)/realmode.elf: $(TEST_DIR)/realmode.o
diff --git a/x86/tsc_adjust.c b/x86/tsc_adjust.c
new file mode 100644
index 000..bcb8982
--- /dev/null
+++ b/x86/tsc_adjust.c
@@ -0,0 +1,43 @@
+#include "libcflat.h"
+#include "processor.h"
+
+#define IA32_TSC_ADJUST 0x3b
+
+int main()
+{
+   u64 t1, t2, t3, t4, t5;
+   u64 lat;
+
+   t3 = 0x0;
+
+   t1 = rdtsc();
+   wrmsr(IA32_TSC_ADJUST, t3);
+   t2 = rdtsc();
+   lat = t2 - t1;
+   printf("rdtsc/wrmsr/rdtsc latency %lld\n", lat);
+   printf("Initial rdtsc: %lld\n", t2);
+
+   t1 = rdmsr(IA32_TSC_ADJUST);
+   printf("Initial rdmsr IA32_TSC_ADJUST: %lld\n", t1);
+   
+   t5 = 1000ull;
+   wrtsc(t5);
+   t1 = rdmsr(IA32_TSC_ADJUST);
+   printf("wrtsc %lld, rdmsr IA32_TSC_ADJUST: %lld\n", t5, t1);
+
+   wrmsr(IA32_TSC_ADJUST, t3);
+   t2 = rdtsc();
+   t1 = rdmsr(IA32_TSC_ADJUST);
+   printf( "wrmsr IA32_TSC_ADJUST %lld, rdmsr IA32_TSC_ADJUST: %lld, 
rdtsc: %lld\n", t3, t1, t2);
+   
+   t3 = 0x;
+   t4 = rdtsc();
+   wrmsr(IA32_TSC_ADJUST, t3);
+   t2 = rdtsc();
+   t1 = rdmsr(IA32_TSC_ADJUST);
+   printf( "wrmsr IA32_TSC_ADJUST %lld, rdmsr IA32_TSC_ADJUST: %lld, 
rdtsc: %lld\n", t3, t1, t2);
+   lat = t2 - t4;
+   printf("rdtsc/wrmsr/rdtsc latency %lld\n", lat);
+   
+   return 0;
+}
-- 
1.8.0.rc0



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/2] x86/kexec: VMCLEAR vmcss on all cpus if necessary

2012-11-22 Thread Gleb Natapov
On Thu, Nov 22, 2012 at 08:59:15AM +0800, Zhang Yanfei wrote:
> 于 2012年11月21日 18:33, Gleb Natapov 写道:
> > On Wed, Nov 21, 2012 at 10:23:12AM +0800, Zhang Yanfei wrote:
> >> This patch adds an atomic notifier list named crash_notifier_list.
> >> When loading kvm-intel module, a notifier will be registered in
> >> the list to enable vmcss loaded on all cpus to be VMCLEAR'd if
> >> needed.
> >>
> >> Signed-off-by: Zhang Yanfei 
> >> ---
> >>  arch/x86/include/asm/kexec.h |2 ++
> >>  arch/x86/kernel/crash.c  |   25 +
> >>  2 files changed, 27 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
> >> index 317ff17..5e22b00 100644
> >> --- a/arch/x86/include/asm/kexec.h
> >> +++ b/arch/x86/include/asm/kexec.h
> >> @@ -163,6 +163,8 @@ struct kimage_arch {
> >>  };
> >>  #endif
> >>  
> >> +extern struct atomic_notifier_head crash_notifier_list;
> >> +
> >>  #endif /* __ASSEMBLY__ */
> >>  
> >>  #endif /* _ASM_X86_KEXEC_H */
> >> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> >> index 13ad899..0f3d5b4 100644
> >> --- a/arch/x86/kernel/crash.c
> >> +++ b/arch/x86/kernel/crash.c
> >> @@ -16,6 +16,8 @@
> >>  #include 
> >>  #include 
> >>  #include 
> >> +#include 
> >> +#include 
> >>  
> >>  #include 
> >>  #include 
> >> @@ -30,6 +32,19 @@
> >>  
> >>  int in_crash_kexec;
> >>  
> >> +/*
> >> + * The list is used to VMCLEAR vmcss loaded on all
> >> + * cpus. And when loading kvm_intel module, the
> >> + * vmclear function will be registered in the list.
> >> + */
> >> +ATOMIC_NOTIFIER_HEAD(crash_notifier_list);
> >> +EXPORT_SYMBOL_GPL(crash_notifier_list);
> >> +
> >> +static inline void cpu_emergency_vmclear_loaded_vmcss(void)
> >> +{
> >> +  atomic_notifier_call_chain(&crash_notifier_list, 0, NULL);
> >> +}
> >> +
> > The notifier list is not VMX specific. It may be used for other
> > purposes, so please use better name or just open code it.
> 
> OK, thanks. crash_notifier_list --> vmclear_notifier_list?
It is even worse. You are adding general infrastructure here, not
related to KVM at all. Choose neutral names, drop all comments about
VMCLEAR.

> the new v6 version has been sent, any comments are welcome.
> 
> > 
> > ACKs from kexec side are needed.
> > 
> >>  #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
> >>  
> >>  static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
> >> @@ -46,6 +61,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs 
> >> *regs)
> >>  #endif
> >>crash_save_cpu(regs, cpu);
> >>  
> >> +  /*
> >> +   * VMCLEAR vmcss loaded on all cpus if needed.
> >> +   */
> >> +  cpu_emergency_vmclear_loaded_vmcss();
> >> +
> >>/* Disable VMX or SVM if needed.
> >> *
> >> * We need to disable virtualization on all CPUs.
> >> @@ -88,6 +108,11 @@ void native_machine_crash_shutdown(struct pt_regs 
> >> *regs)
> >>  
> >>kdump_nmi_shootdown_cpus();
> >>  
> >> +  /*
> >> +   * VMCLEAR vmcss loaded on this cpu if needed.
> >> +   */
> >> +  cpu_emergency_vmclear_loaded_vmcss();
> >> +
> >>/* Booting kdump kernel with VMX or SVM enabled won't work,
> >> * because (among other limitations) we can't disable paging
> >> * with the virt flags.
> >> -- 
> >> 1.7.1
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >> the body of a message to majord...@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >> Please read the FAQ at  http://www.tux.org/lkml/
> > 
> > --
> > Gleb.
> > 
> 

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 2/2] KVM-INTEL: add a notifier and a bitmap to support VMCLEAR in kdump

2012-11-22 Thread Marcelo Tosatti
On Wed, Nov 21, 2012 at 11:27:19PM +0800, Zhang Yanfei wrote:
> The notifier will be registered in vmclear_notifier_list when loading
> kvm-intel module. And the bitmap indicates whether we should do
> VMCLEAR operation in kdump. The bits in the bitmap are set/unset
> according to different conditions.
> 
> Signed-off-by: Zhang Yanfei 
> ---
>  arch/x86/kvm/vmx.c |   77 
> +++-
>  1 files changed, 76 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 4ff0ab9..eea55b3 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -41,6 +41,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "trace.h"
>  
> @@ -963,6 +964,49 @@ static void vmcs_load(struct vmcs *vmcs)
>  vmcs, phys_addr);
>  }
>  
> +#ifdef CONFIG_KEXEC
> +/*
> + * This bitmap is used to indicate whether the vmclear
> + * operation is enabled on all cpus. All disabled by
> + * default.
> + */
> +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
> +
> +static inline void crash_enable_local_vmclear(int cpu)
> +{
> + cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
> +}
> +
> +static inline void crash_disable_local_vmclear(int cpu)
> +{
> + cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
> +}
> +
> +static inline int crash_local_vmclear_enabled(int cpu)
> +{
> + return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
> +}
> +
> +static void vmclear_local_loaded_vmcss(void);
> +static int crash_vmclear_local_loaded_vmcss(struct notifier_block *this,
> + unsigned long val, void *ptr)
> +{
> + int cpu = raw_smp_processor_id();
> +
> + if (crash_local_vmclear_enabled(cpu))
> + vmclear_local_loaded_vmcss();
> +
> + return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block crash_vmclear_notifier = {
> + .notifier_call = crash_vmclear_local_loaded_vmcss,
> +};
> +#else
> +static inline void crash_enable_local_vmclear(int cpu) { }
> +static inline void crash_disable_local_vmclear(int cpu) { }
> +#endif /* CONFIG_KEXEC */
> +
>  static void __loaded_vmcs_clear(void *arg)
>  {
>   struct loaded_vmcs *loaded_vmcs = arg;
> @@ -972,8 +1016,10 @@ static void __loaded_vmcs_clear(void *arg)
>   return; /* vcpu migration can race with cpu offline */
>   if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
>   per_cpu(current_vmcs, cpu) = NULL;
> + crash_disable_local_vmclear(cpu);
>   list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
>   loaded_vmcs_init(loaded_vmcs);
> + crash_enable_local_vmclear(cpu);
>  }
>  
>  static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
> @@ -1491,8 +1537,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int 
> cpu)
>  
>   kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
>   local_irq_disable();
> + crash_disable_local_vmclear(cpu);
>   list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
>&per_cpu(loaded_vmcss_on_cpu, cpu));
> + crash_enable_local_vmclear(cpu);
>   local_irq_enable();
>  
>   /*
> @@ -2302,6 +2350,18 @@ static int hardware_enable(void *garbage)
>   return -EBUSY;
>  
>   INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
> +
> + /*
> +  * Now we can enable the vmclear operation in kdump
> +  * since the loaded_vmcss_on_cpu list on this cpu
> +  * has been initialized.
> +  *
> +  * Though the cpu is not in VMX operation now, there
> +  * is no problem to enable the vmclear operation
> +  * for the loaded_vmcss_on_cpu list is empty!
> +  */
> + crash_enable_local_vmclear(cpu);
> +
>   rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
>  
>   test_bits = FEATURE_CONTROL_LOCKED;
> @@ -2335,7 +2395,6 @@ static void vmclear_local_loaded_vmcss(void)
>   __loaded_vmcs_clear(v);
>  }
>  
> -
>  /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
>   * tricks.
>   */
> @@ -2348,6 +2407,12 @@ static void hardware_disable(void *garbage)
>  {
>   if (vmm_exclusive) {
>   vmclear_local_loaded_vmcss();
> + /*
> +  * vmclear operation in kdump should be disabled here
> +  * because the cpu is going to exit VMX operation
> +  * and the loaded_vmcss_on_cpu list may not be empty!
> +  */
> + crash_disable_local_vmclear(raw_smp_processor_id());
>   kvm_cpu_vmxoff();

How come its not empty? vmclear_local_loaded_vmcss cleared it, didnt it?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Ubuntu/Debian Installer + Virtio-SCSI -> Bad ram pointer

2012-11-22 Thread Peter Lieven


On 19.11.2012 18:20, Stefan Hajnoczi wrote:

On Thu, Nov 8, 2012 at 4:26 PM, Peter Lieven  wrote:

Has anyone any other idea what the cause could be or where to start?


Hi Peter,
I suggested posting the source tree you are building.  Since you have
applied patches yourself no one else is able to follow along with the
gdb output or reproduce the issue accurately.


Sorry for the late reply, I used qemu git at 
e24dc9feb0d68142d54dc3c097f57588836d1338
and libiscsi git at 3b3036b9dae55f0c3eef9d75db89c7b78f637a12.

The cmdline:
qemu-system-x86_64 -enable-kvm -m 1024 -drive 
if=virtio,file=iscsi://172.21.200.56/iqn.2001-05.com.equallogic:0-8a0906-62ff4e007-e4a3c8908af50839-test-3000g/0
 -cdrom ubuntu-12.04.1-server-amd64.iso -vnc :1

The vm crashes with:

Bad ram pointer 0x7fd220008000

after the user settings and timezone config when loading the module
libdmraid1.0.0.rc16-udeb

I hope this helps to reproduce.

Peter



Stefan



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: messed up with xml-files and configuration of a VM - SOLVED

2012-11-22 Thread Lentes, Bernd


> -Original Message-
> From: kvm-ow...@vger.kernel.org
> [mailto:kvm-ow...@vger.kernel.org] On Behalf Of Lentes, Bernd
> Sent: Tuesday, November 20, 2012 5:14 PM
> To: KVM-ML (kvm@vger.kernel.org)
> Subject: messed up with xml-files and configuration of a VM
>
> Hi,
>
> first, i'm new to kvm. I'm running KVM on a sles 11 sp2,
> kernel 3.0.13-0.27-default. My guest is an Ubuntu 12.0.4 LTS 64bit.
> The guest has attached a CDROM, using an iso-file from a
> CIFS-Share. I detached it with the virtual machine manager (0.9.0).
> I don't see the cd-rom anymore in the virtual machine
> manager. But when i try to start the vm, it complains about
> the missing iso-file.
> Why ? I detached it.
> When i like to have a look in the xml-files of the guest, i
> found three ! One in /var/lib/kvm/images, one in
> /etc/libvirt/qemu and one in /etc/kvm/vm.
> Which one should i use to configure the vm ? In the one in
> /etc/libvirt/qemu the cifs-share isn't mentioned any longer,
> in the other two it is still.
> Is it possible to configure the vm editing one of the XML-files ?
> Or shall i use virsh ? Using virsh, does the vm has to be
> stopped or can i edit the configuration for a running vm ?
> Why three xml-files ? Why is detaching with the virtual
> machine manager not working ?
>
> Thanks for any inspiration.
>
>
Hi,

i found it myself:
http://wiki.libvirt.org/page/FAQ#What_is_the_.27virsh_edit.27_command_and_how_do_I_use_it.3F
 suggests to change configuration using "virsh edit".

I solved the problem with the iso-file reconnecting again the iso-file and 
disconnecting it afterwards. Now the vm is starting properly.


Bernd

Helmholtz Zentrum München
Deutsches Forschungszentrum für Gesundheit und Umwelt (GmbH)
Ingolstädter Landstr. 1
85764 Neuherberg
www.helmholtz-muenchen.de
Aufsichtsratsvorsitzende: MinDir´in Bärbel Brumme-Bothe
Geschäftsführer: Prof. Dr. Günther Wess und Dr. Nikolaus Blum
Registergericht: Amtsgericht München HRB 6466
USt-IdNr: DE 129521671


Re: [PATCH 1/4] nested vmx: clean up for vmcs12 read and write

2012-11-22 Thread Gleb Natapov
On Thu, Nov 22, 2012 at 03:16:47AM +, Xu, Dongxiao wrote:
> 
> 
> > -Original Message-
> > From: Gleb Natapov [mailto:g...@redhat.com]
> > Sent: Wednesday, November 21, 2012 9:27 PM
> > To: Xu, Dongxiao
> > Cc: kvm@vger.kernel.org; mtosa...@redhat.com
> > Subject: Re: [PATCH 1/4] nested vmx: clean up for vmcs12 read and write
> > 
> > On Wed, Nov 21, 2012 at 05:04:34PM +0800, Dongxiao Xu wrote:
> > > abstract vmcs12_read and vmcs12_write functions to do the vmcs12
> > > read/write operations.
> > >
> > > Signed-off-by: Dongxiao Xu 
> > > ---
> > >  arch/x86/kvm/vmx.c |   86
> > +++-
> > >  1 files changed, 45 insertions(+), 41 deletions(-)
> > >
> > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index
> > > f858159..d8670e4 100644
> > > --- a/arch/x86/kvm/vmx.c
> > > +++ b/arch/x86/kvm/vmx.c
> > > @@ -5407,32 +5407,67 @@ static inline int vmcs_field_readonly(unsigned
> > long field)
> > >   * some of the bits we return here (e.g., on 32-bit guests, only 32 bits 
> > > of
> > >   * 64-bit fields are to be returned).
> > >   */
> > > -static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
> > > - unsigned long field, u64 *ret)
> > > +static inline u64 vmcs12_read(struct kvm_vcpu *vcpu, unsigned long
> > > +field)
> > >  {
> > >   short offset = vmcs_field_to_offset(field);
> > >   char *p;
> > >
> > > - if (offset < 0)
> > > + if (offset < 0) {
> > > + nested_vmx_failValid(vcpu,
> > VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > > + skip_emulated_instruction(vcpu);
> > >   return 0;
> > > + }
> > >
> > >   p = ((char *)(get_vmcs12(vcpu))) + offset;
> > >
> > >   switch (vmcs_field_type(field)) {
> > >   case VMCS_FIELD_TYPE_NATURAL_WIDTH:
> > > - *ret = *((natural_width *)p);
> > > + return *((natural_width *)p);
> > > + case VMCS_FIELD_TYPE_U16:
> > > + return *((u16 *)p);
> > > + case VMCS_FIELD_TYPE_U32:
> > > + return *((u32 *)p);
> > > + case VMCS_FIELD_TYPE_U64:
> > > + return *((u64 *)p);
> > > + default:
> > > + nested_vmx_failValid(vcpu,
> > VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > > + skip_emulated_instruction(vcpu);
> > > + return 0; /* can never happen. */
> > > + }
> > > +}
> > > +
> > > +static inline int vmcs12_write(struct kvm_vcpu *vcpu,
> > > + unsigned long field,
> > > + u64 value)
> > > +{
> > > + short offset = vmcs_field_to_offset(field);
> > > + char *p;
> > > +
> > > + if (offset < 0) {
> > > + nested_vmx_failValid(vcpu,
> > VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > > + skip_emulated_instruction(vcpu);
> > > + return 0;
> > > + }
> > > +
> > > + p = ((char *)(get_vmcs12(vcpu))) + offset;
> > > +
> > > + switch (vmcs_field_type(field)) {
> > > + case VMCS_FIELD_TYPE_NATURAL_WIDTH:
> > > + *(natural_width *)p = value;
> > >   return 1;
> > >   case VMCS_FIELD_TYPE_U16:
> > > - *ret = *((u16 *)p);
> > > + *(u16 *)p = value;
> > >   return 1;
> > >   case VMCS_FIELD_TYPE_U32:
> > > - *ret = *((u32 *)p);
> > > + *(u32 *)p = value;
> > >   return 1;
> > >   case VMCS_FIELD_TYPE_U64:
> > > - *ret = *((u64 *)p);
> > > + *(u64 *)p = value;
> > >   return 1;
> > >   default:
> > > - return 0; /* can never happen. */
> > > + nested_vmx_failValid(vcpu,
> > VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > > + skip_emulated_instruction(vcpu);
> > > + return 0;
> > >   }
> > >  }
> > >
> > > @@ -5466,11 +5501,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
> > >   /* Decode instruction info and find the field to read */
> > >   field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
> > >   /* Read the field, zero-extended to a u64 field_value */
> > > - if (!vmcs12_read_any(vcpu, field, &field_value)) {
> > > - nested_vmx_failValid(vcpu,
> > VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > > - skip_emulated_instruction(vcpu);
> > > - return 1;
> > > - }
> > > + field_value = vmcs12_read(vcpu, field);
> > You do not handle failure here and always write back field_value even if
> > vmcs12_read() failed. Actually now it is impossible to detect a failure. 
> > Call to
> > nested_vmx_failValid() in vmcs12_read() will be overwritten by call to
> > nested_vmx_succeed() at the end of
> > handle_vmread() and skip_emulated_instruction() will be called twice.
> 
> Thanks Gleb and Orit to raise this issue.
> 
> What about moving the offset check outside the vmcs12_read() and 
> vmcs12_write() function, and put it directly in handle_vmread() and 
> handle_vmwrite()?
> I think we only need to do offset error check in handle_vmread() and 
> handle_vmwrite() since they are to emulate correct behavior for guest VMM. 
> For example, if guest VMM reads a field that is not valid or writes a field 
> that is read only, then in emulation cod

[PATCH v2 14/18] MIPS: ASM offsets for VCPU arch specific fields.

2012-11-22 Thread Sanjay Lal

Signed-off-by: Sanjay Lal 
---
 arch/mips/kernel/asm-offsets.c | 66 ++
 1 file changed, 66 insertions(+)

diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index 0c4bce4..66895de 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -17,6 +17,8 @@
 #include 
 #include 
 
+#include 
+
 void output_ptreg_defines(void)
 {
COMMENT("MIPS pt_regs offsets.");
@@ -329,3 +331,67 @@ void output_pbe_defines(void)
BLANK();
 }
 #endif
+
+void output_kvm_defines(void)
+{
+   COMMENT(" KVM/MIPS Specfic offsets. ");
+   DEFINE(VCPU_ARCH_SIZE, sizeof(struct kvm_vcpu_arch));
+   OFFSET(VCPU_RUN, kvm_vcpu, run);
+   OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch);
+
+   OFFSET(VCPU_HOST_EBASE, kvm_vcpu_arch, host_ebase);
+   OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase);
+
+   OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack);
+   OFFSET(VCPU_HOST_GP, kvm_vcpu_arch, host_gp);
+
+   OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr);
+   OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause);
+   OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc);
+   OFFSET(VCPU_HOST_ENTRYHI, kvm_vcpu_arch, host_cp0_entryhi);
+
+   OFFSET(VCPU_GUEST_INST, kvm_vcpu_arch, guest_inst);
+
+   OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]);
+   OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]);
+   OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]);
+   OFFSET(VCPU_R3, kvm_vcpu_arch, gprs[3]);
+   OFFSET(VCPU_R4, kvm_vcpu_arch, gprs[4]);
+   OFFSET(VCPU_R5, kvm_vcpu_arch, gprs[5]);
+   OFFSET(VCPU_R6, kvm_vcpu_arch, gprs[6]);
+   OFFSET(VCPU_R7, kvm_vcpu_arch, gprs[7]);
+   OFFSET(VCPU_R8, kvm_vcpu_arch, gprs[8]);
+   OFFSET(VCPU_R9, kvm_vcpu_arch, gprs[9]);
+   OFFSET(VCPU_R10, kvm_vcpu_arch, gprs[10]);
+   OFFSET(VCPU_R11, kvm_vcpu_arch, gprs[11]);
+   OFFSET(VCPU_R12, kvm_vcpu_arch, gprs[12]);
+   OFFSET(VCPU_R13, kvm_vcpu_arch, gprs[13]);
+   OFFSET(VCPU_R14, kvm_vcpu_arch, gprs[14]);
+   OFFSET(VCPU_R15, kvm_vcpu_arch, gprs[15]);
+   OFFSET(VCPU_R16, kvm_vcpu_arch, gprs[16]);
+   OFFSET(VCPU_R17, kvm_vcpu_arch, gprs[17]);
+   OFFSET(VCPU_R18, kvm_vcpu_arch, gprs[18]);
+   OFFSET(VCPU_R19, kvm_vcpu_arch, gprs[19]);
+   OFFSET(VCPU_R20, kvm_vcpu_arch, gprs[20]);
+   OFFSET(VCPU_R21, kvm_vcpu_arch, gprs[21]);
+   OFFSET(VCPU_R22, kvm_vcpu_arch, gprs[22]);
+   OFFSET(VCPU_R23, kvm_vcpu_arch, gprs[23]);
+   OFFSET(VCPU_R24, kvm_vcpu_arch, gprs[24]);
+   OFFSET(VCPU_R25, kvm_vcpu_arch, gprs[25]);
+   OFFSET(VCPU_R26, kvm_vcpu_arch, gprs[26]);
+   OFFSET(VCPU_R27, kvm_vcpu_arch, gprs[27]);
+   OFFSET(VCPU_R28, kvm_vcpu_arch, gprs[28]);
+   OFFSET(VCPU_R29, kvm_vcpu_arch, gprs[29]);
+   OFFSET(VCPU_R30, kvm_vcpu_arch, gprs[30]);
+   OFFSET(VCPU_R31, kvm_vcpu_arch, gprs[31]);
+   OFFSET(VCPU_LO, kvm_vcpu_arch, lo);
+   OFFSET(VCPU_HI, kvm_vcpu_arch, hi);
+   OFFSET(VCPU_PC, kvm_vcpu_arch, pc);
+   OFFSET(VCPU_COP0, kvm_vcpu_arch, cop0);
+   OFFSET(VCPU_GUEST_KERNEL_ASID, kvm_vcpu_arch, guest_kernel_asid);
+   OFFSET(VCPU_GUEST_USER_ASID, kvm_vcpu_arch, guest_user_asid);
+
+   OFFSET(COP0_TLB_HI, mips_coproc, reg[MIPS_CP0_TLB_HI][0]);
+   OFFSET(COP0_STATUS, mips_coproc, reg[MIPS_CP0_STATUS][0]);
+   BLANK();
+}
-- 
1.7.11.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 12/18] MIPS: Export routines needed by the KVM module.

2012-11-22 Thread Sanjay Lal

Signed-off-by: Sanjay Lal 
---
 arch/mips/mm/c-r4k.c   | 6 --
 arch/mips/mm/cache.c   | 1 +
 arch/mips/mm/tlb-r4k.c | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 2b61462..1923063 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -136,7 +136,8 @@ static void __cpuinit 
r4k_blast_dcache_page_indexed_setup(void)
r4k_blast_dcache_page_indexed = blast_dcache64_page_indexed;
 }
 
-static void (* r4k_blast_dcache)(void);
+void (* r4k_blast_dcache)(void);
+EXPORT_SYMBOL(r4k_blast_dcache);
 
 static void __cpuinit r4k_blast_dcache_setup(void)
 {
@@ -264,7 +265,8 @@ static void __cpuinit 
r4k_blast_icache_page_indexed_setup(void)
r4k_blast_icache_page_indexed = blast_icache64_page_indexed;
 }
 
-static void (* r4k_blast_icache)(void);
+void (* r4k_blast_icache)(void);
+EXPORT_SYMBOL(r4k_blast_icache);
 
 static void __cpuinit r4k_blast_icache_setup(void)
 {
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index 07cec44..5aeb3eb 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -48,6 +48,7 @@ void (*flush_icache_all)(void);
 
 EXPORT_SYMBOL_GPL(local_flush_data_cache_page);
 EXPORT_SYMBOL(flush_data_cache_page);
+EXPORT_SYMBOL(flush_icache_all);
 
 #ifdef CONFIG_DMA_NONCOHERENT
 
diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
index 4b9b935..fd30887 100644
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -94,6 +95,7 @@ void local_flush_tlb_all(void)
FLUSH_ITLB;
EXIT_CRITICAL(flags);
 }
+EXPORT_SYMBOL(local_flush_tlb_all);
 
 /* All entries common to a mm share an asid.  To effectively flush
these entries, we just bump the asid. */
-- 
1.7.11.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2] Enabling IA32_TSC_ADJUST for Qemu KVM guest VMs

2012-11-22 Thread Will Auld
CPUID.7.0.EBX[1]=1 indicates IA32_TSC_ADJUST MSR 0x3b is supported

Basic design is to emulate the MSR by allowing reads and writes to the
hypervisor vcpu specific locations to store the value of the emulated MSRs.
In this way the IA32_TSC_ADJUST value will be included in all reads to
the TSC MSR whether through rdmsr or rdtsc.

As this is a new MSR that the guest may access and modify its value needs
to be migrated along with the other MRSs. The changes here are specifically
for recognizing when IA32_TSC_ADJUST is enabled in CPUID and code added
for migrating its value.

Signed-off-by: Will Auld 
---
 target-i386/cpu.h |  2 ++
 target-i386/kvm.c | 15 +++
 target-i386/machine.c | 21 +
 3 files changed, 38 insertions(+)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index aabf993..13d4152 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -284,6 +284,7 @@
 #define MSR_IA32_APICBASE_BSP   (1<<8)
 #define MSR_IA32_APICBASE_ENABLE(1<<11)
 #define MSR_IA32_APICBASE_BASE  (0xf<<12)
+#define MSR_TSC_ADJUST 0x003b
 #define MSR_IA32_TSCDEADLINE0x6e0
 
 #define MSR_MTRRcap0xfe
@@ -701,6 +702,7 @@ typedef struct CPUX86State {
 uint64_t async_pf_en_msr;
 
 uint64_t tsc;
+uint64_t tsc_adjust;
 uint64_t tsc_deadline;
 
 uint64_t mcg_status;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 696b14a..e974c42 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -61,6 +61,7 @@ const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 
 static bool has_msr_star;
 static bool has_msr_hsave_pa;
+static bool has_msr_tsc_adjust;
 static bool has_msr_tsc_deadline;
 static bool has_msr_async_pf_en;
 static bool has_msr_misc_enable;
@@ -641,6 +642,10 @@ static int kvm_get_supported_msrs(KVMState *s)
 has_msr_hsave_pa = true;
 continue;
 }
+if (kvm_msr_list->indices[i] == MSR_TSC_ADJUST) {
+has_msr_tsc_adjust = true;
+continue;
+}
 if (kvm_msr_list->indices[i] == MSR_IA32_TSCDEADLINE) {
 has_msr_tsc_deadline = true;
 continue;
@@ -978,6 +983,10 @@ static int kvm_put_msrs(CPUX86State *env, int level)
 if (has_msr_hsave_pa) {
 kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave);
 }
+if (has_msr_tsc_adjust) {
+kvm_msr_entry_set(&msrs[n++], 
+   MSR_TSC_ADJUST, env->tsc_adjust);
+}
 if (has_msr_tsc_deadline) {
 kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSCDEADLINE, env->tsc_deadline);
 }
@@ -1234,6 +1243,9 @@ static int kvm_get_msrs(CPUX86State *env)
 if (has_msr_hsave_pa) {
 msrs[n++].index = MSR_VM_HSAVE_PA;
 }
+if (has_msr_tsc_adjust) {
+msrs[n++].index = MSR_TSC_ADJUST;
+}
 if (has_msr_tsc_deadline) {
 msrs[n++].index = MSR_IA32_TSCDEADLINE;
 }
@@ -1308,6 +1320,9 @@ static int kvm_get_msrs(CPUX86State *env)
 case MSR_IA32_TSC:
 env->tsc = msrs[i].data;
 break;
+case MSR_TSC_ADJUST:
+env->tsc_adjust = msrs[i].data;
+break;
 case MSR_IA32_TSCDEADLINE:
 env->tsc_deadline = msrs[i].data;
 break;
diff --git a/target-i386/machine.c b/target-i386/machine.c
index a8be058..95bda9b 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -310,6 +310,24 @@ static const VMStateDescription vmstate_fpop_ip_dp = {
 }
 };
 
+static bool tsc_adjust_needed(void *opaque)
+{
+CPUX86State *cpu = opaque;
+
+return cpu->tsc_adjust != 0;
+}
+
+static const VMStateDescription vmstate_msr_tsc_adjust = {
+.name = "cpu/msr_tsc_adjust",
+.version_id = 1,
+.minimum_version_id = 1,
+.minimum_version_id_old = 1,
+.fields  = (VMStateField []) {
+VMSTATE_UINT64(tsc_adjust, CPUX86State),
+VMSTATE_END_OF_LIST()
+}
+};
+
 static bool tscdeadline_needed(void *opaque)
 {
 CPUX86State *env = opaque;
@@ -457,6 +475,9 @@ static const VMStateDescription vmstate_cpu = {
 .vmsd = &vmstate_fpop_ip_dp,
 .needed = fpop_ip_dp_needed,
 }, {
+.vmsd = &vmstate_msr_tsc_adjust,
+.needed = tsc_adjust_needed,
+}, {
 .vmsd = &vmstate_msr_tscdeadline,
 .needed = tscdeadline_needed,
 }, {
-- 
1.8.0.rc0



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: pci_enable_msix() fails with ENOMEM/EINVAL

2012-11-22 Thread Alex Lyakas

Hi Alex,
thanks for your response.

I printed out the "vector" and "entry" values of dev->host_msix_entries[i] 
within assigned_device_enable_host_msix() before call to 
request_threaded_irq(). I see that they are all 0s:

kernel: [ 3332.610980] kvm-8095: KVM_ASSIGN_DEV_IRQ assigned_dev_id=924
kernel: [ 3332.610985] kvm-8095: assigned_device_enable_host_msix() 
assigned_dev_id=924 #0: [v=0 e=0]
kernel: [ 3332.610989] kvm-8095: assigned_device_enable_host_msix() 
assigned_dev_id=924 #1: [v=0 e=1]
kernel: [ 3332.610992] kvm-8095: assigned_device_enable_host_msix() 
assigned_dev_id=924 #2: [v=0 e=2]


So I don't really understand how they all ask for irq=0; I must be missing 
something. Is there any other explanation of request_threaded_irq() to 
return EBUSY? From the code I don't see that there is.


This issue is reproducible and is not going to go away by itself. Working 
around it is also problematic. We thought to check whether all IRQs are 
properly attached after QEMU sets the vm state to "running". However, vm 
state is set to "running" before IRQ attachments are performed; we debugged 
this and found out that they are done from a different thread, from a stack 
trace like this:

kvm_assign_irq()
assigned_dev_update_msix()
assigned_dev_pci_write_config()
pci_host_config_write_common()
pci_data_write()
pci_host_data_write()
memory_region_write_accessor()
access_with_adjusted_size()
memory_region_iorange_write()
ioport_writew_thunk()
ioport_write()
cpu_outw()
kvm_handle_io()
kvm_cpu_exec()
qemu_kvm_cpu_thread_fn()

So looks like this is performed on-demand (on first IO), so no reliable 
point to check that IRQs are attached properly. Another issue that in KVM 
code the return value of pci_host_config_write_common() is not checked, so 
there is no way to report a failure.


Is there any way you think you can help me debug this further?

Thanks,
Alex.


-Original Message- 
From: Alex Williamson

Sent: 22 November, 2012 12:25 AM
To: Alex Lyakas
Cc: kvm@vger.kernel.org
Subject: Re: pci_enable_msix() fails with ENOMEM/EINVAL

On Wed, 2012-11-21 at 16:19 +0200, Alex Lyakas wrote:

Hi,
I was advised to turn off irqbalance and reproduced this issue, but
the failure is in a different place now. Now request_threaded_irq()
fails with EBUSY.
According to the code, this can only happen on the path:
request_threaded_irq() -> __setup_irq()
Now in setup irq, the only place where EBUSY can show up for us is here:
...
raw_spin_lock_irqsave(&desc->lock, flags);
old_ptr = &desc->action;
old = *old_ptr;
if (old) {
/*
* Can't share interrupts unless both agree to and are
* the same type (level, edge, polarity). So both flag
* fields must have IRQF_SHARED set and the bits which
* set the trigger type must match. Also all must
* agree on ONESHOT.
*/
if (!((old->flags & new->flags) & IRQF_SHARED) ||
((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
((old->flags ^ new->flags) & IRQF_ONESHOT)) {
old_name = old->name;
goto mismatch;
}

/* All handlers must agree on per-cpuness */
if ((old->flags & IRQF_PERCPU) !=
(new->flags & IRQF_PERCPU))
goto mismatch;

KVM calls request_threaded_irq() with flags==0, so can it be that
different KVM processes request the same IRQ?


Shouldn't be possible, irqs are allocated from a bitmap protected by a
mutex, see __irq_alloc_descs


 How different KVM
processes spawned simultaneously agree between them on IRQ numbers?


They don't, MSI/X vectors are not currently share-able.  Can you show
that you're actually getting duplicate irq vectors?  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 8/8] kvm tools: add support for ARMv7 processors

2012-11-22 Thread Peter Maydell
On 22 November 2012 15:58, Will Deacon  wrote:
> +++ b/tools/kvm/arm/aarch32/smp-pen.S
> @@ -0,0 +1,30 @@
> +#include "kvm/kvm-arch.h"
> +
> +#include "arm-common/gic.h"
> +
> +   .arm
> +
> +   .globl  smp_pen_start
> +   .globl  smp_jump_addr
> +   .globl  smp_pen_end
> +
> +   .align
> +smp_pen_start:
> +   @ Ensure that the CPU interface is enabled for the wfi wakeup
> +   ldr r0, =ARM_GIC_CPUI_BASE
> +   mov r1, #GIC_CPUI_CTLR_EN
> +   str r1, [r0]
> +
> +   @ Now wait for the primary to poke us
> +   adr r0, smp_jump_addr
> +   dsb
> +   wfi
> +   ldr r1, [r0]
> +   mov pc, r1
> +
> +   .ltorg
> +
> +   .align
> +smp_jump_addr:
> +   .long   0xdeadc0de
> +smp_pen_end:

You've left the gate ajar on your pen -- this won't cope with
spurious WFI wakeups (the architecture allows WFI to return
at any time, down to the trivial case of "implemented as NOP").
Needs a 'branch back to WFI if not yet poked' (or you could
make the initial value stored at smp_jump_addr be the address
of the wfi :-))

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv4] virtio-spec: virtio network device RFS support

2012-11-22 Thread Michael S. Tsirkin
Add RFS support to virtio network device.
Add a new feature flag VIRTIO_NET_F_RFS for this feature, a new
configuration field max_virtqueue_pairs to detect supported number of
virtqueues as well as a new command VIRTIO_NET_CTRL_RFS to program
packet steering for unidirectional protocols.

Signed-off-by: Michael S. Tsirkin 

--

Changes from v3:
- rename multiqueue -> rfs this is what we support
- Be more explicit about what driver should do.
- Simplify layout making VQs functionality depend on feature.
- Remove unused commands, only leave in programming # of queues

Changes from v2:
Address Jason's comments on v2:
- Changed STEERING_HOST to STEERING_RX_FOLLOWS_TX:
  this is both clearer and easier to support.
  It does not look like we need a separate steering command
  since host can just watch tx packets as they go.
- Moved RX and TX steering sections near each other.
- Add motivation for other changes in v2

Changes from Jason's rfc:
- reserved vq 3: this makes all rx vqs even and tx vqs odd, which
  looks nicer to me.
- documented packet steering, added a generalized steering programming
  command. Current modes are single queue and host driven multiqueue,
  but I envision support for guest driven multiqueue in the future.
- make default vqs unused when in mq mode - this wastes some memory
  but makes it more efficient to switch between modes as
  we can avoid this causing packet reordering.

Rusty, could you please take a look and comment soon?
If this looks OK to everyone, we can proceed with finalizing the
implementation. Would be nice to try and put it in 3.8.

---

diff --git a/virtio-spec.lyx b/virtio-spec.lyx
index d2f0da9..c1fa3e4 100644
--- a/virtio-spec.lyx
+++ b/virtio-spec.lyx
@@ -59,6 +59,7 @@
 \author -608949062 "Rusty Russell,,," 
 \author -385801441 "Cornelia Huck" cornelia.h...@de.ibm.com
 \author 1531152142 "Paolo Bonzini,,," 
+\author 1986246365 "Michael S. Tsirkin" 
 \end_header
 
 \begin_body
@@ -4170,9 +4171,42 @@ ID 1
 \end_layout
 
 \begin_layout Description
-Virtqueues 0:receiveq.
- 1:transmitq.
- 2:controlq
+Virtqueues 0:receiveq
+\change_inserted 1986246365 1352742829
+0
+\change_unchanged
+.
+ 1:transmitq
+\change_inserted 1986246365 1352742832
+0
+\change_deleted 1986246365 1352742947
+.
+ 
+\change_inserted 1986246365 1352742952
+.
+ 
+ 2N
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1352743187
+N=0 if VIRTIO_NET_F_RFS is not negotiated, otherwise N is indicated by max_
+\emph on
+virtqueue_pairs control
+\emph default
+ field.
+ 
+\end_layout
+
+\end_inset
+
+: receivqN.
+ 2N+1: transmitqN.
+ 2N+
+\change_unchanged
+2:controlq
 \begin_inset Foot
 status open
 
@@ -4343,6 +4377,16 @@ VIRTIO_NET_F_CTRL_VLAN
 
 \begin_layout Description
 VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous packets.
+\change_inserted 1986246365 1352742767
+
+\end_layout
+
+\begin_layout Description
+
+\change_inserted 1986246365 1352742808
+VIRTIO_NET_F_RFS(2) Device supports Receive Flow Steering.
+\change_unchanged
+
 \end_layout
 
 \end_deeper
@@ -4355,11 +4399,44 @@ configuration
 \begin_inset space ~
 \end_inset
 
-layout Two configuration fields are currently defined.
+layout 
+\change_deleted 1986246365 1352743300
+Two
+\change_inserted 1986246365 1352743301
+Four
+\change_unchanged
+ configuration fields are currently defined.
  The mac address field always exists (though is only valid if VIRTIO_NET_F_MAC
  is set), and the status field only exists if VIRTIO_NET_F_STATUS is set.
  Two read-only bits are currently defined for the status field: 
VIRTIO_NET_S_LIN
 K_UP and VIRTIO_NET_S_ANNOUNCE.
+
+\change_inserted 1986246365 1353595219
+ The following read-only field, 
+\emph on
+max_virtqueue_pairs
+\emph default
+ only exists if VIRTIO_NET_F_RFS is set.
+ This field specifies the maximum number of each of transmit and receive
+ virtqueues (receiveq0..receiveq
+\emph on
+N
+\emph default
+ and transmitq0..transmitq
+\emph on
+N
+\emph default
+ respectively; 
+\emph on
+N
+\emph default
+=
+\emph on
+max_virtqueue_pairs
+\emph default
+) that can be configured once VIRTIO_NET_F_RFS is negotiated.
+
+\change_unchanged
  
 \begin_inset listings
 inline false
@@ -4410,7 +4487,24 @@ Device Initialization
 
 \begin_layout Enumerate
 The initialization routine should identify the receive and transmission
- virtqueues.
+ virtqueues
+\change_inserted 1986246365 1352744077
+, up to N+1 of each kind
+\change_unchanged
+.
+
+\change_inserted 1986246365 1352743942
+ If VIRTIO_NET_F_RFS feature bit is negotiated, 
+\emph on
+N=max_virtqueue_pairs
+\emph default
+, otherwise identify 
+\emph on
+N=0
+\emph default
+.
+\change_unchanged
+
 \end_layout
 
 \begin_layout Enumerate
@@ -4455,7 +4549,11 @@ status
 \end_layout
 
 \begin_layout Enumerate
-The receive virtqueue should be filled with receive buffers.
+The receive virtqueue
+\change_inserted 1986246365 1352743953
+s
+\change_unchanged
+ should be filled with receive buffers.
  This is des

Re: PROBLEM: compilation issue, inline assembly arch/x86/kvm/emulate.c fails at -O0

2012-11-22 Thread H. Peter Anvin
On 11/14/2012 11:45 AM, Blower, Melanie wrote:
> [1.] gcc -O0 assembly arch/x86/kvm/emulate.c gets compilation failure -- 
> incorrect register restrictions
> [2.] Full description of the problem/report:
> I'm trying to compile this file at -O0, but gcc chokes in register allocation 
> at the inline assembly.
> 
> In the ordinary Linux build, this file compiles with gcc at -O2, without 
> compilation errors.
> 
> At -O0, gcc chokes with this message: 
> gcc  -w -c ./emulateE.c   // (using preprocessed file)
> ./emulateE.c: In function `em_mul_ex':
> ./emulateE.c:1918:5: error: can't find a register in class `AREG' while 
> reloading `asm'
> ./emulateE.c:1918:5: error: `asm' operand has impossible constraints
> 
> Explanation:
> The file contains an inline asm of a kind:
> 
> __asm__ __volatile__ ( " . " :
> 
>  "=m" ((ctxt)->eflags), "=&r" (_tmp), "+a" (*rax), "+d" (*rdx), "+qm"(ex) :
>  "i" (11), "m" ((ctxt)->src . val), "a" (*rax), "d" (*rdx)); 
> 
> Note that "+a" in inputs already means that eax is the return value. An then 
> "a" is used as an output constraint too.
> 

Hi Melanie,

Can you test the attached patch?

-hpa


>From f8d8e2842ca05fd89788e35c087f02c6159b023a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" 
Date: Wed, 21 Nov 2012 13:29:20 -0800
Subject: [PATCH] x86, kvm: Remove incorrect redundant assembly constraint

In __emulate_1op_rax_rdx, we use "+a" and "+d" which are input/output
constraints, and *then* use "a" and "d" as input constraints.  This is
incorrect, but happens to work on some versions of gcc.

However, it breaks gcc with -O0 and icc, and may break on future
versions of gcc.

Reported-by: Melanie Blower 
Signed-off-by: H. Peter Anvin 
Link: http://lkml.kernel.org/r/b3584e72cfebed439a3eca9bce67a4ef1b17a...@fmsmsx107.amr.corp.intel.com
---
 arch/x86/kvm/emulate.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 39171cb..bba39bf 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -426,8 +426,7 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
 			_ASM_EXTABLE(1b, 3b)\
 			: "=m" ((ctxt)->eflags), "=&r" (_tmp),		\
 			  "+a" (*rax), "+d" (*rdx), "+qm"(_ex)		\
-			: "i" (EFLAGS_MASK), "m" ((ctxt)->src.val),	\
-			  "a" (*rax), "d" (*rdx));			\
+			: "i" (EFLAGS_MASK), "m" ((ctxt)->src.val));	\
 	} while (0)
 
 /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
-- 
1.7.11.7



Re: hva_to_pfn and memory leak

2012-11-22 Thread Koshelev Vladimir


22.11.2012, 18:13, "Koshelev Vladimir" :
> Hello, guys!
>
> I have written paravirtual interface for GPA->HPA translation inside 
guest. I build GPA->HPA translation table in guest virtual memory.
> To do this, I need to translate userspace virtual address from  memslot to 
host physical address. I use hva_to_pfn for it. It works fine, but linux 
doesn't free the guest
> memory after guest power off. After testing I have found that cause of 
memory leak is hva_to_pfg call. You can find my code at 
http://pastebin.com/0zBV2aPN.
> Do I translate hva to hpa in the right way?
>
> I do this patch for RHEL Linux Kernel 2.6.32-279.5.2.el6 because I use 
CentOS 6.3.
>
> Thanks!
>
> Vladimir.

Fixed.

I have found that kvm_release_pfn_clean is necessary for this case. Before that 
I thought hva_to_pfn just translates(doesn't map) the HVA, but it is not true. 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 07/18] KVM/MIPS32: MMU/TLB operations for the Guest.

2012-11-22 Thread Sanjay Lal
- Note that this file is statically linked with the rest of the host kernel 
(KSEG0). This is because kernel modules are
loaded into mapped space on MIPS and we want to make sure that we don't get any 
host kernel TLB faults while
manipulating TLBs.
- Virtual Guest TLBs are implemented as 64 entry array regardless of the number 
of host TLB entries.
- Shadow TLBs map Guest virtual addresses to Host physical addresses.

- TLB miss handling details:
Guest KSEG0 TLBMISS (0x4000 – 0x6000): Transparent to the Guest.
Guest KSEG2/3 (0x6000 – 0x8000) & Guest UM TLBMISS (0x 
– 0x4000)
Lookup in Guest/Virtual TLB
If an entry doesn’t match
deliver appropriate TLBMISS LD/ST exception to the guest
If entry does exist in the Guest TLB and is NOT Valid
Deliver TLB invalid exception to the guest
If entry does exist in the Guest TLB and is VALID
Inject the TLB entry into the Shadow TLB

Signed-off-by: Sanjay Lal 
---
 arch/mips/kvm/kvm_tlb.c | 932 
 1 file changed, 932 insertions(+)
 create mode 100644 arch/mips/kvm/kvm_tlb.c

diff --git a/arch/mips/kvm/kvm_tlb.c b/arch/mips/kvm/kvm_tlb.c
new file mode 100644
index 000..2d24333
--- /dev/null
+++ b/arch/mips/kvm/kvm_tlb.c
@@ -0,0 +1,932 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* KVM/MIPS TLB handling, this file is part of the Linux host kernel so that
+* TLB handlers run from KSEG0
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#undef CONFIG_MIPS_MT
+#include 
+#define CONFIG_MIPS_MT
+
+#define KVM_GUEST_PC_TLB0
+#define KVM_GUEST_SP_TLB1
+
+#define PRIx64 "llx"
+
+/* Use VZ EntryHi.EHINV to invalidate TLB entries */
+#define UNIQUE_ENTRYHI(idx) (CKSEG0 + ((idx) << (PAGE_SHIFT + 1)))
+
+atomic_t kvm_mips_instance;
+EXPORT_SYMBOL(kvm_mips_instance);
+
+/* These function pointers are initialized once the KVM module is loaded */
+pfn_t(*kvm_mips_gfn_to_pfn) (struct kvm *kvm, gfn_t gfn);
+EXPORT_SYMBOL(kvm_mips_gfn_to_pfn);
+
+void (*kvm_mips_release_pfn_clean) (pfn_t pfn);
+EXPORT_SYMBOL(kvm_mips_release_pfn_clean);
+
+bool(*kvm_mips_is_error_pfn) (pfn_t pfn);
+EXPORT_SYMBOL(kvm_mips_is_error_pfn);
+
+uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+{
+   return vcpu->arch.guest_kernel_asid[smp_processor_id()] & ASID_MASK;
+}
+
+
+uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+{
+   return vcpu->arch.guest_user_asid[smp_processor_id()] & ASID_MASK;
+}
+
+inline uint32_t kvm_mips_get_commpage_asid (struct kvm_vcpu *vcpu)
+{
+   return vcpu->kvm->arch.commpage_tlb;
+}
+
+
+/*
+ * Structure defining an tlb entry data set.
+ */
+
+void kvm_mips_dump_host_tlbs(void)
+{
+   struct kvm_mips_tlb tlb;
+   int i;
+   ulong flags;
+   unsigned long old_entryhi;
+   unsigned long old_pagemask;
+
+   local_irq_save(flags);
+
+   old_entryhi = read_c0_entryhi();
+   old_pagemask = read_c0_pagemask();
+
+   printk("HOST TLBs:\n");
+   printk("ASID: %#lx\n", read_c0_entryhi() & ASID_MASK);
+
+   for (i = 0; i < current_cpu_data.tlbsize; i++) {
+   write_c0_index(i);
+   mtc0_tlbw_hazard();
+
+   tlb_read();
+   tlbw_use_hazard();
+
+   tlb.tlb_hi = read_c0_entryhi();
+   tlb.tlb_lo0 = read_c0_entrylo0();
+   tlb.tlb_lo1 = read_c0_entrylo1();
+   tlb.tlb_mask = read_c0_pagemask();
+
+   printk("TLB%c%3d Hi 0x%08lx ",
+  (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
+  i, tlb.tlb_hi);
+   printk("Lo0=0x%09" PRIx64 " %c%c attr %lx ",
+  (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
+  (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
+  (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
+  (tlb.tlb_lo0 >> 3) & 7);
+   printk("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n",
+  (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
+  (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
+  (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
+  (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
+   }
+   write_c0_entryhi(old_entryhi);
+   write_c0_pagemask(old_pagemask);
+   mtc0_tlbw_hazard();
+   local_irq_restore(flags);
+}
+
+void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
+{
+   int i;
+   struct kvm_mips_tlb tlb;
+   struct mips_coproc *cop0 __unused = vcpu->arch.cop0;
+
+   printk("G

[PATCH v2 08/18] KVM/MIPS32: Release notes and KVM module Makefile

2012-11-22 Thread Sanjay Lal

Signed-off-by: Sanjay Lal 
---
 arch/mips/kvm/00README.txt | 31 +++
 arch/mips/kvm/Makefile | 17 +
 2 files changed, 48 insertions(+)
 create mode 100644 arch/mips/kvm/00README.txt
 create mode 100644 arch/mips/kvm/Makefile

diff --git a/arch/mips/kvm/00README.txt b/arch/mips/kvm/00README.txt
new file mode 100644
index 000..daaf280
--- /dev/null
+++ b/arch/mips/kvm/00README.txt
@@ -0,0 +1,31 @@
+KVM/MIPS Trap & Emulate Release Notes
+=
+
+(1) KVM/MIPS should support MIPS32R2 and beyond. It has been tested on the 
following platforms:
+Malta Board with FPGA based 34K
+Sigma Designs TangoX board with a 24K based 8654 SoC.
+Malta Board with 74K @ 1GHz
+
+(2) Both Guest kernel and Guest Userspace execute in UM.  
+Guest User address space:   0x -> 0x4000
+Guest Kernel Unmapped:  0x4000 -> 0x6000
+Guest Kernel Mapped:0x6000 -> 0x8000
+
+Guest Usermode virtual memory is limited to 1GB.
+
+(2) 16K Page Sizes: Both Host Kernel and Guest Kernel should have the same 
page size, currently at least 16K.
+Note that due to cache aliasing issues, 4K page sizes are NOT supported.
+
+(3) No HugeTLB Support
+Both the host kernel and Guest kernel should have the page size set to 16K.
+This will be implemented in a future release.
+
+(4) KVM/MIPS does not have support for SMP Guests
+Linux-3.7-rc2 based SMP guest hangs due to the following code sequence in 
the generated TLB handlers:
+   LL/TLBP/SC.  Since the TLBP instruction causes a trap the reservation 
gets cleared
+   when we ERET back to the guest. This causes the guest to hang in an 
infinite loop.
+   This will be fixed in a future release.
+
+(5) Use Host FPU
+Currently KVM/MIPS emulates a 24K CPU without a FPU.
+This will be fixed in a future release
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
new file mode 100644
index 000..f3ed378
--- /dev/null
+++ b/arch/mips/kvm/Makefile
@@ -0,0 +1,17 @@
+# Makefile for KVM support for MIPS
+#
+
+common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
+
+EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
+
+kvm-objs := $(common-objs) kvm_mips.o kvm_mips_emul.o kvm_locore.o 
kvm_mips_int.o \
+kvm_mips_stats.o kvm_mips_commpage.o kvm_mips_dyntrans.o
+
+ifdef CONFIG_KVM_MIPS_VZ
+kvm-objs  += kvm_vz.o
+else
+kvm-objs  += kvm_trap_emul.o
+endif
+obj-$(CONFIG_KVM) += kvm.o
+obj-y += kvm_tlb.o kvm_cb.o
-- 
1.7.11.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 16/18] MIPS: Export symbols used by KVM/MIPS module

2012-11-22 Thread Sanjay Lal

Signed-off-by: Sanjay Lal 
---
 arch/mips/kernel/smp.c | 1 +
 mm/bootmem.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 9005bf9..60ea489 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -83,6 +83,7 @@ static inline void set_cpu_sibling_map(int cpu)
 }
 
 struct plat_smp_ops *mp_ops;
+EXPORT_SYMBOL(mp_ops);
 
 __cpuinit void register_smp_ops(struct plat_smp_ops *ops)
 {
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185..a424028 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -32,6 +32,7 @@ EXPORT_SYMBOL(contig_page_data);
 
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
+EXPORT_SYMBOL(min_low_pfn);
 unsigned long max_pfn;
 
 bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
-- 
1.7.11.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: PROBLEM: compilation issue, inline assembly arch/x86/kvm/emulate.c fails at -O0

2012-11-22 Thread Blower, Melanie
Thank you so much, this patch solves the compilation errors that I was seeing 
in the Intel compiler, and with gcc -O0
BTW, my charter is to compile the kernel with the Intel compiler, and my 
testing stops short of verifying that the kernel build actually works.
Best regards, Melanie Blower

-Original Message-
From: H. Peter Anvin [mailto:h...@zytor.com] 
Sent: Wednesday, November 21, 2012 4:35 PM
To: Blower, Melanie
Cc: t...@linutronix.de; mi...@redhat.com; a...@redhat.com; x...@kernel.org; 
kvm@vger.kernel.org; linux-ker...@vger.kernel.org
Subject: Re: PROBLEM: compilation issue, inline assembly arch/x86/kvm/emulate.c 
fails at -O0

On 11/14/2012 11:45 AM, Blower, Melanie wrote:
> [1.] gcc -O0 assembly arch/x86/kvm/emulate.c gets compilation failure 
> -- incorrect register restrictions [2.] Full description of the 
> problem/report:
> I'm trying to compile this file at -O0, but gcc chokes in register allocation 
> at the inline assembly.
> 
> In the ordinary Linux build, this file compiles with gcc at -O2, without 
> compilation errors.
> 
> At -O0, gcc chokes with this message: 
> gcc  -w -c ./emulateE.c   // (using preprocessed file)
> ./emulateE.c: In function `em_mul_ex':
> ./emulateE.c:1918:5: error: can't find a register in class `AREG' while 
> reloading `asm'
> ./emulateE.c:1918:5: error: `asm' operand has impossible constraints
> 
> Explanation:
> The file contains an inline asm of a kind:
> 
> __asm__ __volatile__ ( " . " :
> 
>  "=m" ((ctxt)->eflags), "=&r" (_tmp), "+a" (*rax), "+d" (*rdx), "+qm"(ex) :
>  "i" (11), "m" ((ctxt)->src . val), "a" (*rax), "d" (*rdx));
> 
> Note that "+a" in inputs already means that eax is the return value. An then 
> "a" is used as an output constraint too.
> 

Hi Melanie,

Can you test the attached patch?

-hpa


N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf

[PATCH 1/2] Add code to track call origin for msr assignment.

2012-11-22 Thread Will Auld
In order to track who initiated the call (host or guest) to modify an msr
value I have changed function call parameters along the call path. The
specific change is to add a struct pointer parameter that points to (index,
data, caller) information rather than having this information passed as
individual parameters.

The initial use for this capability is for updating the IA32_TSC_ADJUST
msr while setting the tsc value. It is anticipated that this capability
is useful other tasks.

Signed-off-by: Will Auld 
---
 arch/x86/include/asm/kvm_host.h | 12 +---
 arch/x86/kvm/svm.c  | 21 +++--
 arch/x86/kvm/vmx.c  | 24 +---
 arch/x86/kvm/x86.c  | 23 +--
 arch/x86/kvm/x86.h  |  2 +-
 5 files changed, 59 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 09155d6..da34027 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -598,6 +598,12 @@ struct kvm_vcpu_stat {
 
 struct x86_instruction_info;
 
+struct msr_data {
+bool host_initiated;
+u32 index;
+u64 data;
+};
+
 struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void);  /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -621,7 +627,7 @@ struct kvm_x86_ops {
void (*set_guest_debug)(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
-   int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+   int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
void (*get_segment)(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -772,7 +778,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
 
 void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 struct x86_emulate_ctxt;
 
@@ -799,7 +805,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, 
int *l);
 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index baead95..5ac11f0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1211,6 +1211,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, 
unsigned int id)
struct page *msrpm_pages;
struct page *hsave_page;
struct page *nested_msrpm_pages;
+   struct msr_data msr;
int err;
 
svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -1255,7 +1256,10 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, 
unsigned int id)
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
init_vmcb(svm);
-   kvm_write_tsc(&svm->vcpu, 0);
+   msr.data = 0x0;
+   msr.index = MSR_IA32_TSC;
+   msr.host_initiated = true;
+   kvm_write_tsc(&svm->vcpu, &msr);
 
err = fx_init(&svm->vcpu);
if (err)
@@ -3147,13 +3151,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 
data)
return 0;
 }
 
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
struct vcpu_svm *svm = to_svm(vcpu);
 
+   u32 ecx = msr->index;
+   u64 data = msr->data;
switch (ecx) {
case MSR_IA32_TSC:
-   kvm_write_tsc(vcpu, data);
+   kvm_write_tsc(vcpu, msr);
break;
case MSR_STAR:
svm->vmcb->save.star = data;
@@ -3208,20 +3214,23 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned 
ecx, u64 data)
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", 
ecx, data);
break;
default:
-   return kvm_set_msr_common(vcpu, ecx, data);
+   return kvm_set_msr_common(vcpu, msr);
}
return 0;
 }
 
 static int wrmsr_interception(struct vcpu_svm *svm)
 {
+   struct msr_data msr;
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 
-
+   msr.data = data;
+   msr.index = ecx;
+   msr.host_initiated = false;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
-   if (svm_set_msr(&s

Re: Interrupt controller updates

2012-11-22 Thread Benjamin Herrenschmidt
On Thu, 2012-11-22 at 11:25 +0100, Paolo Bonzini wrote:
> > Again, from memory, you were volunteered to do the initial x86
> change so
> > we could piggy back on it :-) Or do I remember wrong ?
> 
> Please suggest an API, then we can work out the x86 changes.  I can
> volunteer myself, but I wasn't in the BOF so I need something more
> concrete. 

Oh it's simple enough initially, just move the ioctl call from generic
kvm init to machine init. The problem is then to add an argument, since
that essentially means changing the ioctl number, but we need that for
all archs where the interrupt subsystem can be fundamentally different
based on the platform.

Basically, what was discussed in the BOF was that we split the init:

 * The existing ioctl moves to early machine init (before VCPUs) and
gets that argument to define the type of interrupt subsystem to use. It
causes powerpc to instanciate ICPs per VCPUs for example. On archs that
don't have a per-vcpu structure (equivalent of local APIC or ICP), all
it does is enable subsequent irq related ioctls to work (it's just an
"enable" flag).

 * A new ioctl is used to actually instanciate external interrupt
controllers (GIC on ARM, ICS for ppc/pseries, MPIC for ppc/mpic, ...).
This is used later by the PIC code itself when the former ioctl has
enabled "in kernel PIC"

 * A new ioctl is used for platforms that need to be able to adjust the
base address of a PIC (arm/GIC, ppc/mpic)

We have other things to look at (mostly along the MSI routing calls in
qemu that need to be changed to be PCI bridge hooks populated by the
platform) but that's the starting point.

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/5] KVM: PPC: Fix various bugs and vulnerabilities in HV KVM

2012-11-22 Thread Paul Mackerras
This series of patches fixes various bugs that we have found recently.
The bugs fixed in patches 1, 3 and 4 are also vulnerabilities where
the guest could cause the host to crash or could access host memory
inappropriately.  The bug fixed in patch 2 could cause the host to
hang or crash after the guest reboots.  The bug fixed in patch 5 is a
simple thinko in the recently-added HPT reading code.

These patches are against Alex Graf's kvm-ppc-next branch.  They only
affect HV code.

The first two patches have been posted previously but got no comments.

Please apply - given the nature of these bugs I'd really like this
series to make it into the 3.8 merge window.

Paul.

 arch/powerpc/include/asm/kvm_host.h |5 +-
 arch/powerpc/kernel/asm-offsets.c   |4 +-
 arch/powerpc/kvm/Makefile   |1 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c |   29 ++--
 arch/powerpc/kvm/book3s_hv.c|9 ++-
 arch/powerpc/kvm/book3s_hv_ras.c|  115 ++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |   59 ++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  117 +--
 8 files changed, 271 insertions(+), 68 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/5] KVM: PPC: Book3S HV: Report correct HPT entry index when reading HPT

2012-11-22 Thread Paul Mackerras
This fixes a bug in the code which allows userspace to read out the
contents of the guest's hashed page table (HPT).  On the second and
subsequent passes through the HPT, when we are reporting only those
entries that have changed, we were incorrectly initializing the index
field of the header with the index of the first entry we skipped
rather than the first changed entry.  This fixes it.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 2d61e01..8cc18ab 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1279,7 +1279,6 @@ static ssize_t kvm_htab_read(struct file *file, char 
__user *buf,
while (nb + sizeof(hdr) + HPTE_SIZE < count) {
/* Initialize header */
hptr = (struct kvm_get_htab_header __user *)buf;
-   hdr.index = i;
hdr.n_valid = 0;
hdr.n_invalid = 0;
nw = nb;
@@ -1295,6 +1294,7 @@ static ssize_t kvm_htab_read(struct file *file, char 
__user *buf,
++revp;
}
}
+   hdr.index = i;
 
/* Grab a series of valid entries */
while (i < kvm->arch.hpt_npte &&
-- 
1.7.10.rc3.219.g53414

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] KVM: PPC: Book3S HV: Improve handling of local vs. global TLB invalidations

2012-11-22 Thread Paul Mackerras
When we change or remove a HPT (hashed page table) entry, we can do
either a global TLB invalidation (tlbie) that works across the whole
machine, or a local invalidation (tlbiel) that only affects this core.
Currently we do local invalidations if the VM has only one vcpu or if
the guest requests it with the H_LOCAL flag, though the guest Linux
kernel currently doesn't ever use H_LOCAL.  Then, to cope with the
possibility that vcpus moving around to different physical cores might
expose stale TLB entries, there is some code in kvmppc_hv_entry to
flush the whole TLB of entries for this VM if either this vcpu is now
running on a different physical core from where it last ran, or if this
physical core last ran a different vcpu.

There are a number of problems on POWER7 with this as it stands:

- The TLB invalidation is done per thread, whereas it only needs to be
  done per core, since the TLB is shared between the threads.
- With the possibility of the host paging out guest pages, the use of
  H_LOCAL by an SMP guest is dangerous since the guest could possibly
  retain and use a stale TLB entry pointing to a page that had been
  removed from the guest.
- The TLB invalidations that we do when a vcpu moves from one physical
  core to another are unnecessary in the case of an SMP guest that isn't
  using H_LOCAL.
- The optimization of using local invalidations rather than global should
  apply to guests with one virtual core, not just one vcpu.

(None of this applies on PPC970, since there we always have to
invalidate the whole TLB when entering and leaving the guest, and we
can't support paging out guest memory.)

To fix these problems and simplify the code, we now maintain a simple
cpumask of which cpus need to flush the TLB on entry to the guest.
(This is indexed by cpu, though we only ever use the bits for thread
0 of each core.)  Whenever we do a local TLB invalidation, we set the
bits for every cpu except the bit for thread 0 of the core that we're
currently running on.  Whenever we enter a guest, we test and clear the
bit for our core, and flush the TLB if it was set.

On initial startup of the VM, and when resetting the HPT, we set all the
bits in the need_tlb_flush cpumask, since any core could potentially have
stale TLB entries from the previous VM to use the same LPID, or the
previous contents of the HPT.

Then, we maintain a count of the number of online virtual cores, and use
that when deciding whether to use a local invalidation rather than the
number of online vcpus.  The code to make that decision is extracted out
into a new function, global_invalidates().  For multi-core guests on
POWER7 (i.e. when we are using mmu notifiers), we now never do local
invalidations regardless of the H_LOCAL flag.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_host.h |5 +--
 arch/powerpc/kernel/asm-offsets.c   |4 +--
 arch/powerpc/kvm/book3s_64_mmu_hv.c |7 ++--
 arch/powerpc/kvm/book3s_hv.c|9 -
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |   37 +---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   56 ++-
 6 files changed, 73 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 58c7264..62fbd38 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -246,11 +246,12 @@ struct kvm_arch {
int using_mmu_notifiers;
u32 hpt_order;
atomic_t vcpus_running;
+   u32 online_vcores;
unsigned long hpt_npte;
unsigned long hpt_mask;
atomic_t hpte_mod_interest;
spinlock_t slot_phys_lock;
-   unsigned short last_vcpu[NR_CPUS];
+   cpumask_t need_tlb_flush;
struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
struct kvmppc_linear_info *hpt_li;
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
@@ -275,6 +276,7 @@ struct kvmppc_vcore {
int nap_count;
int napping_threads;
u16 pcpu;
+   u16 last_cpu;
u8 vcore_state;
u8 in_guest;
struct list_head runnable_threads;
@@ -523,7 +525,6 @@ struct kvm_vcpu_arch {
u64 dec_jiffies;
u64 dec_expires;
unsigned long pending_exceptions;
-   u16 last_cpu;
u8 ceded;
u8 prodded;
u32 last_inst;
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 7523539..4e23ba2 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -441,8 +441,7 @@ int main(void)
DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
-   DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
-   DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+   DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flu

[PATCH V3 1/2] Add code to track call origin for msr assignment.

2012-11-22 Thread Will Auld
In order to track who initiated the call (host or guest) to modify an msr
value I have changed function call parameters along the call path. The
specific change is to add a struct pointer parameter that points to (index,
data, caller) information rather than having this information passed as
individual parameters.

The initial use for this capability is for updating the IA32_TSC_ADJUST
msr while setting the tsc value. It is anticipated that this capability
is useful other tasks.

Signed-off-by: Will Auld 
---
 arch/x86/include/asm/kvm_host.h | 12 +---
 arch/x86/kvm/svm.c  | 21 +++--
 arch/x86/kvm/vmx.c  | 24 +---
 arch/x86/kvm/x86.c  | 23 +--
 arch/x86/kvm/x86.h  |  2 +-
 5 files changed, 59 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 09155d6..da34027 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -598,6 +598,12 @@ struct kvm_vcpu_stat {
 
 struct x86_instruction_info;
 
+struct msr_data {
+bool host_initiated;
+u32 index;
+u64 data;
+};
+
 struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void);  /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -621,7 +627,7 @@ struct kvm_x86_ops {
void (*set_guest_debug)(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
-   int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+   int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
void (*get_segment)(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -772,7 +778,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
 
 void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 struct x86_emulate_ctxt;
 
@@ -799,7 +805,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, 
int *l);
 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index baead95..5ac11f0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1211,6 +1211,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, 
unsigned int id)
struct page *msrpm_pages;
struct page *hsave_page;
struct page *nested_msrpm_pages;
+   struct msr_data msr;
int err;
 
svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -1255,7 +1256,10 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, 
unsigned int id)
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
init_vmcb(svm);
-   kvm_write_tsc(&svm->vcpu, 0);
+   msr.data = 0x0;
+   msr.index = MSR_IA32_TSC;
+   msr.host_initiated = true;
+   kvm_write_tsc(&svm->vcpu, &msr);
 
err = fx_init(&svm->vcpu);
if (err)
@@ -3147,13 +3151,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 
data)
return 0;
 }
 
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
struct vcpu_svm *svm = to_svm(vcpu);
 
+   u32 ecx = msr->index;
+   u64 data = msr->data;
switch (ecx) {
case MSR_IA32_TSC:
-   kvm_write_tsc(vcpu, data);
+   kvm_write_tsc(vcpu, msr);
break;
case MSR_STAR:
svm->vmcb->save.star = data;
@@ -3208,20 +3214,23 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned 
ecx, u64 data)
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", 
ecx, data);
break;
default:
-   return kvm_set_msr_common(vcpu, ecx, data);
+   return kvm_set_msr_common(vcpu, msr);
}
return 0;
 }
 
 static int wrmsr_interception(struct vcpu_svm *svm)
 {
+   struct msr_data msr;
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 
-
+   msr.data = data;
+   msr.index = ecx;
+   msr.host_initiated = false;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
-   if (svm_set_msr(&s

[PATCH v2 10/18] KVM/MIPS32: Guest interrupt delivery.

2012-11-22 Thread Sanjay Lal

Signed-off-by: Sanjay Lal 
---
 arch/mips/kvm/kvm_mips_int.c | 243 +++
 arch/mips/kvm/kvm_mips_int.h |  49 +
 2 files changed, 292 insertions(+)
 create mode 100644 arch/mips/kvm/kvm_mips_int.c
 create mode 100644 arch/mips/kvm/kvm_mips_int.h

diff --git a/arch/mips/kvm/kvm_mips_int.c b/arch/mips/kvm/kvm_mips_int.c
new file mode 100644
index 000..12450d9
--- /dev/null
+++ b/arch/mips/kvm/kvm_mips_int.c
@@ -0,0 +1,243 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* KVM/MIPS: Interrupt delivery
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "kvm_mips_int.h"
+
+void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+{
+   set_bit(priority, &vcpu->arch.pending_exceptions);
+}
+
+void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+{
+   clear_bit(priority, &vcpu->arch.pending_exceptions);
+}
+
+void kvm_mips_queue_timer_int_cb(struct kvm_vcpu *vcpu)
+{
+   /* Cause bits to reflect the pending timer interrupt,
+* the EXC code will be set when we are actually
+* delivering the interrupt:
+*/
+   kvm_set_c0_guest_cause(vcpu->arch.cop0, (C_IRQ5 | C_TI));
+
+   /* Queue up an INT exception for the core */
+   kvm_mips_queue_irq(vcpu, MIPS_EXC_INT_TIMER);
+
+}
+
+void kvm_mips_dequeue_timer_int_cb(struct kvm_vcpu *vcpu)
+{
+   kvm_clear_c0_guest_cause(vcpu->arch.cop0, (C_IRQ5 | C_TI));
+   kvm_mips_dequeue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+void
+kvm_mips_queue_io_int_cb(struct kvm_vcpu *vcpu, struct kvm_mips_interrupt *irq)
+{
+   int intr = (int)irq->irq;
+
+   /* Cause bits to reflect the pending IO interrupt,
+* the EXC code will be set when we are actually
+* delivering the interrupt:
+*/
+   switch (intr) {
+   case 2:
+   kvm_set_c0_guest_cause(vcpu->arch.cop0, (C_IRQ0));
+   /* Queue up an INT exception for the core */
+   kvm_mips_queue_irq(vcpu, MIPS_EXC_INT_IO);
+   break;
+
+   case 3:
+   kvm_set_c0_guest_cause(vcpu->arch.cop0, (C_IRQ1));
+   kvm_mips_queue_irq(vcpu, MIPS_EXC_INT_IPI_1);
+   break;
+
+   case 4:
+   kvm_set_c0_guest_cause(vcpu->arch.cop0, (C_IRQ2));
+   kvm_mips_queue_irq(vcpu, MIPS_EXC_INT_IPI_2);
+   break;
+
+   default:
+   break;
+   }
+
+}
+
+void
+kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
+  struct kvm_mips_interrupt *irq)
+{
+   int intr = (int)irq->irq;
+   switch (intr) {
+   case -2:
+   kvm_clear_c0_guest_cause(vcpu->arch.cop0, (C_IRQ0));
+   kvm_mips_dequeue_irq(vcpu, MIPS_EXC_INT_IO);
+   break;
+
+   case -3:
+   kvm_clear_c0_guest_cause(vcpu->arch.cop0, (C_IRQ1));
+   kvm_mips_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_1);
+   break;
+
+   case -4:
+   kvm_clear_c0_guest_cause(vcpu->arch.cop0, (C_IRQ2));
+   kvm_mips_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_2);
+   break;
+
+   default:
+   break;
+   }
+
+}
+
+/* Deliver the interrupt of the corresponding priority, if possible. */
+int
+kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
+   uint32_t cause)
+{
+   int allowed = 0;
+   uint32_t exccode;
+
+   struct kvm_vcpu_arch *arch = &vcpu->arch;
+   struct mips_coproc *cop0 __unused = vcpu->arch.cop0;
+
+   switch (priority) {
+   case MIPS_EXC_INT_TIMER:
+   if ((kvm_read_c0_guest_status(cop0) & ST0_IE)
+   && (!(kvm_read_c0_guest_status(cop0) & (ST0_EXL | ST0_ERL)))
+   && (kvm_read_c0_guest_status(cop0) & IE_IRQ5)) {
+   allowed = 1;
+   exccode = T_INT;
+   }
+   break;
+
+   case MIPS_EXC_INT_IO:
+   if ((kvm_read_c0_guest_status(cop0) & ST0_IE)
+   && (!(kvm_read_c0_guest_status(cop0) & (ST0_EXL | ST0_ERL)))
+   && (kvm_read_c0_guest_status(cop0) & IE_IRQ0)) {
+   allowed = 1;
+   exccode = T_INT;
+   }
+   break;
+
+   case MIPS_EXC_INT_IPI_1:
+   if ((kvm_read_c0_guest_status(cop0) & ST0_IE)
+   && (!(kvm_read_c0_guest_status(cop0) & (ST0_EXL | ST0_ERL)))
+   && (kvm_read_c0_guest_status(cop0) & IE_IRQ1)) {
+   allowed = 1;
+   exccode = T_INT;
+   }
+   break;
+
+   case MIPS_EXC_INT_

[PATCH v2 11/18] KVM/MIPS32: Routines to handle specific traps/exceptions while executing the guest.

2012-11-22 Thread Sanjay Lal

Signed-off-by: Sanjay Lal 
---
 arch/mips/kvm/kvm_cb.c|  14 ++
 arch/mips/kvm/kvm_trap_emul.c | 482 ++
 2 files changed, 496 insertions(+)
 create mode 100644 arch/mips/kvm/kvm_cb.c
 create mode 100644 arch/mips/kvm/kvm_trap_emul.c

diff --git a/arch/mips/kvm/kvm_cb.c b/arch/mips/kvm/kvm_cb.c
new file mode 100644
index 000..313c2e3
--- /dev/null
+++ b/arch/mips/kvm/kvm_cb.c
@@ -0,0 +1,14 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Authors: Yann Le Du 
+ */
+
+#include 
+#include 
+
+struct kvm_mips_callbacks *kvm_mips_callbacks;
+EXPORT_SYMBOL(kvm_mips_callbacks);
diff --git a/arch/mips/kvm/kvm_trap_emul.c b/arch/mips/kvm/kvm_trap_emul.c
new file mode 100644
index 000..e20fff0
--- /dev/null
+++ b/arch/mips/kvm/kvm_trap_emul.c
@@ -0,0 +1,482 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* KVM/MIPS: Deliver/Emulate exceptions to the guest kernel
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "kvm_mips_opcode.h"
+#include "kvm_mips_int.h"
+
+static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
+{
+   gpa_t gpa;
+   uint32_t kseg = KSEGX(gva);
+
+   if ((kseg == CKSEG0) || (kseg == CKSEG1))
+   gpa = CPHYSADDR(gva);
+   else {
+   printk("%s: cannot find GPA for GVA: %#lx\n", __func__, gva);
+   kvm_mips_dump_host_tlbs();
+   gpa = KVM_INVALID_ADDR;
+   }
+
+#ifdef DEBUG
+   kvm_debug("%s: gva %#lx, gpa: %#llx\n", __func__, gva, gpa);
+#endif
+
+   return gpa;
+}
+
+
+static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
+{
+   struct kvm_run *run = vcpu->run;
+   uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+   ulong cause = vcpu->arch.host_cp0_cause;
+   enum emulation_result er = EMULATE_DONE;
+   int ret = RESUME_GUEST;
+
+   if (((cause & CAUSEF_CE) >> CAUSEB_CE) == 1) {
+   er = kvm_mips_emulate_fpu_exc(cause, opc, run, vcpu);
+   } else
+   er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
+
+   switch (er) {
+   case EMULATE_DONE:
+   ret = RESUME_GUEST;
+   break;
+
+   case EMULATE_FAIL:
+   run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+   ret = RESUME_HOST;
+   break;
+
+   case EMULATE_WAIT:
+   run->exit_reason = KVM_EXIT_INTR;
+   ret = RESUME_HOST;
+   break;
+
+   default:
+   BUG();
+   }
+   return ret;
+}
+
+static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
+{
+   struct kvm_run *run = vcpu->run;
+   uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+   ulong cause = vcpu->arch.host_cp0_cause;
+   ulong badvaddr = vcpu->arch.host_cp0_badvaddr;
+   enum emulation_result er = EMULATE_DONE;
+   int ret = RESUME_GUEST;
+
+   if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
+   || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
+#ifdef DEBUG
+   kvm_debug
+   ("USER/KSEG23 ADDR TLB MOD fault: cause %#lx, PC: %p, 
BadVaddr: %#lx\n",
+cause, opc, badvaddr);
+#endif
+   er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu);
+
+   if (er == EMULATE_DONE)
+   ret = RESUME_GUEST;
+   else {
+   run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+   ret = RESUME_HOST;
+   }
+   } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+   /* XXXKYMA: The guest kernel does not expect to get this fault 
when we are not
+* using HIGHMEM. Need to address this in a HIGHMEM kernel
+*/
+   printk
+   ("TLB MOD fault not handled, cause %#lx, PC: %p, BadVaddr: 
%#lx\n",
+cause, opc, badvaddr);
+   kvm_mips_dump_host_tlbs();
+   kvm_arch_vcpu_dump_regs(vcpu);
+   run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+   ret = RESUME_HOST;
+   } else {
+   printk
+   ("Illegal TLB Mod fault address , cause %#lx, PC: %p, 
BadVaddr: %#lx\n",
+cause, opc, badvaddr);
+   kvm_mips_dump_host_tlbs();
+   kvm_arch_vcpu_dump_regs(vcpu);
+   run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+   ret = RESUME_HOST;
+   }
+   return ret;
+}
+
+static int kvm_trap_emul_handle_tlb_st_mis

[PATCH v2 09/18] KVM/MIPS32: COP0 accesses profiling.

2012-11-22 Thread Sanjay Lal

Signed-off-by: Sanjay Lal 
---
 arch/mips/kvm/kvm_mips_stats.c | 81 ++
 1 file changed, 81 insertions(+)
 create mode 100644 arch/mips/kvm/kvm_mips_stats.c

diff --git a/arch/mips/kvm/kvm_mips_stats.c b/arch/mips/kvm/kvm_mips_stats.c
new file mode 100644
index 000..e442a26
--- /dev/null
+++ b/arch/mips/kvm/kvm_mips_stats.c
@@ -0,0 +1,81 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* KVM/MIPS: COP0 access histogram
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#include 
+
+char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES] = {
+   "WAIT",
+   "CACHE",
+   "Signal",
+   "Interrupt",
+   "COP0/1 Unusable",
+   "TLB Mod",
+   "TLB Miss (LD)",
+   "TLB Miss (ST)",
+   "Address Err (ST)",
+   "Address Error (LD)",
+   "System Call",
+   "Reserved Inst",
+   "Break Inst",
+   "D-Cache Flushes",
+};
+
+char *kvm_cop0_str[N_MIPS_COPROC_REGS] = {
+   "Index",
+   "Random",
+   "EntryLo0",
+   "EntryLo1",
+   "Context",
+   "PG Mask",
+   "Wired",
+   "HWREna",
+   "BadVAddr",
+   "Count",
+   "EntryHI",
+   "Compare",
+   "Status",
+   "Cause",
+   "EXC PC",
+   "PRID",
+   "Config",
+   "LLAddr",
+   "Watch Lo",
+   "Watch Hi",
+   "X Context",
+   "Reserved",
+   "Impl Dep",
+   "Debug",
+   "DEPC",
+   "PerfCnt",
+   "ErrCtl",
+   "CacheErr",
+   "TagLo",
+   "TagHi",
+   "ErrorEPC",
+   "DESAVE"
+};
+
+int kvm_mips_dump_stats(struct kvm_vcpu *vcpu)
+{
+   int i, j __unused;
+#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
+   printk("\nKVM VCPU[%d] COP0 Access Profile:\n", vcpu->vcpu_id);
+   for (i = 0; i < N_MIPS_COPROC_REGS; i++) {
+   for (j = 0; j < N_MIPS_COPROC_SEL; j++) {
+   if (vcpu->arch.cop0->stat[i][j])
+   printk("%s[%d]: %lu\n", kvm_cop0_str[i], j,
+  vcpu->arch.cop0->stat[i][j]);
+   }
+   }
+#endif
+
+   return 0;
+}
-- 
1.7.11.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 17/18] KVM/MIPS32: Do not call vcpu_load when injecting interrupts.

2012-11-22 Thread Sanjay Lal

Signed-off-by: Sanjay Lal 
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index be70035..ecd96ce 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1880,7 +1880,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
if (vcpu->kvm->mm != current->mm)
return -EIO;
 
-#if defined(CONFIG_S390) || defined(CONFIG_PPC)
+#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
/*
 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
 * so vcpu_load() would break it.
-- 
1.7.11.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 15/18] MIPS: Pull in MIPS fix: fix endless loop when processing signals for kernel tasks.

2012-11-22 Thread Sanjay Lal
This bug is discussed in: 
http://lkml.indiana.edu/hypermail/linux/kernel/1205.2/00719.html

Signed-off-by: Sanjay Lal 
---
 arch/mips/kernel/entry.S | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kernel/entry.S b/arch/mips/kernel/entry.S
index a6c1332..9b00362 100644
--- a/arch/mips/kernel/entry.S
+++ b/arch/mips/kernel/entry.S
@@ -36,6 +36,11 @@ FEXPORT(ret_from_exception)
 FEXPORT(ret_from_irq)
LONG_S  s0, TI_REGS($28)
 FEXPORT(__ret_from_irq)
+/*
+ * We can be coming here from a syscall done in the kernel space,
+ * e.g. a failed kernel_execve().
+ */
+resume_userspace_check:
LONG_L  t0, PT_STATUS(sp)   # returning to kernel mode?
andit0, t0, KU_USER
beqzt0, resume_kernel
@@ -162,7 +167,7 @@ work_notifysig: # deal with 
pending signals and
movea0, sp
li  a1, 0
jal do_notify_resume# a2 already loaded
-   j   resume_userspace
+   j   resume_userspace_check
 
 FEXPORT(syscall_exit_partial)
local_irq_disable   # make sure need_resched doesn't
-- 
1.7.11.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 06/18] KVM/MIPS32: Privileged instruction/target branch emulation.

2012-11-22 Thread Sanjay Lal
- The Guest kernel is run in UM and privileged instructions cause a trap.
- If the instruction causing the trap is in a branch delay slot, the branch 
needs to be emulated to figure
out the PC @ which the guest will resume execution.

Signed-off-by: Sanjay Lal 
---
 arch/mips/kvm/kvm_mips_emul.c   | 1840 +++
 arch/mips/kvm/kvm_mips_opcode.h |   24 +
 2 files changed, 1864 insertions(+)
 create mode 100644 arch/mips/kvm/kvm_mips_emul.c
 create mode 100644 arch/mips/kvm/kvm_mips_opcode.h

diff --git a/arch/mips/kvm/kvm_mips_emul.c b/arch/mips/kvm/kvm_mips_emul.c
new file mode 100644
index 000..dc4960b
--- /dev/null
+++ b/arch/mips/kvm/kvm_mips_emul.c
@@ -0,0 +1,1840 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* KVM/MIPS: Instruction/Exception emulation
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#undef CONFIG_MIPS_MT
+#include 
+#define CONFIG_MIPS_MT
+
+#include "kvm_mips_opcode.h"
+#include "kvm_mips_int.h"
+#include "kvm_mips_comm.h"
+
+#include "trace.h"
+
+static int debug __unused;
+
+/*
+ * Compute the return address and do emulate branch simulation, if required.
+ * This function should be called only in branch delay slot active.
+ */
+u_long kvm_compute_return_epc(struct kvm_vcpu *vcpu, u_long instpc)
+{
+   unsigned int dspcontrol;
+   union mips_instruction insn;
+   struct kvm_vcpu_arch *arch = &vcpu->arch;
+   long epc = instpc;
+   long nextpc = KVM_INVALID_INST;
+
+   if (epc & 3)
+   goto unaligned;
+
+   /*
+* Read the instruction
+*/
+   insn.word = kvm_get_inst((uint32_t *) epc, vcpu);
+
+   if (insn.word == KVM_INVALID_INST)
+   return KVM_INVALID_INST;
+
+   switch (insn.i_format.opcode) {
+   /*
+* jr and jalr are in r_format format.
+*/
+   case spec_op:
+   switch (insn.r_format.func) {
+   case jalr_op:
+   arch->gprs[insn.r_format.rd] = epc + 8;
+   /* Fall through */
+   case jr_op:
+   nextpc = arch->gprs[insn.r_format.rs];
+   break;
+   }
+   break;
+
+   /*
+* This group contains:
+* bltz_op, bgez_op, bltzl_op, bgezl_op,
+* bltzal_op, bgezal_op, bltzall_op, bgezall_op.
+*/
+   case bcond_op:
+   switch (insn.i_format.rt) {
+   case bltz_op:
+   case bltzl_op:
+   if ((long)arch->gprs[insn.i_format.rs] < 0)
+   epc = epc + 4 + (insn.i_format.simmediate << 2);
+   else
+   epc += 8;
+   nextpc = epc;
+   break;
+
+   case bgez_op:
+   case bgezl_op:
+   if ((long)arch->gprs[insn.i_format.rs] >= 0)
+   epc = epc + 4 + (insn.i_format.simmediate << 2);
+   else
+   epc += 8;
+   nextpc = epc;
+   break;
+
+   case bltzal_op:
+   case bltzall_op:
+   arch->gprs[31] = epc + 8;
+   if ((long)arch->gprs[insn.i_format.rs] < 0)
+   epc = epc + 4 + (insn.i_format.simmediate << 2);
+   else
+   epc += 8;
+   nextpc = epc;
+   break;
+
+   case bgezal_op:
+   case bgezall_op:
+   arch->gprs[31] = epc + 8;
+   if ((long)arch->gprs[insn.i_format.rs] >= 0)
+   epc = epc + 4 + (insn.i_format.simmediate << 2);
+   else
+   epc += 8;
+   nextpc = epc;
+   break;
+   case bposge32_op:
+   if (!cpu_has_dsp)
+   goto sigill;
+
+   dspcontrol = rddsp(0x01);
+
+   if (dspcontrol >= 32) {
+   epc = epc + 4 + (insn.i_format.simmediate << 2);
+   } else
+   epc += 8;
+   nextpc = epc;
+   break;
+   }
+   break;
+
+   /*
+* These are unconditional and in j_format.
+*/
+   case jal_op:
+   arch->gprs[31] 

[PATCH v2 04/18] KVM/MIPS32: MIPS arch specific APIs for KVM

2012-11-22 Thread Sanjay Lal
- Implements the arch specific APIs for KVM, some are stubs for MIPS
- kvm_mips_handle_exit(): Main 'C' distpatch routine for handling exceptions 
while in "Guest" mode.
- Also implements in-kernel timer interrupt support for the guest.

Signed-off-by: Sanjay Lal 
---
 arch/mips/kvm/kvm_mips.c | 965 +++
 arch/mips/kvm/trace.h|  46 +++
 2 files changed, 1011 insertions(+)
 create mode 100644 arch/mips/kvm/kvm_mips.c
 create mode 100644 arch/mips/kvm/trace.h

diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
new file mode 100644
index 000..e239d73
--- /dev/null
+++ b/arch/mips/kvm/kvm_mips.c
@@ -0,0 +1,965 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS: MIPS specific KVM APIs
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Authors: Sanjay Lal 
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "kvm_mips_int.h"
+#include "kvm_mips_comm.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#ifndef VECTORSPACING
+#define VECTORSPACING 0x100/* for EI/VI mode */
+#endif
+
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+   { "wait", VCPU_STAT(wait_exits) },
+   { "cache", VCPU_STAT(cache_exits) },
+   { "signal", VCPU_STAT(signal_exits) },
+   { "interrupt", VCPU_STAT(int_exits) },
+   { "cop_unsuable", VCPU_STAT(cop_unusable_exits) },
+   { "tlbmod", VCPU_STAT(tlbmod_exits) },
+   { "tlbmiss_ld", VCPU_STAT(tlbmiss_ld_exits) },
+   { "tlbmiss_st", VCPU_STAT(tlbmiss_st_exits) },
+   { "addrerr_st", VCPU_STAT(addrerr_st_exits) },
+   { "addrerr_ld", VCPU_STAT(addrerr_ld_exits) },
+   { "syscall", VCPU_STAT(syscall_exits) },
+   { "resvd_inst", VCPU_STAT(resvd_inst_exits) },
+   { "break_inst", VCPU_STAT(break_inst_exits) },
+   { "flush_dcache", VCPU_STAT(flush_dcache_exits) },
+   { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+   {NULL}
+};
+
+static int kvm_mips_reset_vcpu(struct kvm_vcpu *vcpu)
+{
+   int i;
+   for_each_possible_cpu(i) {
+   vcpu->arch.guest_kernel_asid[i] = 0;
+   vcpu->arch.guest_user_asid[i] = 0;
+   }
+   return 0;
+}
+
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+   return gfn;
+}
+
+/* XXXKYMA: We are simulatoring a processor that has the WII bit set in 
Config7, so we
+ * are "runnable" if interrupts are pending
+ */
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+   return !!(vcpu->arch.pending_exceptions);
+}
+
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+   return 1;
+}
+
+int kvm_arch_hardware_enable(void *garbage)
+{
+   return 0;
+}
+
+void kvm_arch_hardware_disable(void *garbage)
+{
+}
+
+int kvm_arch_hardware_setup(void)
+{
+   return 0;
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+   int *r = (int *)rtn;
+   *r = 0;
+   return;
+}
+
+static void kvm_mips_init_tlbs(struct kvm *kvm)
+{
+   ulong wired;
+
+   /* Add a wired entry to the TLB, it is used to map the commpage to the 
Guest kernel */
+   wired = read_c0_wired();
+   write_c0_wired(wired + 1);
+   mtc0_tlbw_hazard();
+   kvm->arch.commpage_tlb = wired;
+
+   kvm_debug("[%d] commpage TLB: %d\n", smp_processor_id(),
+ kvm->arch.commpage_tlb);
+}
+
+static void kvm_mips_init_vm_percpu(void *arg)
+{
+   struct kvm *kvm = (struct kvm *)arg;
+
+   kvm_mips_init_tlbs(kvm);
+   kvm_mips_callbacks->vm_init(kvm);
+
+}
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+   if (atomic_inc_return(&kvm_mips_instance) == 1) {
+   kvm_info("%s: 1st KVM instance, setup host TLB parameters\n",
+__func__);
+   on_each_cpu(kvm_mips_init_vm_percpu, kvm, 1);
+   }
+
+
+   return 0;
+}
+
+void kvm_mips_free_vcpus(struct kvm *kvm)
+{
+   unsigned int i;
+   struct kvm_vcpu *vcpu;
+
+   /* Put the pages we reserved for the guest pmap */
+   for (i = 0; i < kvm->arch.guest_pmap_npages; i++) {
+   if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE)
+   kvm_mips_release_pfn_clean(kvm->arch.guest_pmap[i]);
+   }
+
+   if (kvm->arch.guest_pmap)
+   kfree(kvm->arch.guest_pmap);
+
+   kvm_for_each_vcpu(i, vcpu, kvm) {
+   kvm_arch_vcpu_free(vcpu);
+   }
+
+   mutex_lock(&kvm->lock);
+
+   for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+   kvm->vcpus[i] = NULL;
+
+   atomic_set(&kvm->online_vcpus, 0);
+
+   mutex_unlock(&kvm->lock);
+}
+
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+}
+
+s

[PATCH v2 05/18] KVM/MIPS32: KVM Guest kernel support.

2012-11-22 Thread Sanjay Lal
Both Guest kernel and Guest Userspace execute in UM. The memory map is as 
follows:
Guest User address space:   0x -> 0x4000
Guest Kernel Unmapped:  0x4000 -> 0x6000
Guest Kernel Mapped:0x6000 -> 0x8000
- Guest Usermode virtual memory is limited to 1GB.

Signed-off-by: Sanjay Lal 
---
 arch/mips/include/asm/mach-generic/spaces.h |  9 -
 arch/mips/include/asm/processor.h   |  5 +
 arch/mips/include/asm/uaccess.h | 11 ++-
 arch/mips/kernel/binfmt_elfo32.c|  4 
 arch/mips/kernel/cevt-r4k.c |  4 
 arch/mips/kernel/traps.c|  7 ++-
 arch/mips/mti-malta/malta-time.c| 13 +
 7 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/arch/mips/include/asm/mach-generic/spaces.h 
b/arch/mips/include/asm/mach-generic/spaces.h
index d7a9efd..ff64289 100644
--- a/arch/mips/include/asm/mach-generic/spaces.h
+++ b/arch/mips/include/asm/mach-generic/spaces.h
@@ -20,14 +20,21 @@
 #endif
 
 #ifdef CONFIG_32BIT
-
+#ifdef CONFIG_KVM_GUEST
+#define CAC_BASE   _AC(0x4000, UL)
+#else
 #define CAC_BASE   _AC(0x8000, UL)
+#endif
 #define IO_BASE_AC(0xa000, UL)
 #define UNCAC_BASE _AC(0xa000, UL)
 
 #ifndef MAP_BASE
+#ifdef CONFIG_KVM_GUEST
+#define MAP_BASE   _AC(0x6000, UL)
+#else
 #define MAP_BASE   _AC(0xc000, UL)
 #endif
+#endif
 
 /*
  * Memory above this physical address will be considered highmem.
diff --git a/arch/mips/include/asm/processor.h 
b/arch/mips/include/asm/processor.h
index 5e33fab..7df9f06 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h
@@ -44,11 +44,16 @@ extern unsigned int vced_count, vcei_count;
 #define SPECIAL_PAGES_SIZE PAGE_SIZE
 
 #ifdef CONFIG_32BIT
+#ifdef CONFIG_KVM_GUEST
+/* User space process size is limited to 1GB in KVM Guest Mode */
+#define TASK_SIZE  0x3fff8000UL
+#else
 /*
  * User space process size: 2GB. This is hardcoded into a few places,
  * so don't change it unless you know what you are doing.
  */
 #define TASK_SIZE  0x7fff8000UL
+#endif
 
 #ifdef __KERNEL__
 #define STACK_TOP_MAX  TASK_SIZE
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index 3b92efe..61ec84d 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -23,7 +23,11 @@
  */
 #ifdef CONFIG_32BIT
 
-#define __UA_LIMIT 0x8000UL
+#ifdef CONFIG_KVM_GUEST
+#define __UA_LIMIT 0x4000UL
+#else
+#define __UA_LIMIT 0x8000UL
+#endif
 
 #define __UA_ADDR  ".word"
 #define __UA_LA"la"
@@ -55,8 +59,13 @@ extern u64 __ua_limit;
  * address in this range it's the process's problem, not ours :-)
  */
 
+#ifdef CONFIG_KVM_GUEST
+#define KERNEL_DS  ((mm_segment_t) { 0x8000UL })
+#define USER_DS((mm_segment_t) { 0xC000UL })
+#else
 #define KERNEL_DS  ((mm_segment_t) { 0UL })
 #define USER_DS((mm_segment_t) { __UA_LIMIT })
+#endif
 
 #define VERIFY_READ0
 #define VERIFY_WRITE   1
diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c
index ff44823..8793ead 100644
--- a/arch/mips/kernel/binfmt_elfo32.c
+++ b/arch/mips/kernel/binfmt_elfo32.c
@@ -48,7 +48,11 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
__res;  \
 })
 
+#ifdef CONFIG_KVM_GUEST
+#define TASK32_SIZE0x3fff8000UL
+#else
 #define TASK32_SIZE0x7fff8000UL
+#endif
 #undef ELF_ET_DYN_BASE
 #define ELF_ET_DYN_BASE (TASK32_SIZE / 3 * 2)
 
diff --git a/arch/mips/kernel/cevt-r4k.c b/arch/mips/kernel/cevt-r4k.c
index 7532392..eebb05b 100644
--- a/arch/mips/kernel/cevt-r4k.c
+++ b/arch/mips/kernel/cevt-r4k.c
@@ -118,6 +118,10 @@ int c0_compare_int_usable(void)
unsigned int delta;
unsigned int cnt;
 
+#ifdef CONFIG_KVM_GUEST
+return 1;
+#endif
+
/*
 * IP7 already pending?  Try to clear it by acking the timer.
 */
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 9260986..1413aef 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -1709,7 +1709,12 @@ void __init trap_init(void)
ebase = (unsigned long)
__alloc_bootmem(size, 1 << fls(size), 0);
} else {
-   ebase = CKSEG0;
+#ifdef CONFIG_KVM_GUEST
+#define KVM_GUEST_KSEG0 0x4000
+ebase = KVM_GUEST_KSEG0;
+#else
+ebase = CKSEG0;
+#endif
if (cpu_has_mips_r2)
ebase += (read_c0_ebase() & 0x3000);
}
diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c
index 115f5bc..1e6bd99 100644
--- a/arch/mips/mti-malta/malta-time.c
+++ b/arch/mips/mti-malta/malta-time.c
@@ -72,6 +72,19 @@ static unsigned int __init estimate_cpu_freque

[PATCH v2 13/18] MIPS: If KVM is enabled then use the KVM specific routine to flush the TLBs on a ASID wrap.

2012-11-22 Thread Sanjay Lal

Signed-off-by: Sanjay Lal 
---
 arch/mips/include/asm/mmu_context.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/mips/include/asm/mmu_context.h 
b/arch/mips/include/asm/mmu_context.h
index 9b02cfb..10a3fd2 100644
--- a/arch/mips/include/asm/mmu_context.h
+++ b/arch/mips/include/asm/mmu_context.h
@@ -112,15 +112,21 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, 
struct task_struct *tsk)
 static inline void
 get_new_mmu_context(struct mm_struct *mm, unsigned long cpu)
 {
+   extern void kvm_local_flush_tlb_all(void);
unsigned long asid = asid_cache(cpu);
 
if (! ((asid += ASID_INC) & ASID_MASK) ) {
if (cpu_has_vtag_icache)
flush_icache_all();
+#ifdef CONFIG_VIRTUALIZATION
+   kvm_local_flush_tlb_all();  /* start new asid cycle */
+#else
local_flush_tlb_all();  /* start new asid cycle */
+#endif
if (!asid)  /* fix version if needed */
asid = ASID_FIRST_VERSION;
}
+
cpu_context(cpu, mm) = asid_cache(cpu) = asid;
 }
 
-- 
1.7.11.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 03/18] KVM/MIPS32: Entry point for trampolining to the guest and trap handlers.

2012-11-22 Thread Sanjay Lal
- __kvm_mips_vcpu_run: main entry point to enter guest, we save kernel context, 
load
  up guest context from and ERET to guest context.
- mips32_exception: L1 exception handler(s), save k0/k1 and jump to main 
handlers.
- mips32_GuestException: Generic exception handlers for exceptions/interrupts 
while in
  guest context.  Save guest context, restore some kernel context and jump to
  main 'C' handler: kvm_mips_handle_exit()

Signed-off-by: Sanjay Lal 
---
 arch/mips/kvm/kvm_locore.S | 651 +
 1 file changed, 651 insertions(+)
 create mode 100644 arch/mips/kvm/kvm_locore.S

diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S
new file mode 100644
index 000..43e7d3f
--- /dev/null
+++ b/arch/mips/kvm/kvm_locore.S
@@ -0,0 +1,651 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* Main entry point for the guest, exception handling.
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+#define _C_LABEL(x) x
+#define MIPSX(name) mips32_ ## name
+#define CALLFRAME_SIZ   32
+
+/*
+ * VECTOR
+ *  exception vector entrypoint
+ */
+#define VECTOR(x, regmask)  \
+.ent_C_LABEL(x),0;  \
+EXPORT(x);
+
+#define VECTOR_END(x)  \
+EXPORT(x);
+
+/* Overload, Danger Will Robinson!! */
+#define PT_HOST_ASIDPT_BVADDR
+#define PT_HOST_USERLOCAL   PT_EPC
+
+#define CP0_DDATA_LO$28,3
+#define CP0_EBASE   $15,1
+
+#define CP0_INTCTL  $12,1
+#define CP0_SRSCTL  $12,2
+#define CP0_SRSMAP  $12,3
+#define CP0_HWRENA  $7,0
+
+/* Resume Flags */
+#define RESUME_FLAG_HOST(1<<1)  /* Resume host? */
+
+#define RESUME_GUEST0
+#define RESUME_HOST RESUME_FLAG_HOST
+
+/*
+ * __kvm_mips_vcpu_run: entry point to the guest
+ * a0: run
+ * a1: vcpu
+ */
+
+FEXPORT(__kvm_mips_vcpu_run)
+.setpush
+.setnoreorder
+.setnoat
+
+/* k0/k1 not being used in host kernel context */
+   addiu   k1,sp, -PT_SIZE
+LONG_S $0, PT_R0(k1)
+LONG_S $1, PT_R1(k1)
+LONG_S $2, PT_R2(k1)
+LONG_S $3, PT_R3(k1)
+
+LONG_S $4, PT_R4(k1)
+LONG_S $5, PT_R5(k1)
+LONG_S $6, PT_R6(k1)
+LONG_S $7, PT_R7(k1)
+
+LONG_S $8,  PT_R8(k1)
+LONG_S $9,  PT_R9(k1)
+LONG_S $10, PT_R10(k1)
+LONG_S $11, PT_R11(k1)
+LONG_S $12, PT_R12(k1)
+LONG_S $13, PT_R13(k1)
+LONG_S $14, PT_R14(k1)
+LONG_S $15, PT_R15(k1)
+LONG_S $16, PT_R16(k1)
+LONG_S $17, PT_R17(k1)
+
+LONG_S $18, PT_R18(k1)
+LONG_S $19, PT_R19(k1)
+LONG_S $20, PT_R20(k1)
+LONG_S $21, PT_R21(k1)
+LONG_S $22, PT_R22(k1)
+LONG_S $23, PT_R23(k1)
+LONG_S $24, PT_R24(k1)
+LONG_S $25, PT_R25(k1)
+
+   /* XXXKYMA k0/k1 not saved, not being used if we got here through an 
ioctl() */
+
+LONG_S $28, PT_R28(k1)
+LONG_S $29, PT_R29(k1)
+LONG_S $30, PT_R30(k1)
+LONG_S $31, PT_R31(k1)
+
+/* Save hi/lo */
+   mflov0
+   LONG_S  v0, PT_LO(k1)
+   mfhiv1
+   LONG_S  v1, PT_HI(k1)
+
+   /* Save host status */
+   mfc0v0, CP0_STATUS
+   LONG_S  v0, PT_STATUS(k1)
+
+   /* Save host ASID, shove it into the BVADDR location */
+   mfc0v1,CP0_ENTRYHI
+   andiv1, 0xff
+   LONG_S  v1, PT_HOST_ASID(k1)
+
+/* Save DDATA_LO, will be used to store pointer to vcpu */
+mfc0v1, CP0_DDATA_LO
+LONG_S  v1, PT_HOST_USERLOCAL(k1)
+
+/* DDATA_LO has pointer to vcpu */
+mtc0a1,CP0_DDATA_LO
+
+/* Offset into vcpu->arch */
+   addiu   k1, a1, VCPU_HOST_ARCH
+
+/* Save the host stack to VCPU, used for exception processing when we exit 
from the Guest */
+LONG_S  sp, VCPU_HOST_STACK(k1)
+
+/* Save the kernel gp as well */
+LONG_S  gp, VCPU_HOST_GP(k1)
+
+   /* Setup status register for running the guest in UM, interrupts are 
disabled */
+   li  k0,(ST0_EXL | KSU_USER| ST0_BEV)
+   mtc0k0,CP0_STATUS
+ehb
+
+/* load up the new EBASE */
+LONG_L  k0, VCPU_GUEST_EBASE(k1)
+mtc0k0,CP0_EBASE
+
+/* Now that the new EBASE has been loaded, unset BEV, set interrupt mask 
as it was 
+ * but make sure that timer interrupts are enabled
+ */
+li   

Re: Interrupt controller updates

2012-11-22 Thread Peter Maydell
On 22 November 2012 21:00, Benjamin Herrenschmidt
 wrote:
> Oh it's simple enough initially, just move the ioctl call from generic
> kvm init to machine init. The problem is then to add an argument, since
> that essentially means changing the ioctl number, but we need that for
> all archs where the interrupt subsystem can be fundamentally different
> based on the platform.

I cynically suspect there may need to be some disentangling of x86/qemu
code assumptions about what happens when, in order to do this "just
move" step :-)

> Basically, what was discussed in the BOF was that we split the init:
>
>  * The existing ioctl moves to early machine init (before VCPUs) and
> gets that argument to define the type of interrupt subsystem to use. It
> causes powerpc to instanciate ICPs per VCPUs for example. On archs that
> don't have a per-vcpu structure (equivalent of local APIC or ICP), all
> it does is enable subsequent irq related ioctls to work (it's just an
> "enable" flag).
>
>  * A new ioctl is used to actually instanciate external interrupt
> controllers (GIC on ARM, ICS for ppc/pseries, MPIC for ppc/mpic, ...).
> This is used later by the PIC code itself when the former ioctl has
> enabled "in kernel PIC"

For ARM we could move to use this but it would just be for the
benefit of nicer fallback behaviour (you could say "no in kernel
GIC" if the user runs qemu with a guest CPU which doesn't have a
GIC, rather than having to exit saying "incompatible options"
if 'in-kernel irqchip' and 'cpu with no irqchip' were both
specified).

>  * A new ioctl is used for platforms that need to be able to adjust the
> base address of a PIC (arm/GIC, ppc/mpic)

We have the ABI for this already in the kvm/arm patches which are
heading into final review, by the way.

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 1/4] nested vmx: clean up for vmcs12 read and write

2012-11-22 Thread Xu, Dongxiao


> -Original Message-
> From: Gleb Natapov [mailto:g...@redhat.com]
> Sent: Wednesday, November 21, 2012 9:27 PM
> To: Xu, Dongxiao
> Cc: kvm@vger.kernel.org; mtosa...@redhat.com
> Subject: Re: [PATCH 1/4] nested vmx: clean up for vmcs12 read and write
> 
> On Wed, Nov 21, 2012 at 05:04:34PM +0800, Dongxiao Xu wrote:
> > abstract vmcs12_read and vmcs12_write functions to do the vmcs12
> > read/write operations.
> >
> > Signed-off-by: Dongxiao Xu 
> > ---
> >  arch/x86/kvm/vmx.c |   86
> +++-
> >  1 files changed, 45 insertions(+), 41 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index
> > f858159..d8670e4 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -5407,32 +5407,67 @@ static inline int vmcs_field_readonly(unsigned
> long field)
> >   * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
> >   * 64-bit fields are to be returned).
> >   */
> > -static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
> > -   unsigned long field, u64 *ret)
> > +static inline u64 vmcs12_read(struct kvm_vcpu *vcpu, unsigned long
> > +field)
> >  {
> > short offset = vmcs_field_to_offset(field);
> > char *p;
> >
> > -   if (offset < 0)
> > +   if (offset < 0) {
> > +   nested_vmx_failValid(vcpu,
> VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > +   skip_emulated_instruction(vcpu);
> > return 0;
> > +   }
> >
> > p = ((char *)(get_vmcs12(vcpu))) + offset;
> >
> > switch (vmcs_field_type(field)) {
> > case VMCS_FIELD_TYPE_NATURAL_WIDTH:
> > -   *ret = *((natural_width *)p);
> > +   return *((natural_width *)p);
> > +   case VMCS_FIELD_TYPE_U16:
> > +   return *((u16 *)p);
> > +   case VMCS_FIELD_TYPE_U32:
> > +   return *((u32 *)p);
> > +   case VMCS_FIELD_TYPE_U64:
> > +   return *((u64 *)p);
> > +   default:
> > +   nested_vmx_failValid(vcpu,
> VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > +   skip_emulated_instruction(vcpu);
> > +   return 0; /* can never happen. */
> > +   }
> > +}
> > +
> > +static inline int vmcs12_write(struct kvm_vcpu *vcpu,
> > +   unsigned long field,
> > +   u64 value)
> > +{
> > +   short offset = vmcs_field_to_offset(field);
> > +   char *p;
> > +
> > +   if (offset < 0) {
> > +   nested_vmx_failValid(vcpu,
> VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > +   skip_emulated_instruction(vcpu);
> > +   return 0;
> > +   }
> > +
> > +   p = ((char *)(get_vmcs12(vcpu))) + offset;
> > +
> > +   switch (vmcs_field_type(field)) {
> > +   case VMCS_FIELD_TYPE_NATURAL_WIDTH:
> > +   *(natural_width *)p = value;
> > return 1;
> > case VMCS_FIELD_TYPE_U16:
> > -   *ret = *((u16 *)p);
> > +   *(u16 *)p = value;
> > return 1;
> > case VMCS_FIELD_TYPE_U32:
> > -   *ret = *((u32 *)p);
> > +   *(u32 *)p = value;
> > return 1;
> > case VMCS_FIELD_TYPE_U64:
> > -   *ret = *((u64 *)p);
> > +   *(u64 *)p = value;
> > return 1;
> > default:
> > -   return 0; /* can never happen. */
> > +   nested_vmx_failValid(vcpu,
> VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > +   skip_emulated_instruction(vcpu);
> > +   return 0;
> > }
> >  }
> >
> > @@ -5466,11 +5501,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
> > /* Decode instruction info and find the field to read */
> > field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
> > /* Read the field, zero-extended to a u64 field_value */
> > -   if (!vmcs12_read_any(vcpu, field, &field_value)) {
> > -   nested_vmx_failValid(vcpu,
> VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > -   skip_emulated_instruction(vcpu);
> > -   return 1;
> > -   }
> > +   field_value = vmcs12_read(vcpu, field);
> You do not handle failure here and always write back field_value even if
> vmcs12_read() failed. Actually now it is impossible to detect a failure. Call 
> to
> nested_vmx_failValid() in vmcs12_read() will be overwritten by call to
> nested_vmx_succeed() at the end of
> handle_vmread() and skip_emulated_instruction() will be called twice.

Thanks Gleb and Orit to raise this issue.

What about moving the offset check outside the vmcs12_read() and vmcs12_write() 
function, and put it directly in handle_vmread() and handle_vmwrite()?
I think we only need to do offset error check in handle_vmread() and 
handle_vmwrite() since they are to emulate correct behavior for guest VMM. For 
example, if guest VMM reads a field that is not valid or writes a field that is 
read only, then in emulation code handle_vmread() and handle_vmwrite, we need 
to raise error to guest VMM.
For other calling of vmcs12_read() and vmcs12_write() functions in KVM 
hypervisor (see PATCH 3/4), actually the caller needs 

Re: [PATCH v2 09/18] KVM/MIPS32: COP0 accesses profiling.

2012-11-22 Thread Sergei Shtylyov

Hello.

On 22-11-2012 6:34, Sanjay Lal wrote:


Signed-off-by: Sanjay Lal 
---
  arch/mips/kvm/kvm_mips_stats.c | 81 ++
  1 file changed, 81 insertions(+)
  create mode 100644 arch/mips/kvm/kvm_mips_stats.c



diff --git a/arch/mips/kvm/kvm_mips_stats.c b/arch/mips/kvm/kvm_mips_stats.c
new file mode 100644
index 000..e442a26
--- /dev/null
+++ b/arch/mips/kvm/kvm_mips_stats.c
@@ -0,0 +1,81 @@
+/*
+* This file is subject to the terms and conditions of the GNU General Public
+* License.  See the file "COPYING" in the main directory of this archive
+* for more details.
+*
+* KVM/MIPS: COP0 access histogram
+*
+* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+* Authors: Sanjay Lal 
+*/
+
+#include 
+
+char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES] = {
+   "WAIT",
+   "CACHE",
+   "Signal",
+   "Interrupt",
+   "COP0/1 Unusable",
+   "TLB Mod",
+   "TLB Miss (LD)",
+   "TLB Miss (ST)",
+   "Address Err (ST)",
+   "Address Error (LD)",


   I guess it should be "Error" in both cases.


+   "System Call",
+   "Reserved Inst",
+   "Break Inst",
+   "D-Cache Flushes",
+};
+
+char *kvm_cop0_str[N_MIPS_COPROC_REGS] = {
+   "Index",
+   "Random",
+   "EntryLo0",
+   "EntryLo1",
+   "Context",
+   "PG Mask",
+   "Wired",
+   "HWREna",
+   "BadVAddr",
+   "Count",
+   "EntryHI",


   EntryHi.


+int kvm_mips_dump_stats(struct kvm_vcpu *vcpu)
+{
+   int i, j __unused;


   Empty line after declarations wouldn't hurt.


+#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS


WBR, Sergei


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Interrupt controller updates

2012-11-22 Thread Paolo Bonzini
Il 21/11/2012 02:07, Benjamin Herrenschmidt ha scritto:
> David (CC) want to make some progress with our in-kernel PIC. From
> memory, one of the outcomes of the BOF was that we need to move the
> existing "enable in-kernel PIC" from generic KVM init to machine init in
> order to be able to add an argument indicating the "model" use by the
> arch/platform since some like ours support several different models and
> since that all needs to be selected before the VCPUs are created.
> 
> Again, from memory, you were volunteered to do the initial x86 change so
> we could piggy back on it :-) Or do I remember wrong ?

Please suggest an API, then we can work out the x86 changes.  I can
volunteer myself, but I wasn't in the BOF so I need something more concrete.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/6] x86, apicv: add virtual interrupt delivery support

2012-11-22 Thread Gleb Natapov
On Wed, Nov 21, 2012 at 04:09:36PM +0800, Yang Zhang wrote:
> Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
> manually, which is fully taken care of by the hardware. This needs
> some special awareness into existing interrupr injection path:
> 
> - for pending interrupt, instead of direct injection, we may need
>   update architecture specific indicators before resuming to guest.
> 
> - A pending interrupt, which is masked by ISR, should be also
>   considered in above update action, since hardware will decide
>   when to inject it at right time. Current has_interrupt and
>   get_interrupt only returns a valid vector from injection p.o.v.
> 
> Signed-off-by: Yang Zhang 
> Signed-off-by: Kevin Tian 
> ---
>  arch/x86/include/asm/kvm_host.h |4 +
>  arch/x86/include/asm/vmx.h  |   11 
>  arch/x86/kvm/irq.c  |   44 ++
>  arch/x86/kvm/lapic.c|   44 +-
>  arch/x86/kvm/lapic.h|   13 
>  arch/x86/kvm/svm.c  |6 ++
>  arch/x86/kvm/vmx.c  |  125 
> ++-
>  arch/x86/kvm/x86.c  |   16 +-
>  virt/kvm/ioapic.c   |1 +
>  9 files changed, 260 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index b2e11f4..8e07a86 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -682,6 +682,10 @@ struct kvm_x86_ops {
>   void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>   void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>   void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> + int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
> + void (*update_irq)(struct kvm_vcpu *vcpu);
> + void (*set_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector,
> + int need_eoi, int global);
>   int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>   int (*get_tdp_level)(void);
>   u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 21101b6..1003341 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -62,6 +62,7 @@
>  #define EXIT_REASON_MCE_DURING_VMENTRY  41
>  #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
>  #define EXIT_REASON_APIC_ACCESS 44
> +#define EXIT_REASON_EOI_INDUCED 45
>  #define EXIT_REASON_EPT_VIOLATION   48
>  #define EXIT_REASON_EPT_MISCONFIG   49
>  #define EXIT_REASON_WBINVD  54
> @@ -143,6 +144,7 @@
>  #define SECONDARY_EXEC_WBINVD_EXITING0x0040
>  #define SECONDARY_EXEC_UNRESTRICTED_GUEST0x0080
>  #define SECONDARY_EXEC_APIC_REGISTER_VIRT   0x0100
> +#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY0x0200
>  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING0x0400
>  #define SECONDARY_EXEC_ENABLE_INVPCID0x1000
>  
> @@ -180,6 +182,7 @@ enum vmcs_field {
>   GUEST_GS_SELECTOR   = 0x080a,
>   GUEST_LDTR_SELECTOR = 0x080c,
>   GUEST_TR_SELECTOR   = 0x080e,
> + GUEST_INTR_STATUS   = 0x0810,
>   HOST_ES_SELECTOR= 0x0c00,
>   HOST_CS_SELECTOR= 0x0c02,
>   HOST_SS_SELECTOR= 0x0c04,
> @@ -207,6 +210,14 @@ enum vmcs_field {
>   APIC_ACCESS_ADDR_HIGH   = 0x2015,
>   EPT_POINTER = 0x201a,
>   EPT_POINTER_HIGH= 0x201b,
> + EOI_EXIT_BITMAP0= 0x201c,
> + EOI_EXIT_BITMAP0_HIGH   = 0x201d,
> + EOI_EXIT_BITMAP1= 0x201e,
> + EOI_EXIT_BITMAP1_HIGH   = 0x201f,
> + EOI_EXIT_BITMAP2= 0x2020,
> + EOI_EXIT_BITMAP2_HIGH   = 0x2021,
> + EOI_EXIT_BITMAP3= 0x2022,
> + EOI_EXIT_BITMAP3_HIGH   = 0x2023,
>   GUEST_PHYSICAL_ADDRESS  = 0x2400,
>   GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401,
>   VMCS_LINK_POINTER   = 0x2800,
> diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> index 7e06ba1..c7356a3 100644
> --- a/arch/x86/kvm/irq.c
> +++ b/arch/x86/kvm/irq.c
> @@ -60,6 +60,29 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
>  EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
>  
>  /*
> + * check if there is pending interrupt without
> + * intack. This _apicv version is used when hardware
> + * supports APIC virtualization with virtual interrupt
> + * delivery support. In such case, KVM is not required
> + * to poll pending APIC interrupt, and thus this
> + * interface is used to poll pending interupts from
> + * non-APIC source.
> + */
> +int kvm_cpu_has_extint(struct kvm_vcpu *v)
> +{
> + struct kvm_pic *s;
> +
> + if (!irqchip_in_kernel(v->kvm))
> +   

[Bug 50891] New: The smp_affinity cannot work correctly when PCI passthrough device using msi/msi-x with KVM

2012-11-22 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=50891

   Summary: The smp_affinity cannot work correctly when PCI
passthrough device using msi/msi-x with KVM
   Product: Virtualization
   Version: unspecified
Kernel Version: 3.7-rc6
  Platform: All
OS/Version: Linux
  Tree: Mainline
Status: NEW
  Severity: normal
  Priority: P1
 Component: kvm
AssignedTo: virtualization_...@kernel-bugs.osdl.org
ReportedBy: yiliker...@gmail.com
CC: kvm@vger.kernel.org
Regression: No


1: passthrough a netcard (Brodcom BCM5716S) to the guest os

2: ifup the netcard, the card will use msi-x interrupt default, and close the
irqbalance service

3:  echo 4 > cat /proc/irq/NETCARDIRQ/smp_affinity, so we assume the vcpu2
handle the irq.

4: we have set  and set the irq kvm:pci-bus to
the pcpu1 on the host.

we think this configure will reduce the ipi interrupt when inject interrupt to
the guest os. but this irq is not only handle on vcpu2. it is not expect。

-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are on the CC list for the bug.
You are watching the assignee of the bug.--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 50891] The smp_affinity cannot work correctly on guest os when PCI passthrough device using msi/msi-x with KVM

2012-11-22 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=50891


liyi  changed:

   What|Removed |Added

Summary|The smp_affinity cannot |The smp_affinity cannot
   |work correctly when PCI |work correctly on guest os
   |passthrough device using|when PCI passthrough device
   |msi/msi-x with KVM  |using msi/msi-x with KVM




-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are on the CC list for the bug.
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/8] kvm tools: don't bother including linux/compiler.h

2012-11-22 Thread Will Deacon
linux/compiler.h will never give us a definition for __compiler_offsetof
because __KERNEL__ isn't defined, so just use the simple definition that
we have already.

This patch removes the redundant code.

Signed-off-by: Will Deacon 
---
 tools/kvm/include/linux/stddef.h | 6 --
 1 file changed, 6 deletions(-)

diff --git a/tools/kvm/include/linux/stddef.h b/tools/kvm/include/linux/stddef.h
index 60ea512..39da808 100644
--- a/tools/kvm/include/linux/stddef.h
+++ b/tools/kvm/include/linux/stddef.h
@@ -1,16 +1,10 @@
 #ifndef _LINUX_STDDEF_H
 #define _LINUX_STDDEF_H
 
-#include 
-
 #undef NULL
 #define NULL ((void *)0)
 
 #undef offsetof
-#ifdef __compiler_offsetof
-#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
-#else
 #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
-#endif
 
 #endif
-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 5/8] kvm tools: keep track of registered memory banks in struct kvm

2012-11-22 Thread Will Deacon
When registering memory banks for a guest, it is useful to keep the
range information around for translating between guest and host address
spaces.

This patch adds a list of kvm_mem_bank structures to struct kvm, which
is updated when a new bank is registered.

Signed-off-by: Will Deacon 
---
 tools/kvm/include/kvm/kvm.h |  8 
 tools/kvm/kvm.c | 23 ++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
index cf959ea..9b4a9a4 100644
--- a/tools/kvm/include/kvm/kvm.h
+++ b/tools/kvm/include/kvm/kvm.h
@@ -35,6 +35,13 @@ struct kvm_ext {
int code;
 };
 
+struct kvm_mem_bank {
+   struct list_headlist;
+   u64 guest_phys_addr;
+   void*host_addr;
+   u64 size;
+};
+
 struct kvm {
struct kvm_arch arch;
struct kvm_config   cfg;
@@ -49,6 +56,7 @@ struct kvm {
u64 ram_size;
void*ram_start;
u64 ram_pagesize;
+   struct list_headmem_banks;
 
boolnmi_disabled;
 
diff --git a/tools/kvm/kvm.c b/tools/kvm/kvm.c
index b283171..1a10ec0 100644
--- a/tools/kvm/kvm.c
+++ b/tools/kvm/kvm.c
@@ -6,7 +6,9 @@
 #include "kvm/kvm-cpu.h"
 #include "kvm/kvm-ipc.h"
 
+#include 
 #include 
+#include 
 #include 
 
 #include 
@@ -133,9 +135,16 @@ struct kvm *kvm__new(void)
 
 int kvm__exit(struct kvm *kvm)
 {
+   struct kvm_mem_bank *bank, *tmp;
+
kvm__arch_delete_ram(kvm);
-   free(kvm);
 
+   list_for_each_entry_safe(bank, tmp, &kvm->mem_banks, list) {
+   list_del(&bank->list);
+   free(bank);
+   }
+
+   free(kvm);
return 0;
 }
 core_exit(kvm__exit);
@@ -148,8 +157,18 @@ core_exit(kvm__exit);
 int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void 
*userspace_addr)
 {
struct kvm_userspace_memory_region mem;
+   struct kvm_mem_bank *bank;
int ret;
 
+   bank = malloc(sizeof(*bank));
+   if (!bank)
+   return -ENOMEM;
+
+   INIT_LIST_HEAD(&bank->list);
+   bank->guest_phys_addr   = guest_phys;
+   bank->host_addr = userspace_addr;
+   bank->size  = size;
+
mem = (struct kvm_userspace_memory_region) {
.slot   = kvm->mem_slots++,
.guest_phys_addr= guest_phys,
@@ -161,6 +180,7 @@ int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 
size, void *userspace
if (ret < 0)
return -errno;
 
+   list_add(&bank->list, &kvm->mem_banks);
return 0;
 }
 
@@ -245,6 +265,7 @@ int kvm__init(struct kvm *kvm)
 
kvm__arch_init(kvm, kvm->cfg.hugetlbfs_path, kvm->cfg.ram_size);
 
+   INIT_LIST_HEAD(&kvm->mem_banks);
kvm__init_ram(kvm);
 
if (!kvm->cfg.firmware_filename) {
-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/8] kvm tools: add support for ARMv7 processors

2012-11-22 Thread Will Deacon
Hello,

This is version two of the patches I originally posted here:

  http://www.spinics.net/lists/kvm/msg82447.html

Changes since version one include:

- MAX_MEMORY no longer needlessly page-aligned for ARM
- Use xread instead of read for reading the kernel image
- Limit virtual CPUs to 8 due to hardware limitations of the GIC
- Use hugetlbfs_path instead of NULL
- Improved member type consistency between kvm_mem_bank and
  kvm_userspace_memory_region
- New tree-based device registration which moves device number
  allocation out of the arch-dependent IRQ registration code
- Included dependency from mainline kernel to use rbtrees without
  including linux/compiler.h
- SMP secondary boot now initialises the GIC CPU interface (this
  worked by fluke previously due to a bug in kvm)
- Added a dummy set_size_vq function to the balloon driver,
  which I accidentally missed in the original patch (now merged)
- Fixed header guard consistency
- Dropped the RFC tag
- Rebased onto latest kvmtool code

As usual, all comments welcome.

Cheers,

Will


Will Deacon (8):
  rbtree: include linux/compiler.h for definition of __always_inline
  kvm tools: don't bother including linux/compiler.h
  kvm tools: balloon: add dummy set_size_vq implementation
  kvm tools: add generic device registration mechanism
  kvm tools: keep track of registered memory banks in struct kvm
  kvm tools: teach guest_flat_to_host about memory banks starting above
0
  kvm tools: provide a mechanism for translating host to guest
addresses
  kvm tools: add support for ARMv7 processors

 include/linux/rbtree_augmented.h |   1 +
 tools/kvm/Makefile   |  23 ++-
 tools/kvm/arm/aarch32/cortex-a15.c   |  98 ++
 tools/kvm/arm/aarch32/include/kvm/barrier.h  |  10 +
 tools/kvm/arm/aarch32/include/kvm/kvm-arch.h |  30 +++
 tools/kvm/arm/aarch32/kvm-cpu.c  | 111 +++
 tools/kvm/arm/aarch32/smp-pen.S  |  30 +++
 tools/kvm/arm/fdt.c  | 266 +++
 tools/kvm/arm/gic.c  |  92 +
 tools/kvm/arm/include/arm-common/gic.h   |  34 
 tools/kvm/arm/include/arm-common/kvm-arch.h  |  34 
 tools/kvm/arm/include/kvm/kvm-cpu-arch.h |  47 +
 tools/kvm/arm/ioport.c   |   5 +
 tools/kvm/arm/irq.c  |  17 ++
 tools/kvm/arm/kvm-cpu.c  | 107 +++
 tools/kvm/arm/kvm.c  |  69 +++
 tools/kvm/arm/smp.c  |  21 +++
 tools/kvm/devices.c  |  86 +
 tools/kvm/hw/pci-shmem.c |  12 +-
 tools/kvm/hw/vesa.c  |  12 +-
 tools/kvm/include/kvm/devices.h  |  27 +++
 tools/kvm/include/kvm/irq.h  |   2 +-
 tools/kvm/include/kvm/kvm.h  |  16 +-
 tools/kvm/include/kvm/pci.h  |   2 -
 tools/kvm/include/kvm/virtio-mmio.h  |   1 +
 tools/kvm/include/kvm/virtio-pci.h   |   2 +
 tools/kvm/include/linux/stddef.h |   6 -
 tools/kvm/kvm.c  |  56 +-
 tools/kvm/pci.c  |  39 ++--
 tools/kvm/powerpc/irq.c  |  10 +-
 tools/kvm/powerpc/spapr_pci.c|   2 +-
 tools/kvm/virtio/balloon.c   |   7 +
 tools/kvm/virtio/mmio.c  |  11 +-
 tools/kvm/virtio/pci.c   |  11 +-
 tools/kvm/x86/include/kvm/kvm-arch.h |   9 -
 tools/kvm/x86/irq.c  |   4 +-
 tools/kvm/x86/kvm.c  |   7 +
 37 files changed, 1242 insertions(+), 75 deletions(-)
 create mode 100644 tools/kvm/arm/aarch32/cortex-a15.c
 create mode 100644 tools/kvm/arm/aarch32/include/kvm/barrier.h
 create mode 100644 tools/kvm/arm/aarch32/include/kvm/kvm-arch.h
 create mode 100644 tools/kvm/arm/aarch32/kvm-cpu.c
 create mode 100644 tools/kvm/arm/aarch32/smp-pen.S
 create mode 100644 tools/kvm/arm/fdt.c
 create mode 100644 tools/kvm/arm/gic.c
 create mode 100644 tools/kvm/arm/include/arm-common/gic.h
 create mode 100644 tools/kvm/arm/include/arm-common/kvm-arch.h
 create mode 100644 tools/kvm/arm/include/kvm/kvm-cpu-arch.h
 create mode 100644 tools/kvm/arm/ioport.c
 create mode 100644 tools/kvm/arm/irq.c
 create mode 100644 tools/kvm/arm/kvm-cpu.c
 create mode 100644 tools/kvm/arm/kvm.c
 create mode 100644 tools/kvm/arm/smp.c
 create mode 100644 tools/kvm/devices.c
 create mode 100644 tools/kvm/include/kvm/devices.h

-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/8] rbtree: include linux/compiler.h for definition of __always_inline

2012-11-22 Thread Will Deacon
Commit 29fc7c5a4f516d388fb6e1f6d24bfb04b8093e54 upstream.

rb_erase_augmented() is a static function annotated with
__always_inline.  This causes a compile failure when attempting to use
the rbtree implementation as a library (e.g.  kvm tool):

  rbtree_augmented.h:125:24: error: expected `=', `,', `;', `asm' or 
`__attribute__' before `void'

Include linux/compiler.h in rbtree_augmented.h so that the __always_inline
macro is resolved correctly.

Signed-off-by: Will Deacon 
Cc: Pekka Enberg 
Reviewed-by: Michel Lespinasse 
Cc: Ingo Molnar 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 
---
 include/linux/rbtree_augmented.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 214caa3..2ac60c9 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -24,6 +24,7 @@
 #ifndef _LINUX_RBTREE_AUGMENTED_H
 #define _LINUX_RBTREE_AUGMENTED_H
 
+#include 
 #include 
 
 /*
-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Re: Re: Re: [RFC PATCH 0/2] kvm/vmx: Output TSC offset

2012-11-22 Thread Yoshihiro YUNOMAE

Hi Marcelo,

(2012/11/21 7:51), Marcelo Tosatti wrote:

On Tue, Nov 20, 2012 at 07:36:33PM +0900, Yoshihiro YUNOMAE wrote:

Hi Marcelo,

Sorry for the late reply.

(2012/11/17 4:15), Marcelo Tosatti wrote:

On Wed, Nov 14, 2012 at 05:26:10PM +0900, Yoshihiro YUNOMAE wrote:

Thank you for commenting on my patch set.

(2012/11/14 11:31), Steven Rostedt wrote:

On Tue, 2012-11-13 at 18:03 -0800, David Sharp wrote:

On Tue, Nov 13, 2012 at 6:00 PM, Steven Rostedt  wrote:

On Wed, 2012-11-14 at 10:36 +0900, Yoshihiro YUNOMAE wrote:


To merge the data like previous pattern, we apply this patch set. Then, we can
get TSC offset of the guest as follows:

$ dmesg | grep kvm
[   57.717180] kvm: (2687) write TSC offset 18446743360465545001, now clock ##
     |
  PID TSC offset |
HOST TSC value --+



Using printk to export something like this is IMO a nasty hack.

Can't we create a /sys or /proc file to export the same thing?


Since the value changes over the course of the trace, and seems to be
part of the context of the trace, I think I'd include it as a
tracepoint.



I'm fine with that too.


Using some tracepoint is a nice idea, but there is one problem. Here,
our discussion point is "the event which TSC offset is changed does not
frequently occur, but the buffer must keep the event data."

There are two ideas for using tracepoint. First, we define new
tracepoint for changed TSC offset. This is simple and the overhead will
be low. However, this trace event stored in the buffer will be
overwritten by other trace events because this TSC offset event does
not frequently occur. Second, we add TSC offset information to the
tracepoint frequently occured. For example, we assume that TSC offset
information is added to arguments of trace_kvm_exit().


The TSC offset is in the host trace. So given a host trace with two TSC
offset updates, how do you know which events in the guest trace
(containing a number of events) refer to which tsc offset update?

Unless i am missing something, you can't solve this easily (well, except
exporting information to the guest that allows it to transform RDTSC ->
host TSC value, which can be done via pvclock).


As you say, TSC offset events are in the host trace, but we don't need
to notify guests of updating TSC offset. The offset event will output
the next TSC offset value and the current TSC value, so we can
calculate the guest TSC (T1) for the event. Guest TSCs since T1 can be
converted to host TSC using the TSC offset, so we can integrate those
trace data.


Think of this scenario:

host trace
1h. event tsc write tsc_offset=-1000
3h. vmenter
4h. vmexit
... (event sequence)
99h. vmexit
100h. event tsc_write tsc_offset=-2000
101h. vmenter
... (event sequence).
500h. event tsc_write tsc_offset=-3000

Then a guest trace containing events with a TSC timestamp.
Which tsc_offset to use?

(that is the problem, which unless i am mistaken can only be solved
easily if the guest can convert RDTSC -> TSC of host).


There are three following cases of changing TSC offset:
 1. Reset TSC at guest boot time
 2. Adjust TSC offset due to some host's problems
 3. Write TSC on guests
The scenario which you mentioned is case 3, so we'll discuss this case.
Here, we assume that a guest is allocated single CPU for the sake of
ease.

If a guest executes write_tsc, TSC values jumps to forward or backward.
For the forward case, trace data are as follows:

   <   guest   >
cyclesevents   cycles   events
 3000   tsc_offset=-2950
 3001   kvm_enter
 53 eventX
 
100 (write_tsc=+900)
 3060   kvm_exit
 3075   tsc_offset=-2050
 3080   kvm_enter
   1050 event1
   1055 event2
 ...


This case is simple. The guest TSC of the first kvm_enter is calculated
as follows:

  (host TSC of kvm_enter) + (current tsc_offset) = 3001 - 2950 = 51

Similarly, the guest TSC of the second kvm_enter is 130. So, the guest
events between 51 and 130, that is, 53 eventX is inserted between the
first pair of kvm_enter and kvm_exit. To insert events of the guests
between 51 and 130, we convert the guest TSC to the host TSC using TSC
offset 2950.

For the backward case, trace data are as follows:

   <   guest   >
cyclesevents   cycles   events
 3000   tsc_offset=-2950
 3001   kvm_enter
 53 eventX
 
100 (write_tsc=-50)
 3060   kvm_exit
 3075   tsc_offset=-2050
 3080   kvm_enter
 90 event1
 95 event2

RE: [PATCH 1/4] nested vmx: clean up for vmcs12 read and write

2012-11-22 Thread Xu, Dongxiao


> -Original Message-
> From: Gleb Natapov [mailto:g...@redhat.com]
> Sent: Wednesday, November 21, 2012 9:04 PM
> To: Xu, Dongxiao
> Cc: kvm@vger.kernel.org; mtosa...@redhat.com
> Subject: Re: [PATCH 1/4] nested vmx: clean up for vmcs12 read and write
> 
> On Wed, Nov 21, 2012 at 05:04:34PM +0800, Dongxiao Xu wrote:
> > abstract vmcs12_read and vmcs12_write functions to do the vmcs12
> > read/write operations.
> >
> > Signed-off-by: Dongxiao Xu 
> > ---
> >  arch/x86/kvm/vmx.c |   86
> +++-
> >  1 files changed, 45 insertions(+), 41 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index
> > f858159..d8670e4 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -5407,32 +5407,67 @@ static inline int vmcs_field_readonly(unsigned
> long field)
> >   * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
> >   * 64-bit fields are to be returned).
> >   */
> > -static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
> > -   unsigned long field, u64 *ret)
> > +static inline u64 vmcs12_read(struct kvm_vcpu *vcpu, unsigned long
> > +field)
> >  {
> > short offset = vmcs_field_to_offset(field);
> > char *p;
> >
> > -   if (offset < 0)
> > +   if (offset < 0) {
> > +   nested_vmx_failValid(vcpu,
> VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > +   skip_emulated_instruction(vcpu);
> > return 0;
> > +   }
> >
> > p = ((char *)(get_vmcs12(vcpu))) + offset;
> >
> > switch (vmcs_field_type(field)) {
> > case VMCS_FIELD_TYPE_NATURAL_WIDTH:
> > -   *ret = *((natural_width *)p);
> > +   return *((natural_width *)p);
> > +   case VMCS_FIELD_TYPE_U16:
> > +   return *((u16 *)p);
> > +   case VMCS_FIELD_TYPE_U32:
> > +   return *((u32 *)p);
> > +   case VMCS_FIELD_TYPE_U64:
> > +   return *((u64 *)p);
> > +   default:
> > +   nested_vmx_failValid(vcpu,
> VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > +   skip_emulated_instruction(vcpu);
> > +   return 0; /* can never happen. */
> > +   }
> > +}
> > +
> > +static inline int vmcs12_write(struct kvm_vcpu *vcpu,
> > +   unsigned long field,
> > +   u64 value)
> > +{
> > +   short offset = vmcs_field_to_offset(field);
> > +   char *p;
> > +
> > +   if (offset < 0) {
> > +   nested_vmx_failValid(vcpu,
> VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > +   skip_emulated_instruction(vcpu);
> > +   return 0;
> > +   }
> > +
> Shouldn't vmcs_field_readonly() check be in vmcs12_write() instead of a 
> caller?

Well, you can see from the later patch that, we construct vmcs12 through the 
vmcs12_write() function. Some fields like the exit reason, exit qualification 
are needed to be written into the vmcs12 area. Therefore we could not put the 
read only check into the vmcs12_write() function.

Thanks,
Dongxiao

> 
> > +   p = ((char *)(get_vmcs12(vcpu))) + offset;
> > +
> > +   switch (vmcs_field_type(field)) {
> > +   case VMCS_FIELD_TYPE_NATURAL_WIDTH:
> > +   *(natural_width *)p = value;
> > return 1;
> > case VMCS_FIELD_TYPE_U16:
> > -   *ret = *((u16 *)p);
> > +   *(u16 *)p = value;
> > return 1;
> > case VMCS_FIELD_TYPE_U32:
> > -   *ret = *((u32 *)p);
> > +   *(u32 *)p = value;
> > return 1;
> > case VMCS_FIELD_TYPE_U64:
> > -   *ret = *((u64 *)p);
> > +   *(u64 *)p = value;
> > return 1;
> > default:
> > -   return 0; /* can never happen. */
> > +   nested_vmx_failValid(vcpu,
> VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > +   skip_emulated_instruction(vcpu);
> > +   return 0;
> > }
> >  }
> >
> > @@ -5466,11 +5501,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
> > /* Decode instruction info and find the field to read */
> > field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
> > /* Read the field, zero-extended to a u64 field_value */
> > -   if (!vmcs12_read_any(vcpu, field, &field_value)) {
> > -   nested_vmx_failValid(vcpu,
> VMXERR_UNSUPPORTED_VMCS_COMPONENT);
> > -   skip_emulated_instruction(vcpu);
> > -   return 1;
> > -   }
> > +   field_value = vmcs12_read(vcpu, field);
> > /*
> >  * Now copy part of this value to register or memory, as requested.
> >  * Note that the number of bits actually copied is 32 or 64
> > depending @@ -5500,8 +5531,6 @@ static int handle_vmwrite(struct
> kvm_vcpu *vcpu)
> > gva_t gva;
> > unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> > u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> > -   char *p;
> > -   short offset;
> > /* The value to write might be 32 or 64 bits, depending on L1's long
> >  * mode, and eventually we need to write that into a field of several
> >  *

KVM Disk i/o or VM activities causes soft lockup?

2012-11-22 Thread Vincent Li
Hi,

We have users running on redhat based distro (Kernel
2.6.32-131.21.1.el6.x86_64 ) with kvm, when customer made cron job
script to copy large files between kvm guest or some other user space
program leads to disk i/o or VM activities, users get following soft
lockup message from console:

Nov 17 13:44:46 slot1/luipaard100a err kernel: BUG: soft lockup -
CPU#4 stuck for 61s! [qemu-kvm:6795]
Nov 17 13:44:46 slot1/luipaard100a warning kernel: Modules linked in:
ebt_vlan nls_utf8 isofs ebtable_filter ebtables 8021q garp bridge stp
llc ipt_REJECT iptable_filter xt_NOTRACK nf_conntrack iptable_raw
ip_tables loop ext2 binfmt_misc hed womdict(U) vnic(U) parport_pc lp
parport predis(U) lasthop(U) ipv6 toggler vhost_net tun kvm_intel kvm
jiffies(U) sysstats hrsleep i2c_dev datastor(U) linux_user_bde(P)(U)
linux_kernel_bde(P)(U) tg3 libphy serio_raw i2c_i801 i2c_core ehci_hcd
raid1 raid0 virtio_pci virtio_blk virtio virtio_ring mvsas libsas
scsi_transport_sas mptspi mptscsih mptbase scsi_transport_spi 3w_9xxx
sata_svw(U) ahci serverworks sata_sil ata_piix libata sd_mod
crc_t10dif amd74xx piix ide_gd_mod ide_core dm_snapshot dm_mirror
dm_region_hash dm_log dm_mod ext3 jbd mbcache
Nov 17 13:44:46 slot1/luipaard100a warning kernel: Pid: 6795, comm:
qemu-kvm Tainted: P   
2.6.32-131.21.1.el6.f5.x86_64 #1
Nov 17 13:44:46 slot1/luipaard100a warning kernel: Call Trace:
Nov 17 13:44:46 slot1/luipaard100a warning kernel: 
[] ? get_timestamp+0x9/0xf
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? watchdog_timer_fn+0x130/0x178
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? __run_hrtimer+0xa3/0xff
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? hrtimer_interrupt+0xe6/0x190
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? hrtimer_interrupt+0xa9/0x190
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? hpet_interrupt_handler+0x26/0x2d
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? hrtimer_peek_ahead_timers+0x9/0xd
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? __do_softirq+0xc5/0x17a
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? call_softirq+0x1c/0x28
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? do_softirq+0x31/0x66
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? call_function_interrupt+0x13/0x20
Nov 17 13:44:46 slot1/luipaard100a warning kernel: 
[] ? vmx_get_msr+0x0/0x123 [kvm_intel]
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? kvm_arch_vcpu_ioctl_run+0x80e/0xaf1 [kvm]
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? kvm_arch_vcpu_ioctl_run+0x802/0xaf1 [kvm]
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? inode_has_perm+0x65/0x72
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? kvm_vcpu_ioctl+0xf2/0x5ba [kvm]
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? file_has_perm+0x9a/0xac
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? vfs_ioctl+0x21/0x6b
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? do_vfs_ioctl+0x487/0x4da
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? sys_ioctl+0x51/0x70
Nov 17 13:44:46 slot1/luipaard100a warning kernel:
[] ? system_call_fastpath+0x3c/0x41

or

BUG: soft lockup - CPU#2 stuck for 11s! [ksoftirqd/2:13]
Modules linked in: ebt_vlan ebtable_filter ebtables 8021q garp bridge
stp llc ipt_REJECT iptable_filter xt_NOTRACK nf_conntrack iptable_raw
ip_tables loop ext2 binfmt_misc hed womdict(U) vnic(U) parport_pc lp
parport predis(U) lasthop(U) ipv6 toggler vhost_net tun kvm_intel kvm
jiffies(U) sysstats hrsleep i2c_dev datastor(U) linux_user_bde(P)(U)
linux_kernel_bde(P)(U) serio_raw tg3 libphy ehci_hcd i2c_i801 i2c_core
raid1 raid0 virtio_pci virtio_blk virtio virtio_ring mvsas libsas
scsi_transport_sas mptspi mptscsih mptbase scsi_transport_spi 3w_9xxx
sata_svw(U) ahci serverworks sata_sil ata_piix libata sd_mod
crc_t10dif amd74xx piix ide_gd_mod ide_core dm_snapshot dm_mirror
dm_region_hash dm_log dm_mod ext3 jbd mbcache
Pid: 13, comm: ksoftirqd/2 Tainted: P   
2.6.32-131.21.1.el6.x86_64 #1
Call Trace:
   [] ? get_timestamp+0x9/0xf
 [] ? watchdog_timer_fn+0x130/0x178
 [] ? __run_hrtimer+0xa3/0xff
 [] ? __rcu_process_callbacks+0xdd/0x252
 [] ? hrtimer_interrupt+0xa9/0x190
 [] ? __do_softirq+0x13a/0x17a
 [] ? hrtimer_peek_ahead_timers+0x9/0xd
 [] ? __do_softirq+0xc5/0x17a
 [] ? ksoftirqd+0x0/0xbe
 [] ? call_softirq+0x1c/0x28
   [] ? do_softirq+0x31/0x66
 [] ? ksoftirqd+0x0/0xbe
 [] ? ksoftirqd+0x55/0xbe
 [] ? ksoftirqd+0x0/0xbe
 [] ? kthread+0x79/0x84
 [] ? child_rip+0xa/0x20
 [] ? kthread+0x0/0x84
 [] ? child_rip+0x0/0x20

I am wondering if there is known issue in kernel world, any info would
be helpful.

Thanks

Vincent
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 8/8] kvm tools: add support for ARMv7 processors

2012-11-22 Thread Will Deacon
This patch adds initial support for ARMv7 processors (more specifically,
Cortex-A15) to kvmtool.

Everything is driven by FDT, including dynamic generation of virtio nodes
for MMIO devices (PCI is not used due to lack of a suitable host-bridge).

The virtual timers and virtual interrupt controller (VGIC) are provided
by the kernel and require very little in terms of userspace code.

Signed-off-by: Will Deacon 
---
 tools/kvm/Makefile   |  22 ++-
 tools/kvm/arm/aarch32/cortex-a15.c   |  98 ++
 tools/kvm/arm/aarch32/include/kvm/barrier.h  |  10 +
 tools/kvm/arm/aarch32/include/kvm/kvm-arch.h |  30 +++
 tools/kvm/arm/aarch32/kvm-cpu.c  | 111 +++
 tools/kvm/arm/aarch32/smp-pen.S  |  30 +++
 tools/kvm/arm/fdt.c  | 266 +++
 tools/kvm/arm/gic.c  |  92 +
 tools/kvm/arm/include/arm-common/gic.h   |  34 
 tools/kvm/arm/include/arm-common/kvm-arch.h  |  34 
 tools/kvm/arm/include/kvm/kvm-cpu-arch.h |  47 +
 tools/kvm/arm/ioport.c   |   5 +
 tools/kvm/arm/irq.c  |  17 ++
 tools/kvm/arm/kvm-cpu.c  | 107 +++
 tools/kvm/arm/kvm.c  |  69 +++
 tools/kvm/arm/smp.c  |  21 +++
 16 files changed, 992 insertions(+), 1 deletion(-)
 create mode 100644 tools/kvm/arm/aarch32/cortex-a15.c
 create mode 100644 tools/kvm/arm/aarch32/include/kvm/barrier.h
 create mode 100644 tools/kvm/arm/aarch32/include/kvm/kvm-arch.h
 create mode 100644 tools/kvm/arm/aarch32/kvm-cpu.c
 create mode 100644 tools/kvm/arm/aarch32/smp-pen.S
 create mode 100644 tools/kvm/arm/fdt.c
 create mode 100644 tools/kvm/arm/gic.c
 create mode 100644 tools/kvm/arm/include/arm-common/gic.h
 create mode 100644 tools/kvm/arm/include/arm-common/kvm-arch.h
 create mode 100644 tools/kvm/arm/include/kvm/kvm-cpu-arch.h
 create mode 100644 tools/kvm/arm/ioport.c
 create mode 100644 tools/kvm/arm/irq.c
 create mode 100644 tools/kvm/arm/kvm-cpu.c
 create mode 100644 tools/kvm/arm/kvm.c
 create mode 100644 tools/kvm/arm/smp.c

diff --git a/tools/kvm/Makefile b/tools/kvm/Makefile
index 3f25a14..a83dd10 100644
--- a/tools/kvm/Makefile
+++ b/tools/kvm/Makefile
@@ -102,7 +102,8 @@ OBJS+= builtin-sandbox.o
 OBJS   += virtio/mmio.o
 
 # Translate uname -m into ARCH string
-ARCH ?= $(shell uname -m | sed -e s/i.86/i386/ -e s/ppc.*/powerpc/)
+ARCH ?= $(shell uname -m | sed -e s/i.86/i386/ -e s/ppc.*/powerpc/ \
+ -e s/armv7.*/arm/)
 
 ifeq ($(ARCH),i386)
ARCH := x86
@@ -157,6 +158,25 @@ ifeq ($(ARCH), powerpc)
CFLAGS  += -m64
 endif
 
+# ARM
+OBJS_ARM_COMMON:= arm/fdt.o arm/gic.o arm/ioport.o arm/irq.o \
+  arm/kvm.o arm/kvm-cpu.o arm/smp.o
+HDRS_ARM_COMMON:= arm/include
+ifeq ($(ARCH), arm)
+   DEFINES += -DCONFIG_ARM
+   OBJS+= $(OBJS_ARM_COMMON)
+   OBJS+= arm/aarch32/cortex-a15.o
+   OBJS+= arm/aarch32/kvm-cpu.o
+   OBJS+= arm/aarch32/smp-pen.o
+   ARCH_INCLUDE:= $(HDRS_ARM_COMMON)
+   ARCH_INCLUDE+= -Iarm/aarch32/include
+   ASFLAGS += -D__ASSEMBLY__
+   ASFLAGS += -I$(ARCH_INCLUDE)
+   CFLAGS  += -march=armv7-a
+   CFLAGS  += -I../../scripts/dtc/libfdt
+   OTHEROBJS   += $(LIBFDT_OBJS)
+endif
+
 ###
 
 ifeq (,$(ARCH_INCLUDE))
diff --git a/tools/kvm/arm/aarch32/cortex-a15.c 
b/tools/kvm/arm/aarch32/cortex-a15.c
new file mode 100644
index 000..eac0bb9
--- /dev/null
+++ b/tools/kvm/arm/aarch32/cortex-a15.c
@@ -0,0 +1,98 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+
+#include 
+#include 
+
+#define CPU_NAME_MAX_LEN 8
+static void generate_cpu_nodes(void *fdt, struct kvm *kvm)
+{
+   int cpu;
+
+   _FDT(fdt_begin_node(fdt, "cpus"));
+   _FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+   _FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+
+   for (cpu = 0; cpu < kvm->nrcpus; ++cpu) {
+   char cpu_name[CPU_NAME_MAX_LEN];
+
+   if (kvm->cpus[cpu]->cpu_type != KVM_ARM_TARGET_CORTEX_A15) {
+   pr_warning("Ignoring unknown type for CPU %d\n", cpu);
+   continue;
+   }
+
+   snprintf(cpu_name, CPU_NAME_MAX_LEN, "cpu@%d", cpu);
+
+   _FDT(fdt_begin_node(fdt, cpu_name));
+   _FDT(fdt_property_string(fdt, "device_type", "cpu"));
+   _FDT(fdt_property_string(fdt, "compatible", "arm,cortex-a15"));
+
+   if (kvm->nrcpus > 1) {
+   _FDT(fdt_property_string(fdt, "enable-method",
+"spin-table"));
+   _FDT(fdt_property_cell(fdt, "cpu-release-ad

[PATCH v2 4/8] kvm tools: add generic device registration mechanism

2012-11-22 Thread Will Deacon
PCI devices are currently registered into the pci_devices array via the
pci__register function, which can then be indexed later by architecture
code to construct device tree nodes. For MMIO devices, there is no such
utility.

Rather than invent a similar mechanism for MMIO, this patch creates a
global device registration mechanism, which allows the device type to be
specified when registered or indexing a device. Current users of the pci
registration code are migrated to the new infrastructure and virtio MMIO
devices are registered at init time.

As part of the device registration, allocation of the device number is
moved out of irq__register_device and performed when adding the device
header to the relevant bus tree, allowing us to maintain separate device
numberspaces for each bus.

Signed-off-by: Will Deacon 
---
 tools/kvm/Makefile  |  1 +
 tools/kvm/devices.c | 86 +
 tools/kvm/hw/pci-shmem.c| 12 --
 tools/kvm/hw/vesa.c | 12 --
 tools/kvm/include/kvm/devices.h | 27 
 tools/kvm/include/kvm/irq.h |  2 +-
 tools/kvm/include/kvm/pci.h |  2 -
 tools/kvm/include/kvm/virtio-mmio.h |  1 +
 tools/kvm/include/kvm/virtio-pci.h  |  2 +
 tools/kvm/pci.c | 39 ++---
 tools/kvm/powerpc/irq.c | 10 +
 tools/kvm/powerpc/spapr_pci.c   |  2 +-
 tools/kvm/virtio/mmio.c | 11 -
 tools/kvm/virtio/pci.c  | 11 +++--
 tools/kvm/x86/irq.c |  4 +-
 15 files changed, 169 insertions(+), 53 deletions(-)
 create mode 100644 tools/kvm/devices.c
 create mode 100644 tools/kvm/include/kvm/devices.h

diff --git a/tools/kvm/Makefile b/tools/kvm/Makefile
index de11060..3f25a14 100644
--- a/tools/kvm/Makefile
+++ b/tools/kvm/Makefile
@@ -50,6 +50,7 @@ OBJS  += builtin-run.o
 OBJS   += builtin-setup.o
 OBJS   += builtin-stop.o
 OBJS   += builtin-version.o
+OBJS   += devices.o
 OBJS   += disk/core.o
 OBJS   += framebuffer.o
 OBJS   += guest_compat.o
diff --git a/tools/kvm/devices.c b/tools/kvm/devices.c
new file mode 100644
index 000..9f1941d
--- /dev/null
+++ b/tools/kvm/devices.c
@@ -0,0 +1,86 @@
+#include "kvm/devices.h"
+#include "kvm/kvm.h"
+
+#include 
+#include 
+
+struct device_bus {
+   struct rb_root  root;
+   int dev_num;
+};
+
+static struct device_bus device_trees[DEVICE_BUS_MAX] = {
+   [0 ... (DEVICE_BUS_MAX - 1)] = { RB_ROOT, 0 },
+};
+
+int device__register(struct device_header *dev)
+{
+   struct device_bus *bus;
+   struct rb_node **node, *parent = NULL;
+
+   if (dev->bus_type >= DEVICE_BUS_MAX) {
+   pr_warning("Ignoring device registration on unknown bus %d\n",
+  dev->bus_type);
+   return -EINVAL;
+   }
+
+   bus = &device_trees[dev->bus_type];
+   dev->dev_num = bus->dev_num++;
+
+   node = &bus->root.rb_node;
+   while (*node) {
+   int num = rb_entry(*node, struct device_header, node)->dev_num;
+   int result = dev->dev_num - num;
+
+   if (result < 0)
+   node = &((*node)->rb_left);
+   else if (result > 0)
+   node = &((*node)->rb_right);
+   else
+   return -EEXIST;
+   }
+
+   rb_link_node(&dev->node, parent, node);
+   rb_insert_color(&dev->node, &bus->root);
+   return 0;
+}
+
+struct device_header *device__find_dev(enum device_bus_type bus_type, u8 
dev_num)
+{
+   struct rb_node *node;
+
+   if (bus_type >= DEVICE_BUS_MAX)
+   return ERR_PTR(-EINVAL);
+
+   node = device_trees[bus_type].root.rb_node;
+   while (node) {
+   struct device_header *dev = rb_entry(node, struct device_header,
+node);
+   if (dev_num < dev->dev_num) {
+   node = node->rb_left;
+   } else if (dev_num > dev->dev_num) {
+   node = node->rb_right;
+   } else {
+   return dev;
+   }
+   }
+
+   return NULL;
+}
+
+struct device_header *device__first_dev(enum device_bus_type bus_type)
+{
+   struct rb_node *node;
+
+   if (bus_type >= DEVICE_BUS_MAX)
+   return NULL;
+
+   node = rb_first(&device_trees[bus_type].root);
+   return node ? rb_entry(node, struct device_header, node) : NULL;
+}
+
+struct device_header *device__next_dev(struct device_header *dev)
+{
+   struct rb_node *node = rb_next(&dev->node);
+   return node ? rb_entry(node, struct device_header, node) : NULL;
+}
diff --git a/tools/kvm/hw/pci-shmem.c b/tools/kvm/hw/pci-shmem.c
index 4161335..00e5d93 100644
--- a/tools/kvm/hw/pci-shmem.c
+++ b/tools/kvm/hw/pci-shmem.c
@@ -1,3 +1,4 @@
+#include "kvm/devices.h"
 #include "kvm/pci-shmem.h"
 #include "kvm/virtio-pci-dev.h"
 

[PATCH v2 6/8] kvm tools: teach guest_flat_to_host about memory banks starting above 0

2012-11-22 Thread Will Deacon
Running a guest with multiple banks of memory based above 0 causes the
guest_flat_to_host address conversion to fail, as it is assumed that
guest memory addresses are offset linearly from 0.

This patch changes the translation function so that the kvm_mem_bank
structures registered by kvm__register_mem are used to translate guest
addresses, rather than use an offset from the start of host memory.

Signed-off-by: Will Deacon 
---
 tools/kvm/include/kvm/kvm.h  |  7 ++-
 tools/kvm/kvm.c  | 17 +
 tools/kvm/x86/include/kvm/kvm-arch.h |  9 -
 tools/kvm/x86/kvm.c  |  7 +++
 4 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
index 9b4a9a4..5fb2fb2 100644
--- a/tools/kvm/include/kvm/kvm.h
+++ b/tools/kvm/include/kvm/kvm.h
@@ -105,6 +105,8 @@ int kvm__arch_free_firmware(struct kvm *kvm);
 bool kvm__arch_cpu_supports_vm(void);
 void kvm__arch_periodic_poll(struct kvm *kvm);
 
+void *guest_flat_to_host(struct kvm *kvm, u64 offset);
+
 int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char 
*kernel_cmdline);
 bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd, const char 
*kernel_cmdline, u16 vidmode);
 
@@ -120,11 +122,6 @@ static inline bool host_ptr_in_ram(struct kvm *kvm, void 
*p)
return kvm->ram_start <= p && p < (kvm->ram_start + kvm->ram_size);
 }
 
-static inline void *guest_flat_to_host(struct kvm *kvm, unsigned long offset)
-{
-   return kvm->ram_start + offset;
-}
-
 bool kvm__supports_extension(struct kvm *kvm, unsigned int extension);
 
 static inline void kvm__set_thread_name(const char *name)
diff --git a/tools/kvm/kvm.c b/tools/kvm/kvm.c
index 1a10ec0..a7e2628 100644
--- a/tools/kvm/kvm.c
+++ b/tools/kvm/kvm.c
@@ -184,6 +184,23 @@ int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 
size, void *userspace
return 0;
 }
 
+void *guest_flat_to_host(struct kvm *kvm, u64 offset)
+{
+   struct kvm_mem_bank *bank;
+
+   list_for_each_entry(bank, &kvm->mem_banks, list) {
+   u64 bank_start = bank->guest_phys_addr;
+   u64 bank_end = bank_start + bank->size;
+
+   if (offset >= bank_start && offset < bank_end)
+   return bank->host_addr + (offset - bank_start);
+   }
+
+   pr_warning("unable to translate guest address 0x%llx to host",
+   (unsigned long long)offset);
+   return NULL;
+}
+
 int kvm__recommended_cpus(struct kvm *kvm)
 {
int ret;
diff --git a/tools/kvm/x86/include/kvm/kvm-arch.h 
b/tools/kvm/x86/include/kvm/kvm-arch.h
index 2aaedcc..1e0949e 100644
--- a/tools/kvm/x86/include/kvm/kvm-arch.h
+++ b/tools/kvm/x86/include/kvm/kvm-arch.h
@@ -33,13 +33,4 @@ struct kvm_arch {
struct interrupt_table  interrupt_table;
 };
 
-static inline void *guest_flat_to_host(struct kvm *kvm, unsigned long offset); 
/* In kvm.h */
-
-static inline void *guest_real_to_host(struct kvm *kvm, u16 selector, u16 
offset)
-{
-   unsigned long flat = segment_to_flat(selector, offset);
-
-   return guest_flat_to_host(kvm, flat);
-}
-
 #endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/x86/kvm.c b/tools/kvm/x86/kvm.c
index ecada45..9971ffd 100644
--- a/tools/kvm/x86/kvm.c
+++ b/tools/kvm/x86/kvm.c
@@ -199,6 +199,13 @@ void kvm__irq_trigger(struct kvm *kvm, int irq)
 #define BOOT_PROTOCOL_REQUIRED 0x206
 #define LOAD_HIGH  0x01
 
+static inline void *guest_real_to_host(struct kvm *kvm, u16 selector, u16 
offset)
+{
+   unsigned long flat = segment_to_flat(selector, offset);
+
+   return guest_flat_to_host(kvm, flat);
+}
+
 int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char 
*kernel_cmdline)
 {
void *p;
-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 7/8] kvm tools: provide a mechanism for translating host to guest addresses

2012-11-22 Thread Will Deacon
When generating a device tree for a guest, it is useful to have a helper
for converting host addresses to guest addresses in order to populate
the device nodes correctly.

This patch adds such a helper, following a similar implementation to the
reverse translation function that already exists.

Signed-off-by: Will Deacon 
---
 tools/kvm/include/kvm/kvm.h |  1 +
 tools/kvm/kvm.c | 16 
 2 files changed, 17 insertions(+)

diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
index 5fb2fb2..b54ac03 100644
--- a/tools/kvm/include/kvm/kvm.h
+++ b/tools/kvm/include/kvm/kvm.h
@@ -106,6 +106,7 @@ bool kvm__arch_cpu_supports_vm(void);
 void kvm__arch_periodic_poll(struct kvm *kvm);
 
 void *guest_flat_to_host(struct kvm *kvm, u64 offset);
+u64 host_to_guest_flat(struct kvm *kvm, void *ptr);
 
 int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char 
*kernel_cmdline);
 bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd, const char 
*kernel_cmdline, u16 vidmode);
diff --git a/tools/kvm/kvm.c b/tools/kvm/kvm.c
index a7e2628..af19e37 100644
--- a/tools/kvm/kvm.c
+++ b/tools/kvm/kvm.c
@@ -201,6 +201,22 @@ void *guest_flat_to_host(struct kvm *kvm, u64 offset)
return NULL;
 }
 
+u64 host_to_guest_flat(struct kvm *kvm, void *ptr)
+{
+   struct kvm_mem_bank *bank;
+
+   list_for_each_entry(bank, &kvm->mem_banks, list) {
+   void *bank_start = bank->host_addr;
+   void *bank_end = bank_start + bank->size;
+
+   if (ptr >= bank_start && ptr < bank_end)
+   return bank->guest_phys_addr + (ptr - bank_start);
+   }
+
+   pr_warning("unable to translate host address %p to guest", ptr);
+   return 0;
+}
+
 int kvm__recommended_cpus(struct kvm *kvm)
 {
int ret;
-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/8] kvm tools: balloon: add dummy set_size_vq implementation

2012-11-22 Thread Will Deacon
Commit 5e980d372620 ("kvm tools: virtio: add dummy set_size_vq
implementations") added dummy set_size_vq implementations for a number
of devices now that they can use virtio MMIO as their transport.

Unfortunately, it missed the balloon driver, so this patch adds the same
implementation there.

Signed-off-by: Will Deacon 
---
 tools/kvm/virtio/balloon.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/tools/kvm/virtio/balloon.c b/tools/kvm/virtio/balloon.c
index 3965b24..9edce87 100644
--- a/tools/kvm/virtio/balloon.c
+++ b/tools/kvm/virtio/balloon.c
@@ -232,6 +232,12 @@ static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
return VIRTIO_BLN_QUEUE_SIZE;
 }
 
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+   /* FIXME: dynamic */
+   return size;
+}
+
 struct virtio_ops bln_dev_virtio_ops = (struct virtio_ops) {
.get_config = get_config,
.get_host_features  = get_host_features,
@@ -240,6 +246,7 @@ struct virtio_ops bln_dev_virtio_ops = (struct virtio_ops) {
.notify_vq  = notify_vq,
.get_pfn_vq = get_pfn_vq,
.get_size_vq= get_size_vq,
+   .set_size_vq= set_size_vq,
 };
 
 int virtio_bln__init(struct kvm *kvm)
-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V3 2/2] Enabling IA32_TSC_ADJUST for KVM guest VM support

2012-11-22 Thread Will Auld
CPUID.7.0.EBX[1]=1 indicates IA32_TSC_ADJUST MSR 0x3b is supported

Basic design is to emulate the MSR by allowing reads and writes to a guest
vcpu specific location to store the value of the emulated MSR while adding
the value to the vmcs tsc_offset. In this way the IA32_TSC_ADJUST value will
be included in all reads to the TSC MSR whether through rdmsr or rdtsc. This
is of course as long as the "use TSC counter offsetting" VM-execution
control is enabled as well as the IA32_TSC_ADJUST control.

However, because hardware will only return the TSC + IA32_TSC_ADJUST + vmsc
tsc_offset for a guest process when it does and rdtsc (with the correct
settings) the value of our virtualized IA32_TSC_ADJUST must be stored in
one of these three locations. The argument against storing it in the actual
MSR is performance. This is likely to be seldom used while the save/restore
is required on every transition. IA32_TSC_ADJUST was created as a way to
solve some issues with writing TSC itself so that is not an option either.
The remaining option, defined above as our solution has the problem of
returning incorrect vmcs tsc_offset values (unless we intercept and fix, not
done here) as mentioned above. However, more problematic is that storing the
data in vmcs tsc_offset will have a different semantic effect on the system
than does using the actual MSR. This is illustrated in the following example:
The hypervisor set the IA32_TSC_ADJUST, then the guest sets it and a guest
process performs a rdtsc. In this case the guest process will get TSC +
IA32_TSC_ADJUST_hyperviser + vmsc tsc_offset including IA32_TSC_ADJUST_guest.
While the total system semantics changed the semantics as seen by the guest
do not and hence this will not cause a problem.

Signed-off-by: Will Auld 
---
 arch/x86/include/asm/cpufeature.h |  1 +
 arch/x86/include/asm/kvm_host.h   |  3 +++
 arch/x86/include/asm/msr-index.h  |  1 +
 arch/x86/kvm/cpuid.c  |  2 ++
 arch/x86/kvm/cpuid.h  |  8 
 arch/x86/kvm/svm.c|  7 +++
 arch/x86/kvm/vmx.c|  9 +
 arch/x86/kvm/x86.c| 22 ++
 8 files changed, 53 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 6b7ee5f..e574d81 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -199,6 +199,7 @@
 
 /* Intel-defined CPU features, CPUID level 0x0007:0 (ebx), word 9 */
 #define X86_FEATURE_FSGSBASE   (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST  (9*32+ 1) /* TSC adjustment MSR 0x3b */
 #define X86_FEATURE_BMI1   (9*32+ 3) /* 1st group bit manipulation 
extensions */
 #define X86_FEATURE_HLE(9*32+ 4) /* Hardware Lock Elision */
 #define X86_FEATURE_AVX2   (9*32+ 5) /* AVX2 instructions */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da34027..cf8c7e0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -442,6 +442,8 @@ struct kvm_vcpu_arch {
u32 virtual_tsc_mult;
u32 virtual_tsc_khz;
 
+   s64 ia32_tsc_adjust_msr;
+
atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
bool nmi_injected;/* Trying to inject an NMI this entry */
@@ -690,6 +692,7 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
 
void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool 
scale);
+   u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 957ec87..6486569 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -231,6 +231,7 @@
 #define MSR_IA32_EBL_CR_POWERON0x002a
 #define MSR_EBC_FREQUENCY_ID   0x002c
 #define MSR_IA32_FEATURE_CONTROL0x003a
+#define MSR_IA32_TSC_ADJUST 0x003b
 
 #define FEATURE_CONTROL_LOCKED (1<<0)
 #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX   (1<<1)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0595f13..e817bac 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 
function,
if (index == 0) {
entry->ebx &= kvm_supported_word9_x86_features;
cpuid_mask(&entry->ebx, 9);
+   // TSC_ADJUST is emulated 
+   entry->ebx |= F(TSC_ADJUST);
} else
entry->ebx = 0;
entry->eax = 0;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index a10e460..3a8b504 10

RE: [PATCH] vfio powerpc: enabled and supported on powernv platform

2012-11-22 Thread Sethi Varun-B16395


> -Original Message-
> From: linux-kernel-ow...@vger.kernel.org [mailto:linux-kernel-
> ow...@vger.kernel.org] On Behalf Of Alex Williamson
> Sent: Tuesday, November 20, 2012 11:50 PM
> To: Alexey Kardashevskiy
> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> d...@lists.ozlabs.org; linux-ker...@vger.kernel.org; kvm@vger.kernel.org;
> David Gibson
> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> platform
> 
> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> > VFIO implements platform independent stuff such as a PCI driver, BAR
> > access (via read/write on a file descriptor or direct mapping when
> > possible) and IRQ signaling.
> > The platform dependent part includes IOMMU initialization and
> > handling.
> >
> > This patch initializes IOMMU groups based on the IOMMU configuration
> > discovered during the PCI scan, only POWERNV platform is supported at
> > the moment.
> >
> > Also the patch implements an VFIO-IOMMU driver which manages DMA
> > mapping/unmapping requests coming from the client (now QEMU). It also
> > returns a DMA window information to let the guest initialize the
> > device tree for a guest OS properly. Although this driver has been
> > tested only on POWERNV, it should work on any platform supporting TCE
> > tables.
> >
> > To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> >
> > Cc: David Gibson 
> > Signed-off-by: Alexey Kardashevskiy 
> > ---
> >  arch/powerpc/include/asm/iommu.h |6 +
> >  arch/powerpc/kernel/iommu.c  |  140 +++
> >  arch/powerpc/platforms/powernv/pci.c |  135 +++
> >  drivers/iommu/Kconfig|8 ++
> >  drivers/vfio/Kconfig |6 +
> >  drivers/vfio/Makefile|1 +
> >  drivers/vfio/vfio_iommu_spapr_tce.c  |  247
> ++
> >  include/linux/vfio.h |   20 +++
> >  8 files changed, 563 insertions(+)
> >  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >
> > diff --git a/arch/powerpc/include/asm/iommu.h
> > b/arch/powerpc/include/asm/iommu.h
> > index cbfe678..5ba66cb 100644
> > --- a/arch/powerpc/include/asm/iommu.h
> > +++ b/arch/powerpc/include/asm/iommu.h
> > @@ -64,30 +64,33 @@ struct iommu_pool {  }
> > cacheline_aligned_in_smp;
> >
> >  struct iommu_table {
> > unsigned long  it_busno; /* Bus number this table belongs to */
> > unsigned long  it_size;  /* Size of iommu table in entries */
> > unsigned long  it_offset;/* Offset into global table */
> > unsigned long  it_base;  /* mapped address of tce table */
> > unsigned long  it_index; /* which iommu table this is */
> > unsigned long  it_type;  /* type: PCI or Virtual Bus */
> > unsigned long  it_blocksize; /* Entries in each block (cacheline)
> */
> > unsigned long  poolsize;
> > unsigned long  nr_pools;
> > struct iommu_pool large_pool;
> > struct iommu_pool pools[IOMMU_NR_POOLS];
> > unsigned long *it_map;   /* A simple allocation bitmap for now
> */
> > +#ifdef CONFIG_IOMMU_API
> > +   struct iommu_group *it_group;
> > +#endif
> >  };
> >
> >  struct scatterlist;
> >
> >  static inline void set_iommu_table_base(struct device *dev, void
> > *base)  {
> > dev->archdata.dma_data.iommu_table_base = base;  }
> >
> >  static inline void *get_iommu_table_base(struct device *dev)  {
> > return dev->archdata.dma_data.iommu_table_base;
> >  }
> >
> >  /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> > static inline void pci_iommu_init(void) { }  extern void
> > alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
> > defined(CONFIG_PM)  static inline void iommu_save(void)  {
> > if (ppc_md.iommu_save)
> > ppc_md.iommu_save();
> >  }
> >
> >  static inline void iommu_restore(void)  {
> > if (ppc_md.iommu_restore)
> > ppc_md.iommu_restore();
> >  }
> >  #endif
> >
> > +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> entry, uint64_t tce,
> > +   enum dma_data_direction direction, unsigned long pages);
> > +
> >  #endif /* __KERNEL__ */
> >  #endif /* _ASM_IOMMU_H */
> > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > index ff5a6ce..94f614b 100644
> > --- a/arch/powerpc/kernel/iommu.c
> > +++ b/arch/powerpc/kernel/iommu.c
> > @@ -32,30 +32,31 @@
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >
> >  #define DBG(...)
> >
> >  static int novmerge;
> >
> >  static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> > int);
> >
> >  static int __init setup_iommu(char *str)  {
> > if (!strcmp(str, "novmerge"))
> > novmerge = 1;
> > else if (!strcmp(str, "vmerge"))
> > novmerge 

[PATCH v2 0/4] nested vmx code clean up and restructure

2012-11-22 Thread Dongxiao Xu
This patch series clean up and restructure part of the nested vmx code.
The main purpose is to abstract the vmcs12_read() and vmcs12_write() functions.
With this change, we have a unified API to get/set field values from/to vmcs12.

Changes from v1 to v2:
Move the VMCS field valid check into handle_vmread() and handle_vmwrite() 
functions.

Thanks,
Dongxiao

Dongxiao Xu (4):
  nested vmx: clean up for vmcs12 read and write
  nested vmx: clean up for nested_cpu_has_xxx functions
  nested vmx: use vmcs12_read/write() to operate VMCS fields
  nested vmx: use a list to store the launched vmcs12 for L1 VMM

 arch/x86/kvm/vmx.c |  811 ++--
 1 files changed, 463 insertions(+), 348 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/1] s390 fix

2012-11-22 Thread Christian Borntraeger
Marcelo,

here is a kvm related fix for s390. Please consider for the next merge window.
Since it is s390 core kernel, I will also push it into Martins s390 tree.


Christian Borntraeger (1):
  Subject: [PATCH] s390/kvm: Fix address space mixup

 arch/s390/kernel/entry64.S | 25 -
 1 file changed, 20 insertions(+), 5 deletions(-)

-- 
1.7.12.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/1] Subject: [PATCH] s390/kvm: Fix address space mixup

2012-11-22 Thread Christian Borntraeger
I was chasing down a bug of random validity intercepts on s390.
(guest prefix page not mapped in the host virtual aspace). Turns out
that the problem was a wrong address space control element. The
cause was quite complex:

During paging activity a DAT protection during SIE caused a program
interrupt. Normally, the sie retry loop tries to catch all
interrupts during and shortly before sie to rerun the setup. The
problem is now that protection causes a suppressing program interrupt,
causing the PSW to point to the instruction AFTER SIE in case of DAT
protection. This confused the logic of the retry loop to not trigger,
instead we jumped directly back to SIE after return from
the program  interrupt. (the protection fault handler itself did
a rewind of the psw). This usually works quite well, but:

If now the protection fault handler has to wait, another program
might be scheduled in. Later on the sie process will be schedules
in again. In that case the content of CR1 (primary address space)
will be wrong because switch_to will put the user space ASCE into CR1
and not the guest ASCE.

In addition the program parameter is also wrong for every protection
fault of a guest, since we dont issue the SPP instruction.

So lets also check for PSW == instruction after SIE in the program
check handler. Instead of expensively checking all program
interruption codes that might be suppressing we assume that a program
interrupt pointing after SIE was always a program interrupt in SIE.
(Otherwise we have a kernel bug anyway).

We also have to compensate the rewinding, since the C-level handlers
will do that. Therefore we need to add a nop with the same length
as SIE before the sie_loop.

Signed-off-by: Christian Borntraeger 
CC: sta...@vger.kernel.org
CC: Martin Schwidefsky 
CC: Heiko Carstens 
---
 arch/s390/kernel/entry64.S | 25 -
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 07d8de3..19b6080 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -80,14 +80,21 @@ _TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | 
_TIF_MCCK_PENDING)
 #endif
.endm
 
-   .macro  HANDLE_SIE_INTERCEPT scratch
+   .macro  HANDLE_SIE_INTERCEPT scratch,pgmcheck
 #if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
tmhh%r8,0x0001  # interrupting from user ?
jnz .+42
lgr \scratch,%r9
slg \scratch,BASED(.Lsie_loop)
clg \scratch,BASED(.Lsie_length)
+   .if \pgmcheck
+   # Some program interrupts are suppressing (e.g. protection).
+   # We must also check the instruction after SIE in that case.
+   # do_protection_exception will rewind to rewind_pad
+   jh  .+22
+   .else
jhe .+22
+   .endif
lg  %r9,BASED(.Lsie_loop)
SPP BASED(.Lhost_id)# set host id
 #endif
@@ -391,7 +398,7 @@ ENTRY(pgm_check_handler)
lg  %r12,__LC_THREAD_INFO
larl%r13,system_call
lmg %r8,%r9,__LC_PGM_OLD_PSW
-   HANDLE_SIE_INTERCEPT %r14
+   HANDLE_SIE_INTERCEPT %r14,1
tmhh%r8,0x0001  # test problem state bit
jnz 1f  # -> fault in user space
tmhh%r8,0x4000  # PER bit set in old PSW ?
@@ -467,7 +474,7 @@ ENTRY(io_int_handler)
lg  %r12,__LC_THREAD_INFO
larl%r13,system_call
lmg %r8,%r9,__LC_IO_OLD_PSW
-   HANDLE_SIE_INTERCEPT %r14
+   HANDLE_SIE_INTERCEPT %r14,0
SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
tmhh%r8,0x0001  # interrupting from user?
jz  io_skip
@@ -613,7 +620,7 @@ ENTRY(ext_int_handler)
lg  %r12,__LC_THREAD_INFO
larl%r13,system_call
lmg %r8,%r9,__LC_EXT_OLD_PSW
-   HANDLE_SIE_INTERCEPT %r14
+   HANDLE_SIE_INTERCEPT %r14,0
SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
tmhh%r8,0x0001  # interrupting from user ?
jz  ext_skip
@@ -661,7 +668,7 @@ ENTRY(mcck_int_handler)
lg  %r12,__LC_THREAD_INFO
larl%r13,system_call
lmg %r8,%r9,__LC_MCK_OLD_PSW
-   HANDLE_SIE_INTERCEPT %r14
+   HANDLE_SIE_INTERCEPT %r14,0
tm  __LC_MCCK_CODE,0x80 # system damage?
jo  mcck_panic  # yes -> rest of mcck code invalid
lghi%r14,__LC_CPU_TIMER_SAVE_AREA
@@ -960,6 +967,13 @@ ENTRY(sie64a)
stg %r3,__SF_EMPTY+8(%r15)  # save guest register save area
xc  __SF_EMPTY+16(8,%r15),__SF_EMPTY+16(%r15) # host id == 0
lmg %r0,%r13,0(%r3) # load guest gprs 0-13
+# some program checks are suppressing. C code (e.g. do_protection_exception)
+# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other
+# instructions in the sie_loo

Re: [PATCH 0/3] KVM: x86: improve reexecute_instruction

2012-11-22 Thread Marcelo Tosatti
On Tue, Nov 20, 2012 at 07:57:48AM +0800, Xiao Guangrong wrote:
> The current reexecute_instruction can not well detect the failed instruction
> emulation. It allows guest to retry all the instructions except it accesses
> on error pfn.
> 
> For example, these cases can not be detected:
> - for tdp used
>   currently, it refused to retry all instructions. If nested npt is used, the
>   emulation may be caused by shadow page, it can be fixed by unshadow the
>   shadow page.
> 
> - for shadow mmu
>   some cases are nested-write-protect, for example, if the page we want to
>   write is used as PDE but it chains to itself. Under this case, we should
>   stop the emulation and report the case to userspace.
> 
> This test case based on kvm-unit-test can trigger a infinite loop on current
> code (ept = 0), after this patchset, it can report the error to Qemu.
> 
> Marcelo, I am afraid this test case can not be putted on kvm-unit-test,
> autotest is confused about this case since it can abort Qemu.

That is OK, kvm-unit-test only executes tests listed at
x86/unittests.cfg.

> 
> Subject: [PATCH] access test: test unhandleable instruction
> 
> Test the instruction which can not be handled by kvm
> 
> Signed-off-by: Xiao Guangrong 
> ---
>  x86/access.c |   27 ++-
>  1 files changed, 26 insertions(+), 1 deletions(-)
> 
> diff --git a/x86/access.c b/x86/access.c
> index 23a5995..e88db6b 100644
> --- a/x86/access.c
> +++ b/x86/access.c
> @@ -2,6 +2,7 @@
>  #include "libcflat.h"
>  #include "desc.h"
>  #include "processor.h"
> +#include "vm.h"
> 
>  #define smp_id() 0
> 
> @@ -739,6 +740,28 @@ err:
>   return 0;
>  }
> 
> +static int check_retry_unhandleable_ins(ac_pool_t *pool)
> +{
> + unsigned long mem = 30 * 1024 * 1024;
> + unsigned long esp;
> + ac_test_t at;
> +
> + ac_test_init(&at, (void *)(0x123406003000));
> + at.flags[AC_PDE_PRESENT] = at.flags[AC_PDE_WRITABLE] = 1;
> + at.flags[AC_PTE_PRESENT] = at.flags[AC_PTE_WRITABLE] = 1;
> + at.flags[AC_CPU_CR0_WP] = 1;
> +
> + at.phys = mem;
> + ac_setup_specific_pages(&at, pool, mem, 0);
> +
> + asm volatile("mov %%rsp, %%rax  \n\t" : "=a"(esp));
> + asm volatile("mov %%rax, %%rsp  \n\t" : : "a"(0x123406003000 + 0xf0));
> + asm volatile ("int $0x3 \n\t");
> + asm volatile("mov %%rax, %%rsp  \n\t" : : "a"(esp));
> +
> + return 1;
> +}
> +
>  int ac_test_exec(ac_test_t *at, ac_pool_t *pool)
>  {
>  int r;
> @@ -756,7 +779,8 @@ const ac_test_fn ac_test_cases[] =
>  {
>   corrupt_hugepage_triger,
>   check_pfec_on_prefetch_pte,
> - check_smep_andnot_wp
> + check_smep_andnot_wp,
> + check_retry_unhandleable_ins
>  };
> 
>  int ac_test_run(void)
> @@ -770,6 +794,7 @@ int ac_test_run(void)
>  tests = successes = 0;
>  ac_env_int(&pool);
>  ac_test_init(&at, (void *)(0x1234 + 16 * smp_id()));
> +
>  do {
>   if (at.flags[AC_CPU_CR4_SMEP] && (ptl2[2] & 0x4))
>   ptl2[2] -= 0x4;
> -- 
> 1.7.7.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vfio powerpc: enabled and supported on powernv platform

2012-11-22 Thread Alexey Kardashevskiy

On 22/11/12 22:56, Sethi Varun-B16395 wrote:




-Original Message-
From: linux-kernel-ow...@vger.kernel.org [mailto:linux-kernel-
ow...@vger.kernel.org] On Behalf Of Alex Williamson
Sent: Tuesday, November 20, 2012 11:50 PM
To: Alexey Kardashevskiy
Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
d...@lists.ozlabs.org; linux-ker...@vger.kernel.org; kvm@vger.kernel.org;
David Gibson
Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
platform

On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:

VFIO implements platform independent stuff such as a PCI driver, BAR
access (via read/write on a file descriptor or direct mapping when
possible) and IRQ signaling.
The platform dependent part includes IOMMU initialization and
handling.

This patch initializes IOMMU groups based on the IOMMU configuration
discovered during the PCI scan, only POWERNV platform is supported at
the moment.

Also the patch implements an VFIO-IOMMU driver which manages DMA
mapping/unmapping requests coming from the client (now QEMU). It also
returns a DMA window information to let the guest initialize the
device tree for a guest OS properly. Although this driver has been
tested only on POWERNV, it should work on any platform supporting TCE
tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.

Cc: David Gibson 
Signed-off-by: Alexey Kardashevskiy 
---
  arch/powerpc/include/asm/iommu.h |6 +
  arch/powerpc/kernel/iommu.c  |  140 +++
  arch/powerpc/platforms/powernv/pci.c |  135 +++
  drivers/iommu/Kconfig|8 ++
  drivers/vfio/Kconfig |6 +
  drivers/vfio/Makefile|1 +
  drivers/vfio/vfio_iommu_spapr_tce.c  |  247

++

  include/linux/vfio.h |   20 +++
  8 files changed, 563 insertions(+)
  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/arch/powerpc/include/asm/iommu.h
b/arch/powerpc/include/asm/iommu.h
index cbfe678..5ba66cb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -64,30 +64,33 @@ struct iommu_pool {  }
cacheline_aligned_in_smp;

  struct iommu_table {
unsigned long  it_busno; /* Bus number this table belongs to */
unsigned long  it_size;  /* Size of iommu table in entries */
unsigned long  it_offset;/* Offset into global table */
unsigned long  it_base;  /* mapped address of tce table */
unsigned long  it_index; /* which iommu table this is */
unsigned long  it_type;  /* type: PCI or Virtual Bus */
unsigned long  it_blocksize; /* Entries in each block (cacheline)

*/

unsigned long  poolsize;
unsigned long  nr_pools;
struct iommu_pool large_pool;
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now

*/

+#ifdef CONFIG_IOMMU_API
+   struct iommu_group *it_group;
+#endif
  };

  struct scatterlist;

  static inline void set_iommu_table_base(struct device *dev, void
*base)  {
dev->archdata.dma_data.iommu_table_base = base;  }

  static inline void *get_iommu_table_base(struct device *dev)  {
return dev->archdata.dma_data.iommu_table_base;
  }

  /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
static inline void pci_iommu_init(void) { }  extern void
alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
defined(CONFIG_PM)  static inline void iommu_save(void)  {
if (ppc_md.iommu_save)
ppc_md.iommu_save();
  }

  static inline void iommu_restore(void)  {
if (ppc_md.iommu_restore)
ppc_md.iommu_restore();
  }
  #endif

+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long

entry, uint64_t tce,

+   enum dma_data_direction direction, unsigned long pages);
+
  #endif /* __KERNEL__ */
  #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..94f614b 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -32,30 +32,31 @@
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
+#include 

  #define DBG(...)

  static int novmerge;

  static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
int);

  static int __init setup_iommu(char *str)  {
if (!strcmp(str, "novmerge"))
novmerge = 1;
else if (!strcmp(str, "vmerge"))
novmerge = 0;
return 1;
  }
@@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
struct iommu_table *tbl,  }

  void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 void *vaddr, dma_addr_t dma_handle)  {
if (tbl) {
unsigned int nio_pa

The smp_affinity cannot work correctly on guest os when PCI passthrough device using msi/msi-x with KVM

2012-11-22 Thread yi li
Hi Guys,

there have a issue about smp_affinity cannot work correctly on guest
os when PCI passthrough device using msi/msi-x with KVM.

My reason:
pcpu will occur a lot of ipi interrupt to find the vcpu to handle the
irq.  so the guest os will VM_EXIT frequelty. right?

if smp_affinity can work correctly on guest os,  the best way is that
the vcpu handle the irq is cputune at the pcpu which handle the
kvm:pci-bus irq on the host.but  unfortunly, i find that smp_affinity
can not work correctly on guest os when msi/msi-x.

how to reproduce:
1: passthrough a netcard (Brodcom BCM5716S) to the guest os

2: ifup the netcard, the card will use msi-x interrupt default, and close the
irqbalance service

3:  echo 4 > cat /proc/irq/NETCARDIRQ/smp_affinity, so we assume the vcpu2
handle the irq.

4: we have set  and set the irq kvm:pci-bus to
the pcpu1 on the host.

we think this configure will reduce the ipi interrupt when inject interrupt to
the guest os. but this irq is not only handle on vcpu2. maybe it is
not our expect。

YiLi
Thanks
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V3 0/2] IA32_TSC_ADJUST for KVM

2012-11-22 Thread Will Auld
Marcelo,

I have addressed your comments for this patch set, the following patch for 
QEMU-KVM and for adding a test 
case for tsc_adjust also to follow today. 

Thanks,

Will

Will Auld (2):
  Add code to track call origin for msr assignment.
  Enabling IA32_TSC_ADJUST for KVM guest VM support

 arch/x86/include/asm/cpufeature.h |  1 +
 arch/x86/include/asm/kvm_host.h   | 15 ++---
 arch/x86/include/asm/msr-index.h  |  1 +
 arch/x86/kvm/cpuid.c  |  2 ++
 arch/x86/kvm/cpuid.h  |  8 +++
 arch/x86/kvm/svm.c| 28 ++--
 arch/x86/kvm/vmx.c| 33 ++--
 arch/x86/kvm/x86.c| 45 +--
 arch/x86/kvm/x86.h|  2 +-
 9 files changed, 112 insertions(+), 23 deletions(-)

-- 
1.8.0.rc0



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4] virtio-spec: virtio network device RFS support

2012-11-22 Thread Jason Wang

On 11/22/2012 10:46 PM, Michael S. Tsirkin wrote:

Add RFS support to virtio network device.
Add a new feature flag VIRTIO_NET_F_RFS for this feature, a new
configuration field max_virtqueue_pairs to detect supported number of
virtqueues as well as a new command VIRTIO_NET_CTRL_RFS to program
packet steering for unidirectional protocols.

Signed-off-by: Michael S. Tsirkin 

--

Changes from v3:
- rename multiqueue -> rfs this is what we support
- Be more explicit about what driver should do.
- Simplify layout making VQs functionality depend on feature.
- Remove unused commands, only leave in programming # of queues

Changes from v2:
Address Jason's comments on v2:
- Changed STEERING_HOST to STEERING_RX_FOLLOWS_TX:
   this is both clearer and easier to support.
   It does not look like we need a separate steering command
   since host can just watch tx packets as they go.
- Moved RX and TX steering sections near each other.
- Add motivation for other changes in v2

Changes from Jason's rfc:
- reserved vq 3: this makes all rx vqs even and tx vqs odd, which
   looks nicer to me.
- documented packet steering, added a generalized steering programming
   command. Current modes are single queue and host driven multiqueue,
   but I envision support for guest driven multiqueue in the future.
- make default vqs unused when in mq mode - this wastes some memory
   but makes it more efficient to switch between modes as
   we can avoid this causing packet reordering.

Rusty, could you please take a look and comment soon?
If this looks OK to everyone, we can proceed with finalizing the
implementation. Would be nice to try and put it in 3.8.

---

diff --git a/virtio-spec.lyx b/virtio-spec.lyx
index d2f0da9..c1fa3e4 100644
--- a/virtio-spec.lyx
+++ b/virtio-spec.lyx
@@ -59,6 +59,7 @@
  \author -608949062 "Rusty Russell,,,"
  \author -385801441 "Cornelia Huck" cornelia.h...@de.ibm.com
  \author 1531152142 "Paolo Bonzini,,,"
+\author 1986246365 "Michael S. Tsirkin"
  \end_header
  
  \begin_body

@@ -4170,9 +4171,42 @@ ID 1
  \end_layout
  
  \begin_layout Description

-Virtqueues 0:receiveq.
- 1:transmitq.
- 2:controlq
+Virtqueues 0:receiveq
+\change_inserted 1986246365 1352742829
+0
+\change_unchanged
+.
+ 1:transmitq
+\change_inserted 1986246365 1352742832
+0
+\change_deleted 1986246365 1352742947
+.
+
+\change_inserted 1986246365 1352742952
+.
+ 
+ 2N
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1352743187
+N=0 if VIRTIO_NET_F_RFS is not negotiated, otherwise N is indicated by max_
+\emph on
+virtqueue_pairs control
+\emph default
+ field.
+
+\end_layout
+
+\end_inset
+
+: receivqN.
+ 2N+1: transmitqN.
+ 2N+
+\change_unchanged
+2:controlq
  \begin_inset Foot
  status open
  
@@ -4343,6 +4377,16 @@ VIRTIO_NET_F_CTRL_VLAN
  
  \begin_layout Description

  VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous packets.
+\change_inserted 1986246365 1352742767
+
+\end_layout
+
+\begin_layout Description
+
+\change_inserted 1986246365 1352742808
+VIRTIO_NET_F_RFS(2) Device supports Receive Flow Steering.
+\change_unchanged


should be 22

+
  \end_layout
  
  \end_deeper

@@ -4355,11 +4399,44 @@ configuration
  \begin_inset space ~
  \end_inset
  
-layout Two configuration fields are currently defined.

+layout
+\change_deleted 1986246365 1352743300
+Two
+\change_inserted 1986246365 1352743301
+Four
+\change_unchanged
+ configuration fields are currently defined.
   The mac address field always exists (though is only valid if VIRTIO_NET_F_MAC
   is set), and the status field only exists if VIRTIO_NET_F_STATUS is set.
   Two read-only bits are currently defined for the status field: 
VIRTIO_NET_S_LIN
  K_UP and VIRTIO_NET_S_ANNOUNCE.
+
+\change_inserted 1986246365 1353595219
+ The following read-only field,
+\emph on
+max_virtqueue_pairs
+\emph default
+ only exists if VIRTIO_NET_F_RFS is set.
+ This field specifies the maximum number of each of transmit and receive
+ virtqueues (receiveq0..receiveq
+\emph on
+N
+\emph default
+ and transmitq0..transmitq
+\emph on
+N
+\emph default
+ respectively;
+\emph on
+N
+\emph default
+=
+\emph on
+max_virtqueue_pairs
+\emph default
+) that can be configured once VIRTIO_NET_F_RFS is negotiated.
+
+\change_unchanged


So the virt queues used in single queue mode is still reserved in 
multiqueue mode, since when max_virtqueue_pairs in N, we finally get N+1 
virt queue pairs? And this looks conflict with the description in 
"Packet Receive Flow Steering":


"specifying the number of the last transmit and receive queue that is 
going to be used; thus out of transmitq0..transmitqn and 
receiveq0..receiveqn where n=virtqueue_pairs will be used."


In this description, looks like n+1 virtqueue pairs (include receiveq0 
and transmitq0) could be used in RFS mode.
   
  \begin_inset listings

  inline false
@@ -4410,7 +4487,24 @@ Device Initialization
  
  \begin_layout Enumerate

  The initialization routine should identify the receive and 

RE: [PATCH v2 5/6] x86: Enable ack interrupt on vmexit

2012-11-22 Thread Zhang, Yang Z
Gleb Natapov wrote on 2012-11-22:
> On Wed, Nov 21, 2012 at 04:09:38PM +0800, Yang Zhang wrote:
>> Ack interrupt on vmexit is required by Posted Interrupt. With it,
>> when external interrupt caused vmexit, the cpu will acknowledge the
>> interrupt controller and save the interrupt's vector in vmcs.
>> 
>> There are several approaches to enable it. This patch uses a simply
>> way: re-generate an interrupt via self ipi.
>> 
>> Signed-off-by: Yang Zhang 
>> ---
>>  arch/x86/kvm/vmx.c |   11 ++-
>>  1 files changed, 10 insertions(+), 1 deletions(-)
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 7949d21..f6ef090 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -2525,7 +2525,8 @@ static __init int setup_vmcs_config(struct
> vmcs_config *vmcs_conf)
>>  #ifdef CONFIG_X86_64
>>  min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
>>  #endif
>> -opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
>> +opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
>> +VM_EXIT_ACK_INTR_ON_EXIT;
> Always? Do it only if posted interrupts are actually available
> and going to be used.

Right. 
But the currently interrupt handler path is too long:
vm exit -> KVM vmexit handler(interrupt disabled) -> KVM re-enable interrupt -> 
cpu ack the interrupt and interrupt deliver through the host IDT
This will bring extra cost for interrupt belongs to guest. After enable 
"acknowledge interrupt on exit", we can inject the interrupt right way after vm 
exit if the interrupt is for the guest(This patch doesn't do this).

Since we only want to enable "acknowledge interrupt on exit" for Posted 
Interrupt, probably, we can enable it when PI is available.

>>  if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
>>  &_vmexit_control) < 0)
>>  return -EIO;
>> @@ -4457,6 +4458,14 @@ static int handle_exception(struct kvm_vcpu *vcpu)
>> 
>>  static int handle_external_interrupt(struct kvm_vcpu *vcpu)
>>  {
>> +unsigned int vector;
>> +
>> +vector = vmcs_read32(VM_EXIT_INTR_INFO);
>> +vector &= INTR_INFO_VECTOR_MASK;
> Valid bit is guarantied to be set here?
> 
>> +
>> +apic_eoi();
> This is way to late. handle_external_interrupt() is called longs after
> preemption and local irqs are enabled. vcpu process may be scheduled out
> and apic_eoi() will not be called for a long time leaving interrupt
> stuck in ISR and blocking other interrupts.

I will move it to vmx_complete_atomic_exit().

>> +apic->send_IPI_self(vector);
> For level interrupt this is not needed, no?

If we enable "ack interrupt on exit" only when apicv is available, then all 
interrupt is edge(interrupt remapping will setup all remap entries to deliver 
edge interrupt. interrupt remapping is required by x2apic, x2apic is required 
by PI)

/*
 * Trigger mode in the IRTE will always be edge, and for IO-APIC,
the
 * actual level or edge trigger will be setup in the IO-APIC
 * RTE. This will help simplify level triggered irq migration.
 * For more details, see the comments (in io_apic.c) explainig
IO-APIC
 * irq migration in the presence of interrupt-remapping.
*/

>> +
>>  ++vcpu->stat.irq_exits;
>>  return 1;
>>  }
>> --
>> 1.7.1
> 
> --
>   Gleb.


Best regards,
Yang

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] Enabling IA32_TSC_ADJUST for KVM guest VM support

2012-11-22 Thread Will Auld
CPUID.7.0.EBX[1]=1 indicates IA32_TSC_ADJUST MSR 0x3b is supported

Basic design is to emulate the MSR by allowing reads and writes to a guest
vcpu specific location to store the value of the emulated MSR while adding
the value to the vmcs tsc_offset. In this way the IA32_TSC_ADJUST value will
be included in all reads to the TSC MSR whether through rdmsr or rdtsc. This
is of course as long as the "use TSC counter offsetting" VM-execution
control is enabled as well as the IA32_TSC_ADJUST control.

However, because hardware will only return the TSC + IA32_TSC_ADJUST + vmsc
tsc_offset for a guest process when it does and rdtsc (with the correct
settings) the value of our virtualized IA32_TSC_ADJUST must be stored in
one of these three locations. The argument against storing it in the actual
MSR is performance. This is likely to be seldom used while the save/restore
is required on every transition. IA32_TSC_ADJUST was created as a way to
solve some issues with writing TSC itself so that is not an option either.
The remaining option, defined above as our solution has the problem of
returning incorrect vmcs tsc_offset values (unless we intercept and fix, not
done here) as mentioned above. However, more problematic is that storing the
data in vmcs tsc_offset will have a different semantic effect on the system
than does using the actual MSR. This is illustrated in the following example:
The hypervisor set the IA32_TSC_ADJUST, then the guest sets it and a guest
process performs a rdtsc. In this case the guest process will get TSC +
IA32_TSC_ADJUST_hyperviser + vmsc tsc_offset including IA32_TSC_ADJUST_guest.
While the total system semantics changed the semantics as seen by the guest
do not and hence this will not cause a problem.

Signed-off-by: Will Auld 
---
 arch/x86/include/asm/cpufeature.h |  1 +
 arch/x86/include/asm/kvm_host.h   |  3 +++
 arch/x86/include/asm/msr-index.h  |  1 +
 arch/x86/kvm/cpuid.c  |  2 ++
 arch/x86/kvm/cpuid.h  |  8 
 arch/x86/kvm/svm.c|  7 +++
 arch/x86/kvm/vmx.c|  9 +
 arch/x86/kvm/x86.c| 22 ++
 8 files changed, 53 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 6b7ee5f..e574d81 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -199,6 +199,7 @@
 
 /* Intel-defined CPU features, CPUID level 0x0007:0 (ebx), word 9 */
 #define X86_FEATURE_FSGSBASE   (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST  (9*32+ 1) /* TSC adjustment MSR 0x3b */
 #define X86_FEATURE_BMI1   (9*32+ 3) /* 1st group bit manipulation 
extensions */
 #define X86_FEATURE_HLE(9*32+ 4) /* Hardware Lock Elision */
 #define X86_FEATURE_AVX2   (9*32+ 5) /* AVX2 instructions */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da34027..cf8c7e0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -442,6 +442,8 @@ struct kvm_vcpu_arch {
u32 virtual_tsc_mult;
u32 virtual_tsc_khz;
 
+   s64 ia32_tsc_adjust_msr;
+
atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
bool nmi_injected;/* Trying to inject an NMI this entry */
@@ -690,6 +692,7 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
 
void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool 
scale);
+   u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 957ec87..6486569 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -231,6 +231,7 @@
 #define MSR_IA32_EBL_CR_POWERON0x002a
 #define MSR_EBC_FREQUENCY_ID   0x002c
 #define MSR_IA32_FEATURE_CONTROL0x003a
+#define MSR_IA32_TSC_ADJUST 0x003b
 
 #define FEATURE_CONTROL_LOCKED (1<<0)
 #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX   (1<<1)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0595f13..e817bac 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 
function,
if (index == 0) {
entry->ebx &= kvm_supported_word9_x86_features;
cpuid_mask(&entry->ebx, 9);
+   // TSC_ADJUST is emulated 
+   entry->ebx |= F(TSC_ADJUST);
} else
entry->ebx = 0;
entry->eax = 0;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index a10e460..3a8b504 10

Re: qemu-kvm-1.2.0: double free or corruption in VNC code

2012-11-22 Thread Stefan Hajnoczi
On Wed, Nov 21, 2012 at 07:43:16AM +0100, Nikola Ciprich wrote:
> Hello Stefan,
> 
> thanks! here it goes..
> 
> > > *** glibc detected *** /usr/bin/qemu-kvm: double free or corruption 
> > > (!prev): 0x7fc634008cd0 ***
> > > === Backtrace: =
> > > /lib64/libc.so.6(+0x75916)[0x7fc9026f4916]
> > > /lib64/libc.so.6(+0x78443)[0x7fc9026f7443]
> > > /usr/bin/qemu-kvm(+0x1faeb1)[0x7fc907187eb1]
> > > /usr/bin/qemu-kvm(+0x1f0e1a)[0x7fc90717de1a]
> > > /usr/bin/qemu-kvm(+0x1fb681)[0x7fc907188681]
> > > /usr/bin/qemu-kvm(+0xed6a7)[0x7fc90707a6a7]
> > > /usr/bin/qemu-kvm(+0x195c31)[0x7fc907122c31]
> > > /usr/bin/qemu-kvm(main+0x106c)[0x7fc90711e5fc]
> > > /lib64/libc.so.6(__libc_start_main+0xfd)[0x7fc90269dcdd]
> > > /usr/bin/qemu-kvm(+0x749f9)[0x7fc9070019f9]
> > [...]
> 
> [root@blg qemu-kvm-1.2.0]# addr2line -e /usr/lib/debug/usr/bin/qemu-kvm.debug 
> 0x1faeb1 0x1f0e1a 0x1fb681 0xed6a7 0x195c31 0x106c
> /usr/src/debug/qemu-kvm-1.2.0/ui/vnc.c:499
> /usr/src/debug/qemu-kvm-1.2.0/ui/vnc-enc-zrle.c:364
> /usr/src/debug/qemu-kvm-1.2.0/ui/vnc.c:1037
> /usr/src/debug/qemu-kvm-1.2.0/iohandler.c:159
> /usr/src/debug/qemu-kvm-1.2.0/main-loop.c:499

Please also post the exact package version you are using - the line
numbers change between releases and depend on which patches have been
applied to the source tree.  The distro exact package version allows me
to download the source tree that was used to build this binary and check
the correct line numbers.

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Performance issue

2012-11-22 Thread Stefan Hajnoczi
On Thu, Nov 22, 2012 at 09:17:34PM +0200, George-Cristian Bîrzan wrote:
> I'm trying to understand a performance problem (50% degradation in the
> VM) that I'm experiencing some systems with qemu-kvm. Running Fedora
> with 3.5.3-1.fc17.x86_64 or 3.6.6-1.fc17.x86_64, qemu 1.0.1 or 1.2.1
> on AMD Opteron 6176 and 6174, and all of them behave identically.
> 
> A Windows guest is receiving a UDP MPEG stream that is being processed
> by TSReader. The stream comes in at about 73Mbps, but the VM cannot
> process more than 43Mbps. It's not a networking issue, the packets
> reach the guest and with iperf we can easily do 80Mbps. Also, with
> iperf, it can receive the packets from the streamer (even though it
> doesn't detect things properly, but it was just a way to see ).

Hi George-Cristian,
On IRC you mentioned you found a solution.  Any updates?  Are you still
seeing the performance problem?

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM Disk i/o or VM activities causes soft lockup?

2012-11-22 Thread Stefan Hajnoczi
On Wed, Nov 21, 2012 at 03:36:50PM -0800, Vincent Li wrote:
> We have users running on redhat based distro (Kernel
> 2.6.32-131.21.1.el6.x86_64 ) with kvm, when customer made cron job
> script to copy large files between kvm guest or some other user space
> program leads to disk i/o or VM activities, users get following soft
> lockup message from console:
> 
> Nov 17 13:44:46 slot1/luipaard100a err kernel: BUG: soft lockup -
> CPU#4 stuck for 61s! [qemu-kvm:6795]
> Nov 17 13:44:46 slot1/luipaard100a warning kernel: Modules linked in:
> ebt_vlan nls_utf8 isofs ebtable_filter ebtables 8021q garp bridge stp
> llc ipt_REJECT iptable_filter xt_NOTRACK nf_conntrack iptable_raw
> ip_tables loop ext2 binfmt_misc hed womdict(U) vnic(U) parport_pc lp
> parport predis(U) lasthop(U) ipv6 toggler vhost_net tun kvm_intel kvm
> jiffies(U) sysstats hrsleep i2c_dev datastor(U) linux_user_bde(P)(U)
> linux_kernel_bde(P)(U) tg3 libphy serio_raw i2c_i801 i2c_core ehci_hcd
> raid1 raid0 virtio_pci virtio_blk virtio virtio_ring mvsas libsas
> scsi_transport_sas mptspi mptscsih mptbase scsi_transport_spi 3w_9xxx
> sata_svw(U) ahci serverworks sata_sil ata_piix libata sd_mod
> crc_t10dif amd74xx piix ide_gd_mod ide_core dm_snapshot dm_mirror
> dm_region_hash dm_log dm_mod ext3 jbd mbcache
> Nov 17 13:44:46 slot1/luipaard100a warning kernel: Pid: 6795, comm:
> qemu-kvm Tainted: P   
> 2.6.32-131.21.1.el6.f5.x86_64 #1
> Nov 17 13:44:46 slot1/luipaard100a warning kernel: Call Trace:
> Nov 17 13:44:46 slot1/luipaard100a warning kernel: 
> [] ? get_timestamp+0x9/0xf
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? watchdog_timer_fn+0x130/0x178
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? __run_hrtimer+0xa3/0xff
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? hrtimer_interrupt+0xe6/0x190
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? hrtimer_interrupt+0xa9/0x190
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? hpet_interrupt_handler+0x26/0x2d
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? hrtimer_peek_ahead_timers+0x9/0xd
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? __do_softirq+0xc5/0x17a
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? call_softirq+0x1c/0x28
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? do_softirq+0x31/0x66
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? call_function_interrupt+0x13/0x20
> Nov 17 13:44:46 slot1/luipaard100a warning kernel: 
> [] ? vmx_get_msr+0x0/0x123 [kvm_intel]
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? kvm_arch_vcpu_ioctl_run+0x80e/0xaf1 [kvm]
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? kvm_arch_vcpu_ioctl_run+0x802/0xaf1 [kvm]
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? inode_has_perm+0x65/0x72
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? kvm_vcpu_ioctl+0xf2/0x5ba [kvm]
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? file_has_perm+0x9a/0xac
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? vfs_ioctl+0x21/0x6b
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? do_vfs_ioctl+0x487/0x4da
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? sys_ioctl+0x51/0x70
> Nov 17 13:44:46 slot1/luipaard100a warning kernel:
> [] ? system_call_fastpath+0x3c/0x41

This soft lockup is report on the host?

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html