Re: [PATCH v12 2/3] x86, apicv: add virtual x2apic support

2013-01-23 Thread Gleb Natapov
On Wed, Jan 23, 2013 at 10:47:25PM +0800, Yang Zhang wrote:
> From: Yang Zhang 
> 
> basically to benefit from apicv, we need to enable virtualized x2apic mode.
> Currently, we only enable it when guest is really using x2apic.
> 
> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled 
> x2apic:
> 0x800 - 0x8ff: no read intercept for apicv register virtualization,
>except APIC ID and TMCCT which need software's assistance to
>get right value.
> 
> Signed-off-by: Kevin Tian 
> Signed-off-by: Yang Zhang 
> ---
>  arch/x86/include/asm/kvm_host.h |1 +
>  arch/x86/include/asm/vmx.h  |1 +
>  arch/x86/kvm/lapic.c|   14 ++-
>  arch/x86/kvm/svm.c  |6 +
>  arch/x86/kvm/vmx.c  |  203 
> +++
>  5 files changed, 201 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f75e1fe..e1306c1 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -692,6 +692,7 @@ struct kvm_x86_ops {
>   void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>   void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>   void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> + void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
>   int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>   int (*get_tdp_level)(void);
>   u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 44c3f7e..0a54df0 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -139,6 +139,7 @@
>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x0001
>  #define SECONDARY_EXEC_ENABLE_EPT   0x0002
>  #define SECONDARY_EXEC_RDTSCP0x0008
> +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x0010
>  #define SECONDARY_EXEC_ENABLE_VPID  0x0020
>  #define SECONDARY_EXEC_WBINVD_EXITING0x0040
>  #define SECONDARY_EXEC_UNRESTRICTED_GUEST0x0080
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 0664c13..83a9547 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1303,6 +1303,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
>  
>  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>  {
> + u64 old_value = vcpu->arch.apic_base;
>   struct kvm_lapic *apic = vcpu->arch.apic;
>  
>   if (!apic) {
> @@ -1324,11 +1325,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 
> value)
>   value &= ~MSR_IA32_APICBASE_BSP;
>  
>   vcpu->arch.apic_base = value;
> - if (apic_x2apic_mode(apic)) {
> - u32 id = kvm_apic_id(apic);
> - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> - kvm_apic_set_ldr(apic, ldr);
> + if ((old_value ^ value) & X2APIC_ENABLE) {
> + if (value & X2APIC_ENABLE) {
> + u32 id = kvm_apic_id(apic);
> + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> + kvm_apic_set_ldr(apic, ldr);
> + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> + } else
> + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
>   }
> +
>   apic->base_address = apic->vcpu->arch.apic_base &
>MSR_IA32_APICBASE_BASE;
>  
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d29d3cd..38407e9 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu 
> *vcpu, int tpr, int irr)
>   set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>  }
>  
> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
> +{
> + return;
> +}
> +
>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
>  {
>   struct vcpu_svm *svm = to_svm(vcpu);
> @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops = {
>   .enable_nmi_window = enable_nmi_window,
>   .enable_irq_window = enable_irq_window,
>   .update_cr8_intercept = update_cr8_intercept,
> + .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
>  
>   .set_tss_addr = svm_set_tss_addr,
>   .get_tdp_level = get_npt_level,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 8a8116a..c2bc989 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -643,6 +643,8 @@ static unsigned long *vmx_io_bitmap_a;
>  static unsigned long *vmx_io_bitmap_b;
>  static unsigned long *vmx_msr_bitmap_legacy;
>  static unsigned long *vmx_msr_bitmap_longmode;
> +static unsigned long *vmx_msr_bitmap_legacy_x2apic;
> +static unsigned long *vmx_msr_bitmap_longmode_x2apic;
>  
>  static bool cpu_has_load_ia32_efer;
>  static bool cpu_has_load_perf_global_ctrl;
> @@ -767,6 

RE: [PATCH v2] KVM: VMX: enable acknowledge interupt on vmexit

2013-01-23 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-01-24:
> On Thu, Jan 24, 2013 at 12:47:14AM +, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-01-23:
>>> On Tue, Jan 22, 2013 at 01:49:31PM +0800, Yang Zhang wrote:
 From: Yang Zhang 
 
 The "acknowledge interrupt on exit" feature controls processor behavior
 for external interrupt acknowledgement. When this control is set, the
 processor acknowledges the interrupt controller to acquire the
 interrupt vector on VM exit.
 
 After enabling this feature, an interrupt which arrived when target cpu
 is running in vmx non-root mode will be handled by vmx handler instead
 of handler in idt. Currently, vmx handler only fakes an interrupt stack
 and jump to idt table to let real handler to handle it. Further, we
 will recognize the interrupt and only delivery the interrupt which not
 belong to current vcpu through idt table. The interrupt which belonged
 to current vcpu will be handled inside vmx handler. This will reduce
 the interrupt handle cost of KVM.
 
 Refer to Intel SDM volum 3, chapter 33.2.
 
 Signed-off-by: Yang Zhang 
 ---
  arch/x86/include/asm/kvm_host.h |2 + arch/x86/kvm/svm.c
|6  arch/x86/kvm/vmx.c  |   52
  ++- arch/x86/kvm/x86.c
   |2 + 4 files changed, 61 insertions(+), 1 deletions(-)
 diff --git a/arch/x86/include/asm/kvm_host.h
 b/arch/x86/include/asm/kvm_host.h index c431b33..0b73602 100644 ---
 a/arch/x86/include/asm/kvm_host.h +++
 b/arch/x86/include/asm/kvm_host.h @@ -345,6 +345,7 @@ struct
 kvm_vcpu_arch {
unsigned long cr8;
u32 hflags;
u64 efer;
 +  struct desc_ptr host_idt;
>>> Enough to save only host_idt.address.
>>> 
u64 apic_base;  struct kvm_lapic *apic;/* kernel irqchip
  context */unsigned long apic_attention; @@ -723,6 +724,7 @@ struct
  kvm_x86_ops { int (*check_intercept)(struct kvm_vcpu *vcpu,
  
  struct x86_instruction_info *info,   enum
  x86_intercept_stage stage); + void (*handle_external_intr)(struct
  kvm_vcpu *vcpu); };
  
  struct kvm_arch_async_pf {
 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
 index d29d3cd..e286600 100644
 --- a/arch/x86/kvm/svm.c
 +++ b/arch/x86/kvm/svm.c
 @@ -4227,6 +4227,11 @@ out:
return ret;
  }
 +static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
 +{
 +  return;
 +}
 +
  static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support =
  has_svm,  .disabled_by_bios = is_disabled, @@ -4318,6 +4323,7 @@
  static struct kvm_x86_ops svm_x86_ops = { .set_tdp_cr3 =
> set_tdp_cr3,
 
.check_intercept = svm_check_intercept, +   .handle_external_intr =
  svm_handle_external_intr, };
  
  static int __init svm_init(void)
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index dd2a85c..ef98392 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2565,7 +2565,8 @@ static __init int setup_vmcs_config(struct
>>> vmcs_config *vmcs_conf)
  #ifdef CONFIG_X86_64
min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
  #endif
 -  opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
 +  opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
 +  VM_EXIT_ACK_INTR_ON_EXIT;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
&_vmexit_control) < 0)
return -EIO;
 @@ -3933,6 +3934,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
set_cr4_guest_host_mask(vmx);
 +  native_store_idt(&vmx->vcpu.arch.host_idt);
 
>>> We already call native_store_idt() in vmx_set_constant_host_state(). No
>>> need to do it twice. Add vcpu parameter to vmx_set_constant_host_state()
>>> to get idt address from there.
>> Sure.
>> 
return 0;
  }
 @@ -6096,6 +6098,53 @@ static void vmx_complete_atomic_exit(struct
>>> vcpu_vmx *vmx)
}
  }
 + +static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) +{
 +  u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); +  if
 ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
 +  == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { + 
 unsigned int
 vector; +  unsigned long entry; +  gate_desc *desc; + +
 vector =
 exit_intr_info & INTR_INFO_VECTOR_MASK; +#ifdef CONFIG_X86_64 +
desc = (void *)vcpu->arch.host_idt.address + vector * 16; +#else
 +  desc = (void *)vcpu->arch.host_idt.address + vector * 8; 
 +#endif +
 +  entry = gate_offset(*desc); +   asm( +  "mov 
 %0, %%" _ASM_DX "
 \n\t" +#ifdef CONFIG_X86_64 +  "mov %%" _ASM

Re: [PATCH v2] KVM: VMX: enable acknowledge interupt on vmexit

2013-01-23 Thread Gleb Natapov
On Thu, Jan 24, 2013 at 12:47:14AM +, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-01-23:
> > On Tue, Jan 22, 2013 at 01:49:31PM +0800, Yang Zhang wrote:
> >> From: Yang Zhang 
> >> 
> >> The "acknowledge interrupt on exit" feature controls processor behavior
> >> for external interrupt acknowledgement. When this control is set, the
> >> processor acknowledges the interrupt controller to acquire the
> >> interrupt vector on VM exit.
> >> 
> >> After enabling this feature, an interrupt which arrived when target cpu
> >> is running in vmx non-root mode will be handled by vmx handler instead
> >> of handler in idt. Currently, vmx handler only fakes an interrupt stack
> >> and jump to idt table to let real handler to handle it. Further, we
> >> will recognize the interrupt and only delivery the interrupt which not
> >> belong to current vcpu through idt table. The interrupt which belonged
> >> to current vcpu will be handled inside vmx handler. This will reduce
> >> the interrupt handle cost of KVM.
> >> 
> >> Refer to Intel SDM volum 3, chapter 33.2.
> >> 
> >> Signed-off-by: Yang Zhang 
> >> ---
> >>  arch/x86/include/asm/kvm_host.h |2 + arch/x86/kvm/svm.c   
> >>|6  arch/x86/kvm/vmx.c  |   52
> >>  ++- arch/x86/kvm/x86.c
> >>   |2 + 4 files changed, 61 insertions(+), 1 deletions(-)
> >> diff --git a/arch/x86/include/asm/kvm_host.h
> >> b/arch/x86/include/asm/kvm_host.h index c431b33..0b73602 100644 ---
> >> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -345,6 +345,7 @@ struct kvm_vcpu_arch {
> >>unsigned long cr8;
> >>u32 hflags;
> >>u64 efer;
> >> +  struct desc_ptr host_idt;
> > Enough to save only host_idt.address.
> > 
> >>u64 apic_base;  struct kvm_lapic *apic;/* kernel irqchip context
> >>  */unsigned long apic_attention; @@ -723,6 +724,7 @@ struct
> >>  kvm_x86_ops { int (*check_intercept)(struct kvm_vcpu *vcpu,   
> >>   
> >>  struct x86_instruction_info *info,   enum 
> >> x86_intercept_stage
> >>  stage); + void (*handle_external_intr)(struct kvm_vcpu *vcpu); };
> >>  
> >>  struct kvm_arch_async_pf {
> >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> >> index d29d3cd..e286600 100644
> >> --- a/arch/x86/kvm/svm.c
> >> +++ b/arch/x86/kvm/svm.c
> >> @@ -4227,6 +4227,11 @@ out:
> >>return ret;
> >>  }
> >> +static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
> >> +{
> >> +  return;
> >> +}
> >> +
> >>  static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support =
> >>  has_svm,  .disabled_by_bios = is_disabled, @@ -4318,6 +4323,7 @@
> >>  static struct kvm_x86_ops svm_x86_ops = { .set_tdp_cr3 = 
> >> set_tdp_cr3,
> >>  
> >>.check_intercept = svm_check_intercept, +   .handle_external_intr =
> >>  svm_handle_external_intr, };
> >>  
> >>  static int __init svm_init(void)
> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >> index dd2a85c..ef98392 100644
> >> --- a/arch/x86/kvm/vmx.c
> >> +++ b/arch/x86/kvm/vmx.c
> >> @@ -2565,7 +2565,8 @@ static __init int setup_vmcs_config(struct
> > vmcs_config *vmcs_conf)
> >>  #ifdef CONFIG_X86_64
> >>min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
> >>  #endif
> >> -  opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
> >> +  opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
> >> +  VM_EXIT_ACK_INTR_ON_EXIT;
> >>if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
> >>&_vmexit_control) < 0)
> >>return -EIO;
> >> @@ -3933,6 +3934,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
> >> 
> >>vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
> >>set_cr4_guest_host_mask(vmx);
> >> +  native_store_idt(&vmx->vcpu.arch.host_idt);
> >> 
> > We already call native_store_idt() in vmx_set_constant_host_state(). No
> > need to do it twice. Add vcpu parameter to vmx_set_constant_host_state()
> > to get idt address from there.
> Sure.
> 
> >>return 0;
> >>  }
> >> @@ -6096,6 +6098,53 @@ static void vmx_complete_atomic_exit(struct
> > vcpu_vmx *vmx)
> >>}
> >>  }
> >> + +static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) +{ +
> >> u32
> >> exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + if ((exit_intr_info
> >> & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) + 
> >> ==
> >> (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { +   unsigned int 
> >> vector;
> >> +  unsigned long entry; +  gate_desc *desc; + +
> >> vector = 
> >> exit_intr_info & INTR_INFO_VECTOR_MASK; +#ifdef CONFIG_X86_64 +
> >> desc =
> >> (void *)vcpu->arch.host_idt.address + vector * 16; +#else +
> >> desc =
> >> (void *)vcpu->arch.host_idt.address + vector * 8; +#endif + +  
> >> entry =
> >> gate_offset(*desc); +  asm( +  "mov %0, %%" 
> >>

Re: windows 2008 guest causing rcu_shed to emit NMI

2013-01-23 Thread Marcelo Tosatti
On Tue, Jan 22, 2013 at 09:00:25PM +0300, Andrey Korolyov wrote:
> Hi,
> 
> problem described in the title happens on heavy I/O pressure on the
> host, without idle=poll trace almost always is the same, involving
> mwait, with poll and nohz=off RIP varies from time to time, at the
> previous hang it was tg_throttle_down, rather than test_ti_thread_flag
> in attached one. Both possible clocksource drivers, hpet and tsc, able
> to reproduce that with equal probability. VMs are pinned over one of
> two numa sets on two-head machine, mean emulator thread and each of
> vcpu threads has its own cpuset cg with '0-5,12-17' or '6-11,18-23'.
> I`ll appreciate any suggestions to try.

Andrey,

Can you reproduce with an upstream kernel? Commit
5cfc2aabcb282f fixes a livelock.

 d2 75 c3 eb 03 41 89 c6 48 83 c4 18 44 89 f0 5b 5d 41 5c 41 5d 41 5e 41
5f c3 <31> c0 c3 48 63 ff 48 c7 c2 80 37 01 00 48 8b 0c fd e0 d6 68 81
[12738.508644] Call Trace:
[12738.508648]  [] ? walk_tg_tree_from+0x70/0x99
[12738.508652]  [] ? __switch_to_xtra+0x14c/0x160
[12738.508656]  [] ? throttle_cfs_rq+0x4d/0x109
[12738.508660]  [] ? put_prev_task_fair+0x3f/0x65
[12738.508663]  [] ? __schedule+0x32e/0x5c3
[12738.508666]  [] ? yield_to+0xfa/0x10c
[12738.508669]  [] ? atomic_inc+0x3/0x4
[12738.508678]  [] ? kvm_vcpu_on_spin+0x8c/0xf7 [kvm]
[12738.508684]  [] ? handle_pause+0x11/0x18
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v2] KVM: VMX: enable acknowledge interupt on vmexit

2013-01-23 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-01-23:
> On Tue, Jan 22, 2013 at 01:49:31PM +0800, Yang Zhang wrote:
>> From: Yang Zhang 
>> 
>> The "acknowledge interrupt on exit" feature controls processor behavior
>> for external interrupt acknowledgement. When this control is set, the
>> processor acknowledges the interrupt controller to acquire the
>> interrupt vector on VM exit.
>> 
>> After enabling this feature, an interrupt which arrived when target cpu
>> is running in vmx non-root mode will be handled by vmx handler instead
>> of handler in idt. Currently, vmx handler only fakes an interrupt stack
>> and jump to idt table to let real handler to handle it. Further, we
>> will recognize the interrupt and only delivery the interrupt which not
>> belong to current vcpu through idt table. The interrupt which belonged
>> to current vcpu will be handled inside vmx handler. This will reduce
>> the interrupt handle cost of KVM.
>> 
>> Refer to Intel SDM volum 3, chapter 33.2.
>> 
>> Signed-off-by: Yang Zhang 
>> ---
>>  arch/x86/include/asm/kvm_host.h |2 + arch/x86/kvm/svm.c   
>>|6  arch/x86/kvm/vmx.c  |   52
>>  ++- arch/x86/kvm/x86.c
>>   |2 + 4 files changed, 61 insertions(+), 1 deletions(-)
>> diff --git a/arch/x86/include/asm/kvm_host.h
>> b/arch/x86/include/asm/kvm_host.h index c431b33..0b73602 100644 ---
>> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -345,6 +345,7 @@ struct kvm_vcpu_arch {
>>  unsigned long cr8;
>>  u32 hflags;
>>  u64 efer;
>> +struct desc_ptr host_idt;
> Enough to save only host_idt.address.
> 
>>  u64 apic_base;  struct kvm_lapic *apic;/* kernel irqchip context
>>  */  unsigned long apic_attention; @@ -723,6 +724,7 @@ struct
>>  kvm_x86_ops {   int (*check_intercept)(struct kvm_vcpu *vcpu,   
>>   
>>  struct x86_instruction_info *info, enum 
>> x86_intercept_stage
>>  stage); +   void (*handle_external_intr)(struct kvm_vcpu *vcpu); };
>>  
>>  struct kvm_arch_async_pf {
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index d29d3cd..e286600 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -4227,6 +4227,11 @@ out:
>>  return ret;
>>  }
>> +static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
>> +{
>> +return;
>> +}
>> +
>>  static struct kvm_x86_ops svm_x86_ops = {   .cpu_has_kvm_support =
>>  has_svm,.disabled_by_bios = is_disabled, @@ -4318,6 +4323,7 @@
>>  static struct kvm_x86_ops svm_x86_ops = {   .set_tdp_cr3 = set_tdp_cr3,
>>  
>>  .check_intercept = svm_check_intercept, +   .handle_external_intr =
>>  svm_handle_external_intr, };
>>  
>>  static int __init svm_init(void)
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index dd2a85c..ef98392 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -2565,7 +2565,8 @@ static __init int setup_vmcs_config(struct
> vmcs_config *vmcs_conf)
>>  #ifdef CONFIG_X86_64
>>  min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
>>  #endif
>> -opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
>> +opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
>> +VM_EXIT_ACK_INTR_ON_EXIT;
>>  if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
>>  &_vmexit_control) < 0)
>>  return -EIO;
>> @@ -3933,6 +3934,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>> 
>>  vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
>>  set_cr4_guest_host_mask(vmx);
>> +native_store_idt(&vmx->vcpu.arch.host_idt);
>> 
> We already call native_store_idt() in vmx_set_constant_host_state(). No
> need to do it twice. Add vcpu parameter to vmx_set_constant_host_state()
> to get idt address from there.
Sure.

>>  return 0;
>>  }
>> @@ -6096,6 +6098,53 @@ static void vmx_complete_atomic_exit(struct
> vcpu_vmx *vmx)
>>  }
>>  }
>> + +static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) +{ +  u32
>> exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); +   if ((exit_intr_info
>> & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) +   
>> ==
>> (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { + unsigned int 
>> vector;
>> +unsigned long entry; +  gate_desc *desc; + +
>> vector = 
>> exit_intr_info & INTR_INFO_VECTOR_MASK; +#ifdef CONFIG_X86_64 +  
>> desc =
>> (void *)vcpu->arch.host_idt.address + vector * 16; +#else +  desc =
>> (void *)vcpu->arch.host_idt.address + vector * 8; +#endif + +
>> entry =
>> gate_offset(*desc); +asm( +  "mov %0, %%" 
>> _ASM_DX " \n\t" +#ifdef
>> CONFIG_X86_64 +  "mov %%" _ASM_SP ", %%" _ASM_BX " \n\t" 
>> +   "and
>> $0xfff0, %%" _ASM_SP " \n\t" +   "mov %%ss, %%" 
>> _ASM_AX "
>> \n\t" +  "push %%" _ASM_AX 

[PATCH] vfio-pci: Enable PCIe extended config space

2013-01-23 Thread Alex Williamson
We don't know pre-init time whether the device we're exposing is PCIe
or legacy PCI.  We could ask for it to be specified via a device
option, but that seems like too much to ask of the user.  Instead we
can assume everything will be PCIe, which makes PCI-core allocate
enough config space.  Removing the flag during init leaves the space
allocated, but allows legacy PCI devices to report the real device
config space size to rest of Qemu.

Signed-off-by: Alex Williamson 
---
 hw/vfio_pci.c |4 
 1 file changed, 4 insertions(+)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index c51ae67..66537b7 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -1899,6 +1899,9 @@ static int vfio_get_device(VFIOGroup *group, const char 
*name, VFIODevice *vdev)
 (unsigned long)reg_info.flags);
 
 vdev->config_size = reg_info.size;
+if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
+vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
+}
 vdev->config_offset = reg_info.offset;
 
 error:
@@ -2121,6 +2124,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, 
void *data)
 pdc->exit = vfio_exitfn;
 pdc->config_read = vfio_pci_read_config;
 pdc->config_write = vfio_pci_write_config;
+pdc->is_express = 1; /* We might be */
 }
 
 static const TypeInfo vfio_pci_dev_info = {

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/9] some more vmx real mode emulation fixes and cleanups

2013-01-23 Thread Marcelo Tosatti
On Mon, Jan 21, 2013 at 03:36:40PM +0200, Gleb Natapov wrote:
> 
> Gleb Natapov (9):
>   KVM: VMX: remove special CPL cache access during transition to real
> mode.
>   KVM: VMX: reset CPL only on CS register write.
>   KVM: VMX: if unrestricted guest is enabled vcpu state is always
> valid.
>   KVM: VMX: remove hack that disables emulation on vcpu reset/init
>   KVM: VMX: skip vmx->rmode.vm86_active check on cr0 write if
> unrestricted guest is enabled
>   KVM: VMX: don't clobber segment AR of unusable segments.
>   KVM: VMX: rename fix_pmode_dataseg to fix_pmode_seg.
>   KVM: x86: fix use of uninitialized memory as segment descriptor in
> emulator.
>   KVM: VMX: set vmx->emulation_required only when needed.
> 
>  arch/x86/kvm/vmx.c |   70 
> +---
>  arch/x86/kvm/x86.c |4 ++-
>  2 files changed, 37 insertions(+), 37 deletions(-)
> 
> -- 
> 1.7.10.4

Reviewed-by: Marcelo Tosatti 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v12 0/3] x86, apicv: Add APIC virtualization support

2013-01-23 Thread Zhang, Yang Z
Marcelo Tosatti wrote on 2013-01-24:
> On Wed, Jan 23, 2013 at 10:47:23PM +0800, Yang Zhang wrote:
>> From: Yang Zhang 
>> 
>> APIC virtualization is a new feature which can eliminate most of VM exit
>> when vcpu handle a interrupt:
>> 
>> APIC register virtualization:
>> APIC read access doesn't cause APIC-access VM exits.
>> APIC write becomes trap-like.
>> Virtual interrupt delivery:
>> Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
>> manually, which is fully taken care of by the hardware.
> 
> Can you provide numbers, and also what tests have been performed.
This first patch is adding APIC register virtualization supporting.
The second patch is adding enable virtual x2apic mode supporting since it is 
required by APICv when guest uses msr based way to access APIC.
Then third patch is is add virtual interrupt delivery supporting.

The test cover booting different guests: include windows 2k3, win7, winxp, 
rhel6u3, rhel 5u5 and upstream Linux. And also did some scp inside guest and 
migration. Didn't see any issues.

Also, we did some performance measure with previous patch and it shows about 3% 
~5% improvement. 

Best regards,
Yang

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] vfio-pci: Enable PCIe extended capabilities on v1

2013-01-23 Thread Alex Williamson
Even PCIe 1.x had extended config space.

Signed-off-by: Alex Williamson 
---
 drivers/vfio/pci/vfio_pci_config.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c 
b/drivers/vfio/pci/vfio_pci_config.c
index 6985149..f1dde2c 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -985,12 +985,12 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 
cap, u8 pos)
if (ret)
return pcibios_err_to_errno(ret);
 
+   vdev->extended_caps = true;
+
if ((word & PCI_EXP_FLAGS_VERS) == 1)
return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
-   else {
-   vdev->extended_caps = true;
+   else
return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;
-   }
case PCI_CAP_ID_HT:
ret = pci_read_config_byte(pdev, pos + 3, &byte);
if (ret)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v12 0/3] x86, apicv: Add APIC virtualization support

2013-01-23 Thread Marcelo Tosatti
On Wed, Jan 23, 2013 at 10:47:23PM +0800, Yang Zhang wrote:
> From: Yang Zhang 
> 
> APIC virtualization is a new feature which can eliminate most of VM exit
> when vcpu handle a interrupt:
> 
> APIC register virtualization:
> APIC read access doesn't cause APIC-access VM exits.
> APIC write becomes trap-like.
> 
> Virtual interrupt delivery:
> Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
> manually, which is fully taken care of by the hardware.
> 
> Please refer to Intel SDM volume 3, chapter 29 for more details.
> Changes v11 to v12:
>  * Check irqchip in kernel when enabling apicv, if using userspace irq chip,
>apicv cannot be used and must be disabled.
>  * Rename some fucntion to more descriptive name.
>  * Move ioapic entry pase logic to lapic.c
>  * Rebased on top of KVM upstream
> 
> Changes v10 to v11:
>  * Use two new msr bitmaps for guest that enabling x2apic mode:
>Since msr bitmap is shared by all guests, it will break guest that
>not using x2apic when updating the global msr bitmap. To solve this,
>we use two new msr bitmap for guest which using x2apic.
> 
> Changes v9 to v10:
>  * Enable virtualize x2apic mode when guest is using x2apic and apicv:
>There is no point to enable x2apic mode when apicv is disabled.
>  * Grep ioapic_lock when traversing ioapic entry to set eoi exit bitmap
>  * Rebased on top of KVM upstream
> 
> Changes v8 to v9:
>  * Update eoi exit bitmap by vcpu itself.
>  * Enable virtualize x2apic mode when guest is using x2apic.
>  * Rebase on top of KVM upstream
> 
> Changes v7 to v8:
>  * According Marcelo's suggestion, add comments for irr_pending and isr_count,
>since the two valiables have different meaning when using apicv.
>  * Set highest bit in vISR to SVI after migation.
>  * Use spinlock to access eoi exit bitmap synchronously.
>  * Enable virtualize x2apic mode when guest is using x2apic
>  * Rebased on top of KVM upstream.
> 
> Yang Zhang (3):
>   x86, apicv: add APICv register virtualization support
>   x86, apicv: add virtual x2apic support
>   x86, apicv: add virtual interrupt delivery support
> 
>  arch/ia64/kvm/lapic.h   |6 +
>  arch/x86/include/asm/kvm_host.h |8 +
>  arch/x86/include/asm/vmx.h  |   14 ++
>  arch/x86/kvm/irq.c  |   56 ++-
>  arch/x86/kvm/lapic.c|  135 ---
>  arch/x86/kvm/lapic.h|   29 
>  arch/x86/kvm/svm.c  |   37 
>  arch/x86/kvm/vmx.c  |  349 
> ---
>  arch/x86/kvm/x86.c  |   11 +-
>  include/linux/kvm_host.h|3 +
>  virt/kvm/ioapic.c   |   39 +
>  virt/kvm/ioapic.h   |4 +
>  virt/kvm/irq_comm.c |   25 +++
>  virt/kvm/kvm_main.c |5 +
>  14 files changed, 667 insertions(+), 54 deletions(-)

Reviewed-by: Marcelo Tosatti 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v12 0/3] x86, apicv: Add APIC virtualization support

2013-01-23 Thread Marcelo Tosatti
On Wed, Jan 23, 2013 at 10:47:23PM +0800, Yang Zhang wrote:
> From: Yang Zhang 
> 
> APIC virtualization is a new feature which can eliminate most of VM exit
> when vcpu handle a interrupt:
> 
> APIC register virtualization:
> APIC read access doesn't cause APIC-access VM exits.
> APIC write becomes trap-like.
> 
> Virtual interrupt delivery:
> Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
> manually, which is fully taken care of by the hardware.

Can you provide numbers, and also what tests have been performed.
Thanks.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost-net: fall back to vmalloc if high-order allocation fails

2013-01-23 Thread Michael S. Tsirkin
On Wed, Jan 23, 2013 at 09:46:47PM +0100, Romain Francoise wrote:
> Creating a vhost-net device allocates an object large enough (34320 bytes
> on x86-64) to trigger an order-4 allocation, which may fail if memory if
> fragmented:
> 
>  libvirtd: page allocation failure: order:4, mode:0x2000d0
>  ...
>  SLAB: Unable to allocate memory on node 0 (gfp=0xd0)
>cache: size-65536, object size: 65536, order: 4
>node 0: slabs: 8/8, objs: 8/8, free: 0
> 
> In that situation, rather than forcing the caller to use regular
> virtio-net, try to allocate the descriptor with vmalloc().
> 
> Signed-off-by: Romain Francoise 

Thanks for the patch.
Hmm, I haven't seen this.
Maybe we should try and reduce our memory usage,
I will look into this.

> ---
>  drivers/vhost/net.c | 18 +++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index ebd08b2..1ded79b 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -18,6 +18,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -603,12 +604,23 @@ static void handle_rx_net(struct vhost_work *work)
>   handle_rx(net);
>  }
>  
> +static void vhost_net_kvfree(void *addr)
> +{
> + if (is_vmalloc_addr(addr))
> + vfree(addr);
> + else
> + kfree(addr);
> +}
> +
>  static int vhost_net_open(struct inode *inode, struct file *f)
>  {
> - struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
> + struct vhost_net *n;
>   struct vhost_dev *dev;
>   int r;
>  
> + n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN);
> + if (!n)
> + n = vmalloc(sizeof *n);
>   if (!n)
>   return -ENOMEM;
>  
> @@ -617,7 +629,7 @@ static int vhost_net_open(struct inode *inode, struct 
> file *f)
>   n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
>   r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
>   if (r < 0) {
> - kfree(n);
> + vhost_net_kvfree(n);
>   return r;
>   }
>  
> @@ -719,7 +731,7 @@ static int vhost_net_release(struct inode *inode, struct 
> file *f)
>   /* We do an extra flush before freeing memory,
>* since jobs can re-queue themselves. */
>   vhost_net_flush(n);
> - kfree(n);
> + vhost_net_kvfree(n);
>   return 0;
>  }
>  
> -- 
> 1.8.1.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] vhost-net: fall back to vmalloc if high-order allocation fails

2013-01-23 Thread Romain Francoise
Creating a vhost-net device allocates an object large enough (34320 bytes
on x86-64) to trigger an order-4 allocation, which may fail if memory if
fragmented:

 libvirtd: page allocation failure: order:4, mode:0x2000d0
 ...
 SLAB: Unable to allocate memory on node 0 (gfp=0xd0)
   cache: size-65536, object size: 65536, order: 4
   node 0: slabs: 8/8, objs: 8/8, free: 0

In that situation, rather than forcing the caller to use regular
virtio-net, try to allocate the descriptor with vmalloc().

Signed-off-by: Romain Francoise 
---
 drivers/vhost/net.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ebd08b2..1ded79b 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -603,12 +604,23 @@ static void handle_rx_net(struct vhost_work *work)
handle_rx(net);
 }
 
+static void vhost_net_kvfree(void *addr)
+{
+   if (is_vmalloc_addr(addr))
+   vfree(addr);
+   else
+   kfree(addr);
+}
+
 static int vhost_net_open(struct inode *inode, struct file *f)
 {
-   struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
+   struct vhost_net *n;
struct vhost_dev *dev;
int r;
 
+   n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN);
+   if (!n)
+   n = vmalloc(sizeof *n);
if (!n)
return -ENOMEM;
 
@@ -617,7 +629,7 @@ static int vhost_net_open(struct inode *inode, struct file 
*f)
n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
if (r < 0) {
-   kfree(n);
+   vhost_net_kvfree(n);
return r;
}
 
@@ -719,7 +731,7 @@ static int vhost_net_release(struct inode *inode, struct 
file *f)
/* We do an extra flush before freeing memory,
 * since jobs can re-queue themselves. */
vhost_net_flush(n);
-   kfree(n);
+   vhost_net_kvfree(n);
return 0;
 }
 
-- 
1.8.1.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] KVM/ARM core implementation

2013-01-23 Thread Christoffer Dall
Hi Will,

I've prepared a stable branch for you, for-will/kvm/core, based on
your stable perf branch.

Since the last patch series, I've addressed the reviewer comments, and
rev'ed KVM_CAP_ARM_PSCI to 87, since 86 is already used by PPC in
kvm/next.

kvmtool should probably be updated acoordingly.

I rebased the VGIC and the Arch. Timers series on the stable branch
and made these available in kvm/vgic and kvm/vgic-timers,
respectively. These branches are, however, not yet stable.


Thanks,
-Christoffer

The following changes since commit 6abc749f635005be78dfcb562c2235511965db6d:

  Merge branch 'for-rmk/perf' into for-rmk/virt/kvm/core (2013-01-23
17:17:52 +)

are available in the git repository at:


  git://github.com/virtualopensystems/linux-kvm-arm.git for-will/kvm/core

for you to fetch changes up to a749474de5f0f5902f59acb5c7f4dc6b816ac788:

  KVM: ARM: Add maintainer entry for KVM/ARM (2013-01-23 13:29:19 -0500)


Christoffer Dall (13):
  ARM: Add page table and page defines needed by KVM
  ARM: Section based HYP idmap
  KVM: ARM: Initial skeleton to compile KVM support
  KVM: ARM: Hypervisor initialization
  KVM: ARM: Memory virtualization setup
  KVM: ARM: Inject IRQs and FIQs from userspace
  KVM: ARM: World-switch implementation
  KVM: ARM: Emulation framework and CP15 emulation
  KVM: ARM: User space API for getting/setting co-proc registers
  KVM: ARM: Demux CCSIDR in the userspace API
  KVM: ARM: Handle guest faults in KVM
  KVM: ARM: Handle I/O aborts
  KVM: ARM: Add maintainer entry for KVM/ARM

Marc Zyngier (1):
  KVM: ARM: Power State Coordination Interface implementation

Rusty Russell (1):
  KVM: ARM: VFP userspace interface

 Documentation/virtual/kvm/api.txt   |   99 ++-
 MAINTAINERS |9 +
 arch/arm/Kconfig|2 +
 arch/arm/Makefile   |1 +
 arch/arm/include/asm/idmap.h|1 +
 arch/arm/include/asm/kvm_arm.h  |  214 ++
 arch/arm/include/asm/kvm_asm.h  |   82 +++
 arch/arm/include/asm/kvm_coproc.h   |   47 ++
 arch/arm/include/asm/kvm_emulate.h  |   72 ++
 arch/arm/include/asm/kvm_host.h |  161 +
 arch/arm/include/asm/kvm_mmio.h |   56 ++
 arch/arm/include/asm/kvm_mmu.h  |   50 ++
 arch/arm/include/asm/kvm_psci.h |   23 +
 arch/arm/include/asm/pgtable-3level-hwdef.h |5 +
 arch/arm/include/asm/pgtable-3level.h   |   18 +
 arch/arm/include/asm/pgtable.h  |7 +
 arch/arm/include/uapi/asm/kvm.h |  164 +
 arch/arm/kernel/asm-offsets.c   |   25 +
 arch/arm/kernel/vmlinux.lds.S   |6 +-
 arch/arm/kvm/Kconfig|   56 ++
 arch/arm/kvm/Makefile   |   21 +
 arch/arm/kvm/arm.c  | 1015 ++
 arch/arm/kvm/coproc.c   | 1046 +++
 arch/arm/kvm/coproc.h   |  153 
 arch/arm/kvm/coproc_a15.c   |  162 +
 arch/arm/kvm/emulate.c  |  373 ++
 arch/arm/kvm/guest.c|  222 ++
 arch/arm/kvm/init.S |  114 +++
 arch/arm/kvm/interrupts.S   |  478 
 arch/arm/kvm/interrupts_head.S  |  441 +++
 arch/arm/kvm/mmio.c |  153 
 arch/arm/kvm/mmu.c  |  787 
 arch/arm/kvm/psci.c |  108 +++
 arch/arm/kvm/reset.c|   74 ++
 arch/arm/kvm/trace.h|  235 ++
 arch/arm/mm/idmap.c |   55 +-
 arch/arm/mm/mmu.c   |   22 +
 include/uapi/linux/kvm.h|9 +
 38 files changed, 6546 insertions(+), 20 deletions(-)
 create mode 100644 arch/arm/include/asm/kvm_arm.h
 create mode 100644 arch/arm/include/asm/kvm_asm.h
 create mode 100644 arch/arm/include/asm/kvm_coproc.h
 create mode 100644 arch/arm/include/asm/kvm_emulate.h
 create mode 100644 arch/arm/include/asm/kvm_host.h
 create mode 100644 arch/arm/include/asm/kvm_mmio.h
 create mode 100644 arch/arm/include/asm/kvm_mmu.h
 create mode 100644 arch/arm/include/asm/kvm_psci.h
 create mode 100644 arch/arm/include/uapi/asm/kvm.h
 create mode 100644 arch/arm/kvm/Kconfig
 create mode 100644 arch/arm/kvm/Makefile
 create mode 100644 arch/arm/kvm/arm.c
 create mode 100644 arch/arm/kvm/coproc.c
 create mode 100644 arch/arm/kvm/coproc.h
 create mode 100644 arch/arm/kvm/coproc_a15.c
 create mode 100644 arch/arm/kvm/emulate.c
 create mode 100644 arch/arm/kvm/guest.c
 create mode 100644 arch/arm/kvm/init.S
 create mode 100644 arch/arm/kvm/interrupts.S
 create mode 100644 arch/arm/kvm/interrupts_head.

Re: [Qemu-devel] [PATCH for-1.4 qom-cpu 2/9] target-i386: kvm: Set vcpu_id to APIC ID instead of CPU index

2013-01-23 Thread Andreas Färber
Am 23.01.2013 11:26, schrieb Gleb Natapov:
> On Tue, Jan 22, 2013 at 06:25:02PM -0200, Eduardo Habkost wrote:
>> The CPU ID in KVM is supposed to be the APIC ID, so change the
>> KVM_CREATE_VCPU call to match it. The current behavior didn't break
>> anything yet because today the APIC ID is assumed to be equal to the CPU
>> index, but this won't be true in the future.
>>
>> Signed-off-by: Eduardo Habkost 
>> Reviewed-by: Marcelo Tosatti 
> Acked-by: Gleb Natapov 

Thanks, applied these two to qom-cpu already:
https://github.com/afaerber/qemu-cpu/commits/qom-cpu

Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/8] KVM: Reduce mmu_lock hold time when zapping mmu pages

2013-01-23 Thread Xiao Guangrong
On 01/23/2013 10:49 PM, Takuya Yoshikawa wrote:
> On Wed, 23 Jan 2013 21:45:23 +0800
> Xiao Guangrong  wrote:
> 
>>> The current code which deletes the two link nodes in different functions
>>> looks unnatural to me: traversing the sp->link nodes forces us to break
>>> the loop and sp->hash_link nodes alone is allowed to continue ...
>>>
>>> Making each function semantically clear should be more important than
>>> other things.
>>>
>>
>> The reason the code like this is, we have lockless shadow page walker.
> 
> But hash_link needs to be protected by mmu_lock anyway?

The purpose that do not delete hlist is for continuously walking hash table 
entry.
Deleting link is for reusing it on invalid list to save memory space. If you 
really
like to continuously walk sh->link, we can introduce another list for invalid 
list
using, but it is not worthwhile.

To be honest, i do not care this, no one ask us to obey the rule that "all lists
should have the same walking behaviour". ;). But comment for these code is 
always
appreciated.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [QEMU PATCH v5 1/3] virtio-net: remove layout assumptions for ctrl vq

2013-01-23 Thread Michael S. Tsirkin
On Tue, Jan 22, 2013 at 11:44:44PM +0800, Amos Kong wrote:
> From: Michael S. Tsirkin 
> 
> Virtio-net code makes assumption about virtqueue descriptor layout
> (e.g. sg[0] is the header, sg[1] is the data buffer).
> 
> This patch makes code not rely on the layout of descriptors.
> 
> Signed-off-by: Michael S. Tsirkin 
> Signed-off-by: Amos Kong 

Applied all three, thanks.

> ---
>  hw/virtio-net.c |  129 
> ---
>  1 files changed, 75 insertions(+), 54 deletions(-)
> 
> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
> index 3bb01b1..af1f3a1 100644
> --- a/hw/virtio-net.c
> +++ b/hw/virtio-net.c
> @@ -315,44 +315,44 @@ static void virtio_net_set_features(VirtIODevice *vdev, 
> uint32_t features)
>  }
>  
>  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> - VirtQueueElement *elem)
> + struct iovec *iov, unsigned int iov_cnt)
>  {
>  uint8_t on;
> +size_t s;
>  
> -if (elem->out_num != 2 || elem->out_sg[1].iov_len != sizeof(on)) {
> -error_report("virtio-net ctrl invalid rx mode command");
> -exit(1);
> +s = iov_to_buf(iov, iov_cnt, 0, &on, sizeof(on));
> +if (s != sizeof(on)) {
> +return VIRTIO_NET_ERR;
>  }
>  
> -on = ldub_p(elem->out_sg[1].iov_base);
> -
> -if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC)
> +if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) {
>  n->promisc = on;
> -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI)
> +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) {
>  n->allmulti = on;
> -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI)
> +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) {
>  n->alluni = on;
> -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI)
> +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) {
>  n->nomulti = on;
> -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI)
> +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) {
>  n->nouni = on;
> -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST)
> +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) {
>  n->nobcast = on;
> -else
> +} else {
>  return VIRTIO_NET_ERR;
> +}
>  
>  return VIRTIO_NET_OK;
>  }
>  
>  static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
> - VirtQueueElement *elem)
> + struct iovec *iov, unsigned int iov_cnt)
>  {
>  struct virtio_net_ctrl_mac mac_data;
> +size_t s;
>  
> -if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET || elem->out_num != 3 ||
> -elem->out_sg[1].iov_len < sizeof(mac_data) ||
> -elem->out_sg[2].iov_len < sizeof(mac_data))
> +if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) {
>  return VIRTIO_NET_ERR;
> +}
>  
>  n->mac_table.in_use = 0;
>  n->mac_table.first_multi = 0;
> @@ -360,54 +360,72 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t 
> cmd,
>  n->mac_table.multi_overflow = 0;
>  memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN);
>  
> -mac_data.entries = ldl_p(elem->out_sg[1].iov_base);
> +s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
> +   sizeof(mac_data.entries));
> +mac_data.entries = ldl_p(&mac_data.entries);
> +if (s != sizeof(mac_data.entries)) {
> +return VIRTIO_NET_ERR;
> +}
> +iov_discard_front(&iov, &iov_cnt, s);
>  
> -if (sizeof(mac_data.entries) +
> -(mac_data.entries * ETH_ALEN) > elem->out_sg[1].iov_len)
> +if (mac_data.entries * ETH_ALEN > iov_size(iov, iov_cnt)) {
>  return VIRTIO_NET_ERR;
> +}
>  
>  if (mac_data.entries <= MAC_TABLE_ENTRIES) {
> -memcpy(n->mac_table.macs, elem->out_sg[1].iov_base + 
> sizeof(mac_data),
> -   mac_data.entries * ETH_ALEN);
> +s = iov_to_buf(iov, iov_cnt, 0, n->mac_table.macs,
> +   mac_data.entries * ETH_ALEN);
> +if (s != mac_data.entries * ETH_ALEN) {
> +return VIRTIO_NET_ERR;
> +}
>  n->mac_table.in_use += mac_data.entries;
>  } else {
>  n->mac_table.uni_overflow = 1;
>  }
>  
> +iov_discard_front(&iov, &iov_cnt, mac_data.entries * ETH_ALEN);
> +
>  n->mac_table.first_multi = n->mac_table.in_use;
>  
> -mac_data.entries = ldl_p(elem->out_sg[2].iov_base);
> +s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
> +   sizeof(mac_data.entries));
> +mac_data.entries = ldl_p(&mac_data.entries);
> +if (s != sizeof(mac_data.entries)) {
> +return VIRTIO_NET_ERR;
> +}
> +
> +iov_discard_front(&iov, &iov_cnt, s);
>  
> -if (sizeof(mac_data.entries) +
> -(mac_data.entries * ETH_ALEN) > elem->out_sg[2].iov_len)
> +if (mac_data.entries * ETH_ALEN != iov_size(iov, iov_cnt)) {
>  return VIRTIO_NET_ERR;
> +}
>  
> -if (mac_data.entries) {
> -  

RE: "external snapshot-delete" questions

2013-01-23 Thread Skardal, Harald
Using RHEL7A2 with newest libvirt etc. Doing some work that uses external 
snapshot and qcow2.

I understand that "virsh snapshot-delete  " is not 
implemented yet.
Neither can you destroy/delete a VM that has external snapshots.

Is there another way to delete snapshots?

Is there a clean way to "destroy" a VM with snapshots and all? 

ThX,

    Harald
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v12 3/3] x86, apicv: add virtual interrupt delivery support

2013-01-23 Thread Yang Zhang
From: Yang Zhang 

Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
manually, which is fully taken care of by the hardware. This needs
some special awareness into existing interrupr injection path:

- for pending interrupt, instead of direct injection, we may need
  update architecture specific indicators before resuming to guest.

- A pending interrupt, which is masked by ISR, should be also
  considered in above update action, since hardware will decide
  when to inject it at right time. Current has_interrupt and
  get_interrupt only returns a valid vector from injection p.o.v.

Signed-off-by: Kevin Tian 
Signed-off-by: Yang Zhang 
---
 arch/ia64/kvm/lapic.h   |6 ++
 arch/x86/include/asm/kvm_host.h |7 ++
 arch/x86/include/asm/vmx.h  |   11 +++
 arch/x86/kvm/irq.c  |   56 +++--
 arch/x86/kvm/lapic.c|  106 ---
 arch/x86/kvm/lapic.h|   27 
 arch/x86/kvm/svm.c  |   31 +
 arch/x86/kvm/vmx.c  |  133 ---
 arch/x86/kvm/x86.c  |   11 +++-
 include/linux/kvm_host.h|3 +
 virt/kvm/ioapic.c   |   39 +++
 virt/kvm/ioapic.h   |4 +
 virt/kvm/irq_comm.c |   25 +++
 virt/kvm/kvm_main.c |5 ++
 14 files changed, 425 insertions(+), 39 deletions(-)

diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index c5f92a9..c3e2935 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -27,4 +27,10 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct 
kvm_lapic_irq *irq);
 #define kvm_apic_present(x) (true)
 #define kvm_lapic_enabled(x) (true)
 
+static inline bool kvm_apic_vid_enabled(void)
+{
+   /* IA64 has no apicv supporting, do nothing here */
+   return false;
+}
+
 #endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e1306c1..a94f8d7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -692,6 +692,12 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+   int (*vcpu_has_apicv)(struct kvm *kvm);
+   void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
+   void (*hwapic_isr_update)(struct kvm *kvm, int isr);
+   void (*hwapic_vector_intercept_on_eoi)(struct kvm_vcpu *vcpu,
+   u32 vector, u64 *eoi_exit_bitmap);
+   void (*update_eoi_exitmap)(struct kvm_vcpu *vcpu);
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
@@ -987,6 +993,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 0a54df0..694586c 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -62,6 +62,7 @@
 #define EXIT_REASON_MCE_DURING_VMENTRY  41
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EOI_INDUCED 45
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
 #define EXIT_REASON_WBINVD  54
@@ -144,6 +145,7 @@
 #define SECONDARY_EXEC_WBINVD_EXITING  0x0040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST  0x0080
 #define SECONDARY_EXEC_APIC_REGISTER_VIRT   0x0100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY0x0200
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING  0x0400
 #define SECONDARY_EXEC_ENABLE_INVPCID  0x1000
 
@@ -181,6 +183,7 @@ enum vmcs_field {
GUEST_GS_SELECTOR   = 0x080a,
GUEST_LDTR_SELECTOR = 0x080c,
GUEST_TR_SELECTOR   = 0x080e,
+   GUEST_INTR_STATUS   = 0x0810,
HOST_ES_SELECTOR= 0x0c00,
HOST_CS_SELECTOR= 0x0c02,
HOST_SS_SELECTOR= 0x0c04,
@@ -208,6 +211,14 @@ enum vmcs_field {
APIC_ACCESS_ADDR_HIGH   = 0x2015,
EPT_POINTER = 0x201a,
EPT_POINTER_HIGH= 0x201b,
+   EOI_EXIT_BITMAP0= 0x201c,
+   EOI_EXIT_BITMAP0_HIGH   = 0x201d,
+   EOI_EXIT_BITMAP1= 0x201e,
+   EOI_EXIT_BITMAP1_HI

[PATCH v12 1/3] x86, apicv: add APICv register virtualization support

2013-01-23 Thread Yang Zhang
- APIC read doesn't cause VM-Exit
- APIC write becomes trap-like

Signed-off-by: Kevin Tian 
Signed-off-by: Yang Zhang 
---
 arch/x86/include/asm/vmx.h |2 ++
 arch/x86/kvm/lapic.c   |   15 +++
 arch/x86/kvm/lapic.h   |2 ++
 arch/x86/kvm/vmx.c |   33 -
 4 files changed, 51 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index e385df9..44c3f7e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,6 +66,7 @@
 #define EXIT_REASON_EPT_MISCONFIG   49
 #define EXIT_REASON_WBINVD  54
 #define EXIT_REASON_XSETBV  55
+#define EXIT_REASON_APIC_WRITE  56
 #define EXIT_REASON_INVPCID 58
 
 #define VMX_EXIT_REASONS \
@@ -141,6 +142,7 @@
 #define SECONDARY_EXEC_ENABLE_VPID  0x0020
 #define SECONDARY_EXEC_WBINVD_EXITING  0x0040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST  0x0080
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT   0x0100
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING  0x0400
 #define SECONDARY_EXEC_ENABLE_INVPCID  0x1000
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9392f52..0664c13 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1212,6 +1212,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
 
+/* emulate APIC access in a trap manner */
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+{
+   u32 val = 0;
+
+   /* hw has done the conditional check and inst decode */
+   offset &= 0xff0;
+
+   apic_reg_read(vcpu->arch.apic, offset, 4, &val);
+
+   /* TODO: optimize to just emulate side effect w/o one more write */
+   apic_reg_write(vcpu->arch.apic, offset, val);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
+
 void kvm_free_lapic(struct kvm_vcpu *vcpu)
 {
struct kvm_lapic *apic = vcpu->arch.apic;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e5ebf9f..9a8ee22 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -64,6 +64,8 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
 
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
+
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index dd2a85c..8a8116a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,6 +84,9 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
+static bool __read_mostly enable_apicv_reg = 1;
+module_param(enable_apicv_reg, bool, S_IRUGO);
+
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -764,6 +767,12 @@ static inline bool 
cpu_has_vmx_virtualize_apic_accesses(void)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 }
 
+static inline bool cpu_has_vmx_apic_register_virt(void)
+{
+   return vmcs_config.cpu_based_2nd_exec_ctrl &
+   SECONDARY_EXEC_APIC_REGISTER_VIRT;
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
return cpu_has_vmx_tpr_shadow() &&
@@ -2540,7 +2549,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
SECONDARY_EXEC_RDTSCP |
-   SECONDARY_EXEC_ENABLE_INVPCID;
+   SECONDARY_EXEC_ENABLE_INVPCID |
+   SECONDARY_EXEC_APIC_REGISTER_VIRT;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -2551,6 +2561,11 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
 #endif
+
+   if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+   _cpu_based_2nd_exec_control &= ~(
+   SECONDARY_EXEC_APIC_REGISTER_VIRT);
+
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
   enabled */
@@ -2748,6 +2763,9 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_ple())
ple_gap = 0;
 
+   if (!cpu_has_vmx_apic_register_virt())
+   enable_apicv_reg = 0;
+
if (nested)
 

[PATCH v12 2/3] x86, apicv: add virtual x2apic support

2013-01-23 Thread Yang Zhang
From: Yang Zhang 

basically to benefit from apicv, we need to enable virtualized x2apic mode.
Currently, we only enable it when guest is really using x2apic.

Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic:
0x800 - 0x8ff: no read intercept for apicv register virtualization,
   except APIC ID and TMCCT which need software's assistance to
   get right value.

Signed-off-by: Kevin Tian 
Signed-off-by: Yang Zhang 
---
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/include/asm/vmx.h  |1 +
 arch/x86/kvm/lapic.c|   14 ++-
 arch/x86/kvm/svm.c  |6 +
 arch/x86/kvm/vmx.c  |  203 +++
 5 files changed, 201 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f75e1fe..e1306c1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -692,6 +692,7 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+   void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 44c3f7e..0a54df0 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -139,6 +139,7 @@
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x0001
 #define SECONDARY_EXEC_ENABLE_EPT   0x0002
 #define SECONDARY_EXEC_RDTSCP  0x0008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x0010
 #define SECONDARY_EXEC_ENABLE_VPID  0x0020
 #define SECONDARY_EXEC_WBINVD_EXITING  0x0040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST  0x0080
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0664c13..83a9547 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1303,6 +1303,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 {
+   u64 old_value = vcpu->arch.apic_base;
struct kvm_lapic *apic = vcpu->arch.apic;
 
if (!apic) {
@@ -1324,11 +1325,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 
value)
value &= ~MSR_IA32_APICBASE_BSP;
 
vcpu->arch.apic_base = value;
-   if (apic_x2apic_mode(apic)) {
-   u32 id = kvm_apic_id(apic);
-   u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
-   kvm_apic_set_ldr(apic, ldr);
+   if ((old_value ^ value) & X2APIC_ENABLE) {
+   if (value & X2APIC_ENABLE) {
+   u32 id = kvm_apic_id(apic);
+   u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+   kvm_apic_set_ldr(apic, ldr);
+   kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
+   } else
+   kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
}
+
apic->base_address = apic->vcpu->arch.apic_base &
 MSR_IA32_APICBASE_BASE;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d29d3cd..38407e9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, 
int tpr, int irr)
set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 }
 
+static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+{
+   return;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4290,6 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
+   .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
 
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8a8116a..c2bc989 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -643,6 +643,8 @@ static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
+static unsigned long *vmx_msr_bitmap_legacy_x2apic;
+static unsigned long *vmx_msr_bitmap_longmode_x2apic;
 
 static bool cpu_has_load_ia32_efer;
 static bool cpu_has_load_perf_global_ctrl;
@@ -767,6 +769,12 @@ static inline bool 
cpu_has_vmx_virtualize_apic_accesses(void)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 }
 
+static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
+{
+   return vmcs_config.cpu_based_2

[PATCH v12 0/3] x86, apicv: Add APIC virtualization support

2013-01-23 Thread Yang Zhang
From: Yang Zhang 

APIC virtualization is a new feature which can eliminate most of VM exit
when vcpu handle a interrupt:

APIC register virtualization:
APIC read access doesn't cause APIC-access VM exits.
APIC write becomes trap-like.

Virtual interrupt delivery:
Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
manually, which is fully taken care of by the hardware.

Please refer to Intel SDM volume 3, chapter 29 for more details.
Changes v11 to v12:
 * Check irqchip in kernel when enabling apicv, if using userspace irq chip,
   apicv cannot be used and must be disabled.
 * Rename some fucntion to more descriptive name.
 * Move ioapic entry pase logic to lapic.c
 * Rebased on top of KVM upstream

Changes v10 to v11:
 * Use two new msr bitmaps for guest that enabling x2apic mode:
   Since msr bitmap is shared by all guests, it will break guest that
   not using x2apic when updating the global msr bitmap. To solve this,
   we use two new msr bitmap for guest which using x2apic.

Changes v9 to v10:
 * Enable virtualize x2apic mode when guest is using x2apic and apicv:
   There is no point to enable x2apic mode when apicv is disabled.
 * Grep ioapic_lock when traversing ioapic entry to set eoi exit bitmap
 * Rebased on top of KVM upstream

Changes v8 to v9:
 * Update eoi exit bitmap by vcpu itself.
 * Enable virtualize x2apic mode when guest is using x2apic.
 * Rebase on top of KVM upstream

Changes v7 to v8:
 * According Marcelo's suggestion, add comments for irr_pending and isr_count,
   since the two valiables have different meaning when using apicv.
 * Set highest bit in vISR to SVI after migation.
 * Use spinlock to access eoi exit bitmap synchronously.
 * Enable virtualize x2apic mode when guest is using x2apic
 * Rebased on top of KVM upstream.

Yang Zhang (3):
  x86, apicv: add APICv register virtualization support
  x86, apicv: add virtual x2apic support
  x86, apicv: add virtual interrupt delivery support

 arch/ia64/kvm/lapic.h   |6 +
 arch/x86/include/asm/kvm_host.h |8 +
 arch/x86/include/asm/vmx.h  |   14 ++
 arch/x86/kvm/irq.c  |   56 ++-
 arch/x86/kvm/lapic.c|  135 ---
 arch/x86/kvm/lapic.h|   29 
 arch/x86/kvm/svm.c  |   37 
 arch/x86/kvm/vmx.c  |  349 ---
 arch/x86/kvm/x86.c  |   11 +-
 include/linux/kvm_host.h|3 +
 virt/kvm/ioapic.c   |   39 +
 virt/kvm/ioapic.h   |4 +
 virt/kvm/irq_comm.c |   25 +++
 virt/kvm/kvm_main.c |5 +
 14 files changed, 667 insertions(+), 54 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/8] KVM: Reduce mmu_lock hold time when zapping mmu pages

2013-01-23 Thread Takuya Yoshikawa
On Wed, 23 Jan 2013 21:45:23 +0800
Xiao Guangrong  wrote:

> > The current code which deletes the two link nodes in different functions
> > looks unnatural to me: traversing the sp->link nodes forces us to break
> > the loop and sp->hash_link nodes alone is allowed to continue ...
> > 
> > Making each function semantically clear should be more important than
> > other things.
> > 
> 
> The reason the code like this is, we have lockless shadow page walker.

But hash_link needs to be protected by mmu_lock anyway?

> > But maybe a matter of taste, so I'll wait for the maintainers' comments.
> > 
> >> http://www.linux-kvm.org/page/TODO: O(1) mmu invalidation using a 
> >> generation number
> >>
> >> I am doing this work for some weeks and will post the patch out during 
> >> these days.
> > 
> > I remember that Avi originally wrote the idea of introducing the
> > generation of mmu pages in his other work.
> > 
> 
> Whatever the original consideration is, the idea can speed up mmu 
> invalidation a lot.
> (Actually, i mentioned this idea to you when discussion fast write protect 
> long time ago.)

It's fine.  I just wanted to know if my memory was correct.

Takuya
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V3 RESEND RFC 0/2] kvm: Improving undercommit scenarios

2013-01-23 Thread Andrew Jones
On Tue, Jan 22, 2013 at 01:08:54PM +0530, Raghavendra K T wrote:
>  In some special scenarios like #vcpu <= #pcpu, PLE handler may
> prove very costly, because there is no need to iterate over vcpus
> and do unsuccessful yield_to burning CPU.
> 
>  The first patch optimizes all the yield_to by bailing out when there
>  is no need to continue in yield_to (i.e., when there is only one task 
>  in source and target rq).
> 
>  Second patch uses that in PLE handler. Further when a yield_to fails
>  we do not immediately go out of PLE handler instead we try thrice 
>  to have better statistical possibility of false return. Otherwise that
>  would affect moderate overcommit cases.
>  
>  Result on 3.7.0-rc6 kernel shows around 140% improvement for ebizzy 1x and
>  around 51% for dbench 1x  with 32 core PLE machine with 32 vcpu guest.
> 
> 
> base = 3.7.0-rc6 
> machine: 32 core mx3850 x5 PLE mc
> 
> --+---+---+---++---+
>ebizzy (rec/sec higher is beter)
> --+---+---+---++---+
> basestdev   patched stdev   %improve 
> --+---+---+---++---+
> 1x   2511.300021.54096051.8000   170.2592   140.98276   
> 2x   2679.4000   332.44822692.3000   251.4005 0.48145
> 3x   2253.5000   266.42432192.1667   178.9753-2.72169
> --+---+---+---++---+
> 
> --+---+---+---++---+
> dbench (throughput in MB/sec. higher is better)
> --+---+---+---++---+
> basestdev   patched stdev   %improve 
> --+---+---+---++---+
> 1x  6677.4080   638.504810098.0060   3449.7026 51.22643
> 2x  2012.676064.76422019.0440 62.6702   0.31639
> 3x  1302.078340.83361292.7517 27.0515  -0.71629
> --+---+---+---++---+
> 
> Here is the refernce of no ple result.
>  ebizzy-1x_nople 7592.6000 rec/sec
>  dbench_1x_nople 7853.6960 MB/sec

I'm not sure how much we should trust ebizzy results, but even
so, the dbench results are stranger. The percent error is huge
(34%) and somehow we do much better for 1x overcommit with PLE
enabled then without (for the patched version). How does that
happen? How many guests are running in the 1x test? And are the
throughput results the combined throughput of all of them? I
wonder if this jump in throughput is just the guests' perceived
throughput, but wrong due to bad virtual time keeping. Can we
run a long-lasting benchmark and measure the elapsed time with
a clock external from the guests?

Drew

> 
> The result says we can still improve by 60% for ebizzy, but overall we are
> getting impressive performance with the patches.
> 
>  Changes Since V2:
>  - Dropped global measures usage patch (Peter Zilstra)
>  - Do not bail out on first failure (Avi Kivity)
>  - Try thrice for the failure of yield_to to get statistically more correct
>behaviour.
> 
>  Changes since V1:
>  - Discard the idea of exporting nrrunning and optimize in core scheduler 
> (Peter)
>  - Use yield() instead of schedule in overcommit scenarios (Rik)
>  - Use loadavg knowledge to detect undercommit/overcommit
> 
>  Peter Zijlstra (1):
>   Bail out of yield_to when source and target runqueue has one task
> 
>  Raghavendra K T (1):
>   Handle yield_to failure return for potential undercommit case
> 
>  Please let me know your comments and suggestions.
> 
>  Link for the discussion of V3 original:
>  https://lkml.org/lkml/2012/11/26/166
> 
>  Link for V2:
>  https://lkml.org/lkml/2012/10/29/287
> 
>  Link for V1:
>  https://lkml.org/lkml/2012/9/21/168
> 
>  kernel/sched/core.c | 25 +++--
>  virt/kvm/kvm_main.c | 26 --
>  2 files changed, 35 insertions(+), 16 deletions(-)
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/8] KVM: Reduce mmu_lock hold time when zapping mmu pages

2013-01-23 Thread Xiao Guangrong
On 01/23/2013 09:28 PM, Takuya Yoshikawa wrote:
> On Wed, 23 Jan 2013 18:44:52 +0800
> Xiao Guangrong  wrote:
> 
>> On 01/23/2013 06:12 PM, Takuya Yoshikawa wrote:
>>> This patch set mitigates another mmu_lock hold time issue.  Although
>>> this is not enough and I'm thinking of additional work already, this
>>> alone can reduce the lock hold time to some extent.
>>>
>>
>> It is not worth doing this kind of complex thing, usually, only a few pages 
>> on
>> the invalid list. The *really* heavily case is kvm_mmu_zap_all() which can 
>> be speeded
>> up by using generation number, this is a todo work in kvm wiki:
> 
> I don't think this is so complex.  This is a basic programming technique
> using linked lists.

I just consider that only a few page exist on the invalid list, no worth 
introducing
this.

> 
> The current code which deletes the two link nodes in different functions
> looks unnatural to me: traversing the sp->link nodes forces us to break
> the loop and sp->hash_link nodes alone is allowed to continue ...
> 
> Making each function semantically clear should be more important than
> other things.
> 

The reason the code like this is, we have lockless shadow page walker.

> But maybe a matter of taste, so I'll wait for the maintainers' comments.
> 
>> http://www.linux-kvm.org/page/TODO: O(1) mmu invalidation using a generation 
>> number
>>
>> I am doing this work for some weeks and will post the patch out during these 
>> days.
> 
> I remember that Avi originally wrote the idea of introducing the
> generation of mmu pages in his other work.
> 

Whatever the original consideration is, the idea can speed up mmu invalidation 
a lot.
(Actually, i mentioned this idea to you when discussion fast write protect long 
time ago.)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/8] KVM: Reduce mmu_lock hold time when zapping mmu pages

2013-01-23 Thread Takuya Yoshikawa
On Wed, 23 Jan 2013 18:44:52 +0800
Xiao Guangrong  wrote:

> On 01/23/2013 06:12 PM, Takuya Yoshikawa wrote:
> > This patch set mitigates another mmu_lock hold time issue.  Although
> > this is not enough and I'm thinking of additional work already, this
> > alone can reduce the lock hold time to some extent.
> > 
> 
> It is not worth doing this kind of complex thing, usually, only a few pages on
> the invalid list. The *really* heavily case is kvm_mmu_zap_all() which can be 
> speeded
> up by using generation number, this is a todo work in kvm wiki:

I don't think this is so complex.  This is a basic programming technique
using linked lists.

The current code which deletes the two link nodes in different functions
looks unnatural to me: traversing the sp->link nodes forces us to break
the loop and sp->hash_link nodes alone is allowed to continue ...

Making each function semantically clear should be more important than
other things.

But maybe a matter of taste, so I'll wait for the maintainers' comments.

> http://www.linux-kvm.org/page/TODO: O(1) mmu invalidation using a generation 
> number
> 
> I am doing this work for some weeks and will post the patch out during these 
> days.

I remember that Avi originally wrote the idea of introducing the
generation of mmu pages in his other work.

Thanks,
Takuya
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [RFC] KVM/arm64, take #3

2013-01-23 Thread Marc Zyngier
On 23/01/13 11:24, Pranavkumar Sawargaonkar wrote:

Hi Pranav,

> I have tried kvm-arm64/kvm branch but seems it is not booting on foundation 
> model.

Hmmm...

root@hot-poop:~# dmesg | head
Initializing cgroup subsys cpu
Linux version 3.8.0-rc4+ (maz@e102391-lin) (gcc version 4.7.1 (0.11.114) ) #761 
SMP PREEMPT Wed Jan 23 11:43:41 GMT 2013
CPU: AArch64 Processor [410fd000] revision 0
Machine: Foundation-v8A
^^^

Seems to be booting well enough for me. Are you sure you used the supplied DT
(as mentioned in the below email)?

> Also there is no "arch/arm64/platforms" folder to have verxpress support.

There is no need for a platforms directory any more. The vexpress support is
now just a configuration option.

> Thanks,
> Pranav

Cheers,

M.

> 
> On 23 January 2013 00:32, Marc Zyngier 
> mailto:marc.zyng...@arm.com>> wrote:
> Guys,
> 
> I've once more updated the branches for KVM/arm64
> 
> - kvm-arm/pre-arm64: kvm-arm-master as of today + the "cleanup" branch +
> some basic perf support
> 
> - arm64/soc-armv8-model: Catalin Marinas' arm64 branch
> 
> - arm64/psci: Implementation of PSCI for the above
> 
> - arm64/perf: host/guest discrimination
> 
> - kvm-arm64/kvm-prereq: a bunch of random bits that KVM/arm requires to
> compile on arm64.
> 
> - kvm-arm64/kvm-prereq-merged: all the above, plus Mark Rutland's timer
> rework.
> 
> - kvm-arm64/kvm: KVM/arm64 itself, and the only branch you should use
> unless you're completely hatstand.
> 
> All that is at:
> git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git
> 
> You'll also need Will Deacon's KVM Tool port:
> git://git.kernel.org/pub/scm/linux/kernel/git/will/kvmtool.git
>  kvmtool/arm
> 
> 
> A few random notes:
> - If you're using the Foundation Model, use the provided DTS for your
> host kernel (arch/arm64/boot/dts/foundation-v8.dts).
> - The only supported models are the AEMv8 and the Foundation models. If
> you're using something else and have any issue, first reproduce it with
> one of the supported implementations.
> 
> What's new:
> - Rebased on 3.8-rc4
> - Resynced with kvm-arm-master
> - More 32bit fixes (ThumbEE, check for lack of 32bit support in HW)
> - Some basic perf support
> 
> Enjoy,
> 
> M.
> --
> Jazz is not dead. It just smells funny...
> 
> 
> ___
> kvmarm mailing list
> kvm...@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/cucslists/listinfo/kvmarm
> 
> 


-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] KVM: VMX: enable acknowledge interupt on vmexit

2013-01-23 Thread Gleb Natapov
On Tue, Jan 22, 2013 at 01:49:31PM +0800, Yang Zhang wrote:
> From: Yang Zhang 
> 
> The "acknowledge interrupt on exit" feature controls processor behavior
> for external interrupt acknowledgement. When this control is set, the
> processor acknowledges the interrupt controller to acquire the
> interrupt vector on VM exit.
> 
> After enabling this feature, an interrupt which arrived when target cpu is
> running in vmx non-root mode will be handled by vmx handler instead of handler
> in idt. Currently, vmx handler only fakes an interrupt stack and jump to idt
> table to let real handler to handle it. Further, we will recognize the 
> interrupt
> and only delivery the interrupt which not belong to current vcpu through idt 
> table.
> The interrupt which belonged to current vcpu will be handled inside vmx 
> handler.
> This will reduce the interrupt handle cost of KVM.
> 
> Refer to Intel SDM volum 3, chapter 33.2.
> 
> Signed-off-by: Yang Zhang 
> ---
>  arch/x86/include/asm/kvm_host.h |2 +
>  arch/x86/kvm/svm.c  |6 
>  arch/x86/kvm/vmx.c  |   52 
> ++-
>  arch/x86/kvm/x86.c  |2 +
>  4 files changed, 61 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c431b33..0b73602 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -345,6 +345,7 @@ struct kvm_vcpu_arch {
>   unsigned long cr8;
>   u32 hflags;
>   u64 efer;
> + struct desc_ptr host_idt;
Enough to save only host_idt.address.

>   u64 apic_base;
>   struct kvm_lapic *apic;/* kernel irqchip context */
>   unsigned long apic_attention;
> @@ -723,6 +724,7 @@ struct kvm_x86_ops {
>   int (*check_intercept)(struct kvm_vcpu *vcpu,
>  struct x86_instruction_info *info,
>  enum x86_intercept_stage stage);
> + void (*handle_external_intr)(struct kvm_vcpu *vcpu);
>  };
>  
>  struct kvm_arch_async_pf {
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d29d3cd..e286600 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -4227,6 +4227,11 @@ out:
>   return ret;
>  }
>  
> +static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
> +{
> + return;
> +}
> +
>  static struct kvm_x86_ops svm_x86_ops = {
>   .cpu_has_kvm_support = has_svm,
>   .disabled_by_bios = is_disabled,
> @@ -4318,6 +4323,7 @@ static struct kvm_x86_ops svm_x86_ops = {
>   .set_tdp_cr3 = set_tdp_cr3,
>  
>   .check_intercept = svm_check_intercept,
> + .handle_external_intr = svm_handle_external_intr,
>  };
>  
>  static int __init svm_init(void)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index dd2a85c..ef98392 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2565,7 +2565,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
> *vmcs_conf)
>  #ifdef CONFIG_X86_64
>   min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
>  #endif
> - opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
> + opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
> + VM_EXIT_ACK_INTR_ON_EXIT;
>   if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
>   &_vmexit_control) < 0)
>   return -EIO;
> @@ -3933,6 +3934,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>  
>   vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
>   set_cr4_guest_host_mask(vmx);
> + native_store_idt(&vmx->vcpu.arch.host_idt);
>  
We already call native_store_idt() in vmx_set_constant_host_state(). No
need to do it twice. Add vcpu parameter to vmx_set_constant_host_state()
to get idt address from there.

>   return 0;
>  }
> @@ -6096,6 +6098,53 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx 
> *vmx)
>   }
>  }
>  
> +
> +static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
> +{
> + u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> + if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
> + == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
> + unsigned int vector;
> + unsigned long entry;
> + gate_desc *desc;
> +
> + vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
> +#ifdef CONFIG_X86_64
> + desc = (void *)vcpu->arch.host_idt.address + vector * 16;
> +#else
> + desc = (void *)vcpu->arch.host_idt.address + vector * 8;
> +#endif
> +
> + entry = gate_offset(*desc);
> + asm(
> + "mov %0, %%" _ASM_DX " \n\t"
> +#ifdef CONFIG_X86_64
> + "mov %%" _ASM_SP ", %%" _ASM_BX " \n\t"
> + "and $0xfff0, %%" _ASM_SP " \n\t"
> + "mov %%ss, %%" _ASM_AX " \n\t"
> + "push %%" _ASM_AX " \n\t"
> + "push %%" _ASM_BX " \n\t"
> +#endif

RE: [PATCH v11 3/3] x86, apicv: add virtual interrupt delivery support

2013-01-23 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-01-23:
> On Wed, Jan 23, 2013 at 12:45:39AM +, Zhang, Yang Z wrote:
>>> We are getting close so please test with userspace irq chip too.
>> Thanks for your suggestion to test with userspace irqchip. I found some
>> issues and will modify the logic: As we known, APICv deponds on TPR
>> shadow. But TPR shadow is per VM(it will
> be disabled when VM uses userspace irq chip), this means APICv also is per VM.
> But in current implementation, we use the global variable enable_apicv_reg to
> check whether APICv is used by target vcpu. This is wrong. Instead, it should 
> to
> read VMCS to see whether the bit is set or not.
>> 
> No, the apicv and TPR shadow are global for all VMs with irq chip in
> kernel and VMs without irq chip in kernel skip APIC that code entirely.
> If there is generic code that should behave differently with or without
> in-kernel irq chip make the check for the condition there.
Yes, check irqchip_in_kernel is enough. No need to check the bits in vmcs. Like 
this:

static int vmx_vcpu_has_apicv(struct kvm *kvm)
{
return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
}

Best regards,
Yang

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/8] KVM: Reduce mmu_lock hold time when zapping mmu pages

2013-01-23 Thread Xiao Guangrong
On 01/23/2013 06:12 PM, Takuya Yoshikawa wrote:
> This patch set mitigates another mmu_lock hold time issue.  Although
> this is not enough and I'm thinking of additional work already, this
> alone can reduce the lock hold time to some extent.
> 

It is not worth doing this kind of complex thing, usually, only a few pages on
the invalid list. The *really* heavily case is kvm_mmu_zap_all() which can be 
speeded
up by using generation number, this is a todo work in kvm wiki:

http://www.linux-kvm.org/page/TODO: O(1) mmu invalidation using a generation 
number

I am doing this work for some weeks and will post the patch out during these 
days.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 3/3] x86, apicv: add virtual interrupt delivery support

2013-01-23 Thread Gleb Natapov
On Wed, Jan 23, 2013 at 12:45:39AM +, Zhang, Yang Z wrote:
> > We are getting close so please test with userspace irq chip too.
> Thanks for your suggestion to test with userspace irqchip. I found some 
> issues and will modify the logic:
> As we known, APICv deponds on TPR shadow. But TPR shadow is per VM(it will be 
> disabled when VM uses userspace irq chip), this means APICv also is per VM. 
> But in current implementation, we use the global variable enable_apicv_reg to 
> check whether APICv is used by target vcpu. This is wrong. Instead, it should 
> to read VMCS to see whether the bit is set or not.
> 
No, the apicv and TPR shadow are global for all VMs with irq chip in
kernel and VMs without irq chip in kernel skip APIC that code entirely.
If there is generic code that should behave differently with or without
in-kernel irq chip make the check for the condition there.

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 2/3] x86, apicv: add virtual x2apic support

2013-01-23 Thread Gleb Natapov
On Tue, Jan 22, 2013 at 09:13:06PM -0200, Marcelo Tosatti wrote:
> On Tue, Jan 22, 2013 at 05:55:53PM +0200, Gleb Natapov wrote:
> > On Tue, Jan 22, 2013 at 12:21:47PM +, Zhang, Yang Z wrote:
> > > >> +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
> > > >> +{
> > > >> +  unsigned long *msr_bitmap;
> > > >> +
> > > >> +  if (apic_x2apic_mode(vcpu->arch.apic))
> > > > 
> > > > vcpu->arch.apic can be NULL.
> > > Actually, call apic_x2apic_mode to check whether use x2apic msr bitmap is 
> > > wrong.
> > > VCPU uses x2apic but it may not set virtual x2apic mode bit due to TPR 
> > > shadow not enabled or irqchip not in kernel. Check the virtual x2apic 
> > > mode bit in vmcs directly should be a better choice. How about the follow 
> > > code:
> > > 
> > If TPR shadow it not enabled vmx_msr_bitmap_.*x2apic bitmap will have 
> > x2apic MSRs intercepted.
> 
> And what is the problem? APIC register virt depends on TPR shadow.
No problem. I am saying that it is safe to set exit bitmap to
vmx_msr_bitmap_.*x2apic in this case, so no need to check vmcs,
apic_x2apic_mode() is sufficient.
--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for-1.4 qom-cpu 2/9] target-i386: kvm: Set vcpu_id to APIC ID instead of CPU index

2013-01-23 Thread Gleb Natapov
On Tue, Jan 22, 2013 at 06:25:02PM -0200, Eduardo Habkost wrote:
> The CPU ID in KVM is supposed to be the APIC ID, so change the
> KVM_CREATE_VCPU call to match it. The current behavior didn't break
> anything yet because today the APIC ID is assumed to be equal to the CPU
> index, but this won't be true in the future.
> 
> Signed-off-by: Eduardo Habkost 
> Reviewed-by: Marcelo Tosatti 
Acked-by: Gleb Natapov 

> ---
> Cc: kvm@vger.kernel.org
> Cc: Michael S. Tsirkin 
> Cc: Gleb Natapov 
> Cc: Marcelo Tosatti 
> 
> Changes v2:
>  - Change only i386 code (kvm_arch_vcpu_id())
> 
> Changes v3:
>  - Get CPUState as argument instead of CPUArchState
> ---
>  target-i386/kvm.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 5f3f789..c440809 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -411,9 +411,10 @@ static void cpu_update_state(void *opaque, int running, 
> RunState state)
>  }
>  }
>  
> -unsigned long kvm_arch_vcpu_id(CPUState *cpu)
> +unsigned long kvm_arch_vcpu_id(CPUState *cs)
>  {
> -return cpu->cpu_index;
> +X86CPU *cpu = X86_CPU(cs);
> +return cpu->env.cpuid_apic_id;
>  }
>  
>  int kvm_arch_init_vcpu(CPUState *cs)
> -- 
> 1.8.1

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for-1.4 qom-cpu 1/9] kvm: Create kvm_arch_vcpu_id() function

2013-01-23 Thread Gleb Natapov
On Tue, Jan 22, 2013 at 06:25:01PM -0200, Eduardo Habkost wrote:
> This will allow each architecture to define how the VCPU ID is set on
> the KVM_CREATE_VCPU ioctl call.
> 
> Signed-off-by: Eduardo Habkost 
Acked-by: Gleb Natapov 

> ---
> Cc: kvm@vger.kernel.org
> Cc: Michael S. Tsirkin 
> Cc: Gleb Natapov 
> Cc: Marcelo Tosatti 
> 
> Changes v2:
>  - Get CPUState as argument instead of CPUArchState
> 
> Changes v3:
>  - Convert KVM_CREATE_VCPU ioctl() argument to void*, so
>the argument type matches the type expected by kvm_vm_ioctl()
> ---
>  include/sysemu/kvm.h | 3 +++
>  kvm-all.c| 2 +-
>  target-i386/kvm.c| 5 +
>  target-ppc/kvm.c | 5 +
>  target-s390x/kvm.c   | 5 +
>  5 files changed, 19 insertions(+), 1 deletion(-)
> 
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 22acf91..384ee66 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -196,6 +196,9 @@ int kvm_arch_init(KVMState *s);
>  
>  int kvm_arch_init_vcpu(CPUState *cpu);
>  
> +/* Returns VCPU ID to be used on KVM_CREATE_VCPU ioctl() */
> +unsigned long kvm_arch_vcpu_id(CPUState *cpu);
> +
>  void kvm_arch_reset_vcpu(CPUState *cpu);
>  
>  int kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr);
> diff --git a/kvm-all.c b/kvm-all.c
> index 6278d61..363a358 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -222,7 +222,7 @@ int kvm_init_vcpu(CPUState *cpu)
>  
>  DPRINTF("kvm_init_vcpu\n");
>  
> -ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, cpu->cpu_index);
> +ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
>  if (ret < 0) {
>  DPRINTF("kvm_create_vcpu failed\n");
>  goto err;
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 3acff40..5f3f789 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -411,6 +411,11 @@ static void cpu_update_state(void *opaque, int running, 
> RunState state)
>  }
>  }
>  
> +unsigned long kvm_arch_vcpu_id(CPUState *cpu)
> +{
> +return cpu->cpu_index;
> +}
> +
>  int kvm_arch_init_vcpu(CPUState *cs)
>  {
>  struct {
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 2f4f068..2c64c63 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -384,6 +384,11 @@ static inline void kvm_fixup_page_sizes(PowerPCCPU *cpu)
>  
>  #endif /* !defined (TARGET_PPC64) */
>  
> +unsigned long kvm_arch_vcpu_id(CPUState *cpu)
> +{
> +return cpu->cpu_index;
> +}
> +
>  int kvm_arch_init_vcpu(CPUState *cs)
>  {
>  PowerPCCPU *cpu = POWERPC_CPU(cs);
> diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
> index add6a58..99deddf 100644
> --- a/target-s390x/kvm.c
> +++ b/target-s390x/kvm.c
> @@ -76,6 +76,11 @@ int kvm_arch_init(KVMState *s)
>  return 0;
>  }
>  
> +unsigned long kvm_arch_vcpu_id(CPUState *cpu)
> +{
> +return cpu->cpu_index;
> +}
> +
>  int kvm_arch_init_vcpu(CPUState *cpu)
>  {
>  int ret = 0;
> -- 
> 1.8.1

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/8] KVM: MMU: Move free_zapped_mmu_pages() out of the protection of mmu_lock

2013-01-23 Thread Takuya Yoshikawa
We noticed that kvm_mmu_zap_all() could take hundreds of milliseconds
for zapping mmu pages with mmu_lock held.

Although we need to do conditional rescheduling for completely
fixing this issue, we can reduce the hold time to some extent by moving
free_zapped_mmu_pages() out of the protection.  Since invalid_list can
be very long, the effect is not negligible.

Note: this patch does not treat non-trivial cases.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |   20 +---
 1 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dd7b455..7976f55 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2185,7 +2185,6 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned 
int goal_nr_mmu_pages)
kvm_mmu_prepare_zap_page(kvm, page, &invalid_list, 
NULL);
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
-   free_zapped_mmu_pages(kvm, &invalid_list);
 
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
@@ -2193,6 +2192,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned 
int goal_nr_mmu_pages)
kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
 
spin_unlock(&kvm->mmu_lock);
+
+   free_zapped_mmu_pages(kvm, &invalid_list);
 }
 
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2213,9 +2214,10 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &npos);
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
-   free_zapped_mmu_pages(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
 
+   free_zapped_mmu_pages(kvm, &invalid_list);
+
return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
@@ -2934,10 +2936,11 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 &invalid_list, NULL);
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
}
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
+
+   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
return;
}
for (i = 0; i < 4; ++i) {
@@ -2954,9 +2957,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
 
spin_unlock(&vcpu->kvm->mmu_lock);
+
+   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 }
 
@@ -4054,10 +4058,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
 
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
 
kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock);
+
+   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -4254,9 +4259,10 @@ restart:
goto restart;
 
kvm_mmu_commit_zap_page(kvm, &invalid_list);
-   free_zapped_mmu_pages(kvm, &invalid_list);
 
spin_unlock(&kvm->mmu_lock);
+
+   free_zapped_mmu_pages(kvm, &invalid_list);
 }
 
 static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
@@ -4308,10 +4314,10 @@ static int mmu_shrink(struct shrinker *shrink, struct 
shrink_control *sc)
 
kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
kvm_mmu_commit_zap_page(kvm, &invalid_list);
-   free_zapped_mmu_pages(kvm, &invalid_list);
 
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
+   free_zapped_mmu_pages(kvm, &invalid_list);
 
list_move_tail(&kvm->vm_list, &vm_list);
break;
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/8] KVM: MMU: Split out free_zapped_mmu_pages() from kvm_mmu_commit_zap_page()

2013-01-23 Thread Takuya Yoshikawa
Just trivial conversions at this point.  Some of these will be moved out
of the protection of the mmu_lock in the following patch.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |   24 +---
 1 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 97d372a..dd7b455 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1721,8 +1721,10 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
int ret;
 
ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
-   if (ret)
+   if (ret) {
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
+   }
 
return ret;
 }
@@ -1765,6 +1767,8 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t 
gfn)
}
 
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
+
if (flush)
kvm_mmu_flush_tlb(vcpu);
 }
@@ -1852,6 +1856,8 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
mmu_pages_clear_parents(&parents);
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
+
cond_resched_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_pages_init(parent, &parents, &pages);
}
@@ -2152,8 +2158,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 * page table walks.
 */
kvm_flush_remote_tlbs(kvm);
-
-   free_zapped_mmu_pages(kvm, invalid_list);
 }
 
 /*
@@ -2181,6 +2185,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned 
int goal_nr_mmu_pages)
kvm_mmu_prepare_zap_page(kvm, page, &invalid_list, 
NULL);
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
+   free_zapped_mmu_pages(kvm, &invalid_list);
+
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
 
@@ -2207,6 +2213,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &npos);
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
+   free_zapped_mmu_pages(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
 
return r;
@@ -2927,6 +2934,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 &invalid_list, NULL);
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
}
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2946,6 +2954,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
+
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 }
@@ -4042,7 +4052,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
}
mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
+
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
+
kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock);
 }
@@ -4076,7 +4089,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list, NULL);
++vcpu->kvm->stat.mmu_recycled;
}
+
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+   free_zapped_mmu_pages(vcpu->kvm, &invalid_list);
 }
 
 static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
@@ -4239,6 +4254,8 @@ restart:
goto restart;
 
kvm_mmu_commit_zap_page(kvm, &invalid_list);
+   free_zapped_mmu_pages(kvm, &invalid_list);
+
spin_unlock(&kvm->mmu_lock);
 }
 
@@ -4291,6 +4308,7 @@ static int mmu_shrink(struct shrinker *shrink, struct 
shrink_control *sc)
 
kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
kvm_mmu_commit_zap_page(kvm, &invalid_list);
+   free_zapped_mmu_pages(kvm, &invalid_list);
 
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/8] KVM: MMU: Introduce free_zapped_mmu_pages() for freeing mmu pages in a list

2013-01-23 Thread Takuya Yoshikawa
This will be split out from kvm_mmu_commit_zap_page() and moved out of
the protection of the mmu_lock later.

Note: kvm_mmu_isolate_page() is folded into kvm_mmu_free_page() since it
now does nothing but free sp->gfns.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |   35 +--
 1 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a72c573..97d372a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1461,27 +1461,32 @@ static inline void kvm_mod_used_mmu_pages(struct kvm 
*kvm, int nr)
 }
 
 /*
- * Remove the sp from shadow page cache, after call it,
- * we can not find this sp from the cache, and the shadow
- * page table is still valid.
- * It should be under the protection of mmu lock.
+ * Free the shadow page table and the sp, we can do it
+ * out of the protection of mmu lock.
  */
-static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
ASSERT(is_empty_shadow_page(sp->spt));
+
if (!sp->role.direct)
free_page((unsigned long)sp->gfns);
+
+   list_del(&sp->link);
+   free_page((unsigned long)sp->spt);
+   kmem_cache_free(mmu_page_header_cache, sp);
 }
 
 /*
- * Free the shadow page table and the sp, we can do it
- * out of the protection of mmu lock.
+ * Free zapped mmu pages in @invalid_list.
+ * Call this after releasing mmu_lock if possible.
  */
-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+static void free_zapped_mmu_pages(struct kvm *kvm,
+ struct list_head *invalid_list)
 {
-   list_del(&sp->link);
-   free_page((unsigned long)sp->spt);
-   kmem_cache_free(mmu_page_header_cache, sp);
+   struct kvm_mmu_page *sp, *nsp;
+
+   list_for_each_entry_safe(sp, nsp, invalid_list, link)
+   kvm_mmu_free_page(sp);
 }
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -2133,8 +2138,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
 {
-   struct kvm_mmu_page *sp, *nsp;
-
if (list_empty(invalid_list))
return;
 
@@ -2150,11 +2153,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 */
kvm_flush_remote_tlbs(kvm);
 
-   list_for_each_entry_safe(sp, nsp, invalid_list, link) {
-   WARN_ON(!sp->role.invalid || sp->root_count);
-   kvm_mmu_isolate_page(sp);
-   kvm_mmu_free_page(sp);
-   }
+   free_zapped_mmu_pages(kvm, invalid_list);
 }
 
 /*
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/8] KVM: MMU: Delete hash_link node in kvm_mmu_prepare_zap_page()

2013-01-23 Thread Takuya Yoshikawa
Now that we are using for_each_gfn_indirect_valid_sp_safe, we can safely
delete the node by correctly updating the pointer to the next one.

The only case we need to care about is when mmu_zap_unsync_children()
has zapped anything other than the current one.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |7 ++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d5bf373..a72c573 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1469,7 +1469,6 @@ static inline void kvm_mod_used_mmu_pages(struct kvm 
*kvm, int nr)
 static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
 {
ASSERT(is_empty_shadow_page(sp->spt));
-   hlist_del(&sp->hash_link);
if (!sp->role.direct)
free_page((unsigned long)sp->gfns);
 }
@@ -2111,9 +2110,15 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
unaccount_shadowed(kvm, sp->gfn);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
+
+   /* Next entry might be deleted by mmu_zap_unsync_children(). */
+   if (npos && ret)
+   npos->hn = sp->hash_link.next;
+
if (!sp->root_count) {
/* Count self */
ret++;
+   hlist_del(&sp->hash_link);
list_move(&sp->link, invalid_list);
kvm_mod_used_mmu_pages(kvm, -1);
} else {
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/8] KVM: MMU: Introduce for_each_gfn_indirect_valid_sp_safe macro

2013-01-23 Thread Takuya Yoshikawa
This is a preparation for moving hlist_del(&sp->hash_link) from
kvm_mmu_isolate_page() to kvm_mmu_prepare_zap_page().

All for_each_gfn_indirect_valid_sp's whose bodies contain a function
call which will reach kvm_mmu_prepare_zap_page(), and not break the
loop right after the call, are converted to this new one.

Note: ignored the following checkpatch report:
  ERROR: Macros with complex values should be enclosed in parenthesis

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |   27 +--
 1 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a48533..d5bf373 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1677,6 +1677,18 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
for_each_gfn_sp(_kvm, _sp, _gfn, _pos)  \
if (!(_sp)->role.direct && !(_sp)->role.invalid)
 
+/*
+ * Used for zapping mmu pages while traversing the mmu page hash list.
+ * Users must update @_n so that it points to the new next node after deleting
+ * any entries in such a way that can make the value prepared by
+ * hlist_for_each_entry_safe invalid.
+ */
+#define for_each_gfn_indirect_valid_sp_safe(_kvm, _sp, _gfn, _pos, _n) \
+   hlist_for_each_entry_safe(_sp, _pos, _n,\
+ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+   if (((_sp)->gfn == (_gfn)) &&   \
+   !(_sp)->role.direct && !(_sp)->role.invalid)
+
 /* @sp->gfn should be write-protected at the call site */
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
   struct list_head *invalid_list, bool clear_unsync)
@@ -1729,10 +1741,11 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  
gfn_t gfn)
 {
struct kvm_mmu_page *s;
struct hlist_node *node;
+   struct sp_next_pos npos;
LIST_HEAD(invalid_list);
bool flush = false;
 
-   for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+   for_each_gfn_indirect_valid_sp_safe(vcpu->kvm, s, gfn, node, npos.hn) {
if (!s->unsync)
continue;
 
@@ -1741,7 +1754,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t 
gfn)
if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
(vcpu->arch.mmu.sync_page(vcpu, s))) {
kvm_mmu_prepare_zap_page(vcpu->kvm, s,
-&invalid_list, NULL);
+&invalid_list, &npos);
continue;
}
flush = true;
@@ -2176,17 +2189,18 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
struct kvm_mmu_page *sp;
struct hlist_node *node;
+   struct sp_next_pos npos;
LIST_HEAD(invalid_list);
int r;
 
pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
spin_lock(&kvm->mmu_lock);
-   for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+   for_each_gfn_indirect_valid_sp_safe(kvm, sp, gfn, node, npos.hn) {
pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
 sp->role.word);
r = 1;
-   kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, NULL);
+   kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &npos);
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
@@ -3966,6 +3980,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
union kvm_mmu_page_role mask = { .word = 0 };
struct kvm_mmu_page *sp;
struct hlist_node *node;
+   struct sp_next_pos npos;
LIST_HEAD(invalid_list);
u64 entry, gentry, *spte;
int npte;
@@ -3996,11 +4011,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 
mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
-   for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
+   for_each_gfn_indirect_valid_sp_safe(vcpu->kvm, sp, gfn, node, npos.hn) {
if (detect_write_misaligned(sp, gpa, bytes) ||
  detect_write_flooding(sp)) {
zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-   &invalid_list, NULL);
+   &invalid_list, &npos);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/8] KVM: MMU: Add a parameter to kvm_mmu_prepare_zap_page() to update the next position

2013-01-23 Thread Takuya Yoshikawa
Currently we cannot do the isolation of mmu pages, i.e. deleting the
current hash_link node by hlist_del(), in this function, because we
may call it while traversing the linked list; we cannot solve the
problem by hlist_for_each_entry_safe as zapping can happen recursively.

Since the isolation must be done before releasing mmu_lock, we are now
forced to call kvm_mmu_isolate_page() for each mmu page found in the
invalid_list in kvm_mmu_commit_zap_page().

This patch adds a new parameter to kvm_mmu_prepare_zap_page() as a
preparation for solving this issue: all callers just pass NULL now.

Note: the abstraction, the introduction of sp_next_pos, makes it
possible to support the other list later.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |   41 +++--
 1 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 46b1435..2a48533 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1653,8 +1653,18 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, 
struct kvm_mmu_page *sp)
--kvm->stat.mmu_unsync;
 }
 
+/*
+ * Used to hold a pointer to the next mmu page's node when traversing through
+ * one of the linked lists.  This must be updated correctly when deleting any
+ * entries from the list.
+ */
+struct sp_next_pos {
+   struct hlist_node *hn;  /* next hash_link node */
+};
+
 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
-   struct list_head *invalid_list);
+   struct list_head *invalid_list,
+   struct sp_next_pos *npos);
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
 
@@ -1672,7 +1682,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
   struct list_head *invalid_list, bool clear_unsync)
 {
if (sp->role.cr4_pae != !!is_pae(vcpu)) {
-   kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+   kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list, NULL);
return 1;
}
 
@@ -1680,7 +1690,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
kvm_unlink_unsync_page(vcpu->kvm, sp);
 
if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
-   kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+   kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list, NULL);
return 1;
}
 
@@ -1730,7 +1740,8 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t 
gfn)
kvm_unlink_unsync_page(vcpu->kvm, s);
if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
(vcpu->arch.mmu.sync_page(vcpu, s))) {
-   kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
+   kvm_mmu_prepare_zap_page(vcpu->kvm, s,
+&invalid_list, NULL);
continue;
}
flush = true;
@@ -2062,7 +2073,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
struct kvm_mmu_page *sp;
 
for_each_sp(pages, sp, parents, i) {
-   kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+   kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, NULL);
mmu_pages_clear_parents(&parents);
zapped++;
}
@@ -2073,7 +2084,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 }
 
 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
-   struct list_head *invalid_list)
+   struct list_head *invalid_list,
+   struct sp_next_pos *npos)
 {
int ret;
 
@@ -2149,7 +2161,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned 
int goal_nr_mmu_pages)
 
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
-   kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
+   kvm_mmu_prepare_zap_page(kvm, page, &invalid_list, 
NULL);
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
@@ -2174,7 +2186,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
 sp->role.word);
r = 1;
-   kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+   kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, NULL);
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
@@ -2894

[PATCH 2/8] KVM: MMU: Use list_for_each_entry_safe in kvm_mmu_commit_zap_page()

2013-01-23 Thread Takuya Yoshikawa
We are traversing the linked list, invalid_list, deleting each entry by
kvm_mmu_free_page().  _safe version is there for such a case.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |7 +++
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 376cec8..46b1435 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2103,7 +2103,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
 {
-   struct kvm_mmu_page *sp;
+   struct kvm_mmu_page *sp, *nsp;
 
if (list_empty(invalid_list))
return;
@@ -2120,12 +2120,11 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 */
kvm_flush_remote_tlbs(kvm);
 
-   do {
-   sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+   list_for_each_entry_safe(sp, nsp, invalid_list, link) {
WARN_ON(!sp->role.invalid || sp->root_count);
kvm_mmu_isolate_page(sp);
kvm_mmu_free_page(sp);
-   } while (!list_empty(invalid_list));
+   }
 }
 
 /*
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/8] KVM: MMU: Fix and clean up for_each_gfn_* macros

2013-01-23 Thread Takuya Yoshikawa
The expression (sp)->gfn should not be expanded using @gfn.

Although no user of these macros passes a string other than gfn now,
this should be fixed before anyone sees strange errors.

Also, the counter-intuitive conditions, which had been used before these
macros were introduced to avoid extra indentations, should not be used.

Note: ignored the following checkpatch report:
  ERROR: Macros with complex values should be enclosed in parenthesis

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |   18 --
 1 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9f628f7..376cec8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1658,16 +1658,14 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
 
-#define for_each_gfn_sp(kvm, sp, gfn, pos) \
-  hlist_for_each_entry(sp, pos,
\
-   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)  \
-   if ((sp)->gfn != (gfn)) {} else
-
-#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)  \
-  hlist_for_each_entry(sp, pos,
\
-   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)  \
-   if ((sp)->gfn != (gfn) || (sp)->role.direct ||  \
-   (sp)->role.invalid) {} else
+#define for_each_gfn_sp(_kvm, _sp, _gfn, _pos) \
+   hlist_for_each_entry(_sp, _pos, \
+ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+   if ((_sp)->gfn == (_gfn))
+
+#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn, _pos)  \
+   for_each_gfn_sp(_kvm, _sp, _gfn, _pos)  \
+   if (!(_sp)->role.direct && !(_sp)->role.invalid)
 
 /* @sp->gfn should be write-protected at the call site */
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/8] KVM: Reduce mmu_lock hold time when zapping mmu pages

2013-01-23 Thread Takuya Yoshikawa
This patch set mitigates another mmu_lock hold time issue.  Although
this is not enough and I'm thinking of additional work already, this
alone can reduce the lock hold time to some extent.

Takuya Yoshikawa (8):
  KVM: MMU: Fix and clean up for_each_gfn_* macros
  KVM: MMU: Use list_for_each_entry_safe in kvm_mmu_commit_zap_page()
  KVM: MMU: Add a parameter to kvm_mmu_prepare_zap_page() to update the next 
position
  KVM: MMU: Introduce for_each_gfn_indirect_valid_sp_safe macro
  KVM: MMU: Delete hash_link node in kvm_mmu_prepare_zap_page()
  KVM: MMU: Introduce free_zapped_mmu_pages() for freeing mmu pages in a list
  KVM: MMU: Split out free_zapped_mmu_pages() from kvm_mmu_commit_zap_page()
  KVM: MMU: Move free_zapped_mmu_pages() out of the protection of mmu_lock

 arch/x86/kvm/mmu.c |  149 +++-
 1 files changed, 101 insertions(+), 48 deletions(-)

-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 12/12] KVM: MMU: fast drop all spte on the pte_list

2013-01-23 Thread Xiao Guangrong
If the pte_list need to be destroyed, no need to delete its spte one
by one, we can directly reset it and free the memory its used

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   36 +++-
 1 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2c0a786..0afe8da 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -968,6 +968,25 @@ static void pte_list_remove(u64 *spte, unsigned long 
*pte_list)
}
 }

+static void pte_list_destroy(unsigned long *pte_list)
+{
+   struct pte_list_desc *desc;
+   unsigned long list_value = *pte_list;
+
+   *pte_list = 0;
+
+   if (!(list_value & 1))
+   return;
+
+   desc = (struct pte_list_desc *)(list_value & ~1ul);
+   while (desc) {
+   struct pte_list_desc *next_desc = desc->more;
+
+   mmu_free_pte_list_desc(desc);
+   desc = next_desc;
+   }
+}
+
 /*
  * Used by the following functions to iterate through the sptes linked by a
  * pte_list.  All fields are private and not assumed to be used outside.
@@ -1206,17 +1225,17 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
   struct kvm_memory_slot *slot, unsigned long data)
 {
-   u64 *sptep;
struct pte_list_iterator iter;
+   u64 *sptep;
int need_tlb_flush = 0;

-restart:
for_each_spte_in_rmap(*rmapp, iter, sptep) {
-   drop_spte(kvm, sptep);
+   mmu_spte_clear_track_bits(sptep);
need_tlb_flush = 1;
-   goto restart;
}

+   pte_list_destroy(rmapp);
+
return need_tlb_flush;
 }

@@ -2041,11 +2060,10 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, 
struct kvm_mmu_page *sp)
u64 *sptep;
struct pte_list_iterator iter;

-restart:
-   for_each_spte_in_rmap(sp->parent_ptes, iter, sptep) {
-   drop_parent_pte(sp, sptep);
-   goto restart;
-   }
+   for_each_spte_in_rmap(sp->parent_ptes, iter, sptep)
+   mmu_spte_clear_no_track(sptep);
+
+   pte_list_destroy(&sp->parent_ptes);
 }

 static int mmu_zap_unsync_children(struct kvm *kvm,
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 11/12] KVM: MMU: fix spte assertion

2013-01-23 Thread Xiao Guangrong
PT_PRESENT_MASK bit is not enough to see the spte has already been mapped
into pte-list for mmio spte also set this bit. Use is_shadow_present_pte
instead to fix it

Also, this patch move many assertions to the common place to clean up the
code

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   20 ++--
 1 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b7da3fb..2c0a786 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1032,7 +1032,9 @@ static u64 *pte_list_get_next(struct pte_list_iterator 
*iter)

 #define for_each_spte_in_pte_list(pte_list, iter, spte)\
   for (spte = pte_list_get_first(pte_list, &(iter));   \
- spte != NULL; spte = pte_list_get_next(&(iter)))
+ spte != NULL &&   \
+ ({WARN_ON(!is_shadow_present_pte(*(spte))); 1; });\
+  spte = pte_list_get_next(&iter))

 #define for_each_spte_in_rmap(rmap, iter, spte)\
   for_each_spte_in_pte_list(rmap, iter, spte)
@@ -1151,11 +1153,8 @@ static bool __rmap_write_protect(struct kvm *kvm, 
unsigned long *rmapp,
struct pte_list_iterator iter;
bool flush = false;

-   for_each_spte_in_rmap(*rmapp, iter, sptep) {
-   BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+   for_each_spte_in_rmap(*rmapp, iter, sptep)
spte_write_protect(kvm, sptep, &flush, pt_protect);
-   }

return flush;
 }
@@ -1236,7 +1235,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned 
long *rmapp,

 restart:
for_each_spte_in_rmap(*rmapp, iter, sptep) {
-   BUG_ON(!is_shadow_present_pte(*sptep));
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);

need_flush = 1;
@@ -1361,15 +1359,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long 
*rmapp,
goto out;
}

-   for_each_spte_in_rmap(*rmapp, iter, sptep) {
-   BUG_ON(!is_shadow_present_pte(*sptep));
-
+   for_each_spte_in_rmap(*rmapp, iter, sptep)
if (*sptep & shadow_accessed_mask) {
young = 1;
clear_bit((ffs(shadow_accessed_mask) - 1),
 (unsigned long *)sptep);
}
-   }
 out:
/* @data has hva passed to kvm_age_hva(). */
trace_kvm_age_page(data, slot, young);
@@ -1391,14 +1386,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned 
long *rmapp,
if (!shadow_accessed_mask)
goto out;

-   for_each_spte_in_rmap(*rmapp, iter, sptep) {
-   BUG_ON(!is_shadow_present_pte(*sptep));
-
+   for_each_spte_in_rmap(*rmapp, iter, sptep)
if (*sptep & shadow_accessed_mask) {
young = 1;
break;
}
-   }
 out:
return young;
 }
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 10/12] KVM: MMU: unify the code of walking pte list

2013-01-23 Thread Xiao Guangrong
Walking parent spte and walking ramp have same logic, this patch introduces
for_each_spte_in_pte_list to integrate their code

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c   |  199 ++
 arch/x86/kvm/mmu_audit.c |5 +-
 2 files changed, 97 insertions(+), 107 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 55198a1..b7da3fb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -968,26 +968,75 @@ static void pte_list_remove(u64 *spte, unsigned long 
*pte_list)
}
 }

-typedef void (*pte_list_walk_fn) (u64 *spte);
-static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
+/*
+ * Used by the following functions to iterate through the sptes linked by a
+ * pte_list.  All fields are private and not assumed to be used outside.
+ */
+struct pte_list_iterator {
+   /* private fields */
+   struct pte_list_desc *desc; /* holds the sptep if not NULL */
+   int pos;/* index of the sptep */
+};
+
+/*
+ * Iteration must be started by this function.  This should also be used after
+ * removing/dropping sptes from the pte_list link because in such cases the
+ * information in the itererator may not be valid.
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *pte_list_get_first(unsigned long pte_list,
+  struct pte_list_iterator *iter)
 {
-   struct pte_list_desc *desc;
-   int i;
+   if (!pte_list)
+   return NULL;

-   if (!*pte_list)
-   return;
+   if (!(pte_list & 1)) {
+   iter->desc = NULL;
+   return (u64 *)pte_list;
+   }

-   if (!(*pte_list & 1))
-   return fn((u64 *)*pte_list);
+   iter->desc = (struct pte_list_desc *)(pte_list & ~1ul);
+   iter->pos = 0;
+   return iter->desc->sptes[iter->pos];
+}

-   desc = (struct pte_list_desc *)(*pte_list & ~1ul);
-   while (desc) {
-   for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
-   fn(desc->sptes[i]);
-   desc = desc->more;
+/*
+ * Must be used with a valid iterator: e.g. after pte_list_get_next().
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *pte_list_get_next(struct pte_list_iterator *iter)
+{
+   if (iter->desc) {
+   if (iter->pos < PTE_LIST_EXT - 1) {
+   u64 *sptep;
+
+   ++iter->pos;
+   sptep = iter->desc->sptes[iter->pos];
+   if (sptep)
+   return sptep;
+   }
+
+   iter->desc = iter->desc->more;
+
+   if (iter->desc) {
+   iter->pos = 0;
+   /* desc->sptes[0] cannot be NULL */
+   return iter->desc->sptes[iter->pos];
+   }
}
+
+   return NULL;
 }

+#define for_each_spte_in_pte_list(pte_list, iter, spte)\
+  for (spte = pte_list_get_first(pte_list, &(iter));   \
+ spte != NULL; spte = pte_list_get_next(&(iter)))
+
+#define for_each_spte_in_rmap(rmap, iter, spte)\
+  for_each_spte_in_pte_list(rmap, iter, spte)
+
 static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
 {
@@ -1039,67 +1088,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
pte_list_remove(spte, rmapp);
 }

-/*
- * Used by the following functions to iterate through the sptes linked by a
- * rmap.  All fields are private and not assumed to be used outside.
- */
-struct rmap_iterator {
-   /* private fields */
-   struct pte_list_desc *desc; /* holds the sptep if not NULL */
-   int pos;/* index of the sptep */
-};
-
-/*
- * Iteration must be started by this function.  This should also be used after
- * removing/dropping sptes from the rmap link because in such cases the
- * information in the itererator may not be valid.
- *
- * Returns sptep if found, NULL otherwise.
- */
-static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
-{
-   if (!rmap)
-   return NULL;
-
-   if (!(rmap & 1)) {
-   iter->desc = NULL;
-   return (u64 *)rmap;
-   }
-
-   iter->desc = (struct pte_list_desc *)(rmap & ~1ul);
-   iter->pos = 0;
-   return iter->desc->sptes[iter->pos];
-}
-
-/*
- * Must be used with a valid iterator: e.g. after rmap_get_first().
- *
- * Returns sptep if found, NULL otherwise.
- */
-static u64 *rmap_get_next(struct rmap_iterator *iter)
-{
-   if (iter->desc) {
-   if (iter->pos < PTE_LIST_EXT - 1) {
-   u64 *sptep;
-
-   ++iter->pos;
-   sptep = iter->desc->sptes[iter->pos];
-   if (sptep)
-   return sp

[PATCH v2 09/12] KVM: MMU: introduce mmu_spte_establish

2013-01-23 Thread Xiao Guangrong
It is used to establish the spte if it is not present to cleanup the
code, it also marks spte present before linking it to the sp's
parent_list, then we can integrate the code between rmap walking and
parent_lisk walking in the later patch

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   81 ++-
 arch/x86/kvm/paging_tmpl.h |   16 -
 2 files changed, 48 insertions(+), 49 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7d7eb4a..55198a1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1505,9 +1505,6 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *parent_pte)
 {
-   if (!parent_pte)
-   return;
-
pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
 }

@@ -1525,7 +1522,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
 }

 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-  u64 *parent_pte, int direct)
+  int direct)
 {
struct kvm_mmu_page *sp;
sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
@@ -1535,7 +1532,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct 
kvm_vcpu *vcpu,
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
sp->parent_ptes = 0;
-   mmu_page_add_parent_pte(vcpu, sp, parent_pte);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
 }
@@ -1868,8 +1864,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
 gva_t gaddr,
 unsigned level,
 int direct,
-unsigned access,
-u64 *parent_pte)
+unsigned access)
 {
union kvm_mmu_page_role role;
unsigned quadrant;
@@ -1899,19 +1894,15 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
break;

-   mmu_page_add_parent_pte(vcpu, sp, parent_pte);
-   if (sp->unsync_children) {
+   if (sp->unsync_children)
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-   kvm_mmu_mark_parents_unsync(sp);
-   } else if (sp->unsync)
-   kvm_mmu_mark_parents_unsync(sp);

__clear_sp_write_flooding_count(sp);
trace_kvm_mmu_get_page(sp, false);
return sp;
}
++vcpu->kvm->stat.mmu_cache_miss;
-   sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
+   sp = kvm_mmu_alloc_page(vcpu, direct);
if (!sp)
return sp;
sp->gfn = gfn;
@@ -1931,6 +1922,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
return sp;
 }

+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+{
+   u64 spte;
+
+   spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+  shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+
+   mmu_spte_set(sptep, spte);
+}
+
+static struct kvm_mmu_page *
+mmu_spte_establish(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, gva_t gaddr,
+  unsigned level, int direct, unsigned access)
+{
+   struct kvm_mmu_page *sp;
+
+   WARN_ON(is_shadow_present_pte(*spte));
+
+   sp = kvm_mmu_get_page(vcpu, gfn, gaddr, level, direct, access);
+
+   link_shadow_page(spte, sp);
+   mmu_page_add_parent_pte(vcpu, sp, spte);
+
+   if (sp->unsync_children || sp->unsync)
+   kvm_mmu_mark_parents_unsync(sp);
+
+   return sp;
+}
+
 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
 struct kvm_vcpu *vcpu, u64 addr)
 {
@@ -1980,16 +2000,6 @@ static void shadow_walk_next(struct 
kvm_shadow_walk_iterator *iterator)
return __shadow_walk_next(iterator, *iterator->sptep);
 }

-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
-{
-   u64 spte;
-
-   spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
-  shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
-
-   mmu_spte_set(sptep, spte);
-}
-
 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
   unsigned direct_access)
 {
@@ -2046,11 +2056,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
mmu_page_zap_pte(kvm, sp, sp->spt + i);
 }

-static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
-{
-   mmu_page_remove_parent

[PATCH v2 08/12] KVM: MMU: cleanup __direct_map

2013-01-23 Thread Xiao Guangrong
Use link_shadow_page to link the sp to the spte in __direct_map

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   12 
 1 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c0bb7cf..7d7eb4a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1984,9 +1984,9 @@ static void link_shadow_page(u64 *sptep, struct 
kvm_mmu_page *sp)
 {
u64 spte;

-   spte = __pa(sp->spt)
-   | PT_PRESENT_MASK | PT_ACCESSED_MASK
-   | PT_WRITABLE_MASK | PT_USER_MASK;
+   spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+  shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+
mmu_spte_set(sptep, spte);
 }

@@ -2626,11 +2626,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
int write,
  iterator.level - 1,
  1, ACC_ALL, iterator.sptep);

-   mmu_spte_set(iterator.sptep,
-__pa(sp->spt)
-| PT_PRESENT_MASK | PT_WRITABLE_MASK
-| shadow_user_mask | shadow_x_mask
-| shadow_accessed_mask);
+   link_shadow_page(iterator.sptep, sp);
}
}
return emulate;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 07/12] KVM: MMU: remove pt_access in mmu_set_spte

2013-01-23 Thread Xiao Guangrong
It is only used in debug code, so drop it

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   15 ++-
 arch/x86/kvm/paging_tmpl.h |9 -
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a8a9c0e..c0bb7cf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2436,15 +2436,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 }

 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
-unsigned pt_access, unsigned pte_access,
-int write_fault, int *emulate, int level, gfn_t gfn,
-pfn_t pfn, bool speculative, bool host_writable)
+unsigned pte_access, int write_fault, int *emulate,
+int level, gfn_t gfn, pfn_t pfn, bool speculative,
+bool host_writable)
 {
bool was_rmapped = false;

pgprintk("%s: spte %llx access %x write_fault %d gfn %llx\n",
-__func__, *sptep, pt_access,
-write_fault, gfn);
+__func__, *sptep, write_fault, gfn);

if (is_rmap_spte(*sptep)) {
if (pfn != spte_to_pfn(*sptep)) {
@@ -2547,7 +2546,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;

for (i = 0; i < ret; i++, gfn++, start++)
-   mmu_set_spte(vcpu, start, ACC_ALL, access, 0, NULL,
+   mmu_set_spte(vcpu, start, access, 0, NULL,
 sp->role.level, gfn, page_to_pfn(pages[i]),
 true, true);

@@ -2608,9 +2607,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
int write,

for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
-   unsigned pte_access = ACC_ALL;
-
-   mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
+   mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
 write, &emulate, level, gfn, pfn,
 prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ca69dcc..b9a0df6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -326,8 +326,8 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
 * we call mmu_set_spte() with host_writable = true because
 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
 */
-   mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0,
-NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);
+   mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL,
+gfn, pfn, true, true);

return true;
 }
@@ -473,9 +473,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}

clear_sp_write_flooding_count(it.sptep);
-   mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
-write_fault, &emulate, it.level,
-gw->gfn, pfn, prefault, map_writable);
+   mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate,
+it.level, gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);

return emulate;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 06/12] KVM: MMU: introduce a static table to map guest access to spte access

2013-01-23 Thread Xiao Guangrong
It makes set_spte more clean and reduces branch prediction

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   37 ++---
 1 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 43b7e0c..a8a9c0e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -235,6 +235,29 @@ static inline u64 rsvd_bits(int s, int e)
return ((1ULL << (e - s + 1)) - 1) << s;
 }

+static u64 gaccess_to_spte_access[ACC_ALL + 1];
+static void build_access_table(void)
+{
+   int access;
+
+   for (access = 0; access < ACC_ALL + 1; access++) {
+   u64 spte_access = 0;
+
+   if (access & ACC_EXEC_MASK)
+   spte_access |= shadow_x_mask;
+   else
+   spte_access |= shadow_nx_mask;
+
+   if (access & ACC_USER_MASK)
+   spte_access |= shadow_user_mask;
+
+   if (access & ACC_WRITE_MASK)
+   spte_access |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+
+   gaccess_to_spte_access[access] = spte_access;
+   }
+}
+
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
@@ -243,6 +266,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
shadow_dirty_mask = dirty_mask;
shadow_nx_mask = nx_mask;
shadow_x_mask = x_mask;
+   build_access_table();
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);

@@ -2391,20 +2415,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (!speculative)
spte |= shadow_accessed_mask;

-   if (pte_access & ACC_EXEC_MASK)
-   spte |= shadow_x_mask;
-   else
-   spte |= shadow_nx_mask;
-
-   if (pte_access & ACC_USER_MASK)
-   spte |= shadow_user_mask;
-
-   if (pte_access & ACC_WRITE_MASK)
-   spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
-
if (level > PT_PAGE_TABLE_LEVEL)
spte |= PT_PAGE_SIZE_MASK;

+   spte |= gaccess_to_spte_access[pte_access];
+
if (tdp_enabled)
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 05/12] KVM: MMU: introduce vcpu_adjust_access

2013-01-23 Thread Xiao Guangrong
Introduce it to split the code of adjusting pte_access from the large
function of set_spte

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   63 +---
 1 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index af8bcb2..43b7e0c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2324,25 +2324,18 @@ static int mmu_need_write_protect(struct kvm_vcpu 
*vcpu, gfn_t gfn,
return 0;
 }

-static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
-   unsigned pte_access, int level,
-   gfn_t gfn, pfn_t pfn, bool speculative,
-   bool can_unsync, bool host_writable)
+/*
+ * Return -1 if a race condition is detected, 1 if @gfn need to be
+ * write-protected, otherwise 0 is returned.
+ */
+static int vcpu_adjust_access(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned *pte_access, int level, gfn_t gfn,
+ bool can_unsync, bool host_writable)
 {
-   u64 spte;
-   int ret = 0;
-
-   if (set_mmio_spte(sptep, gfn, pfn, pte_access))
-   return 0;
+   if (!host_writable)
+   *pte_access &= ~ACC_WRITE_MASK;

-   spte = PT_PRESENT_MASK;
-
-   if (host_writable)
-   spte |= SPTE_HOST_WRITEABLE;
-   else
-   pte_access &= ~ACC_WRITE_MASK;
-
-   if (pte_access & ACC_WRITE_MASK) {
+   if (*pte_access & ACC_WRITE_MASK) {
/*
 * Other vcpu creates new sp in the window between
 * mapping_level() and acquiring mmu-lock. We can
@@ -2351,7 +2344,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 */
if (level > PT_PAGE_TABLE_LEVEL &&
  has_wrprotected_page(vcpu->kvm, gfn, level))
-   goto done;
+   return -1;

/*
 * Optimization: for pte sync, if spte was writable the hash
@@ -2360,17 +2353,41 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 * Same reasoning can be applied to dirty page accounting.
 */
if (!can_unsync && is_writable_pte(*sptep))
-   goto out_access_adjust;
+   return 0;

if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %llx, marking ro\n",
 __func__, gfn);
-   ret = 1;
-   pte_access &= ~ACC_WRITE_MASK;
+
+   *pte_access &= ~ACC_WRITE_MASK;
+   return 1;
}
}

-out_access_adjust:
+   return 0;
+}
+
+static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+   unsigned pte_access, int level,
+   gfn_t gfn, pfn_t pfn, bool speculative,
+   bool can_unsync, bool host_writable)
+{
+   u64 spte;
+   int ret;
+
+   if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+   return 0;
+
+   ret = vcpu_adjust_access(vcpu, sptep, &pte_access, level, gfn,
+can_unsync, host_writable);
+   if (ret < 0)
+   return 0;
+
+   spte = PT_PRESENT_MASK;
+
+   if (host_writable)
+   spte |= SPTE_HOST_WRITEABLE;
+
if (!speculative)
spte |= shadow_accessed_mask;

@@ -2399,7 +2416,7 @@ out_access_adjust:

if (mmu_spte_update(sptep, spte))
kvm_flush_remote_tlbs(vcpu->kvm);
-done:
+
return ret;
 }

-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 04/12] KVM: MMU: simplify set_spte

2013-01-23 Thread Xiao Guangrong
For the logic, the function can be divided into two parts: one is adjusting
pte_access, the rest one is setting spte according the pte_access. It makes
the code more readable

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   51 ++-
 1 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a999755..af8bcb2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2336,32 +2336,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
return 0;

spte = PT_PRESENT_MASK;
-   if (!speculative)
-   spte |= shadow_accessed_mask;
-
-   if (pte_access & ACC_EXEC_MASK)
-   spte |= shadow_x_mask;
-   else
-   spte |= shadow_nx_mask;
-
-   if (pte_access & ACC_USER_MASK)
-   spte |= shadow_user_mask;
-
-   if (level > PT_PAGE_TABLE_LEVEL)
-   spte |= PT_PAGE_SIZE_MASK;
-   if (tdp_enabled)
-   spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
-   kvm_is_mmio_pfn(pfn));

if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
else
pte_access &= ~ACC_WRITE_MASK;

-   spte |= (u64)pfn << PAGE_SHIFT;
-
if (pte_access & ACC_WRITE_MASK) {
-
/*
 * Other vcpu creates new sp in the window between
 * mapping_level() and acquiring mmu-lock. We can
@@ -2369,11 +2350,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 * be fixed if guest refault.
 */
if (level > PT_PAGE_TABLE_LEVEL &&
-   has_wrprotected_page(vcpu->kvm, gfn, level))
+ has_wrprotected_page(vcpu->kvm, gfn, level))
goto done;

-   spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
-
/*
 * Optimization: for pte sync, if spte was writable the hash
 * lookup is unnecessary (and expensive). Write protection
@@ -2381,21 +2360,43 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 * Same reasoning can be applied to dirty page accounting.
 */
if (!can_unsync && is_writable_pte(*sptep))
-   goto set_pte;
+   goto out_access_adjust;

if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %llx, marking ro\n",
 __func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK;
-   spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
}
}

+out_access_adjust:
+   if (!speculative)
+   spte |= shadow_accessed_mask;
+
+   if (pte_access & ACC_EXEC_MASK)
+   spte |= shadow_x_mask;
+   else
+   spte |= shadow_nx_mask;
+
+   if (pte_access & ACC_USER_MASK)
+   spte |= shadow_user_mask;
+
if (pte_access & ACC_WRITE_MASK)
+   spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+
+   if (level > PT_PAGE_TABLE_LEVEL)
+   spte |= PT_PAGE_SIZE_MASK;
+
+   if (tdp_enabled)
+   spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
+   kvm_is_mmio_pfn(pfn));
+
+   spte |= (u64)pfn << PAGE_SHIFT;
+
+   if (is_writable_pte(spte))
mark_page_dirty(vcpu->kvm, gfn);

-set_pte:
if (mmu_spte_update(sptep, spte))
kvm_flush_remote_tlbs(vcpu->kvm);
 done:
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 03/12] KVM: MMU: simplify mmu_set_spte

2013-01-23 Thread Xiao Guangrong
In order to detecting spte remapping, we can simply check whether the
spte has already been pointing to the pfn even if the spte is not the
last spte, for middle spte is pointing to the kernel pfn which can not
be mapped to userspace

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   28 +++-
 1 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8dca8af..a999755 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2407,33 +2407,20 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
 int write_fault, int *emulate, int level, gfn_t gfn,
 pfn_t pfn, bool speculative, bool host_writable)
 {
-   int was_rmapped = 0;
-   int rmap_count;
+   bool was_rmapped = false;

pgprintk("%s: spte %llx access %x write_fault %d gfn %llx\n",
 __func__, *sptep, pt_access,
 write_fault, gfn);

if (is_rmap_spte(*sptep)) {
-   /*
-* If we overwrite a PTE page pointer with a 2MB PMD, unlink
-* the parent of the now unreachable PTE.
-*/
-   if (level > PT_PAGE_TABLE_LEVEL &&
-   !is_large_pte(*sptep)) {
-   struct kvm_mmu_page *child;
-   u64 pte = *sptep;
+   if (pfn != spte_to_pfn(*sptep)) {
+   struct kvm_mmu_page *sp = page_header(__pa(sptep));

-   child = page_header(pte & PT64_BASE_ADDR_MASK);
-   drop_parent_pte(child, sptep);
-   kvm_flush_remote_tlbs(vcpu->kvm);
-   } else if (pfn != spte_to_pfn(*sptep)) {
-   pgprintk("hfn old %llx new %llx\n",
-spte_to_pfn(*sptep), pfn);
-   drop_spte(vcpu->kvm, sptep);
-   kvm_flush_remote_tlbs(vcpu->kvm);
+   if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
+   kvm_flush_remote_tlbs(vcpu->kvm);
} else
-   was_rmapped = 1;
+   was_rmapped = true;
}

if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
@@ -2456,8 +2443,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,

if (is_shadow_present_pte(*sptep)) {
if (!was_rmapped) {
-   rmap_count = rmap_add(vcpu, sptep, gfn);
-   if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+   if (rmap_add(vcpu, sptep, gfn) > RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
}
}
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 02/12] KVM: MMU: cleanup mapping-level

2013-01-23 Thread Xiao Guangrong
Use min() to cleanup mapping_level

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0f90269..8dca8af 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -831,8 +831,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t 
large_gfn)
if (host_level == PT_PAGE_TABLE_LEVEL)
return host_level;

-   max_level = kvm_x86_ops->get_lpage_level() < host_level ?
-   kvm_x86_ops->get_lpage_level() : host_level;
+   max_level = min(kvm_x86_ops->get_lpage_level(), host_level);

for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 01/12] KVM: MMU: lazily drop large spte

2013-01-23 Thread Xiao Guangrong
Do not drop large spte until it can be insteaded by small pages so that
the guest can happliy read memory through it

The idea is from Avi:
| As I mentioned before, write-protecting a large spte is a good idea,
| since it moves some work from protect-time to fault-time, so it reduces
| jitter.  This removes the need for the return value.

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   21 ++---
 1 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9f628f7..0f90269 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1105,7 +1105,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 
*sptep)

 /*
  * Write-protect on the specified @sptep, @pt_protect indicates whether
- * spte writ-protection is caused by protecting shadow page table.
+ * spte write-protection is caused by protecting shadow page table.
  * @flush indicates whether tlb need be flushed.
  *
  * Note: write protection is difference between drity logging and spte
@@ -1114,31 +1114,23 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 
*sptep)
  *   its dirty bitmap is properly set.
  * - for spte protection, the spte can be writable only after unsync-ing
  *   shadow page.
- *
- * Return true if the spte is dropped.
  */
-static bool
+static void
 spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
 {
u64 spte = *sptep;

if (!is_writable_pte(spte) &&
  !(pt_protect && spte_is_locklessly_modifiable(spte)))
-   return false;
+   return;

rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);

-   if (__drop_large_spte(kvm, sptep)) {
-   *flush |= true;
-   return true;
-   }
-
if (pt_protect)
spte &= ~SPTE_MMU_WRITEABLE;
spte = spte & ~PT_WRITABLE_MASK;

*flush |= mmu_spte_update(sptep, spte);
-   return false;
 }

 static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
@@ -1150,11 +1142,8 @@ static bool __rmap_write_protect(struct kvm *kvm, 
unsigned long *rmapp,

for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
-   if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
-   sptep = rmap_get_first(*rmapp, &iter);
-   continue;
-   }

+   spte_write_protect(kvm, sptep, &flush, pt_protect);
sptep = rmap_get_next(&iter);
}

@@ -2611,6 +2600,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
int write,
break;
}

+   drop_large_spte(vcpu, iterator.sptep);
+
if (!is_shadow_present_pte(*iterator.sptep)) {
u64 base_addr = iterator.addr;

-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 8/8] KVM: x86 emulator: convert a few freestanding emulations to fastop

2013-01-23 Thread Avi Kivity
On Wed, Jan 23, 2013 at 2:21 AM, Marcelo Tosatti  wrote:
> Missing signed off by.

Signed-off-by: Avi Kivity 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [QEMU PATCH v5 0/3] virtio-net: fix of ctrl commands

2013-01-23 Thread Stefan Hajnoczi
On Tue, Jan 22, 2013 at 11:44:43PM +0800, Amos Kong wrote:
> Currently virtio-net code relys on the layout of descriptor,
> this patchset removed the assumptions and introduced a control
> command to set mac address. Last patch is a trivial renaming.
> 
> V2: check guest's iov_len
> V3: fix of migration compatibility
> make mac field in config space read-only when new feature is acked
> V4: add fix of descriptor layout assumptions, trivial rename
> V5: fix endianness after iov_to_buf copy
> 
> Amos Kong (2):
>   virtio-net: introduce a new macaddr control
>   virtio-net: rename ctrl rx commands
> 
> Michael S. Tsirkin (1):
>   virtio-net: remove layout assumptions for ctrl vq
> 
>  hw/pc_piix.c|4 ++
>  hw/virtio-net.c |  142 +-
>  hw/virtio-net.h |   26 +++
>  3 files changed, 108 insertions(+), 64 deletions(-)
> 

Reviewed-by: Stefan Hajnoczi 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html