Re: [PATCH] KVM: VMX: enable nested virtualization by default

2018-10-17 Thread Wincy Van
On Wed, Oct 17, 2018 at 6:57 AM Paolo Bonzini  wrote:
>
> With live migration support and finally a good solution for CR2/DR6
> exception payloads, nested VMX should finally be ready for having a stable
> userspace ABI.  The results of syzkaller fuzzing are not perfect but not
> horrible either (and might be partially due to running on GCE, so that
> effectively we're testing three-level nesting on a fork of upstream KVM!).
> Enabling it by default seems like a nice way to conclude the 4.20
> pull request. :)
>
> Unfortunately, enabling nested SVM in 2009 was a bit premature.  However,
> until live migration support is in place we can reasonably expect that
> it does not offer much in terms of ABI guarantees.  Therefore we are
> still in time to break things and conform as much as possible to the
> interface used for VMX.
>
> Suggested-by: Jim Mattson 
> Suggested-by: Liran Alon 
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/kvm/vmx.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index e665aa7167cf..89fc2a744d7f 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -107,7 +107,7 @@ module_param_named(enable_shadow_vmcs, 
> enable_shadow_vmcs, bool, S_IRUGO);
>   * VMX and be a hypervisor for its own guests. If nested=0, guests may not
>   * use VMX instructions.
>   */
> -static bool __read_mostly nested = 0;
> +static bool __read_mostly nested = 1;
>  module_param(nested, bool, S_IRUGO);
>
>  static u64 __read_mostly host_xss;
> --
> 2.17.1
>


bravo!   :-)

Thanks,
Wincy


Re: [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC

2016-08-09 Thread Wincy Van
On Tue, Aug 9, 2016 at 5:32 PM, Yang Zhang  wrote:
> On 2016/8/9 2:16, Radim Krčmář wrote:
>>
>> msr bitmap can be used to avoid a VM exit (interception) on guest MSR
>> accesses.  In some configurations of VMX controls, the guest can even
>> directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
>> APIC ACCESSES.
>>
>> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
>> To do so, L1 would first trick KVM to disable all possible interceptions
>> by enabling APICv features and then would turn those features off;
>> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
>> not intercept previously enabled MSRs even though they were not safe
>> with the new configuration.
>>
>> Correctly re-enabling interceptions is not enough as a second bug would
>> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
>> VMCSs, so L1 could trigger a race to get the desired combination of msr
>> bitmap and VMX controls.
>>
>> This fix allocates a msr bitmap for every L1 VCPU, allows only safe
>> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
>> have to intercept everything anyway.
>>
>> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
>> Reported-by: Jim Mattson 
>> Suggested-by: Wincy Van 
>> Signed-off-by: Radim Krčmář 
>> ---
>>  arch/x86/kvm/vmx.c | 107
>> ++---
>>  1 file changed, 44 insertions(+), 63 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index a45d8580f91e..c66ac2c70d22 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -435,6 +435,8 @@ struct nested_vmx {
>> bool pi_pending;
>> u16 posted_intr_nv;
>>
>> +   unsigned long *msr_bitmap;
>> +
>> struct hrtimer preemption_timer;
>> bool preemption_timer_expired;
>>
>> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
>>  static unsigned long *vmx_msr_bitmap_longmode;
>>  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
>>  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
>> -static unsigned long *vmx_msr_bitmap_nested;
>>  static unsigned long *vmx_vmread_bitmap;
>>  static unsigned long *vmx_vmwrite_bitmap;
>>
>> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu
>> *vcpu)
>> unsigned long *msr_bitmap;
>>
>> if (is_guest_mode(vcpu))
>> -   msr_bitmap = vmx_msr_bitmap_nested;
>> +   msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
>> else if (cpu_has_secondary_exec_ctrls() &&
>>  (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
>>   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
>> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void)
>> if (!vmx_msr_bitmap_longmode_x2apic)
>> goto out4;
>>
>> -   if (nested) {
>> -   vmx_msr_bitmap_nested =
>> -   (unsigned long *)__get_free_page(GFP_KERNEL);
>> -   if (!vmx_msr_bitmap_nested)
>> -   goto out5;
>> -   }
>> -
>> vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
>> if (!vmx_vmread_bitmap)
>> goto out6;
>> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void)
>>
>> memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
>> memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
>> -   if (nested)
>> -   memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>>
>> if (setup_vmcs_config(&vmcs_config) < 0) {
>> r = -EIO;
>> @@ -6529,9 +6521,6 @@ out8:
>>  out7:
>> free_page((unsigned long)vmx_vmread_bitmap);
>>  out6:
>> -   if (nested)
>> -   free_page((unsigned long)vmx_msr_bitmap_nested);
>> -out5:
>> free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>>  out4:
>> free_page((unsigned long)vmx_msr_bitmap_longmode);
>> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void)
>> free_page((unsigned long)vmx_io_bitmap_a);
>> free_page((unsigned long)vmx_vmwrite_bitmap);
>> free_page((unsigned long)vmx_vmread_bitmap);
>> -   if (nested)
>> -   free_page((unsigned long)vmx_msr_bitmap_nested);
>>
>> free_kvm_ar

Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages

2016-04-26 Thread Wincy Van
On Mon, Apr 25, 2016 at 9:30 PM, Andres Lagar-Cavilla
 wrote:
>>
>> We are using kvm + tmpfs to do qemu live upgrading, how does google
>> use this memory model ?
>> I think our pupose to use tmpfs may be the same.
>
> Nothing our of the ordinary. Guest memory is an mmap of a tmpfs fd.
> Huge tmpfs gives us naturally a great guest performance boost.
> MAP_SHARED, and having guest memory persist any one given process, are
> what drives us to use tmpfs.
>

OK. We are also using mmap.

Besides google's kvm userspace(as I know it is not qemu), google have another
userspace tool need to access guest memory, so that google use tmpfs?

If so, what function does that another userspace do?

Thanks,
Wincy


Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages

2016-04-23 Thread Wincy Van
On Wed, Apr 20, 2016 at 1:07 AM, Andres Lagar-Cavilla
 wrote:
> Andrea, we provide the, ahem, adjustments to
> transparent_hugepage_adjust. Rest assured we aggressively use mmu
> notifiers with no further changes required.
>
> As in: zero changes have been required in the lifetime (years) of
> kvm+huge tmpfs at Google, other than mod'ing
> transparent_hugepage_adjust.

We are using kvm + tmpfs to do qemu live upgrading, how does google
use this memory model ?
I think our pupose to use tmpfs may be the same.

And huge tmpfs is a really good improvement for that.

>
> As noted by Paolo, the additions to transparent_hugepage_adjust could
> be lifted outside of kvm (into shmem.c? maybe) for any consumer of
> huge tmpfs with mmu notifiers.
>

Thanks,
Wincy


Re: [PATCH v2] KVM: vmx: Set msr bitmap correctly if vcpu is in guest mode

2015-03-04 Thread Wincy Van
On Wed, Mar 4, 2015 at 11:58 PM, Bandan Das  wrote:
> Hi Wincy,
>
> Wincy Van  writes:
>
>> In commit 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap"),
>> we are setting MSR_BITMAP in prepare_vmcs02 if we should use hardware. This
>> is not enough since the field will be modified by following vmx_set_efer.
>>
>> Fix this by setting vmx_msr_bitmap_nested in vmx_set_msr_bitmap if vcpu is
>> in guest mode.
>>
>> Signed-off-by: Wincy Van 
>> ---
>>  arch/x86/kvm/vmx.c |   11 +++
>>  1 files changed, 7 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index f7b20b4..10a481b 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -2168,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
>>  {
>>   unsigned long *msr_bitmap;
>>
>> - if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) 
>> {
>> + if (is_guest_mode(vcpu))
>> + msr_bitmap = vmx_msr_bitmap_nested;
> Do you think this should be (is_guest_mode(vcpu) &
>   (exec_control &  
> CPU_BASED_USE_MSR_BITMAPS)) ?
>

We don't need to do that, because if we don't use the hardware, we will disable
hardware msr bitmap:

if (cpu_has_vmx_msr_bitmap() &&
exec_control & CPU_BASED_USE_MSR_BITMAPS) {
nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
/* MSR_BITMAP will be set by following vmx_set_efer. */
} else
exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;

Then, the hardware will ignore MSR_BITMAP.
Setting MSR_BITMAP without enabling it may be unnecessary, but it is no
harm, and we can reduce the code complexity of vmx_set_msr_bitmap.


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] KVM: vmx: Set msr bitmap correctly if vcpu is in guest mode

2015-03-03 Thread Wincy Van
In commit 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap"),
we are setting MSR_BITMAP in prepare_vmcs02 if we should use hardware. This
is not enough since the field will be modified by following vmx_set_efer.

Fix this by setting vmx_msr_bitmap_nested in vmx_set_msr_bitmap if vcpu is
in guest mode.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   11 +++
 1 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b20b4..10a481b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2168,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 {
unsigned long *msr_bitmap;
 
-   if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
+   if (is_guest_mode(vcpu))
+   msr_bitmap = vmx_msr_bitmap_nested;
+   else if (irqchip_in_kernel(vcpu->kvm) &&
+   apic_x2apic_mode(vcpu->arch.apic)) {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
else
@@ -9218,9 +9221,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12)
}
 
if (cpu_has_vmx_msr_bitmap() &&
-   exec_control & CPU_BASED_USE_MSR_BITMAPS &&
-   nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
-   vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+   exec_control & CPU_BASED_USE_MSR_BITMAPS) {
+   nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
+   /* MSR_BITMAP will be set by following vmx_set_efer. */
} else
exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] KVM: vmx: Set msr bitmap correctly if vcpu is in guest mode

2015-03-03 Thread Wincy Van
On Wed, Mar 4, 2015 at 12:04 PM, Bandan Das  wrote:
> Wincy Van  writes:
>
>> On Wed, Mar 4, 2015 at 1:39 AM, Bandan Das  wrote:
>>> Wincy Van  writes:
>>>
>>>> In commit 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR 
>>>> bitmap"),
>>>> we are setting MSR_BITMAP in prepare_vmcs02 if we should use hardware. This
>>>> is not enough since the field will be modified by following vmx_set_efer.
>>>>
>>>> Fix this by setting vmx_msr_bitmap_nested in vmx_set_msr_bitmap if vcpu is
>>>> in guest mode.
>>>>
>>>> Signed-off-by: Wincy Van 
>>>> ---
>>>>  arch/x86/kvm/vmx.c |5 -
>>>>  1 files changed, 4 insertions(+), 1 deletions(-)
>>>>
>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>> index f7b20b4..f6e3457 100644
>>>> --- a/arch/x86/kvm/vmx.c
>>>> +++ b/arch/x86/kvm/vmx.c
>>>> @@ -2168,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu 
>>>> *vcpu)
>>>>  {
>>>>   unsigned long *msr_bitmap;
>>>>
>>>> - if (irqchip_in_kernel(vcpu->kvm) && 
>>>> apic_x2apic_mode(vcpu->arch.apic)) {
>>>> + if (is_guest_mode(vcpu))
>>>> + msr_bitmap = vmx_msr_bitmap_nested;
>>>> + else if (irqchip_in_kernel(vcpu->kvm) &&
>>>> + apic_x2apic_mode(vcpu->arch.apic)) {
>>>
>>> So, we end up writing the MSR_BITMAP field twice - once when we
>>> call nested_vmx_merge_msr_bitmap() and another here. Why don't we just
>>> remove the former since prepare_vmcs02 will call vmx_set_efer anyway ?
>>>
>>
>> Yes, setting MSR_BITMAP twice is redundant, but we can not rely on
>> vmx_set_efer to set that field, this is not vmx_set_efer 's duty.
> It's not. The change is in vmx_set_msr_bitmap() and vmx_set_efer
> happens to call it. The call to the merge function may very well
> belong to prepare_vmcs02() but the write to the vmcs field could
> belong to vmx_set_msr_bitmap.
>
>> Consider that someone wants to make some changes on loading
>> L2's efer, he may be confused about this. We should reduce the
>> degree of code coupling.
> Fine, just add a comment in prepare_vmcs02 that that's where the field
> is being set. No point in doing the same thing twice.
>

Yes, Agreed. I'll send v2 ASAP.

Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] KVM: vmx: Set msr bitmap correctly if vcpu is in guest mode

2015-03-03 Thread Wincy Van
On Wed, Mar 4, 2015 at 1:39 AM, Bandan Das  wrote:
> Wincy Van  writes:
>
>> In commit 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap"),
>> we are setting MSR_BITMAP in prepare_vmcs02 if we should use hardware. This
>> is not enough since the field will be modified by following vmx_set_efer.
>>
>> Fix this by setting vmx_msr_bitmap_nested in vmx_set_msr_bitmap if vcpu is
>> in guest mode.
>>
>> Signed-off-by: Wincy Van 
>> ---
>>  arch/x86/kvm/vmx.c |5 -
>>  1 files changed, 4 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index f7b20b4..f6e3457 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -2168,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
>>  {
>>   unsigned long *msr_bitmap;
>>
>> - if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) 
>> {
>> + if (is_guest_mode(vcpu))
>> + msr_bitmap = vmx_msr_bitmap_nested;
>> + else if (irqchip_in_kernel(vcpu->kvm) &&
>> + apic_x2apic_mode(vcpu->arch.apic)) {
>
> So, we end up writing the MSR_BITMAP field twice - once when we
> call nested_vmx_merge_msr_bitmap() and another here. Why don't we just
> remove the former since prepare_vmcs02 will call vmx_set_efer anyway ?
>

Yes, setting MSR_BITMAP twice is redundant, but we can not rely on
vmx_set_efer to set that field, this is not vmx_set_efer 's duty.
Consider that someone wants to make some changes on loading
L2's efer, he may be confused about this. We should reduce the
degree of code coupling.

Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] KVM: vmx: Set msr bitmap correctly if vcpu is in guest mode

2015-03-02 Thread Wincy Van
In commit 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap"),
we are setting MSR_BITMAP in prepare_vmcs02 if we should use hardware. This
is not enough since the field will be modified by following vmx_set_efer.

Fix this by setting vmx_msr_bitmap_nested in vmx_set_msr_bitmap if vcpu is
in guest mode.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |5 -
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b20b4..f6e3457 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2168,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 {
unsigned long *msr_bitmap;
 
-   if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
+   if (is_guest_mode(vcpu))
+   msr_bitmap = vmx_msr_bitmap_nested;
+   else if (irqchip_in_kernel(vcpu->kvm) &&
+   apic_x2apic_mode(vcpu->arch.apic)) {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
else
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH resend v5 6/6] KVM: nVMX: Enable nested posted interrupt processing

2015-02-27 Thread Wincy Van
On Sun, Feb 15, 2015 at 2:27 PM, Yong Wang  wrote:
>
> Wincy, our QA found regressions with this patch that 64bit L2 linux guest
> fails to boot up when running nested kvm on kvm.
>
> Environment:
> 
> Host OS (ia32/ia32e/IA64):ia32e
> Guest OS (ia32/ia32e/IA64):ia32e
> Guest OS Type (Linux/Windows):Linux
> kvm.git Commit:6557bada461afeaa920a189fae2cff7c8fdce39f
> qemu.kvm Commit:5c697ae74170d43928cb185f5ac1a9058adcae0b
> Host Kernel Version:3.19.0-rc3
> Hardware:Ivytown_EP, Haswell_EP
>
>
> Bug detailed description:
> --
> create 64bit linux guest as L2 guest, the guest boot up fail
>
> note:
> 1. create a 32bit linux guest as L2 guest, the guest boots up fine.
> 2. create a 64bit windows guest as L2 guest, the guest boots up fine.
> 3. this should be a kernel bug:
> kvm   + qemu = result
> 6557bada  + 5c697ae7 = bad
> 8fff5e37  + 5c697ae7 = good
>
> Reproduce steps:
> 
> 1 create L1 guest:
> qemu-system-x86_64 -enable-kvm -m 8G -smp 4 -net 
> nic,macaddr=00:12:31:34:51:31 -net tap,script=/etc/kvm/qemu-ifup 
> nested-kvm.qcow -cpu host
>
> 2. create L2 guest
> qemu-system-x86_64 -enable-kvm -m 2G -smp 2 -net none rhel6u5.qcow
>
> Current result:
> 
> create 64bit linux guest as L2 guest, the guest boots up fail
>
> Expected result:
> 
> create 64bit linux guest as L2 guest, the guest boots up fine
>
> Please take a look.
>

Yong, according to the logs, I found that L1 may have disabled x2apic,
and the MSR_BITMAP field will be modified by following vmx_set_efer in
prepare_vmcs02.
So I think we can fix this issue by:

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b20b4..f6e3457 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2168,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 {
unsigned long *msr_bitmap;

-   if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
+   if (is_guest_mode(vcpu))
+   msr_bitmap = vmx_msr_bitmap_nested;
+   else if (irqchip_in_kernel(vcpu->kvm) &&
+   apic_x2apic_mode(vcpu->arch.apic)) {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
else


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH resend v5 6/6] KVM: nVMX: Enable nested posted interrupt processing

2015-02-15 Thread Wincy Van
On Sun, Feb 15, 2015 at 2:27 PM, Yong Wang  wrote:
> On Tue, Feb 03, 2015 at 11:58:17PM +0800, Wincy Van wrote:
>> If vcpu has a interrupt in vmx non-root mode, we will
>> kick that vcpu to inject interrupt timely. With posted
>> interrupt processing, the kick intr is not needed, and
>> interrupts are fully taken care of by hardware.
>>
>> In nested vmx, this feature avoids much more vmexits
>> than non-nested vmx.
>>
>> This patch use L0's POSTED_INTR_NV to avoid unexpected
>> interrupt if L1's vector is different with L0's. If vcpu
>> is in hardware's non-root mode, we use a physical ipi to
>> deliver posted interrupts, otherwise we will accomplish
>> that posted interrupt in nested vm-entry manually.
>>
>> Signed-off-by: Wincy Van 
>> ---
>>  arch/x86/kvm/lapic.c |   13 +++-
>>  arch/x86/kvm/lapic.h |1 +
>>  arch/x86/kvm/vmx.c   |  151 
>> -
>>  3 files changed, 158 insertions(+), 7 deletions(-)
>>
>
> Wincy, our QA found regressions with this patch that 64bit L2 linux guest
> fails to boot up when running nested kvm on kvm.
>
> Environment:
> 
> Host OS (ia32/ia32e/IA64):ia32e
> Guest OS (ia32/ia32e/IA64):ia32e
> Guest OS Type (Linux/Windows):Linux
> kvm.git Commit:6557bada461afeaa920a189fae2cff7c8fdce39f
> qemu.kvm Commit:5c697ae74170d43928cb185f5ac1a9058adcae0b
> Host Kernel Version:3.19.0-rc3
> Hardware:Ivytown_EP, Haswell_EP
>
>
> Bug detailed description:
> --
> create 64bit linux guest as L2 guest, the guest boot up fail
>
> note:
> 1. create a 32bit linux guest as L2 guest, the guest boots up fine.
> 2. create a 64bit windows guest as L2 guest, the guest boots up fine.
> 3. this should be a kernel bug:
> kvm   + qemu = result
> 6557bada  + 5c697ae7 = bad
> 8fff5e37  + 5c697ae7 = good
>
> Reproduce steps:
> 
> 1 create L1 guest:
> qemu-system-x86_64 -enable-kvm -m 8G -smp 4 -net 
> nic,macaddr=00:12:31:34:51:31 -net tap,script=/etc/kvm/qemu-ifup 
> nested-kvm.qcow -cpu host
>
> 2. create L2 guest
> qemu-system-x86_64 -enable-kvm -m 2G -smp 2 -net none rhel6u5.qcow
>
> Current result:
> 
> create 64bit linux guest as L2 guest, the guest boots up fail
>
> Expected result:
> 
> create 64bit linux guest as L2 guest, the guest boots up fine
>
> Please take a look.
>

Certainly, will do.

Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH resend v5 0/6] KVM: nVMX: Enable nested apicv support

2015-02-03 Thread Wincy Van
On Wed, Feb 4, 2015 at 12:15 AM, Paolo Bonzini  wrote:
>
>
> On 03/02/2015 16:46, Wincy Van wrote:
>> v1 ---> v2:
>>   Use spin lock to ensure vmcs12 is safe when doing nested
>>   posted interrupt delivery.
>>
>> v2 ---> v3:
>>   1. Add a new field in nested_vmx to avoid the spin lock in v2.
>>   2. Drop send eoi to L1 when doing nested interrupt delivery.
>>   3. Use hardware MSR bitmap to enable nested virtualize x2apic
>>  mode.
>>
>> v3 ---> v4:
>>   1. Optimize nested msr bitmap merging.
>>   2. Allocate nested msr bitmap only when nested == 1.
>>   3. Inline the nested vmx control checking functions.
>>
>> v4 ---> v5:
>>   1. Move EXIT_REASON_APIC_WRITE to the apic register
>>  virtualization patch.
>>   2. Accomplish nested posted interrupts manually if
>>  they are not recognized by hardware.
>
> Thanks, will apply soon to kvm/queue.
>

Thanks a lot for your review, Paolo and Yang!

Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH resend v5 3/6] KVM: nVMX: Make nested control MSRs per-cpu

2015-02-03 Thread Wincy Van
To enable nested apicv support, we need per-cpu vmx
control MSRs:
  1. If in-kernel irqchip is enabled, we can enable nested
 posted interrupt, we should set posted intr bit in
 the nested_vmx_pinbased_ctls_high.
  2. If in-kernel irqchip is disabled, we can not enable
 nested posted interrupt, the posted intr bit
 in the nested_vmx_pinbased_ctls_high will be cleared.

Since there would be different settings about in-kernel
irqchip between VMs, different nested control MSRs
are needed.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |  215 +++-
 1 files changed, 129 insertions(+), 86 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 784a552..40f7951 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -411,6 +411,23 @@ struct nested_vmx {
 
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+
+   u32 nested_vmx_procbased_ctls_low;
+   u32 nested_vmx_procbased_ctls_high;
+   u32 nested_vmx_true_procbased_ctls_low;
+   u32 nested_vmx_secondary_ctls_low;
+   u32 nested_vmx_secondary_ctls_high;
+   u32 nested_vmx_pinbased_ctls_low;
+   u32 nested_vmx_pinbased_ctls_high;
+   u32 nested_vmx_exit_ctls_low;
+   u32 nested_vmx_exit_ctls_high;
+   u32 nested_vmx_true_exit_ctls_low;
+   u32 nested_vmx_entry_ctls_low;
+   u32 nested_vmx_entry_ctls_high;
+   u32 nested_vmx_true_entry_ctls_low;
+   u32 nested_vmx_misc_low;
+   u32 nested_vmx_misc_high;
+   u32 nested_vmx_ept_caps;
 };
 
 #define POSTED_INTR_ON  0
@@ -2297,20 +2314,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu 
*vcpu)
  * if the corresponding bit in the (32-bit) control field *must* be on, and a
  * bit in the high half is on if the corresponding bit in the control field
  * may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
  */
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
-static u32 nested_vmx_true_procbased_ctls_low;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_true_exit_ctls_low;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static u32 nested_vmx_true_entry_ctls_low;
-static u32 nested_vmx_misc_low, nested_vmx_misc_high;
-static u32 nested_vmx_ept_caps;
-static __init void nested_vmx_setup_ctls_msrs(void)
+static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 {
/*
 * Note that as a general rule, the high half of the MSRs (bits in
@@ -2329,57 +2334,71 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 
/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
-   nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
-   nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
-   PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
-   nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+   vmx->nested.nested_vmx_pinbased_ctls_low,
+   vmx->nested.nested_vmx_pinbased_ctls_high);
+   vmx->nested.nested_vmx_pinbased_ctls_low |=
+   PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+   vmx->nested.nested_vmx_pinbased_ctls_high &=
+   PIN_BASED_EXT_INTR_MASK |
+   PIN_BASED_NMI_EXITING |
+   PIN_BASED_VIRTUAL_NMIS;
+   vmx->nested.nested_vmx_pinbased_ctls_high |=
+   PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
 
/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
-   nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
-   nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+   vmx->nested.nested_vmx_exit_ctls_low,
+   vmx->nested.nested_vmx_exit_ctls_high);
+   vmx->nested.nested_vmx_exit_ctls_low =
+   VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 
-   nested_vmx_exit_ctls_high &=
+   vmx->nested.nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
-   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+   vmx->nested.nested_vmx_exit_ctls_high |=
+   VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
 
if (vmx_mpx_supported())
-   nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+  

[PATCH resend v5 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-02-03 Thread Wincy Van
Currently, if L1 enables MSR_BITMAP, we will emulate this feature,
all of L2's msr access is intercepted by L0. Since many features
like virtualize x2apic mode has a complicated logic and it is
difficult for us to emulate, we should use hardware and merge
the bitmap.

This patch introduces nested_vmx_merge_msr_bitmap for future use.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   77 ---
 1 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81152a0..4108676 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -805,6 +805,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
 
@@ -5828,13 +5829,21 @@ static __init int hardware_setup(void)
(unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode_x2apic)
goto out4;
+
+   if (nested) {
+   vmx_msr_bitmap_nested =
+   (unsigned long *)__get_free_page(GFP_KERNEL);
+   if (!vmx_msr_bitmap_nested)
+   goto out5;
+   }
+
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmread_bitmap)
-   goto out5;
+   goto out6;
 
vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmwrite_bitmap)
-   goto out6;
+   goto out7;
 
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5850,10 +5859,12 @@ static __init int hardware_setup(void)
 
memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+   if (nested)
+   memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
 
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
-   goto out7;
+   goto out8;
}
 
if (boot_cpu_has(X86_FEATURE_NX))
@@ -5974,10 +5985,13 @@ static __init int hardware_setup(void)
 
return alloc_kvm_area();
 
-out7:
+out8:
free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
+out7:
free_page((unsigned long)vmx_vmread_bitmap);
+out6:
+   if (nested)
+   free_page((unsigned long)vmx_msr_bitmap_nested);
 out5:
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
@@ -6004,6 +6018,8 @@ static __exit void hardware_unsetup(void)
free_page((unsigned long)vmx_io_bitmap_a);
free_page((unsigned long)vmx_vmwrite_bitmap);
free_page((unsigned long)vmx_vmread_bitmap);
+   if (nested)
+   free_page((unsigned long)vmx_msr_bitmap_nested);
 
free_kvm_area();
 }
@@ -8468,6 +8484,38 @@ static void vmx_start_preemption_timer(struct kvm_vcpu 
*vcpu)
  ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }
 
+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+   struct vmcs12 *vmcs12)
+{
+   int maxphyaddr;
+   u64 addr;
+
+   if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+   return 0;
+
+   if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+   WARN_ON(1);
+   return -EINVAL;
+   }
+   maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+   if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+  ((addr + PAGE_SIZE) >> maxphyaddr))
+   return -EINVAL;
+
+   return 0;
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   return false;
+}
+
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
   unsigned long count_field,
   unsigned long addr_field,
@@ -8800,11 +8848,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
}
 
+   if (cpu_has_vmx_msr_bitmap() &&
+   exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+   nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
+   vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+   } else
+   exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+
/*
-* Merging of IO and MSR bitmaps not currently supported.
+* Merging of IO bitmap not currently supported.
  

[PATCH resend v5 0/6] KVM: nVMX: Enable nested apicv support

2015-02-03 Thread Wincy Van
v1 ---> v2:
  Use spin lock to ensure vmcs12 is safe when doing nested
  posted interrupt delivery.

v2 ---> v3:
  1. Add a new field in nested_vmx to avoid the spin lock in v2.
  2. Drop send eoi to L1 when doing nested interrupt delivery.
  3. Use hardware MSR bitmap to enable nested virtualize x2apic
 mode.

v3 ---> v4:
  1. Optimize nested msr bitmap merging.
  2. Allocate nested msr bitmap only when nested == 1.
  3. Inline the nested vmx control checking functions.

v4 ---> v5:
  1. Move EXIT_REASON_APIC_WRITE to the apic register
 virtualization patch.
  2. Accomplish nested posted interrupts manually if
 they are not recognized by hardware.

Wincy Van (6):
  KVM: nVMX: Use hardware MSR bitmap
  KVM: nVMX: Enable nested virtualize x2apic mode
  KVM: nVMX: Make nested control MSRs per-cpu
  KVM: nVMX: Enable nested apic register virtualization
  KVM: nVMX: Enable nested virtual interrupt delivery
  KVM: nVMX: Enable nested posted interrupt processing

 arch/x86/kvm/lapic.c |   13 +-
 arch/x86/kvm/lapic.h |1 +
 arch/x86/kvm/vmx.c   |  647 ++
 3 files changed, 557 insertions(+), 104 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH resend v5 6/6] KVM: nVMX: Enable nested posted interrupt processing

2015-02-03 Thread Wincy Van
If vcpu has a interrupt in vmx non-root mode, we will
kick that vcpu to inject interrupt timely. With posted
interrupt processing, the kick intr is not needed, and
interrupts are fully taken care of by hardware.

In nested vmx, this feature avoids much more vmexits
than non-nested vmx.

This patch use L0's POSTED_INTR_NV to avoid unexpected
interrupt if L1's vector is different with L0's. If vcpu
is in hardware's non-root mode, we use a physical ipi to
deliver posted interrupts, otherwise we will accomplish
that posted interrupt in nested vm-entry manually.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/lapic.c |   13 +++-
 arch/x86/kvm/lapic.h |1 +
 arch/x86/kvm/vmx.c   |  151 -
 3 files changed, 158 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 555956c..2f83384 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -325,17 +325,24 @@ static u8 count_vectors(void *bitmap)
return count;
 }
 
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+void __kvm_apic_update_irr(u32 *pir, void *regs)
 {
u32 i, pir_val;
-   struct kvm_lapic *apic = vcpu->arch.apic;
 
for (i = 0; i <= 7; i++) {
pir_val = xchg(&pir[i], 0);
if (pir_val)
-   *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+   *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
}
 }
+EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
+
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+   struct kvm_lapic *apic = vcpu->arch.apic;
+
+   __kvm_apic_update_irr(pir, apic->regs);
+}
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
 static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c1ef25c..0bc6c65 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -57,6 +57,7 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
+void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
unsigned long *dest_map);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e22e159..45c3437 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -218,6 +218,7 @@ struct __packed vmcs12 {
u64 tsc_offset;
u64 virtual_apic_page_addr;
u64 apic_access_addr;
+   u64 posted_intr_desc_addr;
u64 ept_pointer;
u64 eoi_exit_bitmap0;
u64 eoi_exit_bitmap1;
@@ -337,6 +338,7 @@ struct __packed vmcs12 {
u32 vmx_preemption_timer_value;
u32 padding32[7]; /* room for future expansion */
u16 virtual_processor_id;
+   u16 posted_intr_nv;
u16 guest_es_selector;
u16 guest_cs_selector;
u16 guest_ss_selector;
@@ -409,6 +411,10 @@ struct nested_vmx {
 */
struct page *apic_access_page;
struct page *virtual_apic_page;
+   struct page *pi_desc_page;
+   struct pi_desc *pi_desc;
+   bool pi_pending;
+   u16 posted_intr_nv;
u64 msr_ia32_feature_control;
 
struct hrtimer preemption_timer;
@@ -628,6 +634,7 @@ static int max_shadow_read_write_fields =
 
 static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+   FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -653,6 +660,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(TSC_OFFSET, tsc_offset),
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+   FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
FIELD64(EPT_POINTER, ept_pointer),
FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
@@ -805,6 +813,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1162,6 +1171,11 @@ static inline bool nested_cpu_has_vid(struct vmcs12 
*vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 }
 
+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+   return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
 static inline bool

[PATCH resend v5 5/6] KVM: nVMX: Enable nested virtual interrupt delivery

2015-02-03 Thread Wincy Van
With virtual interrupt delivery, the hardware prevent KVM from
the low efficiency interrupt inject way. In nested vmx, it is
a important feature, we can reduce much more nested-vmexit,
especially in high throughput scenes.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   67 ++-
 1 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5d20518..e22e159 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -219,6 +219,10 @@ struct __packed vmcs12 {
u64 virtual_apic_page_addr;
u64 apic_access_addr;
u64 ept_pointer;
+   u64 eoi_exit_bitmap0;
+   u64 eoi_exit_bitmap1;
+   u64 eoi_exit_bitmap2;
+   u64 eoi_exit_bitmap3;
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
@@ -341,6 +345,7 @@ struct __packed vmcs12 {
u16 guest_gs_selector;
u16 guest_ldtr_selector;
u16 guest_tr_selector;
+   u16 guest_intr_status;
u16 host_es_selector;
u16 host_cs_selector;
u16 host_ss_selector;
@@ -631,6 +636,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+   FIELD(GUEST_INTR_STATUS, guest_intr_status),
FIELD(HOST_ES_SELECTOR, host_es_selector),
FIELD(HOST_CS_SELECTOR, host_cs_selector),
FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -648,6 +654,10 @@ static const unsigned short vmcs_field_to_offset_table[] = 
{
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
FIELD64(EPT_POINTER, ept_pointer),
+   FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+   FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+   FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+   FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -1147,6 +1157,11 @@ static inline bool nested_cpu_has_apic_reg_virt(struct 
vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
 }
 
+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2441,6 +2456,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx 
*vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
+   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;
 
@@ -7454,7 +7470,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_APIC_WRITE:
-   /* apic_write should exit unconditionally. */
+   case EXIT_REASON_EOI_INDUCED:
+   /* apic_write and eoi_induced should exit unconditionally. */
return 1;
case EXIT_REASON_EPT_VIOLATION:
/*
@@ -8646,6 +8663,19 @@ static inline bool nested_vmx_merge_msr_bitmap(struct 
kvm_vcpu *vcpu,
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_R | MSR_TYPE_W);
+   if (nested_cpu_has_vid(vmcs12)) {
+   /* EOI and self-IPI are allowed */
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_EOI >> 4),
+   MSR_TYPE_W);
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+   MSR_TYPE_W);
+   }
} else {
/*
 * Enable reading intercept of all the x2apic
@@ -8663,6 +8693,14 @@ static inline bool nested_vmx_merge_msr_bitmap(struct 
kvm_vcpu *vcpu,
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_W);
+   __vmx_enable_intercept_for_msr(
+

[PATCH resend v5 4/6] KVM: nVMX: Enable nested apic register virtualization

2015-02-03 Thread Wincy Van
We can reduce apic register virtualization cost with this feature,
it is also a requirement for virtual interrupt delivery and posted
interrupt processing.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   39 +++
 1 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 40f7951..5d20518 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1142,6 +1142,11 @@ static inline bool 
nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 }
 
+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2435,6 +2440,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx 
*vmx)
vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+   SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;
 
@@ -7447,6 +7453,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_APIC_ACCESS:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+   case EXIT_REASON_APIC_WRITE:
+   /* apic_write should exit unconditionally. */
+   return 1;
case EXIT_REASON_EPT_VIOLATION:
/*
 * L0 always deals with the EPT violation. If nested EPT is
@@ -8606,6 +8615,7 @@ static int nested_vmx_check_msr_bitmap_controls(struct 
kvm_vcpu *vcpu,
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
+   int msr;
struct page *page;
unsigned long *msr_bitmap;
 
@@ -8625,16 +8635,35 @@ static inline bool nested_vmx_merge_msr_bitmap(struct 
kvm_vcpu *vcpu,
}
 
if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+   if (nested_cpu_has_apic_reg_virt(vmcs12))
+   for (msr = 0x800; msr <= 0x8ff; msr++)
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   msr, MSR_TYPE_R);
/* TPR is allowed */
nested_vmx_disable_intercept_for_msr(msr_bitmap,
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_R | MSR_TYPE_W);
-   } else
+   } else {
+   /*
+* Enable reading intercept of all the x2apic
+* MSRs. We should not rely on vmcs12 to do any
+* optimizations here, it may have been modified
+* by L1.
+*/
+   for (msr = 0x800; msr <= 0x8ff; msr++)
+   __vmx_enable_intercept_for_msr(
+   vmx_msr_bitmap_nested,
+   msr,
+   MSR_TYPE_R);
+
__vmx_enable_intercept_for_msr(
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-   MSR_TYPE_R | MSR_TYPE_W);
+   MSR_TYPE_W);
+   }
kunmap(page);
nested_release_page_clean(page);
 
@@ -8644,14 +8673,16 @@ static inline bool nested_vmx_merge_msr_bitmap(struct 
kvm_vcpu *vcpu,
 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
-   if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+   if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+   !nested_cpu_has_apic_reg_virt(vmcs12))
return 0;
 
/*
 * If virtualize x2apic mode is enabled,
 * virtualize apic access must be disabled.
 */
-   if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+   if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+   nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
return -EINVAL;
 
/* tpr shadow is needed by all apicv features. */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH resend v5 2/6] KVM: nVMX: Enable nested virtualize x2apic mode

2015-02-03 Thread Wincy Van
When L2 is using x2apic, we can use virtualize x2apic mode to
gain higher performance, especially in apicv case.

This patch also introduces nested_vmx_check_apicv_controls
for the nested apicv patches.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |  114 +++-
 1 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4108676..784a552 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1120,6 +1120,11 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 
*vmcs12)
vmx_xsaves_supported();
 }
 
+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2407,6 +2412,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_secondary_ctls_low = 0;
nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;
 
@@ -4168,6 +4174,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long 
*msr_bitmap,
}
 }
 
+/*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ */
+static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+  unsigned long *msr_bitmap_nested,
+  u32 msr, int type)
+{
+   int f = sizeof(unsigned long);
+
+   if (!cpu_has_vmx_msr_bitmap()) {
+   WARN_ON(1);
+   return;
+   }
+
+   /*
+* See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+* have the write-low and read-high bitmap offsets the wrong way round.
+* We can control MSRs 0x-0x1fff and 0xc000-0xc0001fff.
+*/
+   if (msr <= 0x1fff) {
+   if (type & MSR_TYPE_R &&
+  !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+   /* read-low */
+   __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+
+   if (type & MSR_TYPE_W &&
+  !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+   /* write-low */
+   __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+
+   } else if ((msr >= 0xc000) && (msr <= 0xc0001fff)) {
+   msr &= 0x1fff;
+   if (type & MSR_TYPE_R &&
+  !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+   /* read-high */
+   __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+
+   if (type & MSR_TYPE_W &&
+  !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+   /* write-high */
+   __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+
+   }
+}
+
 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 {
if (!longmode_only)
@@ -8513,7 +8565,59 @@ static int nested_vmx_check_msr_bitmap_controls(struct 
kvm_vcpu *vcpu,
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
-   return false;
+   struct page *page;
+   unsigned long *msr_bitmap;
+
+   if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+   return false;
+
+   page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+   if (!page) {
+   WARN_ON(1);
+   return false;
+   }
+   msr_bitmap = (unsigned long *)kmap(page);
+   if (!msr_bitmap) {
+   nested_release_page_clean(page);
+   WARN_ON(1);
+   return false;
+   }
+
+   if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+   /* TPR is allowed */
+   nested_vmx_disable_intercept_for_msr(msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+   MSR_TYPE_R | MSR_TYPE_W);
+   } else
+   __vmx_enable_intercept_for_msr(
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+   MSR_TYPE_R | MSR_TYPE_W);
+   kunmap(page);
+   nested_release_page_clean(page);
+
+   return true;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   if (!nested_cpu_has_virt

Re: [PATCH v5 6/6] KVM: nVMX: Enable nested posted interrupt processing

2015-02-03 Thread Wincy Van
On Tue, Feb 3, 2015 at 10:02 PM, Paolo Bonzini  wrote:
>
>
> On 03/02/2015 10:17, Wincy Van wrote:
>> +static int vmx_accomp_nested_posted_interrupt(struct kvm_vcpu *vcpu)
>
> Replace accomp with complete.

Will do.

>
>> +{
>> +   struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +   int max_irr;
>> +   void *vapic_page;
>> +   u16 status;
>> +
>> +   if (vmx->nested.posted_intr_nv != -1 &&
>
> Testing posted_intr_nv is not necessary.
>

Indeed.

>> +   vmx->nested.pi_desc &&
>> +   vmx->nested.accomp_pir) {
>
> Replace accomp_pir with pi_pending.
>

Will do.

> The patch has corrupted spaces and tabs like the others, too.
>

I'm sorry, may I resend them via qq.com with the same version(v5) and
make the changes above?


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-02-03 Thread Wincy Van
On Tue, Feb 3, 2015 at 9:57 PM, Paolo Bonzini  wrote:
>
>
> On 03/02/2015 10:11, Wincy Van wrote:
>> @@ -8468,6 +8484,38 @@ static void vmx_start_preemption_timer(struct
>> kvm_vcpu *vcpu)
>>   ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
>>  }
>
> Hi Wincy, the patch is corrupted here.  Also it has spaces instead of
> tabs earlier.  Please send it with git-send-email.
>

Ouch, my bad, gmail cut the lines which is over 78 chars.
Since gmail is GFWed(VPN does not work, and I have a web proxy), I
was sending the patches to my gmail via git-send-email with other mail
and forwarding them to the community.

May I use another mail(qq.com) to send these patches and use gmail to
talk with you?

Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5 6/6] KVM: nVMX: Enable nested posted interrupt processing

2015-02-03 Thread Wincy Van
If vcpu has a interrupt in vmx non-root mode, we will
kick that vcpu to inject interrupt timely. With posted
interrupt processing, the kick intr is not needed, and
interrupts are fully taken care of by hardware.

In nested vmx, this feature avoids much more vmexits
than non-nested vmx.

This patch use L0's POSTED_INTR_NV to avoid unexpected
interrupt if L1's vector is different with L0's. If vcpu
is in hardware's non-root mode, we use a physical ipi to
deliver posted interrupts, otherwise we will accomplish
that posted interrupt in nested vm-entry manually.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/lapic.c |   13 +++-
 arch/x86/kvm/lapic.h |1 +
 arch/x86/kvm/vmx.c   |  152 -
 3 files changed, 159 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 555956c..2f83384 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -325,17 +325,24 @@ static u8 count_vectors(void *bitmap)
return count;
 }

-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+void __kvm_apic_update_irr(u32 *pir, void *regs)
 {
u32 i, pir_val;
-   struct kvm_lapic *apic = vcpu->arch.apic;

for (i = 0; i <= 7; i++) {
pir_val = xchg(&pir[i], 0);
if (pir_val)
-   *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+   *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
}
 }
+EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
+
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+   struct kvm_lapic *apic = vcpu->arch.apic;
+
+   __kvm_apic_update_irr(pir, apic->regs);
+}
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);

 static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c1ef25c..0bc6c65 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -57,6 +57,7 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);

 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
+void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
unsigned long *dest_map);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e22e159..dc3bf94 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -218,6 +218,7 @@ struct __packed vmcs12 {
u64 tsc_offset;
u64 virtual_apic_page_addr;
u64 apic_access_addr;
+   u64 posted_intr_desc_addr;
u64 ept_pointer;
u64 eoi_exit_bitmap0;
u64 eoi_exit_bitmap1;
@@ -337,6 +338,7 @@ struct __packed vmcs12 {
u32 vmx_preemption_timer_value;
u32 padding32[7]; /* room for future expansion */
u16 virtual_processor_id;
+   u16 posted_intr_nv;
u16 guest_es_selector;
u16 guest_cs_selector;
u16 guest_ss_selector;
@@ -409,6 +411,10 @@ struct nested_vmx {
 */
struct page *apic_access_page;
struct page *virtual_apic_page;
+   struct page *pi_desc_page;
+   struct pi_desc *pi_desc;
+   bool accomp_pir;
+   u16 posted_intr_nv;
u64 msr_ia32_feature_control;

struct hrtimer preemption_timer;
@@ -628,6 +634,7 @@ static int max_shadow_read_write_fields =

 static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+   FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -653,6 +660,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(TSC_OFFSET, tsc_offset),
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+   FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
FIELD64(EPT_POINTER, ept_pointer),
FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
@@ -805,6 +813,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1162,6 +1171,11 @@ static inline bool nested_cpu_has_vid(struct
vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 }

+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+   return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
 static inline bool is_excepti

[PATCH v5 5/6] KVM: nVMX: Enable nested virtual interrupt delivery

2015-02-03 Thread Wincy Van
With virtual interrupt delivery, the hardware prevent KVM from
the low efficiency interrupt inject way. In nested vmx, it is
a important feature, we can reduce much more nested-vmexit,
especially in high throughput scenes.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   67 ++-
 1 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5d20518..e22e159 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -219,6 +219,10 @@ struct __packed vmcs12 {
u64 virtual_apic_page_addr;
u64 apic_access_addr;
u64 ept_pointer;
+   u64 eoi_exit_bitmap0;
+   u64 eoi_exit_bitmap1;
+   u64 eoi_exit_bitmap2;
+   u64 eoi_exit_bitmap3;
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
@@ -341,6 +345,7 @@ struct __packed vmcs12 {
u16 guest_gs_selector;
u16 guest_ldtr_selector;
u16 guest_tr_selector;
+   u16 guest_intr_status;
u16 host_es_selector;
u16 host_cs_selector;
u16 host_ss_selector;
@@ -631,6 +636,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+   FIELD(GUEST_INTR_STATUS, guest_intr_status),
FIELD(HOST_ES_SELECTOR, host_es_selector),
FIELD(HOST_CS_SELECTOR, host_cs_selector),
FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -648,6 +654,10 @@ static const unsigned short
vmcs_field_to_offset_table[] = {
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
FIELD64(EPT_POINTER, ept_pointer),
+   FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+   FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+   FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+   FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -1147,6 +1157,11 @@ static inline bool
nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
 }

+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2441,6 +2456,7 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
+   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -7454,7 +7470,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_APIC_WRITE:
-   /* apic_write should exit unconditionally. */
+   case EXIT_REASON_EOI_INDUCED:
+   /* apic_write and eoi_induced should exit unconditionally. */
return 1;
case EXIT_REASON_EPT_VIOLATION:
/*
@@ -8646,6 +8663,19 @@ static inline bool
nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_R | MSR_TYPE_W);
+   if (nested_cpu_has_vid(vmcs12)) {
+   /* EOI and self-IPI are allowed */
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_EOI >> 4),
+   MSR_TYPE_W);
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+   MSR_TYPE_W);
+   }
} else {
/*
 * Enable reading intercept of all the x2apic
@@ -8663,6 +8693,14 @@ static inline bool
nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_W);
+   __vmx_enable_intercept_for_msr(
+

[PATCH v5 4/6] KVM: nVMX: Enable nested apic register virtualization

2015-02-03 Thread Wincy Van
We can reduce apic register virtualization cost with this feature,
it is also a requirement for virtual interrupt delivery and posted
interrupt processing.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   39 +++
 1 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 40f7951..5d20518 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1142,6 +1142,11 @@ static inline bool
nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 }

+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2435,6 +2440,7 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+   SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -7447,6 +7453,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_APIC_ACCESS:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+   case EXIT_REASON_APIC_WRITE:
+   /* apic_write should exit unconditionally. */
+   return 1;
case EXIT_REASON_EPT_VIOLATION:
/*
 * L0 always deals with the EPT violation. If nested EPT is
@@ -8606,6 +8615,7 @@ static int
nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
+   int msr;
struct page *page;
unsigned long *msr_bitmap;

@@ -8625,16 +8635,35 @@ static inline bool
nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
}

if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+   if (nested_cpu_has_apic_reg_virt(vmcs12))
+   for (msr = 0x800; msr <= 0x8ff; msr++)
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   msr, MSR_TYPE_R);
/* TPR is allowed */
nested_vmx_disable_intercept_for_msr(msr_bitmap,
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_R | MSR_TYPE_W);
-   } else
+   } else {
+   /*
+* Enable reading intercept of all the x2apic
+* MSRs. We should not rely on vmcs12 to do any
+* optimizations here, it may have been modified
+* by L1.
+*/
+   for (msr = 0x800; msr <= 0x8ff; msr++)
+   __vmx_enable_intercept_for_msr(
+   vmx_msr_bitmap_nested,
+   msr,
+   MSR_TYPE_R);
+
__vmx_enable_intercept_for_msr(
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-   MSR_TYPE_R | MSR_TYPE_W);
+   MSR_TYPE_W);
+   }
kunmap(page);
nested_release_page_clean(page);

@@ -8644,14 +8673,16 @@ static inline bool
nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
-   if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+   if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+   !nested_cpu_has_apic_reg_virt(vmcs12))
return 0;

/*
 * If virtualize x2apic mode is enabled,
 * virtualize apic access must be disabled.
 */
-   if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+   if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+   nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
return -EINVAL;

/* tpr shadow is needed by all apicv features. */
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5 3/6] KVM: nVMX: Make nested control MSRs per-cpu

2015-02-03 Thread Wincy Van
To enable nested apicv support, we need per-cpu vmx
control MSRs:
  1. If in-kernel irqchip is enabled, we can enable nested
 posted interrupt, we should set posted intr bit in
 the nested_vmx_pinbased_ctls_high.
  2. If in-kernel irqchip is disabled, we can not enable
 nested posted interrupt, the posted intr bit
 in the nested_vmx_pinbased_ctls_high will be cleared.

Since there would be different settings about in-kernel
irqchip between VMs, different nested control MSRs
are needed.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |  215 +++-
 1 files changed, 129 insertions(+), 86 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 784a552..40f7951 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -411,6 +411,23 @@ struct nested_vmx {

/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+
+   u32 nested_vmx_procbased_ctls_low;
+   u32 nested_vmx_procbased_ctls_high;
+   u32 nested_vmx_true_procbased_ctls_low;
+   u32 nested_vmx_secondary_ctls_low;
+   u32 nested_vmx_secondary_ctls_high;
+   u32 nested_vmx_pinbased_ctls_low;
+   u32 nested_vmx_pinbased_ctls_high;
+   u32 nested_vmx_exit_ctls_low;
+   u32 nested_vmx_exit_ctls_high;
+   u32 nested_vmx_true_exit_ctls_low;
+   u32 nested_vmx_entry_ctls_low;
+   u32 nested_vmx_entry_ctls_high;
+   u32 nested_vmx_true_entry_ctls_low;
+   u32 nested_vmx_misc_low;
+   u32 nested_vmx_misc_high;
+   u32 nested_vmx_ept_caps;
 };

 #define POSTED_INTR_ON  0
@@ -2297,20 +2314,8 @@ static inline bool nested_vmx_allowed(struct
kvm_vcpu *vcpu)
  * if the corresponding bit in the (32-bit) control field *must* be on, and a
  * bit in the high half is on if the corresponding bit in the control field
  * may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
  */
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
-static u32 nested_vmx_true_procbased_ctls_low;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_true_exit_ctls_low;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static u32 nested_vmx_true_entry_ctls_low;
-static u32 nested_vmx_misc_low, nested_vmx_misc_high;
-static u32 nested_vmx_ept_caps;
-static __init void nested_vmx_setup_ctls_msrs(void)
+static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 {
/*
 * Note that as a general rule, the high half of the MSRs (bits in
@@ -2329,57 +2334,71 @@ static __init void nested_vmx_setup_ctls_msrs(void)

/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
-   nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
-   nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
-   PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
-   nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+   vmx->nested.nested_vmx_pinbased_ctls_low,
+   vmx->nested.nested_vmx_pinbased_ctls_high);
+   vmx->nested.nested_vmx_pinbased_ctls_low |=
+   PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+   vmx->nested.nested_vmx_pinbased_ctls_high &=
+   PIN_BASED_EXT_INTR_MASK |
+   PIN_BASED_NMI_EXITING |
+   PIN_BASED_VIRTUAL_NMIS;
+   vmx->nested.nested_vmx_pinbased_ctls_high |=
+   PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;

/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
-   nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
-   nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+   vmx->nested.nested_vmx_exit_ctls_low,
+   vmx->nested.nested_vmx_exit_ctls_high);
+   vmx->nested.nested_vmx_exit_ctls_low =
+   VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;

-   nested_vmx_exit_ctls_high &=
+   vmx->nested.nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
-   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+   vmx->nested.nested_vmx_exit_ctls_high |=
+   VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;

if (vmx_mpx_supported())
-   nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+  

[PATCH v5 2/6] KVM: nVMX: Enable nested virtualize x2apic mode

2015-02-03 Thread Wincy Van
When L2 is using x2apic, we can use virtualize x2apic mode to
gain higher performance, especially in apicv case.

This patch also introduces nested_vmx_check_apicv_controls
for the nested apicv patches.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |  114 +++-
 1 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4108676..784a552 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1120,6 +1120,11 @@ static inline bool nested_cpu_has_xsaves(struct
vmcs12 *vmcs12)
vmx_xsaves_supported();
 }

+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2407,6 +2412,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_secondary_ctls_low = 0;
nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -4168,6 +4174,52 @@ static void
__vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
}
 }

+/*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ */
+static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+  unsigned long *msr_bitmap_nested,
+  u32 msr, int type)
+{
+   int f = sizeof(unsigned long);
+
+   if (!cpu_has_vmx_msr_bitmap()) {
+   WARN_ON(1);
+   return;
+   }
+
+   /*
+* See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+* have the write-low and read-high bitmap offsets the wrong way round.
+* We can control MSRs 0x-0x1fff and 0xc000-0xc0001fff.
+*/
+   if (msr <= 0x1fff) {
+   if (type & MSR_TYPE_R &&
+  !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+   /* read-low */
+   __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+
+   if (type & MSR_TYPE_W &&
+  !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+   /* write-low */
+   __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+
+   } else if ((msr >= 0xc000) && (msr <= 0xc0001fff)) {
+   msr &= 0x1fff;
+   if (type & MSR_TYPE_R &&
+  !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+   /* read-high */
+   __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+
+   if (type & MSR_TYPE_W &&
+  !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+   /* write-high */
+   __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+
+   }
+}
+
 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 {
if (!longmode_only)
@@ -8513,7 +8565,59 @@ static int
nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
-   return false;
+   struct page *page;
+   unsigned long *msr_bitmap;
+
+   if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+   return false;
+
+   page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+   if (!page) {
+   WARN_ON(1);
+   return false;
+   }
+   msr_bitmap = (unsigned long *)kmap(page);
+   if (!msr_bitmap) {
+   nested_release_page_clean(page);
+   WARN_ON(1);
+   return false;
+   }
+
+   if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+   /* TPR is allowed */
+   nested_vmx_disable_intercept_for_msr(msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+   MSR_TYPE_R | MSR_TYPE_W);
+   } else
+   __vmx_enable_intercept_for_msr(
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+   MSR_TYPE_R | MSR_TYPE_W);
+   kunmap(page);
+   nested_release_page_clean(page);
+
+   return true;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   if (!nested_cpu_has_virt_x2ap

[PATCH v5 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-02-03 Thread Wincy Van
Currently, if L1 enables MSR_BITMAP, we will emulate this feature,
all of L2's msr access is intercepted by L0. Since many features
like virtualize x2apic mode has a complicated logic and it is
difficult for us to emulate, we should use hardware and merge
the bitmap.

This patch introduces nested_vmx_merge_msr_bitmap for future use.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   77 ---
 1 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81152a0..4108676 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -805,6 +805,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;

@@ -5828,13 +5829,21 @@ static __init int hardware_setup(void)
(unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode_x2apic)
goto out4;
+
+   if (nested) {
+   vmx_msr_bitmap_nested =
+   (unsigned long *)__get_free_page(GFP_KERNEL);
+   if (!vmx_msr_bitmap_nested)
+   goto out5;
+   }
+
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmread_bitmap)
-   goto out5;
+   goto out6;

vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmwrite_bitmap)
-   goto out6;
+   goto out7;

memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5850,10 +5859,12 @@ static __init int hardware_setup(void)

memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+   if (nested)
+   memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);

if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
-   goto out7;
+   goto out8;
}

if (boot_cpu_has(X86_FEATURE_NX))
@@ -5974,10 +5985,13 @@ static __init int hardware_setup(void)

return alloc_kvm_area();

-out7:
+out8:
free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
+out7:
free_page((unsigned long)vmx_vmread_bitmap);
+out6:
+   if (nested)
+   free_page((unsigned long)vmx_msr_bitmap_nested);
 out5:
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
@@ -6004,6 +6018,8 @@ static __exit void hardware_unsetup(void)
free_page((unsigned long)vmx_io_bitmap_a);
free_page((unsigned long)vmx_vmwrite_bitmap);
free_page((unsigned long)vmx_vmread_bitmap);
+   if (nested)
+   free_page((unsigned long)vmx_msr_bitmap_nested);

free_kvm_area();
 }
@@ -8468,6 +8484,38 @@ static void vmx_start_preemption_timer(struct
kvm_vcpu *vcpu)
  ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }

+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+   struct vmcs12 *vmcs12)
+{
+   int maxphyaddr;
+   u64 addr;
+
+   if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+   return 0;
+
+   if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+   WARN_ON(1);
+   return -EINVAL;
+   }
+   maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+   if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+  ((addr + PAGE_SIZE) >> maxphyaddr))
+   return -EINVAL;
+
+   return 0;
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   return false;
+}
+
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
   unsigned long count_field,
   unsigned long addr_field,
@@ -8800,11 +8848,17 @@ static void prepare_vmcs02(struct kvm_vcpu
*vcpu, struct vmcs12 *vmcs12)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
}

+   if (cpu_has_vmx_msr_bitmap() &&
+   exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+   nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
+   vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+   } else
+   exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+
/*
-* Merging of IO and MSR bitmaps not currently supported.
+* Merging of IO bitmap not currently supported.
 * Rather, exit every time.

[PATCH v5 0/6] KVM: nVMX: Enable nested apicv support

2015-02-03 Thread Wincy Van
v1 ---> v2:
  Use spin lock to ensure vmcs12 is safe when doing nested
  posted interrupt delivery.

v2 ---> v3:
  1. Add a new field in nested_vmx to avoid the spin lock in v2.
  2. Drop send eoi to L1 when doing nested interrupt delivery.
  3. Use hardware MSR bitmap to enable nested virtualize x2apic
 mode.

v3 ---> v4:
  1. Optimize nested msr bitmap merging.
  2. Allocate nested msr bitmap only when nested == 1.
  3. Inline the nested vmx control checking functions.

v4 ---> v5:
  1. Move EXIT_REASON_APIC_WRITE to the apic register
 virtualization patch.
  2. Accomplish nested posted interrupts manually if
 they are not recognized by hardware.

Wincy Van (6):
  KVM: nVMX: Use hardware MSR bitmap
  KVM: nVMX: Enable nested virtualize x2apic mode
  KVM: nVMX: Make nested control MSRs per-cpu
  KVM: nVMX: Enable nested apic register virtualization
  KVM: nVMX: Enable nested virtual interrupt delivery
  KVM: nVMX: Enable nested posted interrupt processing

 arch/x86/kvm/lapic.c |   13 +-
 arch/x86/kvm/lapic.h |1 +
 arch/x86/kvm/vmx.c   |  648 ++
 3 files changed, 558 insertions(+), 104 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 6/6] KVM: nVMX: Enable nested posted interrupt processing

2015-02-02 Thread Wincy Van
On Tue, Feb 3, 2015 at 9:21 AM, Zhang, Yang Z  wrote:
> Paolo Bonzini wrote on 2015-02-03:
>>
>>
>> On 02/02/2015 16:33, Wincy Van wrote:
>>> static void vmx_accomp_nested_posted_intr(struct kvm_vcpu *vcpu) {
>>> struct vcpu_vmx *vmx = to_vmx(vcpu);
>>>
>>> if (is_guest_mode(vcpu) &&
>>> vmx->nested.posted_intr_nv != -1 &&
>>> pi_test_on(vmx->nested.pi_desc))
>>> kvm_apic_set_irr(vcpu,
>>> vmx->nested.posted_intr_nv); }
>>> Then we will get an nested-vmexit in vmx_check_nested_events, that
>>> posted intr will be handled by L1 immediately.
>>> This mechanism will also emulate the hardware's behavior: If a
>>> posted intr was not accomplished by hardware, we will get an
>
> Actually, we cannot say "not accomplished by hardware". It more like we don't 
> do the job well. See my below answer.
>

Yes, exactly.

>>> interrupt with POSTED_INTR_NV.
>>
>> Yes.
>
> This is not enough. From L1's point, L2 is in vmx non-root mode. So we should 
> emulate the posted interrupt in L0 correctly, say:
> 1. clear ON bit
> 2. ack interrupt
> 3, syn pir to virr
> 4. update RVI.
> Then let the hardware(virtual interrupt delivery) to accomplish interrupt 
> injection.
>
> Force a vmexit more like a trick. It's better to follow the hardware's 
> behavior unless we cannot do it.
>

Yes, I will try again to do this.


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 6/6] KVM: nVMX: Enable nested posted interrupt processing

2015-02-02 Thread Wincy Van
On Mon, Feb 2, 2015 at 7:03 PM, Paolo Bonzini  wrote:
>
>
> On 28/01/2015 17:02, Wincy Van wrote:
>> +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
>> +   int vector)
>> +{
>> +   if (is_guest_mode(vcpu) &&
>> +   vector == to_vmx(vcpu)->nested.posted_intr_nv &&
>> +   vcpu->mode == IN_GUEST_MODE) {
>> +   /* the PIR and ON have been set by L1. */
>
> What happens if there is a L2->L0->L2 exit on the target VCPU, and the
> guest exits before apic->send_IPI_mask sends the IPI?
>
> The L1 hypervisor might "know" somehow that there cannot be a concurrent
> L2->L1->L2 exit, and not do the equivalent of KVM's
>

In non-nested case, if a posted intr was not accomplished by hardware,
we will sync the pir to irr and set rvi to accomplish it.
Current implementation may lead some of the nested posted intrs delay
for a short time(wait for a nested-vmexit).

> kvm_make_request(KVM_REQ_EVENT, vcpu);
>
> after it sets ON.
>
> So I think you have to do something like
>
> static bool vmx_is_nested_posted_interrupt(struct kvm_vcpu *vcpu,
>int vector)
> {
> return (is_guest_mode(vcpu) &&
> vector == to_vmx(vcpu)->nested.posted_intr_nv);
> }
>
> and in vmx_deliver_posted_interrupt:
>
> r = 0;
> if (!vmx_is_nested_posted_interrupt(vcpu, vector)) {
> if (pi_test_and_set_pir(vector, &vmx->pi_desc))
> return;
>
> r = pi_test_and_set_on(&vmx->pi_desc);
> }
> kvm_make_request(KVM_REQ_EVENT, vcpu);
> #ifdef CONFIG_SMP
> if (!r && (vcpu->mode == IN_GUEST_MODE))
> apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
> POSTED_INTR_VECTOR);
> else
> #endif
> kvm_vcpu_kick(vcpu);
>
>
> What do you think?
>

I think that there would be a way to avoid that delay, but may hurt performance:
When doing nested posted intr, we can set a request bit:

   if (is_guest_mode(vcpu) &&
vector == to_vmx(vcpu)->nested.posted_intr_nv &&
vcpu->mode == IN_GUEST_MODE) {
/* the PIR and ON have been set by L1. */
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
POSTED_INTR_VECTOR);
+kvm_make_request(KVM_REQ_ACCOMP_POSTED_INTR, vcpu);
return 0;
}

If a posted intr was not accomplished by hardware,  we can check that
bit before checking KVM_REQ_EVENT, and if that bit is set, we can do:

static void vmx_accomp_nested_posted_intr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);

if (is_guest_mode(vcpu) &&
vmx->nested.posted_intr_nv != -1 &&
pi_test_on(vmx->nested.pi_desc))
kvm_apic_set_irr(vcpu,
vmx->nested.posted_intr_nv);
}

Then we will get an nested-vmexit in vmx_check_nested_events, that
posted intr will be handled by L1 immediately.
This mechanism will also emulate the hardware's behavior: If a posted
intr was not accomplished by hardware, we will get an interrupt with
POSTED_INTR_NV.

Would this be better?

Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 0/6] KVM: nVMX: Enable nested apicv support

2015-01-28 Thread Wincy Van
On Thu, Jan 29, 2015 at 11:17 AM, Zhang, Yang Z  wrote:
> Wincy Van wrote on 2015-01-28:
>> v1 ---> v2:
>>   Use spin lock to ensure vmcs12 is safe when doing nested
>>   posted interrupt delivery.
>> v2 ---> v3:
>>   1. Add a new field in nested_vmx to avoid the spin lock in v2.
>>   2. Drop send eoi to L1 when doing nested interrupt delivery.
>>   3. Use hardware MSR bitmap to enable nested virtualize x2apic
>>  mode.
>> v3 ---> v4:
>>   1. Optimize nested msr bitmap merging.
>>   2. Allocate nested msr bitmap only when nested == 1.
>>   3. Inline the nested vmx control checking functions.
>
> This version looks good to me. Only minor comment: EXIT_REASON_APIC_WRITE 
> vmexit is introduced by apic register virtualization not virtual interrupt 
> delivery, so it's better add it in 4th patch not 5th patch.(If no other 
> comments, I guess Paolo can help do it when applying it).
>
> Reviewed-by: Yang Zhang 
>

Yes, thank you for pointing out it  ; )
Paolo, what's your opinion?


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 2/6] KVM: nVMX: Enable nested virtualize x2apic mode

2015-01-28 Thread Wincy Van
On Thu, Jan 29, 2015 at 10:54 AM, Zhang, Yang Z  wrote:
>> -8646,7 +8750,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
>> struct vmcs12 *vmcs12)
>> else
>> vmcs_write64(APIC_ACCESS_ADDR,
>>
>> page_to_phys(vmx->nested.apic_access_page));
>> -   } else if
>> (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
>> +   } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
>> +
>> + (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
>> exec_control |=
>
> You don't load L2's apic_page in your patch correctly when x2apic mode is 
> used. Here is the right change for prepare_vmcs02()(maybe other place also 
> need change too):
>
> @@ -8585,7 +8585,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
> struct vmcs12 *vmcs12)
> CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
> exec_control |= vmcs12->secondary_vm_exec_control;
>
> -   if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
> +   if (exec_control & (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> +   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
> /*
>  * If translation failed, no matter: This feature asks
>  * to exit when accessing the given address, and if it
> @@ -8594,7 +8595,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
> struct vmcs12 *vmcs12)
>  */
> if (!vmx->nested.apic_access_page)
> exec_control &=
> - ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> + ~ (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> +SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
> else
> vmcs_write64(APIC_ACCESS_ADDR,
>   page_to_phys(vmx->nested.apic_access_page));
>

I think we don't need to do this, if L1 enables x2apic mode, we have
already checked that the
vmcs12->SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES is 0. "exec_control |=
vmcs12->secondary_vm_exec_control;" merged L1's settings, including
x2apic mode. the special case is vm_need_virtualize_apic_accesses, if
vm_need_virtualize_apic_accesses returns true, the
nested_cpu_has_virt_x2apic_mode will prevent us to set the apic access
bit.


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-01-28 Thread Wincy Van
On Wed, Jan 28, 2015 at 4:05 PM, Zhang, Yang Z  wrote:
>> @@ -8344,7 +8394,68 @@ static int
>> nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,  static
>> inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>struct vmcs12
>> *vmcs12)  {
>> -   return false;
>> +   struct page *page;
>> +   unsigned long *msr_bitmap;
>> +
>> +   if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
>> +   return false;
>> +
>> +   page = nested_get_page(vcpu, vmcs12->msr_bitmap);
>> +   if (!page) {
>> +   WARN_ON(1);
>> +   return false;
>> +   }
>> +   msr_bitmap = (unsigned long *)kmap(page);
>> +   if (!msr_bitmap) {
>> +   nested_release_page_clean(page);
>> +   WARN_ON(1);
>> +   return false;
>> +   }
>> +
>> +   memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>> +
>> +   if (nested_cpu_has_virt_x2apic_mode(vmcs12))
>> +   /* TPR is allowed */
>> +   nested_vmx_disable_intercept_for_msr(msr_bitmap,
>> +   vmx_msr_bitmap_nested,
>> +   APIC_BASE_MSR + (APIC_TASKPRI >>
>> 4),
>> +   MSR_TYPE_R | MSR_TYPE_W);
>
> I didn't understand what this function does? Per my understanding, you only 
> need to set the (vmx_msr_bitmap_nested = vmcs01->msr_bitmap | 
> vmcs12->msr_bitmap) and inject the nested vmexit to L1 if the bit in 
> vmcs12->msr_bitmap is setting. Am I missing some patches?

In the beginning, I want to do "vmcs01->msr_bitmap |
vmcs12->msr_bitmap", but I remember that there isn't a instruction to
do a bit or operation in two pages effectively, so I do the bit or
operation in nested_vmx_disable_intercept_for_msr. If the hardware do
not support this, I think it is faster if we deal with the bits on
demand. nested_vmx_merge_msr_bitmap is used to merge L0's and L1's
bitmaps, any features can put their logic here.

If there is a faster way, please teach me how to do it : )

Thanks,
Wincy


>
> Best regards,
> Yang
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 5/6] KVM: nVMX: Enable nested virtual interrupt delivery

2015-01-28 Thread Wincy Van
With virtual interrupt delivery, the hardware prevent KVM from
the low efficiency interrupt inject way. In nested vmx, it is
a important feature, we can reduce much more nested-vmexit,
especially in high throughput scenes.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   68 +++-
 1 files changed, 67 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 68783e0..ab131f3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -216,6 +216,10 @@ struct __packed vmcs12 {
u64 virtual_apic_page_addr;
u64 apic_access_addr;
u64 ept_pointer;
+   u64 eoi_exit_bitmap0;
+   u64 eoi_exit_bitmap1;
+   u64 eoi_exit_bitmap2;
+   u64 eoi_exit_bitmap3;
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
@@ -338,6 +342,7 @@ struct __packed vmcs12 {
u16 guest_gs_selector;
u16 guest_ldtr_selector;
u16 guest_tr_selector;
+   u16 guest_intr_status;
u16 host_es_selector;
u16 host_cs_selector;
u16 host_ss_selector;
@@ -624,6 +629,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+   FIELD(GUEST_INTR_STATUS, guest_intr_status),
FIELD(HOST_ES_SELECTOR, host_es_selector),
FIELD(HOST_CS_SELECTOR, host_cs_selector),
FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -641,6 +647,10 @@ static const unsigned short
vmcs_field_to_offset_table[] = {
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
FIELD64(EPT_POINTER, ept_pointer),
+   FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+   FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+   FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+   FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -1135,6 +1145,11 @@ static inline bool
nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
 }

+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2429,6 +2444,7 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
+   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -7397,6 +7413,10 @@ static bool nested_vmx_exit_handled(struct
kvm_vcpu *vcpu)
case EXIT_REASON_APIC_ACCESS:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+   case EXIT_REASON_APIC_WRITE:
+   case EXIT_REASON_EOI_INDUCED:
+   /* apic_write and eoi_induced should exit unconditionally. */
+   return 1;
case EXIT_REASON_EPT_VIOLATION:
/*
 * L0 always deals with the EPT violation. If nested EPT is
@@ -8480,6 +8500,19 @@ static inline bool
nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_R | MSR_TYPE_W);
+   if (nested_cpu_has_vid(vmcs12)) {
+   /* EOI and self-IPI are allowed */
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_EOI >> 4),
+   MSR_TYPE_W);
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+   MSR_TYPE_W);
+   }
} else {
/*
 * Enable reading intercept of all the x2apic
@@ -8497,6 +8530,14 @@ static inline bool
nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_W);
+   __

[PATCH v4 0/6] KVM: nVMX: Enable nested apicv support

2015-01-28 Thread Wincy Van
v1 ---> v2:
  Use spin lock to ensure vmcs12 is safe when doing nested
  posted interrupt delivery.

v2 ---> v3:
  1. Add a new field in nested_vmx to avoid the spin lock in v2.
  2. Drop send eoi to L1 when doing nested interrupt delivery.
  3. Use hardware MSR bitmap to enable nested virtualize x2apic
 mode.

v3 ---> v4:
  1. Optimize nested msr bitmap merging.
  2. Allocate nested msr bitmap only when nested == 1.
  3. Inline the nested vmx control checking functions.

Wincy Van (6):
  KVM: nVMX: Use hardware MSR bitmap
  KVM: nVMX: Enable nested virtualize x2apic mode
  KVM: nVMX: Make nested control MSRs per-cpu
  KVM: nVMX: Enable nested apic register virtualization
  KVM: nVMX: Enable nested virtual interrupt delivery
  KVM: nVMX: Enable nested posted interrupt processing

 arch/x86/kvm/vmx.c |  580 +++-
 1 files changed, 480 insertions(+), 100 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-01-28 Thread Wincy Van
Sorry, please ignore this mail, the subject is wrong : (

On Wed, Jan 28, 2015 at 11:50 PM, Wincy Van  wrote:
> Currently, if L1 enables MSR_BITMAP, we will emulate this feature,
> all of L2's msr access is intercepted by L0. Since many features
> like virtualize x2apic mode has a complicated logic and it is
> difficult for us to emulate, we should use hardware and merge
> the bitmap.
>
> This patch introduces nested_vmx_merge_msr_bitmap for future use.
>
> Signed-off-by: Wincy Van 
> ---
>  arch/x86/kvm/vmx.c |   77 ---
>  1 files changed, 66 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index c987374..787f886 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -798,6 +798,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
>  static unsigned long *vmx_msr_bitmap_longmode;
>  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
>  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
> +static unsigned long *vmx_msr_bitmap_nested;
>  static unsigned long *vmx_vmread_bitmap;
>  static unsigned long *vmx_vmwrite_bitmap;
>
> @@ -5812,13 +5813,21 @@ static __init int hardware_setup(void)
> (unsigned long *)__get_free_page(GFP_KERNEL);
> if (!vmx_msr_bitmap_longmode_x2apic)
> goto out4;
> +
> +   if (nested) {
> +   vmx_msr_bitmap_nested =
> +   (unsigned long *)__get_free_page(GFP_KERNEL);
> +   if (!vmx_msr_bitmap_nested)
> +   goto out5;
> +   }
> +
> vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
> if (!vmx_vmread_bitmap)
> -   goto out5;
> +   goto out6;
>
> vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
> if (!vmx_vmwrite_bitmap)
> -   goto out6;
> +   goto out7;
>
> memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
> memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
> @@ -5834,10 +5843,12 @@ static __init int hardware_setup(void)
>
> memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
> memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
> +   if (nested)
> +   memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>
> if (setup_vmcs_config(&vmcs_config) < 0) {
> r = -EIO;
> -   goto out7;
> +   goto out8;
> }
>
> if (boot_cpu_has(X86_FEATURE_NX))
> @@ -5944,10 +5955,13 @@ static __init int hardware_setup(void)
>
> return alloc_kvm_area();
>
> -out7:
> +out8:
> free_page((unsigned long)vmx_vmwrite_bitmap);
> -out6:
> +out7:
> free_page((unsigned long)vmx_vmread_bitmap);
> +out6:
> +   if (nested)
> +   free_page((unsigned long)vmx_msr_bitmap_nested);
>  out5:
> free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>  out4:
> @@ -5974,6 +5988,8 @@ static __exit void hardware_unsetup(void)
> free_page((unsigned long)vmx_io_bitmap_a);
> free_page((unsigned long)vmx_vmwrite_bitmap);
> free_page((unsigned long)vmx_vmread_bitmap);
> +   if (nested)
> +   free_page((unsigned long)vmx_msr_bitmap_nested);
>
> free_kvm_area();
>  }
> @@ -8305,6 +8321,38 @@ static void vmx_start_preemption_timer(struct
> kvm_vcpu *vcpu)
>   ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
>  }
>
> +static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
> +   struct vmcs12 *vmcs12)
> +{
> +   int maxphyaddr;
> +   u64 addr;
> +
> +   if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
> +   return 0;
> +
> +   if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
> +   WARN_ON(1);
> +   return -EINVAL;
> +   }
> +   maxphyaddr = cpuid_maxphyaddr(vcpu);
> +
> +   if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
> +  ((addr + PAGE_SIZE) >> maxphyaddr))
> +   return -EINVAL;
> +
> +   return 0;
> +}
> +
> +/*
> + * Merge L0's and L1's MSR bitmap, return false to indicate that
> + * we do not use the hardware.
> + */
> +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
> +  struct vmcs12 *vmcs12)
> +{
> +   return false;
> +}
> +
>  static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
>unsigned long cou

Re: [PATCH v3 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-01-28 Thread Wincy Van
On Wed, Jan 28, 2015 at 8:33 PM, Zhang, Yang Z  wrote:
>>>
>>> You are right, but this is not fit for all the cases, we should
>>> custom the nested_msr_bitmap.
>>> e.g.  Currently L0 wants to intercept some of the x2apic msrs' reading:
>>>  if (enable_apicv) {
>>> for (msr = 0x800; msr <= 0x8ff; msr++)
>> vmx_disable_intercept_msr_read_x2apic(msr);
>>> /* According SDM, in x2apic mode, the whole id reg
>>> is
>> used.
>>>  * But in KVM, it only use the highest eight bits. Need to
>>>  * intercept it */
>>> vmx_enable_intercept_msr_read_x2apic(0x802); /* TMCCT
>>> */ vmx_enable_intercept_msr_read_x2apic(0x839); /* TPR
>>> */ vmx_disable_intercept_msr_write_x2apic(0x808); /*
>>> EOI
>> */
>>> vmx_disable_intercept_msr_write_x2apic(0x80b); /*
>>> SELF-IPI */
>>> vmx_disable_intercept_msr_write_x2apic(0x83f);
>>> }
>>> But L1 may not want this. So I think we are better to deal with the
>>
>> Actually, from L0's point, it is totally unaware of the L2. The only
>> thing L0 aware is that the CPU should follow L0's configuration when
>> VCPU is running. So if L0 wants to trap a msr, then the read operation
>> to this msr should cause vmexit unconditionally no matter who is running(who 
>> means L1, L2, L3.).
>>
>>> msrs seperately, there is not a common way suit for all the cases.
>>> If other features want to intercept a msr in nested entry, they can
>>> put the custom code in nested_vmx_merge_msr_bitmap.
>>
>> Yes, if other features want to do it in 'nested' entry, they can fill
>> nested_vmx_merge_msr_bitmap. But if in non-nested case, it should be
>> our responsibly to handle it correctly, how about add following check:
>>
>> if (type & MSR_TYPE_R && !test_bit(msr, vmcs01_msr_bitmap) &&
>> !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
>> __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
>
>
> Anyway, this is not necessary for your current patch. We can consider it 
> later if there really have other features will use it.
>

Yep, I know what you mean now, for other msrs which are not forwarded
access by a mechanism like virtual-apic page, we should intercept it
unconditionally. I think we should ensure the msr can be allowed
before call nested_vmx_disable_intercept_for_msr, if L0 want to
intercept it, just do not call nested_vmx_disable_intercept_for_msr.

 !test_bit(msr, vmcs01_msr_bitmap) will introduce a problem that some
of the msrs will be affcted by vmcs01_msr_bitmap, TMCCT and TPR, for
example.
Intercept reading for these msrs is okay, but it is not efficient.

Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-01-28 Thread Wincy Van
On Wed, Jan 28, 2015 at 4:00 PM, Zhang, Yang Z  wrote:
>> @@ -5812,13 +5813,18 @@ static __init int hardware_setup(void)
>> (unsigned long
>> *)__get_free_page(GFP_KERNEL);
>> if (!vmx_msr_bitmap_longmode_x2apic)
>> goto out4;
>> +
>> +   vmx_msr_bitmap_nested = (unsigned long
>> *)__get_free_page(GFP_KERNEL);
>> +   if (!vmx_msr_bitmap_nested)
>> +   goto out5;
>> +
>
> Since the nested virtualization is off by default. It's better to allocate 
> the page
> only when nested is true. Maybe adding the following check is better:
>
> if (nested) {
> vmx_msr_bitmap_nested = (unsigned long *)__get_free_page(GFP_KERNEL);
> if (!vmx_msr_bitmap_nested)
> goto out5;
> }

Agreed. Will do.

>
> ...snip...
>
>> +
>> +/*
>> + * Merge L0's and L1's MSR bitmap, return false to indicate that
>> + * we do not use the hardware.
>> + */
>> +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>> +  struct vmcs12
>> *vmcs12) {
>> +   return false;
>> +}
>> +
>
> The following patches have nothing to do with the MSR control. Why leave the 
> function empty here?
>

No. In patch "Enable nested virtualize x2apic mode", we will return
false if L1 disabled virt_x2apic_mode, then the hardware MSR-bitmap
control is disabled. This is faster than rebuilding the
vmx_msr_bitmap_nested.
This function returns false here to indicate that we do not use the
hardware. Since It is not only virtualize x2apic mode using this,
other features may use this either. I think it isn't suitable to
introduce this function in other patches.


> Best regards,
> Yang
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 3/6] KVM: nVMX: Make nested control MSRs per-cpu

2015-01-28 Thread Wincy Van
To enable nested apicv support, we need per-cpu vmx
control MSRs:
  1. If in-kernel irqchip is enabled, we can enable nested
 posted interrupt, we should set posted intr bit in
 the nested_vmx_pinbased_ctls_high.
  2. If in-kernel irqchip is disabled, we can not enable
 nested posted interrupt, the posted intr bit
 in the nested_vmx_pinbased_ctls_high will be cleared.

Since there would be different settings about in-kernel
irqchip between VMs, different nested control MSRs
are needed.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |  215 +++-
 1 files changed, 129 insertions(+), 86 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9d11a93..55111ed 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -408,6 +408,23 @@ struct nested_vmx {

/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+
+   u32 nested_vmx_procbased_ctls_low;
+   u32 nested_vmx_procbased_ctls_high;
+   u32 nested_vmx_true_procbased_ctls_low;
+   u32 nested_vmx_secondary_ctls_low;
+   u32 nested_vmx_secondary_ctls_high;
+   u32 nested_vmx_pinbased_ctls_low;
+   u32 nested_vmx_pinbased_ctls_high;
+   u32 nested_vmx_exit_ctls_low;
+   u32 nested_vmx_exit_ctls_high;
+   u32 nested_vmx_true_exit_ctls_low;
+   u32 nested_vmx_entry_ctls_low;
+   u32 nested_vmx_entry_ctls_high;
+   u32 nested_vmx_true_entry_ctls_low;
+   u32 nested_vmx_misc_low;
+   u32 nested_vmx_misc_high;
+   u32 nested_vmx_ept_caps;
 };

 #define POSTED_INTR_ON  0
@@ -2285,20 +2302,8 @@ static inline bool nested_vmx_allowed(struct
kvm_vcpu *vcpu)
  * if the corresponding bit in the (32-bit) control field *must* be on, and a
  * bit in the high half is on if the corresponding bit in the control field
  * may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
  */
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
-static u32 nested_vmx_true_procbased_ctls_low;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_true_exit_ctls_low;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static u32 nested_vmx_true_entry_ctls_low;
-static u32 nested_vmx_misc_low, nested_vmx_misc_high;
-static u32 nested_vmx_ept_caps;
-static __init void nested_vmx_setup_ctls_msrs(void)
+static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 {
/*
 * Note that as a general rule, the high half of the MSRs (bits in
@@ -2317,57 +2322,71 @@ static __init void nested_vmx_setup_ctls_msrs(void)

/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
-   nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
-   nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
-   PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
-   nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+   vmx->nested.nested_vmx_pinbased_ctls_low,
+   vmx->nested.nested_vmx_pinbased_ctls_high);
+   vmx->nested.nested_vmx_pinbased_ctls_low |=
+   PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+   vmx->nested.nested_vmx_pinbased_ctls_high &=
+   PIN_BASED_EXT_INTR_MASK |
+   PIN_BASED_NMI_EXITING |
+   PIN_BASED_VIRTUAL_NMIS;
+   vmx->nested.nested_vmx_pinbased_ctls_high |=
+   PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;

/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
-   nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
-   nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+   vmx->nested.nested_vmx_exit_ctls_low,
+   vmx->nested.nested_vmx_exit_ctls_high);
+   vmx->nested.nested_vmx_exit_ctls_low =
+   VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;

-   nested_vmx_exit_ctls_high &=
+   vmx->nested.nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
-   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+   vmx->nested.nested_vmx_exit_ctls_high |=
+   VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;

if (vmx_mpx_supported())
-   nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+  

Re: [PATCH v3 2/6] KVM: nVMX: Enable nested virtualize x2apic mode.

2015-01-28 Thread Wincy Van
On Wed, Jan 28, 2015 at 3:31 PM, Paolo Bonzini  wrote:
>>> >
>>> > No need for this function and nested_cpu_has_virt_x2apic_mode.  Just
>>> > inline them in their caller(s).  Same for other cases throughout the 
>>> > series.
>>> >
>> Do you mean that we should also inline the same functions in the other
>> patches of this patch set?
>> I think these functions will keep the code tidy, just like the
>> functions as nested_cpu_has_preemption_timer, nested_cpu_has_ept, etc.
>
> Most of the functions are just used once.  If you want to keep them,
> please place them all close to the existing ones.
>

Yep, I will inline the functions like nested_vmx_check_virt_x2apic and keep
the nested_cpu_has series.


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 4/6] KVM: nVMX: Enable nested apic register virtualization

2015-01-28 Thread Wincy Van
We can reduce apic register virtualization cost with this feature,
it is also a requirement for virtual interrupt delivery and posted
interrupt processing.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   36 
 1 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 55111ed..68783e0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1130,6 +1130,11 @@ static inline bool
nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 }

+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2423,6 +2428,7 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+   SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -8443,6 +8449,7 @@ static int
nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
+   int msr;
struct page *page;
unsigned long *msr_bitmap;

@@ -8462,16 +8469,35 @@ static inline bool
nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
}

if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+   if (nested_cpu_has_apic_reg_virt(vmcs12))
+   for (msr = 0x800; msr <= 0x8ff; msr++)
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   msr, MSR_TYPE_R);
/* TPR is allowed */
nested_vmx_disable_intercept_for_msr(msr_bitmap,
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_R | MSR_TYPE_W);
-   } else
+   } else {
+   /*
+* Enable reading intercept of all the x2apic
+* MSRs. We should not rely on vmcs12 to do any
+* optimizations here, it may have been modified
+* by L1.
+*/
+   for (msr = 0x800; msr <= 0x8ff; msr++)
+   __vmx_enable_intercept_for_msr(
+   vmx_msr_bitmap_nested,
+   msr,
+   MSR_TYPE_R);
+
__vmx_enable_intercept_for_msr(
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-   MSR_TYPE_R | MSR_TYPE_W);
+   MSR_TYPE_W);
+   }
kunmap(page);
nested_release_page_clean(page);

@@ -8481,14 +8507,16 @@ static inline bool
nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
-   if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+   if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+   !nested_cpu_has_apic_reg_virt(vmcs12))
return 0;

/*
 * If virtualize x2apic mode is enabled,
 * virtualize apic access must be disabled.
 */
-   if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+   if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+   nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
return -EINVAL;

/* tpr shadow is needed by all apicv features. */
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-01-28 Thread Wincy Van
Currently, if L1 enables MSR_BITMAP, we will emulate this feature,
all of L2's msr access is intercepted by L0. Since many features
like virtualize x2apic mode has a complicated logic and it is
difficult for us to emulate, we should use hardware and merge
the bitmap.

This patch introduces nested_vmx_merge_msr_bitmap for future use.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   77 ---
 1 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c987374..787f886 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -798,6 +798,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;

@@ -5812,13 +5813,21 @@ static __init int hardware_setup(void)
(unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode_x2apic)
goto out4;
+
+   if (nested) {
+   vmx_msr_bitmap_nested =
+   (unsigned long *)__get_free_page(GFP_KERNEL);
+   if (!vmx_msr_bitmap_nested)
+   goto out5;
+   }
+
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmread_bitmap)
-   goto out5;
+   goto out6;

vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmwrite_bitmap)
-   goto out6;
+   goto out7;

memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5834,10 +5843,12 @@ static __init int hardware_setup(void)

memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+   if (nested)
+   memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);

if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
-   goto out7;
+   goto out8;
}

if (boot_cpu_has(X86_FEATURE_NX))
@@ -5944,10 +5955,13 @@ static __init int hardware_setup(void)

return alloc_kvm_area();

-out7:
+out8:
free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
+out7:
free_page((unsigned long)vmx_vmread_bitmap);
+out6:
+   if (nested)
+   free_page((unsigned long)vmx_msr_bitmap_nested);
 out5:
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
@@ -5974,6 +5988,8 @@ static __exit void hardware_unsetup(void)
free_page((unsigned long)vmx_io_bitmap_a);
free_page((unsigned long)vmx_vmwrite_bitmap);
free_page((unsigned long)vmx_vmread_bitmap);
+   if (nested)
+   free_page((unsigned long)vmx_msr_bitmap_nested);

free_kvm_area();
 }
@@ -8305,6 +8321,38 @@ static void vmx_start_preemption_timer(struct
kvm_vcpu *vcpu)
  ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }

+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+   struct vmcs12 *vmcs12)
+{
+   int maxphyaddr;
+   u64 addr;
+
+   if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+   return 0;
+
+   if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+   WARN_ON(1);
+   return -EINVAL;
+   }
+   maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+   if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+  ((addr + PAGE_SIZE) >> maxphyaddr))
+   return -EINVAL;
+
+   return 0;
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   return false;
+}
+
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
   unsigned long count_field,
   unsigned long addr_field,
@@ -8637,11 +8685,17 @@ static void prepare_vmcs02(struct kvm_vcpu
*vcpu, struct vmcs12 *vmcs12)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
}

+   if (cpu_has_vmx_msr_bitmap() &&
+   exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+   nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
+   vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+   } else
+   exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+
/*
-* Merging of IO and MSR bitmaps not currently supported.
+* Merging of IO bitmap not currently supported.
 * Rather, exit every time.

[PATCH v4 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-01-28 Thread Wincy Van
Currently, if L1 enables MSR_BITMAP, we will emulate this feature,
all of L2's msr access is intercepted by L0. Since many features
like virtualize x2apic mode has a complicated logic and it is
difficult for us to emulate, we should use hardware and merge
the bitmap.

This patch introduces nested_vmx_merge_msr_bitmap for future use.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   77 ---
 1 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c987374..787f886 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -798,6 +798,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;

@@ -5812,13 +5813,21 @@ static __init int hardware_setup(void)
(unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode_x2apic)
goto out4;
+
+   if (nested) {
+   vmx_msr_bitmap_nested =
+   (unsigned long *)__get_free_page(GFP_KERNEL);
+   if (!vmx_msr_bitmap_nested)
+   goto out5;
+   }
+
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmread_bitmap)
-   goto out5;
+   goto out6;

vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmwrite_bitmap)
-   goto out6;
+   goto out7;

memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5834,10 +5843,12 @@ static __init int hardware_setup(void)

memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+   if (nested)
+   memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);

if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
-   goto out7;
+   goto out8;
}

if (boot_cpu_has(X86_FEATURE_NX))
@@ -5944,10 +5955,13 @@ static __init int hardware_setup(void)

return alloc_kvm_area();

-out7:
+out8:
free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
+out7:
free_page((unsigned long)vmx_vmread_bitmap);
+out6:
+   if (nested)
+   free_page((unsigned long)vmx_msr_bitmap_nested);
 out5:
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
@@ -5974,6 +5988,8 @@ static __exit void hardware_unsetup(void)
free_page((unsigned long)vmx_io_bitmap_a);
free_page((unsigned long)vmx_vmwrite_bitmap);
free_page((unsigned long)vmx_vmread_bitmap);
+   if (nested)
+   free_page((unsigned long)vmx_msr_bitmap_nested);

free_kvm_area();
 }
@@ -8305,6 +8321,38 @@ static void vmx_start_preemption_timer(struct
kvm_vcpu *vcpu)
  ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }

+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+   struct vmcs12 *vmcs12)
+{
+   int maxphyaddr;
+   u64 addr;
+
+   if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+   return 0;
+
+   if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+   WARN_ON(1);
+   return -EINVAL;
+   }
+   maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+   if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+  ((addr + PAGE_SIZE) >> maxphyaddr))
+   return -EINVAL;
+
+   return 0;
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   return false;
+}
+
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
   unsigned long count_field,
   unsigned long addr_field,
@@ -8637,11 +8685,17 @@ static void prepare_vmcs02(struct kvm_vcpu
*vcpu, struct vmcs12 *vmcs12)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
}

+   if (cpu_has_vmx_msr_bitmap() &&
+   exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+   nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
+   vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+   } else
+   exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+
/*
-* Merging of IO and MSR bitmaps not currently supported.
+* Merging of IO bitmap not currently supported.
 * Rather, exit every time.

[PATCH v4 2/6] KVM: nVMX: Enable nested virtualize x2apic mode

2015-01-28 Thread Wincy Van
When L2 is using x2apic, we can use virtualize x2apic mode to
gain higher performance, especially in apicv case.

This patch also introduces nested_vmx_check_apicv_controls
for the nested apicv patches.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |  114 +++-
 1 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 787f886..9d11a93 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1108,6 +1108,11 @@ static inline bool nested_cpu_has_xsaves(struct
vmcs12 *vmcs12)
vmx_xsaves_supported();
 }

+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2395,6 +2400,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_secondary_ctls_low = 0;
nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -4155,6 +4161,52 @@ static void
__vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
}
 }

+/*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ */
+static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+  unsigned long *msr_bitmap_nested,
+  u32 msr, int type)
+{
+   int f = sizeof(unsigned long);
+
+   if (!cpu_has_vmx_msr_bitmap()) {
+   WARN_ON(1);
+   return;
+   }
+
+   /*
+* See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+* have the write-low and read-high bitmap offsets the wrong way round.
+* We can control MSRs 0x-0x1fff and 0xc000-0xc0001fff.
+*/
+   if (msr <= 0x1fff) {
+   if (type & MSR_TYPE_R &&
+  !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+   /* read-low */
+   __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+
+   if (type & MSR_TYPE_W &&
+  !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+   /* write-low */
+   __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+
+   } else if ((msr >= 0xc000) && (msr <= 0xc0001fff)) {
+   msr &= 0x1fff;
+   if (type & MSR_TYPE_R &&
+  !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+   /* read-high */
+   __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+
+   if (type & MSR_TYPE_W &&
+  !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+   /* write-high */
+   __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+
+   }
+}
+
 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 {
if (!longmode_only)
@@ -8350,7 +8402,59 @@ static int
nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
-   return false;
+   struct page *page;
+   unsigned long *msr_bitmap;
+
+   if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+   return false;
+
+   page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+   if (!page) {
+   WARN_ON(1);
+   return false;
+   }
+   msr_bitmap = (unsigned long *)kmap(page);
+   if (!msr_bitmap) {
+   nested_release_page_clean(page);
+   WARN_ON(1);
+   return false;
+   }
+
+   if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+   /* TPR is allowed */
+   nested_vmx_disable_intercept_for_msr(msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+   MSR_TYPE_R | MSR_TYPE_W);
+   } else
+   __vmx_enable_intercept_for_msr(
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+   MSR_TYPE_R | MSR_TYPE_W);
+   kunmap(page);
+   nested_release_page_clean(page);
+
+   return true;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   if (!nested_cpu_has_virt_x2ap

Re: [PATCH v3 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-01-28 Thread Wincy Van
On Wed, Jan 28, 2015 at 9:06 PM, Zhang, Yang Z  wrote:
 __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
>>>
>>>
>>> Anyway, this is not necessary for your current patch. We can consider
>>> it later if there really have other features will use it.
>>>
>>
>> Yep, I know what you mean now, for other msrs which are not forwarded
>> access by a mechanism like virtual-apic page, we should intercept it
>> unconditionally. I think we should ensure the msr can be allowed
>> before call nested_vmx_disable_intercept_for_msr, if L0 want to
>> intercept it, just do not call nested_vmx_disable_intercept_for_msr.
>
> Yes, this is a solution. But I prefer to handle it in nested code path since 
> others may not familiar with nested and may block it by mistake.
>
>>
>>  !test_bit(msr, vmcs01_msr_bitmap) will introduce a problem that some
>> of the msrs will be affcted by vmcs01_msr_bitmap, TMCCT and TPR, for example.
>> Intercept reading for these msrs is okay, but it is not efficient.
>
> TMCCT is always trapped by most VMM. I don't think TPR is trapped in KVM.
>

Oooops. This piece of code made me confused and I was not think about
that words, just forget it.
( * ^ _ ^ * )


vmx_enable_intercept_msr_read_x2apic(0x802);
/* TMCCT */
vmx_enable_intercept_msr_read_x2apic(0x839);
/* TPR */


If there are any features use it in the future, let's consider about
this, thank you for your patience.


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-01-28 Thread Wincy Van
On Wed, Jan 28, 2015 at 7:52 PM, Zhang, Yang Z  wrote:
>>>
>>
>> If L0 wants to intercept a msr, we should set
>> vmx_msr_bitmap_legacy(_x2apic) and vmx_msr_bitmap_longmode(_x2apic),
>> and that bitmaps should only be loaded in non-nested entry.
>> Currently we only clear the corresponding bits if L1 enables
>> virtualize x2apic mode, all the other bits are all set. On nested
>> entry, we load nested_msr_bitmap, on nested vmexit, we will restore L0's 
>> bitmap.
>> In the nested entry, L0 only care about L2's msr access, not L1's. I
>> think that in nested entry, nested_msr_bitmap is "vmcs01->msr_bitmap"
>> as your mentioned above.
>
> Mmm... I think if the bit is setting in vmcs01->msr_bitmap means whenever the 
> VCPU(no matter in nested guest mode or not) accesses this msr should cause 
> vmexit, not only L1. That's why need to construct the vmcs02->msr_bitmap 
> based on (vmcs01->msr_bitmap ORed vmcs12->msr_bitmap).
>

You are right, but this is not fit for all the cases, we should custom
the nested_msr_bitmap.
e.g.  Currently L0 wants to intercept some of the x2apic msrs' reading:
 if (enable_apicv) {
for (msr = 0x800; msr <= 0x8ff; msr++)
vmx_disable_intercept_msr_read_x2apic(msr);

/* According SDM, in x2apic mode, the whole id reg is used.
 * But in KVM, it only use the highest eight bits. Need to
 * intercept it */
vmx_enable_intercept_msr_read_x2apic(0x802);
/* TMCCT */
vmx_enable_intercept_msr_read_x2apic(0x839);
/* TPR */
vmx_disable_intercept_msr_write_x2apic(0x808);
/* EOI */
vmx_disable_intercept_msr_write_x2apic(0x80b);
/* SELF-IPI */
vmx_disable_intercept_msr_write_x2apic(0x83f);
}
But L1 may not want this. So I think we are better to deal with the msrs
seperately, there is not a common way suit for all the cases. If other
features want to intercept a msr in nested entry, they can put the
custom code in nested_vmx_merge_msr_bitmap.


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-01-28 Thread Wincy Van
On Wed, Jan 28, 2015 at 7:25 PM, Zhang, Yang Z  wrote:
> Wincy Van wrote on 2015-01-28:
>> On Wed, Jan 28, 2015 at 4:05 PM, Zhang, Yang Z 
>> wrote:
>>>> @@ -8344,7 +8394,68 @@ static int
>>>> nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,  static
>>>> inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>>>struct vmcs12
>>>> *vmcs12)  { -   return false; +   struct page *page; +
>>>> unsigned long *msr_bitmap; + +   if
>>>> (!nested_cpu_has_virt_x2apic_mode(vmcs12)) +   return
>>>> false; + +   page = nested_get_page(vcpu, vmcs12->msr_bitmap); +
>>>> if (!page) { +   WARN_ON(1); +   return
>>>> false; +   } +   msr_bitmap = (unsigned long *)kmap(page); +
>>>> if (!msr_bitmap) { +
>>>> nested_release_page_clean(page); +   WARN_ON(1); +
>>>>   return false; +   } + +   memset(vmx_msr_bitmap_nested,
>>>> 0xff, PAGE_SIZE); + +   if
>>>> (nested_cpu_has_virt_x2apic_mode(vmcs12)) +   /* TPR is
>>>> allowed */ +
>>>> nested_vmx_disable_intercept_for_msr(msr_bitmap, +
>>>>   vmx_msr_bitmap_nested, +
>>>> APIC_BASE_MSR + (APIC_TASKPRI >> 4), +
>>>> MSR_TYPE_R | MSR_TYPE_W);
>>>
>>> I didn't understand what this function does? Per my understanding,
>>> you only
>> need to set the (vmx_msr_bitmap_nested = vmcs01->msr_bitmap |
>> vmcs12->msr_bitmap) and inject the nested vmexit to L1 if the bit in
>> vmcs12->msr_bitmap is setting. Am I missing some patches?
>>
>> In the beginning, I want to do "vmcs01->msr_bitmap |
>> vmcs12->msr_bitmap", but I remember that there isn't a instruction to
>> do a bit or operation in two pages effectively, so I do the bit or
>> operation in nested_vmx_disable_intercept_for_msr. If the hardware do
>> not support this, I think it is faster if we deal with the bits on demand.
>> nested_vmx_merge_msr_bitmap is used to merge L0's and L1's bitmaps,
>> any features can put their logic here.
>
> You construct the nested_msr_bitmap based on vmcs12->msr_bitmap, what happens 
> if vmcs01->msr_bitmap want to trap this msr?
>

If L0 wants to intercept a msr, we should set
vmx_msr_bitmap_legacy(_x2apic) and vmx_msr_bitmap_longmode(_x2apic),
and that bitmaps should only be loaded in non-nested entry.
Currently we only clear the corresponding bits if L1 enables
virtualize x2apic mode, all the other bits are all set. On nested
entry, we load nested_msr_bitmap, on nested vmexit, we will restore
L0's bitmap.
In the nested entry, L0 only care about L2's msr access, not L1's. I
think that in nested entry, nested_msr_bitmap is "vmcs01->msr_bitmap"
as your mentioned above.

Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-01-28 Thread Wincy Van
On Wed, Jan 28, 2015 at 11:52 PM, Wincy Van  wrote:
> Sorry, please ignore this mail, the subject is wrong : (
>

I was confused by gmail's conversation view, gmail put this patch in
the v3's conversation.
please ignore this.

Thanks,
Wincy

> On Wed, Jan 28, 2015 at 11:50 PM, Wincy Van  wrote:
>> Currently, if L1 enables MSR_BITMAP, we will emulate this feature,
>> all of L2's msr access is intercepted by L0. Since many features
>> like virtualize x2apic mode has a complicated logic and it is
>> difficult for us to emulate, we should use hardware and merge
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 6/6] KVM: nVMX: Enable nested posted interrupt processing

2015-01-28 Thread Wincy Van
If vcpu has a interrupt in vmx non-root mode, we will
kick that vcpu to inject interrupt timely. With posted
interrupt processing, the kick intr is not needed, and
interrupts are fully taken care of by hardware.

In nested vmx, this feature avoids much more vmexits
than non-nested vmx.

This patch use L0's POSTED_INTR_NV to avoid unexpected
interrupt if L1's vector is different with L0's. If vcpu
is in hardware's non-root mode, we use a physical ipi to
deliver posted interrupts, otherwise we will deliver that
interrupt to L1 and kick that vcpu out of nested
non-root mode.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   84 ++--
 1 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ab131f3..85a163c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -215,6 +215,7 @@ struct __packed vmcs12 {
u64 tsc_offset;
u64 virtual_apic_page_addr;
u64 apic_access_addr;
+   u64 posted_intr_desc_addr;
u64 ept_pointer;
u64 eoi_exit_bitmap0;
u64 eoi_exit_bitmap1;
@@ -334,6 +335,7 @@ struct __packed vmcs12 {
u32 vmx_preemption_timer_value;
u32 padding32[7]; /* room for future expansion */
u16 virtual_processor_id;
+   u16 posted_intr_nv;
u16 guest_es_selector;
u16 guest_cs_selector;
u16 guest_ss_selector;
@@ -406,6 +408,8 @@ struct nested_vmx {
 */
struct page *apic_access_page;
struct page *virtual_apic_page;
+   struct page *pi_desc_page;
+   u16 posted_intr_nv;
u64 msr_ia32_feature_control;

struct hrtimer preemption_timer;
@@ -621,6 +625,7 @@ static int max_shadow_read_write_fields =

 static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+   FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -646,6 +651,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(TSC_OFFSET, tsc_offset),
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+   FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
FIELD64(EPT_POINTER, ept_pointer),
FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
@@ -798,6 +804,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1150,6 +1157,11 @@ static inline bool nested_cpu_has_vid(struct
vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 }

+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+   return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2353,6 +2365,9 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
+   if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+   vmx->nested.nested_vmx_pinbased_ctls_high |=
+   PIN_BASED_POSTED_INTR;

/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
@@ -4304,6 +4319,19 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
return enable_apicv && irqchip_in_kernel(kvm);
 }

+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+   int vector)
+{
+   if (is_guest_mode(vcpu) &&
+   vector == to_vmx(vcpu)->nested.posted_intr_nv &&
+   vcpu->mode == IN_GUEST_MODE) {
+   /* the PIR and ON have been set by L1. */
+   apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+   POSTED_INTR_VECTOR);
+   return 0;
+   }
+   return -1;
+}
 /*
  * Send interrupt to vcpu via posted interrupt way.
  * 1. If target vcpu is running(non-root mode), send posted interrupt
@@ -4316,6 +4344,10 @@ static void vmx_deliver_posted_interrupt(struct
kvm_vcpu *vcpu, int vector)
struct vcpu_vmx *vmx = to_vmx(vcpu);
int r;

+   r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+   if (!r)
+   return;
+
if (pi_test_and_set_pir(vector, &

Re: [PATCH v3 0/6] KVM: nVMX: Enable nested apicv support.

2015-01-27 Thread Wincy Van
On Wed, Jan 28, 2015 at 6:06 AM, Paolo Bonzini  wrote:
>
>
> On 24/01/2015 11:18, Wincy Van wrote:
>> v2 ---> v3:
>>   1. Add a new field in nested_vmx to avoid the spin lock in v2.
>>   2. Drop send eoi to L1 when doing nested interrupt delivery.
>>   3. Use hardware MSR bitmap to enable nested virtualize x2apic
>>  mode.
>
> I think the patches are mostly okay.  I made a few comments.
>

Thank you, Paolo and Yang. I can't accomplish this without your help.

> One of the things to do on top could be to avoid rebuilding the whole
> vmcs02 on every entry.  Recomputing the MSR bitmap on every vmentry is
> not particularly nice, for example.  It is not necessary unless the
> execution controls have changed.

Indeed, I was planned to do that optimization after this patch set.


Thanks,
Wincy

>
> Paolo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 2/6] KVM: nVMX: Enable nested virtualize x2apic mode.

2015-01-27 Thread Wincy Van
On Wed, Jan 28, 2015 at 5:39 AM, Paolo Bonzini  wrote:
>
>
> On 24/01/2015 11:21, Wincy Van wrote:
>> +static void nested_vmx_disable_intercept_for_msr(unsigned long 
>> *msr_bitmap_l1,
>> +  unsigned long 
>> *msr_bitmap_nested,
>> +  u32 msr, int type)
>> +{
>> +   int f = sizeof(unsigned long);
>> +
>> +   if (!cpu_has_vmx_msr_bitmap())
>> +   return;
>> +
>
> Also, make this a WARN_ON.

WIll do.



Thanks,
Wincy


>
> Paolo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 2/6] KVM: nVMX: Enable nested virtualize x2apic mode.

2015-01-27 Thread Wincy Van
On Wed, Jan 28, 2015 at 5:37 AM, Paolo Bonzini  wrote:
>
>
> On 24/01/2015 11:21, Wincy Van wrote:
>> +   memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>
> Most bytes are always 0xff.  It's better to initialize it to 0xff once,
> and set the bit here if !nested_cpu_has_virt_x2apic_mode(vmcs12).

Indeed, will do.

>
>> +   if (nested_cpu_has_virt_x2apic_mode(vmcs12))
>
> Please add braces here, because of the /* */ command below.

Will do.

>
>> +   /* TPR is allowed */
>> +   nested_vmx_disable_intercept_for_msr(msr_bitmap,
>> +   vmx_msr_bitmap_nested,
>> +   APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>> +   MSR_TYPE_R | MSR_TYPE_W);
>>
>> +static inline int nested_vmx_check_virt_x2apic(struct kvm_vcpu *vcpu,
>> +  struct vmcs12 *vmcs12)
>> +{
>> +   if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
>> +   return -EINVAL;
>
> No need for this function and nested_cpu_has_virt_x2apic_mode.  Just
> inline them in their caller(s).  Same for other cases throughout the series.
>

Do you mean that we should also inline the same functions in the other
patches of this patch set?
I think these functions will keep the code tidy, just like the
functions as nested_cpu_has_preemption_timer, nested_cpu_has_ept, etc.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 6/6] KVM: nVMX: Enable nested posted interrupt processing.

2015-01-27 Thread Wincy Van
On Wed, Jan 28, 2015 at 5:55 AM, Paolo Bonzini  wrote:
>
>
> On 24/01/2015 11:24, Wincy Van wrote:
>> if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
>> !nested_cpu_has_apic_reg_virt(vmcs12) &&
>> -   !nested_cpu_has_vid(vmcs12))
>> +   !nested_cpu_has_vid(vmcs12) &&
>> +   !nested_cpu_has_posted_intr(vmcs12))
>> return 0;
>>
>> if (nested_cpu_has_virt_x2apic_mode(vmcs12))
>> r = nested_vmx_check_virt_x2apic(vcpu, vmcs12);
>> if (nested_cpu_has_vid(vmcs12))
>> r |= nested_vmx_check_vid(vcpu, vmcs12);
>> +   if (nested_cpu_has_posted_intr(vmcs12))
>> +   r |= nested_vmx_check_posted_intr(vcpu, vmcs12);
>
> These "if"s are always true.
>

Why? L1 may config these features seperately, we should check them one by one.
e.g.  L1 may enable posted interrupt processing and virtual interrupt
delivery, but leaving virtualize x2apic mode disabled, then
nested_cpu_has_virt_x2apic_mode will return false.

> Paolo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 6/6] KVM: nVMX: Enable nested posted interrupt processing.

2015-01-24 Thread Wincy Van
If vcpu has a interrupt in vmx non-root mode, we will
kick that vcpu to inject interrupt timely. With posted
interrupt processing, the kick intr is not needed, and
interrupts are fully taken care of by hardware.

In nested vmx, this feature avoids much more vmexits
than non-nested vmx.

This patch use L0's POSTED_INTR_NV to avoid unexpected
interrupt if L1's vector is different with L0's. If vcpu
is in hardware's non-root mode, we use a physical ipi to
deliver posted interrupts, otherwise we will deliver that
interrupt to L1 and kick that vcpu out of nested
non-root mode.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   90 ++--
 1 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5d8500c..4e4b64e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -215,6 +215,7 @@ struct __packed vmcs12 {
u64 tsc_offset;
u64 virtual_apic_page_addr;
u64 apic_access_addr;
+   u64 posted_intr_desc_addr;
u64 ept_pointer;
u64 eoi_exit_bitmap0;
u64 eoi_exit_bitmap1;
@@ -334,6 +335,7 @@ struct __packed vmcs12 {
u32 vmx_preemption_timer_value;
u32 padding32[7]; /* room for future expansion */
u16 virtual_processor_id;
+   u16 posted_intr_nv;
u16 guest_es_selector;
u16 guest_cs_selector;
u16 guest_ss_selector;
@@ -406,6 +408,8 @@ struct nested_vmx {
 */
struct page *apic_access_page;
struct page *virtual_apic_page;
+   struct page *pi_desc_page;
+   u16 posted_intr_nv;
u64 msr_ia32_feature_control;

struct hrtimer preemption_timer;
@@ -621,6 +625,7 @@ static int max_shadow_read_write_fields =

 static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+   FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -646,6 +651,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(TSC_OFFSET, tsc_offset),
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+   FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
FIELD64(EPT_POINTER, ept_pointer),
FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
@@ -798,6 +804,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1150,6 +1157,11 @@ static inline bool nested_cpu_has_vid(struct
vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 }

+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+   return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2353,6 +2365,9 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
+   if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+   vmx->nested.nested_vmx_pinbased_ctls_high |=
+   PIN_BASED_POSTED_INTR;

/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
@@ -4302,6 +4317,19 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
return enable_apicv && irqchip_in_kernel(kvm);
 }

+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+   int vector)
+{
+   if (is_guest_mode(vcpu) &&
+   vector == to_vmx(vcpu)->nested.posted_intr_nv &&
+   vcpu->mode == IN_GUEST_MODE) {
+   /* the PIR and ON have been set by L1. */
+   apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+   POSTED_INTR_VECTOR);
+   return 0;
+   }
+   return -1;
+}
 /*
  * Send interrupt to vcpu via posted interrupt way.
  * 1. If target vcpu is running(non-root mode), send posted interrupt
@@ -4314,6 +4342,10 @@ static void vmx_deliver_posted_interrupt(struct
kvm_vcpu *vcpu, int vector)
struct vcpu_vmx *vmx = to_vmx(vcpu);
int r;

+   r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+   if (!r)
+   return;
+
if (pi_test_and_set_pir(vector, &

[PATCH v3 5/6] KVM: nVMX: Enable nested virtual interrupt delivery.

2015-01-24 Thread Wincy Van
With virtual interrupt delivery, the hardware prevent KVM from
the low efficiency interrupt inject way. In nested vmx, it is
a important feature, we can reduce much more nested-vmexit,
especially in high throughput scenes.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   63 +++-
 1 files changed, 62 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d57e370..5d8500c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -216,6 +216,10 @@ struct __packed vmcs12 {
u64 virtual_apic_page_addr;
u64 apic_access_addr;
u64 ept_pointer;
+   u64 eoi_exit_bitmap0;
+   u64 eoi_exit_bitmap1;
+   u64 eoi_exit_bitmap2;
+   u64 eoi_exit_bitmap3;
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
@@ -338,6 +342,7 @@ struct __packed vmcs12 {
u16 guest_gs_selector;
u16 guest_ldtr_selector;
u16 guest_tr_selector;
+   u16 guest_intr_status;
u16 host_es_selector;
u16 host_cs_selector;
u16 host_ss_selector;
@@ -624,6 +629,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+   FIELD(GUEST_INTR_STATUS, guest_intr_status),
FIELD(HOST_ES_SELECTOR, host_es_selector),
FIELD(HOST_CS_SELECTOR, host_cs_selector),
FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -641,6 +647,10 @@ static const unsigned short
vmcs_field_to_offset_table[] = {
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
FIELD64(EPT_POINTER, ept_pointer),
+   FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+   FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+   FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+   FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -1135,6 +1145,11 @@ static inline bool
nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
 }

+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2429,6 +2444,7 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
+   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -7389,6 +7405,10 @@ static bool nested_vmx_exit_handled(struct
kvm_vcpu *vcpu)
case EXIT_REASON_APIC_ACCESS:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+   case EXIT_REASON_APIC_WRITE:
+   case EXIT_REASON_EOI_INDUCED:
+   /* apic_write and eoi_induced should exit unconditionally. */
+   return 1;
case EXIT_REASON_EPT_VIOLATION:
/*
 * L0 always deals with the EPT violation. If nested EPT is
@@ -8474,6 +8494,19 @@ static inline bool
nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_R | MSR_TYPE_W);
+   if (nested_cpu_has_vid(vmcs12)) {
+   /* EOI and self-IPI are allowed */
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_EOI >> 4),
+   MSR_TYPE_W);
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+   MSR_TYPE_W);
+   }
}
kunmap(page);
nested_release_page_clean(page);
@@ -8489,17 +8522,29 @@ static inline int
nested_vmx_check_virt_x2apic(struct kvm_vcpu *vcpu,
return 0;
 }

+static inline int nested_vmx_check_vid(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   if (!nested_exit_on_intr(vcpu))
+   return -EINVAL;
+   retu

[PATCH v3 4/6] KVM: nVMX: Enable nested apic register virtualization.

2015-01-24 Thread Wincy Van
We can reduce apic register virtualization cost with this feature,
it is also a requirement for virtual interrupt delivery and posted
interrupt processing.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   24 
 1 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6ec0c2c..d57e370 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1130,6 +1130,11 @@ static inline bool
nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 }

+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2423,6 +2428,7 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+   SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -8435,6 +8441,7 @@ static int
nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
+   int msr;
struct page *page;
unsigned long *msr_bitmap;

@@ -8455,12 +8462,19 @@ static inline bool
nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,

memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);

-   if (nested_cpu_has_virt_x2apic_mode(vmcs12))
+   if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+   if (nested_cpu_has_apic_reg_virt(vmcs12))
+   for (msr = 0x800; msr <= 0x8ff; msr++)
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap,
+   vmx_msr_bitmap_nested,
+   msr, MSR_TYPE_R);
/* TPR is allowed */
nested_vmx_disable_intercept_for_msr(msr_bitmap,
vmx_msr_bitmap_nested,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_R | MSR_TYPE_W);
+   }
kunmap(page);
nested_release_page_clean(page);

@@ -8478,12 +8492,14 @@ static inline int
nested_vmx_check_virt_x2apic(struct kvm_vcpu *vcpu,
 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
-   int r;
+   int r = 0;

-   if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+   if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+   !nested_cpu_has_apic_reg_virt(vmcs12))
return 0;

-   r = nested_vmx_check_virt_x2apic(vcpu, vmcs12);
+   if (nested_cpu_has_virt_x2apic_mode(vmcs12))
+   r = nested_vmx_check_virt_x2apic(vcpu, vmcs12);
if (r)
goto fail;

--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 3/6] KVM: nVMX: Make nested control MSRs per-cpu.

2015-01-24 Thread Wincy Van
To enable nested apicv support, we need per-cpu vmx
control MSRs:
  1. If in-kernel irqchip is enabled, we can enable nested
 posted interrupt, we should set posted intr bit in
 the nested_vmx_pinbased_ctls_high.
  2. If in-kernel irqchip is disabled, we can not enable
 nested posted interrupt, the posted intr bit
 in the nested_vmx_pinbased_ctls_high will be cleared.

Since there would be different settings about in-kernel
irqchip between VMs, different nested control MSRs
are needed.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |  215 +++-
 1 files changed, 129 insertions(+), 86 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4d8939d..6ec0c2c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -408,6 +408,23 @@ struct nested_vmx {

/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+
+   u32 nested_vmx_procbased_ctls_low;
+   u32 nested_vmx_procbased_ctls_high;
+   u32 nested_vmx_true_procbased_ctls_low;
+   u32 nested_vmx_secondary_ctls_low;
+   u32 nested_vmx_secondary_ctls_high;
+   u32 nested_vmx_pinbased_ctls_low;
+   u32 nested_vmx_pinbased_ctls_high;
+   u32 nested_vmx_exit_ctls_low;
+   u32 nested_vmx_exit_ctls_high;
+   u32 nested_vmx_true_exit_ctls_low;
+   u32 nested_vmx_entry_ctls_low;
+   u32 nested_vmx_entry_ctls_high;
+   u32 nested_vmx_true_entry_ctls_low;
+   u32 nested_vmx_misc_low;
+   u32 nested_vmx_misc_high;
+   u32 nested_vmx_ept_caps;
 };

 #define POSTED_INTR_ON  0
@@ -2285,20 +2302,8 @@ static inline bool nested_vmx_allowed(struct
kvm_vcpu *vcpu)
  * if the corresponding bit in the (32-bit) control field *must* be on, and a
  * bit in the high half is on if the corresponding bit in the control field
  * may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
  */
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
-static u32 nested_vmx_true_procbased_ctls_low;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_true_exit_ctls_low;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static u32 nested_vmx_true_entry_ctls_low;
-static u32 nested_vmx_misc_low, nested_vmx_misc_high;
-static u32 nested_vmx_ept_caps;
-static __init void nested_vmx_setup_ctls_msrs(void)
+static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 {
/*
 * Note that as a general rule, the high half of the MSRs (bits in
@@ -2317,57 +2322,71 @@ static __init void nested_vmx_setup_ctls_msrs(void)

/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
-   nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
-   nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
-   PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
-   nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+   vmx->nested.nested_vmx_pinbased_ctls_low,
+   vmx->nested.nested_vmx_pinbased_ctls_high);
+   vmx->nested.nested_vmx_pinbased_ctls_low |=
+   PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+   vmx->nested.nested_vmx_pinbased_ctls_high &=
+   PIN_BASED_EXT_INTR_MASK |
+   PIN_BASED_NMI_EXITING |
+   PIN_BASED_VIRTUAL_NMIS;
+   vmx->nested.nested_vmx_pinbased_ctls_high |=
+   PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;

/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
-   nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
-   nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+   vmx->nested.nested_vmx_exit_ctls_low,
+   vmx->nested.nested_vmx_exit_ctls_high);
+   vmx->nested.nested_vmx_exit_ctls_low =
+   VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;

-   nested_vmx_exit_ctls_high &=
+   vmx->nested.nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
-   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+   vmx->nested.nested_vmx_exit_ctls_high |=
+   VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;

if (vmx_mpx_supported())
-   nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+  

[PATCH v3 2/6] KVM: nVMX: Enable nested virtualize x2apic mode.

2015-01-24 Thread Wincy Van
When L2 is using x2apic, we can use virtualize x2apic mode to
gain higher performance, especially in apicv case.

This patch also introduces nested_vmx_check_apicv_controls
for the nested apicv patches.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |  121 +++-
 1 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 36d0724..4d8939d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1108,6 +1108,11 @@ static inline bool nested_cpu_has_xsaves(struct
vmcs12 *vmcs12)
vmx_xsaves_supported();
 }

+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2395,6 +2400,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_secondary_ctls_low = 0;
nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -4155,6 +4161,50 @@ static void
__vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
}
 }

+/*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ */
+static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+  unsigned long *msr_bitmap_nested,
+  u32 msr, int type)
+{
+   int f = sizeof(unsigned long);
+
+   if (!cpu_has_vmx_msr_bitmap())
+   return;
+
+   /*
+* See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+* have the write-low and read-high bitmap offsets the wrong way round.
+* We can control MSRs 0x-0x1fff and 0xc000-0xc0001fff.
+*/
+   if (msr <= 0x1fff) {
+   if (type & MSR_TYPE_R &&
+  !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+   /* read-low */
+   __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+
+   if (type & MSR_TYPE_W &&
+  !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+   /* write-low */
+   __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+
+   } else if ((msr >= 0xc000) && (msr <= 0xc0001fff)) {
+   msr &= 0x1fff;
+   if (type & MSR_TYPE_R &&
+  !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+   /* read-high */
+   __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+
+   if (type & MSR_TYPE_W &&
+  !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+   /* write-high */
+   __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+
+   }
+}
+
 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 {
if (!longmode_only)
@@ -8344,7 +8394,68 @@ static int
nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
-   return false;
+   struct page *page;
+   unsigned long *msr_bitmap;
+
+   if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+   return false;
+
+   page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+   if (!page) {
+   WARN_ON(1);
+   return false;
+   }
+   msr_bitmap = (unsigned long *)kmap(page);
+   if (!msr_bitmap) {
+   nested_release_page_clean(page);
+   WARN_ON(1);
+   return false;
+   }
+
+   memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
+
+   if (nested_cpu_has_virt_x2apic_mode(vmcs12))
+   /* TPR is allowed */
+   nested_vmx_disable_intercept_for_msr(msr_bitmap,
+   vmx_msr_bitmap_nested,
+   APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+   MSR_TYPE_R | MSR_TYPE_W);
+   kunmap(page);
+   nested_release_page_clean(page);
+
+   return true;
+}
+
+static inline int nested_vmx_check_virt_x2apic(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+   return -EINVAL;
+   return 0;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+  struct v

[PATCH v3 1/6] KVM: nVMX: Use hardware MSR bitmap

2015-01-24 Thread Wincy Van
Currently, if L1 enables MSR_BITMAP, we will emulate this feature,
all of L2's msr access is intercepted by L0. Since many features
like virtualize x2apic mode has a complicated logic and it is
difficult for us to emulate, we should use hardware and merge
the bitmap.

This patch introduces nested_vmx_merge_msr_bitmap for future use.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   71 +++
 1 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c987374..36d0724 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -798,6 +798,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;

@@ -5812,13 +5813,18 @@ static __init int hardware_setup(void)
(unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode_x2apic)
goto out4;
+
+   vmx_msr_bitmap_nested = (unsigned long *)__get_free_page(GFP_KERNEL);
+   if (!vmx_msr_bitmap_nested)
+   goto out5;
+
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmread_bitmap)
-   goto out5;
+   goto out6;

vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmwrite_bitmap)
-   goto out6;
+   goto out7;

memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5834,10 +5840,11 @@ static __init int hardware_setup(void)

memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+   memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);

if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
-   goto out7;
+   goto out8;
}

if (boot_cpu_has(X86_FEATURE_NX))
@@ -5944,10 +5951,12 @@ static __init int hardware_setup(void)

return alloc_kvm_area();

-out7:
+out8:
free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
+out7:
free_page((unsigned long)vmx_vmread_bitmap);
+out6:
+   free_page((unsigned long)vmx_msr_bitmap_nested);
 out5:
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
@@ -5970,6 +5979,7 @@ static __exit void hardware_unsetup(void)
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
free_page((unsigned long)vmx_msr_bitmap_legacy);
free_page((unsigned long)vmx_msr_bitmap_longmode);
+   free_page((unsigned long)vmx_msr_bitmap_nested);
free_page((unsigned long)vmx_io_bitmap_b);
free_page((unsigned long)vmx_io_bitmap_a);
free_page((unsigned long)vmx_vmwrite_bitmap);
@@ -8305,6 +8315,38 @@ static void vmx_start_preemption_timer(struct
kvm_vcpu *vcpu)
  ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }

+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+   struct vmcs12 *vmcs12)
+{
+   int maxphyaddr;
+   u64 addr;
+
+   if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+   return 0;
+
+   if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+   WARN_ON(1);
+   return -EINVAL;
+   }
+   maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+   if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+  ((addr + PAGE_SIZE) >> maxphyaddr))
+   return -EINVAL;
+
+   return 0;
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   return false;
+}
+
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
   unsigned long count_field,
   unsigned long addr_field,
@@ -8637,11 +8679,17 @@ static void prepare_vmcs02(struct kvm_vcpu
*vcpu, struct vmcs12 *vmcs12)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
}

+   if (cpu_has_vmx_msr_bitmap() &&
+   exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+   nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
+   vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+   } else
+   exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+
/*
-* Merging of IO and MSR bitmaps not currently supported.
+* Merging of IO bitmap not currently supported.
 * Rather, exit every time.

[PATCH v3 0/6] KVM: nVMX: Enable nested apicv support.

2015-01-24 Thread Wincy Van
v1 ---> v2:
  Use spin lock to ensure vmcs12 is safe when doing nested
  posted interrupt delivery.

v2 ---> v3:
  1. Add a new field in nested_vmx to avoid the spin lock in v2.
  2. Drop send eoi to L1 when doing nested interrupt delivery.
  3. Use hardware MSR bitmap to enable nested virtualize x2apic
 mode.

Wincy Van (6):
  KVM: nVMX: Use hardware MSR bitmap
  KVM: nVMX: Enable nested virtualize x2apic mode.
  KVM: nVMX: Make nested control MSRs per-cpu.
  KVM: nVMX: Enable nested apic register virtualization.
  KVM: nVMX: Enable nested virtual interrupt delivery.
  KVM: nVMX: Enable nested posted interrupt processing.

 arch/x86/kvm/vmx.c |  570 +++-
 1 files changed, 470 insertions(+), 100 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 5/5] KVM: nVMX: Enable nested posted interrupt processing.

2015-01-21 Thread Wincy Van
On Wed, Jan 21, 2015 at 4:49 PM, Zhang, Yang Z  wrote:
 +   if (vector == vmcs12->posted_intr_nv && +
 nested_cpu_has_posted_intr(vmcs12)) { +   if (vcpu->mode
 == IN_GUEST_MODE) + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), +
   POSTED_INTR_VECTOR); +   else {
 +   r = -1; +   goto out; +
} + +   /* +* if posted intr is
 done by hardware, the +* corresponding eoi was sent to
 L0. Thus +* we should send eoi to L1 manually. +
  */ +   kvm_apic_set_eoi_accelerated(vcpu, +
 vmcs12->posted_intr_nv);
>>>
>>> Why this is necessary? As your comments mentioned, it is done by
>>> hardware not L1, why L1 should aware of it?
>>>
>>
>> According to SDM 29.6, if the processor recognizes a posted interrupt,
>> it will send an EOI to LAPIC.
>> If the posted intr is done by hardware, the processor will send eoi to
>> hardware LAPIC, not L1's, just like the none-nested case(the physical
>> interrupt is dismissed). So we should take care of the L1's LAPIC and send 
>> an eoi to it.
>
> No. You are not emulating the PI feature. You just reuse the hardware's 
> capability. So you don't need to let L1 know it.
>

Agreed, I had thought we have already set L1's IRR before this, I was wrong.

BTW, I was trying to complete the nested posted intr manually if the
dest vcpu is in_guest_mode but not IN_GUEST_MODE, but I found that
it is difficult to set RVI of the destination vcpu timely, because we
should keep the RVI, PIR and ON in sync : (

I think it is better to do a nested vmexit in the case above, rather
than emulate it, because that case is much less than the hardware
case.


Thanks,

Wincy.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/5] KVM: nVMX: Enable nested virtualize x2apic mode.

2015-01-21 Thread Wincy Van
On Wed, Jan 21, 2015 at 4:35 PM, Zhang, Yang Z  wrote:
> Wincy Van wrote on 2015-01-16:
>> When L2 is using x2apic, we can use virtualize x2apic mode to gain higher
>> performance.
>>
>> This patch also introduces nested_vmx_check_apicv_controls for the nested
>> apicv patches.
>>
>> Signed-off-by: Wincy Van 
>
> To enable x2apic, should you to consider the behavior changes to rdmsr and 
> wrmsr. I didn't see your patch do it. Is it correct?

Yes, indeed, I've not noticed that kvm handle nested msr bitmap
manually, the next version will fix this.

> BTW, this patch has nothing to do with APICv, it's better to not use x2apic 
> here and change to apicv in following patch.

Do you mean that we should split this patch from the apicv patch set?

>
>> ---
>>  arch/x86/kvm/vmx.c |   49
>> -
>>  1 files changed, 48 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 954dd54..10183ee
>> 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -1134,6 +1134,11 @@ static inline bool nested_cpu_has_xsaves(struct
>> vmcs12 *vmcs12)
>> vmx_xsaves_supported();
>>  }
>>
>> +static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12
>> +*vmcs12) {
>> +   return nested_cpu_has2(vmcs12,
>> +SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
>> +}
>> +
>>  static inline bool is_exception(u32 intr_info)  {
>> return (intr_info & (INTR_INFO_INTR_TYPE_MASK |
>> INTR_INFO_VALID_MASK)) @@ -2426,6 +2431,7 @@ static void
>> nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>> vmx->nested.nested_vmx_secondary_ctls_low = 0;
>> vmx->nested.nested_vmx_secondary_ctls_high &=
>> SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
>> +   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>> SECONDARY_EXEC_WBINVD_EXITING |
>> SECONDARY_EXEC_XSAVES;
>>
>> @@ -7333,6 +7339,9 @@ static bool nested_vmx_exit_handled(struct
>> kvm_vcpu *vcpu)
>> case EXIT_REASON_APIC_ACCESS:
>> return nested_cpu_has2(vmcs12,
>>
>> SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
>> +   case EXIT_REASON_APIC_WRITE:
>> +   /* apic_write should exit unconditionally. */
>> +   return 1;
>
> APIC_WRITE vmexit is introduced by APIC register virtualization not 
> virtualize x2apic. Move it to next patch.

Agreed, will do.

Thanks,

Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/5] KVM: nVMX: Make nested control MSRs per-cpu.

2015-01-21 Thread Wincy Van
On Wed, Jan 21, 2015 at 4:18 PM, Zhang, Yang Z  wrote:
> Wincy Van wrote on 2015-01-16:
>> To enable nested apicv support, we need per-cpu vmx control MSRs:
>>   1. If in-kernel irqchip is enabled, we can enable nested
>>  posted interrupt, we should set posted intr bit in the
>>  nested_vmx_pinbased_ctls_high. 2. If in-kernel irqchip is disabled,
>>  we can not enable nested posted interrupt, the posted intr bit in
>>  the nested_vmx_pinbased_ctls_high will be cleared.
>> Since there would be different settings about in-kernel irqchip
>> between VMs, different nested control MSRs are needed.
>
> I'd suggest you to check irqchip_in_kernel() instead moving the whole ctrl 
> msr to per vcpu.
>

Yes, moving that msrs looks a bit ugly, but the irqchip_in_kernel is
per-VM, not a global
setting, there would be different settings of kernel_irqchip between VMs.
If we use irqchip_in_kernel to check it and set different value of the
ctl msrs, I think it may
be even worse than moving the msrs, because this logic should be a
init function, and this
setting should be converged.

Thanks,

Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 5/5] KVM: nVMX: Enable nested posted interrupt processing.

2015-01-21 Thread Wincy Van
On Wed, Jan 21, 2015 at 4:07 PM, Zhang, Yang Z  wrote:
>> +   if (vector == vmcs12->posted_intr_nv &&
>> +   nested_cpu_has_posted_intr(vmcs12)) {
>> +   if (vcpu->mode == IN_GUEST_MODE)
>> +   apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
>> +   POSTED_INTR_VECTOR);
>> +   else {
>> +   r = -1;
>> +   goto out;
>> +   }
>> +
>> +   /*
>> +* if posted intr is done by hardware, the
>> +* corresponding eoi was sent to L0. Thus
>> +* we should send eoi to L1 manually.
>> +*/
>> +   kvm_apic_set_eoi_accelerated(vcpu,
>> +   vmcs12->posted_intr_nv);
>
> Why this is necessary? As your comments mentioned, it is done by hardware not 
> L1, why L1 should aware of it?
>

According to SDM 29.6, if the processor recognizes a posted interrupt,
it will send an EOI to LAPIC.
If the posted intr is done by hardware, the processor will send eoi to
hardware LAPIC, not L1's, just
like the none-nested case(the physical interrupt is dismissed). So we
should take care of the L1's
LAPIC and send an eoi to it.


Thanks,

Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 5/5] KVM: nVMX: Enable nested posted interrupt processing.

2015-01-20 Thread Wincy Van
On Tue, Jan 20, 2015 at 5:54 PM, Paolo Bonzini  wrote:
>
>
> On 20/01/2015 09:48, Wincy Van wrote:
>> +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
>> +   int vector)
>> +{
>> +   int r = 0;
>> +   struct vmcs12 *vmcs12;
>> +
>> +   /*
>> +* Since posted intr delivery is async,
>> +* we must aquire a spin-lock to avoid
>> +* the race of vmcs12.
>> +*/
>> +   spin_lock(&to_vmx(vcpu)->nested.vmcs12_lock);
>> +   vmcs12 = get_vmcs12(vcpu);
>> +   if (!is_guest_mode(vcpu) || !vmcs12) {
>> +   r = -1;
>> +   goto out;
>> +   }
>
> is_guest_mode should be checked first outside the lock, to avoid
> affecting the non-nested fast path.  You can then recheck it later
> inside the lock.

Agreed, will do.

>
> Another way to avoid the spinlock: in prepare_vmcs02 or a similar place,
> you can save vmcs12->posted_intr_nv in a new field
> vmx->nested.posted_intr_nv; just set it to -1 if
> !nested_cpu_has_posted_intr(vmcs12).  In vmclear, again you just set the
> field to -1, and here you can do
>
> if (!is_guest_mode(vcpu) ||
> vector != to_vmx(vcpu)->nested.posted_intr_nv) {
> r = -1;
> goto out;
> }
>
> You don't need to access vmcs12, and while there is a race, it's okay
> because there is no pointer access.

That's a good idea. I will apply it to the next version.

>
>>
>> +   if (vcpu->mode == IN_GUEST_MODE)
>> +   apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
>> +   POSTED_INTR_VECTOR);
>
> Please add a comment that PIR and ON have been set by the L1 hypervisor.

Will do.

>
> I'll do a full review the other patches as soon as possible.
>

Thank you, I will send v3 after it is done.


Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 5/5] KVM: nVMX: Enable nested posted interrupt processing.

2015-01-20 Thread Wincy Van
If vcpu has a interrupt in vmx non-root mode, we will
kick that vcpu to inject interrupt timely. With posted
interrupt processing, the kick intr is not needed, and
interrupts are fully taken care of by hardware.

In nested vmx, this feature avoids much more vmexits
than non-nested vmx.

This patch use L0's POSTED_INTR_NV to avoid unexpected
interrupt if L1's vector is different with L0's. If vcpu
is in hardware's non-root mode, we use a physical ipi to
deliver posted interrupts, otherwise we will deliver that
interrupt to L1 and kick that vcpu out of nested
non-root mode.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |  136 ++--
 1 files changed, 132 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ea56e9f..cda9133 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -215,6 +215,7 @@ struct __packed vmcs12 {
u64 tsc_offset;
u64 virtual_apic_page_addr;
u64 apic_access_addr;
+   u64 posted_intr_desc_addr;
u64 ept_pointer;
u64 eoi_exit_bitmap0;
u64 eoi_exit_bitmap1;
@@ -334,6 +335,7 @@ struct __packed vmcs12 {
u32 vmx_preemption_timer_value;
u32 padding32[7]; /* room for future expansion */
u16 virtual_processor_id;
+   u16 posted_intr_nv;
u16 guest_es_selector;
u16 guest_cs_selector;
u16 guest_ss_selector;
@@ -387,6 +389,7 @@ struct nested_vmx {
/* The host-usable pointer to the above */
struct page *current_vmcs12_page;
struct vmcs12 *current_vmcs12;
+   spinlock_t vmcs12_lock;
struct vmcs *current_shadow_vmcs;
/*
 * Indicates if the shadow vmcs must be updated with the
@@ -406,6 +409,8 @@ struct nested_vmx {
 */
struct page *apic_access_page;
struct page *virtual_apic_page;
+   struct page *pi_desc_page;
+   struct pi_desc *pi_desc;
u64 msr_ia32_feature_control;

struct hrtimer preemption_timer;
@@ -621,6 +626,7 @@ static int max_shadow_read_write_fields =

 static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+   FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -646,6 +652,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(TSC_OFFSET, tsc_offset),
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+   FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
FIELD64(EPT_POINTER, ept_pointer),
FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
@@ -798,6 +805,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1159,6 +1167,11 @@ static inline bool nested_cpu_has_vid(struct
vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 }

+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+   return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2362,6 +2375,9 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
+   if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+   vmx->nested.nested_vmx_pinbased_ctls_high |=
+   PIN_BASED_POSTED_INTR;

/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
@@ -4267,6 +4283,46 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
return enable_apicv && irqchip_in_kernel(kvm);
 }

+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+   int vector)
+{
+   int r = 0;
+   struct vmcs12 *vmcs12;
+
+   /*
+* Since posted intr delivery is async,
+* we must aquire a spin-lock to avoid
+* the race of vmcs12.
+*/
+   spin_lock(&to_vmx(vcpu)->nested.vmcs12_lock);
+   vmcs12 = get_vmcs12(vcpu);
+   if (!is_guest_mode(vcpu) || !vmcs12) {
+   r = -1;
+   goto out;
+   }
+   if (vector == vmcs12->posted_intr_nv &&
+   nested_cpu_has_post

Re: [PATCH 5/5] KVM: nVMX: Enable nested posted interrupt processing.

2015-01-19 Thread Wincy Van
On Tue, Jan 20, 2015 at 3:34 PM, Paolo Bonzini  wrote:
>> Hence, we can disable local interrupts while delivering nested posted
>> interrupts to make sure
>> we are faster than the destination vcpu. This is a bit tricky but it
>> an avoid that race. I think we
>> do not need to add a spin lock here. RCU does not fit this case, since
>> it will introduce a
>> new race window between the rcu handler and handle_vmptr**.
>>
>> I am wondering that whether there is a better way : )
>
> Why not just use a spinlock?
>

Hmm.. it seems that using a spinlock is the best way.
I think we can drop the local_irq_save and use a spinlock instead.
I can send v2 if it is necessary, any more ideas?


Thanks,

Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/5] KVM: nVMX: Enable nested apicv support.

2015-01-19 Thread Wincy Van
Hi, Yang,

Could you please have a look at this patch set?
Your comment is very appreciated!


Thanks,

Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/5] KVM: nVMX: Enable nested posted interrupt processing.

2015-01-19 Thread Wincy Van
On Mon, Jan 19, 2015 at 7:43 PM, Paolo Bonzini  wrote:
> Hi Wincy,
>
> there is only one thing that I don't understand in this patchset, and it is:
>
> On 16/01/2015 06:59, Wincy Van wrote:
>> +   /*
>> +* if vcpu is in L2, we are fast enough to complete
>> +* before L1 changes/destroys vmcs12.
>> +*/
>
> ... this comment.  What do you mean exactly?
>

Hi, Paolo,

Actually, there is a race window between
vmx_deliver_nested_posted_interrupt and nested_release_vmcs12
since posted intr delivery is async:

 cpu 1
  cpu 2
(nested posted intr)  (dest vcpu,
release vmcs12)
vmcs12 = get_vmcs12(vcpu);
if (!is_guest_mode(vcpu) || !vmcs12) {
r = -1;
goto out;
}


kunmap(vmx->nested.current_vmcs12_page);

   ..


oops! current vmcs12 is invalid.

However, we have already checked that the destination vcpu
is_in_guest_mode, and if L1
want to destroy vmcs12(in handle_vmptrld/clear, etc..), the dest vcpu
must have done a nested
vmexit and a non-nested vmexit(handle_vmptr***).

Hence, we can disable local interrupts while delivering nested posted
interrupts to make sure
we are faster than the destination vcpu. This is a bit tricky but it
an avoid that race. I think we
do not need to add a spin lock here. RCU does not fit this case, since
it will introduce a
new race window between the rcu handler and handle_vmptr**.

I am wondering that whether there is a better way : )

Thanks,

Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/5] KVM: nVMX: Enable nested posted interrupt processing.

2015-01-15 Thread Wincy Van
If vcpu has a interrupt in vmx non-root mode, we will
kick that vcpu to inject interrupt timely. With posted
interrupt processing, the kick intr is not needed, and
interrupts are fully taken care of by hardware.

In nested vmx, this feature avoids much more vmexits
than non-nested vmx.

This patch use L0's POSTED_INTR_NV to avoid unexpected
interrupt if L1's vector is different with L0's. If vcpu
is in hardware's non-root mode, we use a physical ipi to
deliver posted interrupts, otherwise we will deliver that
interrupt to L1 and kick that vcpu out of nested
non-root mode.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |  131 ++--
 1 files changed, 127 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ea56e9f..5aeef79 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -215,6 +215,7 @@ struct __packed vmcs12 {
u64 tsc_offset;
u64 virtual_apic_page_addr;
u64 apic_access_addr;
+   u64 posted_intr_desc_addr;
u64 ept_pointer;
u64 eoi_exit_bitmap0;
u64 eoi_exit_bitmap1;
@@ -334,6 +335,7 @@ struct __packed vmcs12 {
u32 vmx_preemption_timer_value;
u32 padding32[7]; /* room for future expansion */
u16 virtual_processor_id;
+   u16 posted_intr_nv;
u16 guest_es_selector;
u16 guest_cs_selector;
u16 guest_ss_selector;
@@ -406,6 +408,8 @@ struct nested_vmx {
 */
struct page *apic_access_page;
struct page *virtual_apic_page;
+   struct page *pi_desc_page;
+   struct pi_desc *pi_desc;
u64 msr_ia32_feature_control;

struct hrtimer preemption_timer;
@@ -621,6 +625,7 @@ static int max_shadow_read_write_fields =

 static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+   FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -646,6 +651,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(TSC_OFFSET, tsc_offset),
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+   FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
FIELD64(EPT_POINTER, ept_pointer),
FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
@@ -798,6 +804,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1159,6 +1166,11 @@ static inline bool nested_cpu_has_vid(struct
vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 }

+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+   return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2362,6 +2374,9 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
+   if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+   vmx->nested.nested_vmx_pinbased_ctls_high |=
+   PIN_BASED_POSTED_INTR;

/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
@@ -4267,6 +4282,46 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
return enable_apicv && irqchip_in_kernel(kvm);
 }

+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+   int vector)
+{
+   int r = 0;
+   unsigned long flags;
+   struct vmcs12 *vmcs12;
+
+   /*
+* if vcpu is in L2, we are fast enough to complete
+* before L1 changes/destroys vmcs12.
+*/
+   local_irq_save(flags);
+   vmcs12 = get_vmcs12(vcpu);
+   if (!is_guest_mode(vcpu) || !vmcs12) {
+   r = -1;
+   goto out;
+   }
+   if (vector == vmcs12->posted_intr_nv &&
+   nested_cpu_has_posted_intr(vmcs12)) {
+   if (vcpu->mode == IN_GUEST_MODE)
+   apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+   POSTED_INTR_VECTOR);
+   else {
+   r = -1;
+   goto out;
+   }
+
+   /*
+* if posted in

[PATCH 4/5] KVM: nVMX: Enable nested virtual interrupt delivery.

2015-01-15 Thread Wincy Van
With virtual interrupt delivery, the hardware prevent KVM from
the low efficiency interrupt inject way. In nested vmx, it is
a important feature, we can reduce much more nested-vmexit,
especially in high throughput scenes.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   49 +++--
 1 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 99e19bb..ea56e9f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -216,6 +216,10 @@ struct __packed vmcs12 {
u64 virtual_apic_page_addr;
u64 apic_access_addr;
u64 ept_pointer;
+   u64 eoi_exit_bitmap0;
+   u64 eoi_exit_bitmap1;
+   u64 eoi_exit_bitmap2;
+   u64 eoi_exit_bitmap3;
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
@@ -338,6 +342,7 @@ struct __packed vmcs12 {
u16 guest_gs_selector;
u16 guest_ldtr_selector;
u16 guest_tr_selector;
+   u16 guest_intr_status;
u16 host_es_selector;
u16 host_cs_selector;
u16 host_ss_selector;
@@ -624,6 +629,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+   FIELD(GUEST_INTR_STATUS, guest_intr_status),
FIELD(HOST_ES_SELECTOR, host_es_selector),
FIELD(HOST_CS_SELECTOR, host_cs_selector),
FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -641,6 +647,10 @@ static const unsigned short
vmcs_field_to_offset_table[] = {
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
FIELD64(EPT_POINTER, ept_pointer),
+   FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+   FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+   FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+   FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -1144,6 +1154,11 @@ static inline bool
nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
 }

+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2438,6 +2453,7 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
+   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -7346,7 +7362,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_APIC_WRITE:
-   /* apic_write should exit unconditionally. */
+   case EXIT_REASON_EOI_INDUCED:
+   /* apic_write and eoi_induced should exit unconditionally. */
return 1;
case EXIT_REASON_EPT_VIOLATION:
/*
@@ -8379,17 +8396,29 @@ static inline int
nested_vmx_check_virt_x2apic(struct kvm_vcpu *vcpu,
return 0;
 }

+static inline int nested_vmx_check_vid(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   if (!nested_exit_on_intr(vcpu))
+   return -EINVAL;
+   return 0;
+}
+
 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
int r;

if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
-   !nested_cpu_has_apic_reg_virt(vmcs12))
+   !nested_cpu_has_apic_reg_virt(vmcs12) &&
+   !nested_cpu_has_vid(vmcs12))
return 0;

if (nested_cpu_has_virt_x2apic_mode(vmcs12))
r = nested_vmx_check_virt_x2apic(vcpu, vmcs12);
+   if (nested_cpu_has_vid(vmcs12))
+   r |= nested_vmx_check_vid(vcpu, vmcs12);
+
if (r)
goto fail;

@@ -8705,6 +8734,19 @@ static void prepare_vmcs02(struct kvm_vcpu
*vcpu, struct vmcs12 *vmcs12)
kvm_vcpu_reload_apic_access_page(vcpu);
}

+   if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+   vmcs_write64(EOI_EXIT_BITMAP0,
+   vmcs12->eoi_exit_bitmap0);
+   vmcs_write64(EOI_EXIT_BITMAP1,
+  

[PATCH 3/5] KVM: nVMX: Enable nested apic register virtualization.

2015-01-15 Thread Wincy Van
We can reduce apic register virtualization cost with this feature,
it is also a requirement for virtual interrupt delivery and posted
interrupt processing.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   12 ++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 10183ee..99e19bb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1139,6 +1139,11 @@ static inline bool
nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 }

+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2432,6 +2437,7 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+   SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -8378,10 +8384,12 @@ static int
nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 {
int r;

-   if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+   if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+   !nested_cpu_has_apic_reg_virt(vmcs12))
return 0;

-   r = nested_vmx_check_virt_x2apic(vcpu, vmcs12);
+   if (nested_cpu_has_virt_x2apic_mode(vmcs12))
+   r = nested_vmx_check_virt_x2apic(vcpu, vmcs12);
if (r)
goto fail;

--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/5] KVM: nVMX: Enable nested virtualize x2apic mode.

2015-01-15 Thread Wincy Van
When L2 is using x2apic, we can use virtualize x2apic mode to
gain higher performance.

This patch also introduces nested_vmx_check_apicv_controls
for the nested apicv patches.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |   49 -
 1 files changed, 48 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 954dd54..10183ee 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1134,6 +1134,11 @@ static inline bool nested_cpu_has_xsaves(struct
vmcs12 *vmcs12)
vmx_xsaves_supported();
 }

+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2426,6 +2431,7 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
vmx->nested.nested_vmx_secondary_ctls_low = 0;
vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;

@@ -7333,6 +7339,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_APIC_ACCESS:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+   case EXIT_REASON_APIC_WRITE:
+   /* apic_write should exit unconditionally. */
+   return 1;
case EXIT_REASON_EPT_VIOLATION:
/*
 * L0 always deals with the EPT violation. If nested EPT is
@@ -8356,6 +8365,38 @@ static void vmx_start_preemption_timer(struct
kvm_vcpu *vcpu)
  ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }

+static inline int nested_vmx_check_virt_x2apic(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+   return -EINVAL;
+   return 0;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+  struct vmcs12 *vmcs12)
+{
+   int r;
+
+   if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+   return 0;
+
+   r = nested_vmx_check_virt_x2apic(vcpu, vmcs12);
+   if (r)
+   goto fail;
+
+   /* tpr shadow is needed by all apicv features. */
+   if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+   r = -EINVAL;
+   goto fail;
+   }
+
+   return 0;
+
+fail:
+   return r;
+}
+
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
   unsigned long count_field,
   unsigned long addr_field,
@@ -8649,7 +8690,8 @@ static void prepare_vmcs02(struct kvm_vcpu
*vcpu, struct vmcs12 *vmcs12)
else
vmcs_write64(APIC_ACCESS_ADDR,
  page_to_phys(vmx->nested.apic_access_page));
-   } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
+   } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+   (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
kvm_vcpu_reload_apic_access_page(vcpu);
@@ -8856,6 +8898,11 @@ static int nested_vmx_run(struct kvm_vcpu
*vcpu, bool launch)
return 1;
}

+   if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
+   nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+   return 1;
+   }
+
if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/5] KVM: nVMX: Make nested control MSRs per-cpu.

2015-01-15 Thread Wincy Van
To enable nested apicv support, we need per-cpu vmx
control MSRs:
  1. If in-kernel irqchip is enabled, we can enable nested
 posted interrupt, we should set posted intr bit in
 the nested_vmx_pinbased_ctls_high.
  2. If in-kernel irqchip is disabled, we can not enable
 nested posted interrupt, the posted intr bit
 in the nested_vmx_pinbased_ctls_high will be cleared.

Since there would be different settings about in-kernel
irqchip between VMs, different nested control MSRs
are needed.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/vmx.c |  215 +++-
 1 files changed, 129 insertions(+), 86 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ce35071..954dd54 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -408,6 +408,23 @@ struct nested_vmx {

/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+
+   u32 nested_vmx_procbased_ctls_low;
+   u32 nested_vmx_procbased_ctls_high;
+   u32 nested_vmx_true_procbased_ctls_low;
+   u32 nested_vmx_secondary_ctls_low;
+   u32 nested_vmx_secondary_ctls_high;
+   u32 nested_vmx_pinbased_ctls_low;
+   u32 nested_vmx_pinbased_ctls_high;
+   u32 nested_vmx_exit_ctls_low;
+   u32 nested_vmx_exit_ctls_high;
+   u32 nested_vmx_true_exit_ctls_low;
+   u32 nested_vmx_entry_ctls_low;
+   u32 nested_vmx_entry_ctls_high;
+   u32 nested_vmx_true_entry_ctls_low;
+   u32 nested_vmx_misc_low;
+   u32 nested_vmx_misc_high;
+   u32 nested_vmx_ept_caps;
 };

 #define POSTED_INTR_ON  0
@@ -2289,20 +2306,8 @@ static inline bool nested_vmx_allowed(struct
kvm_vcpu *vcpu)
  * if the corresponding bit in the (32-bit) control field *must* be on, and a
  * bit in the high half is on if the corresponding bit in the control field
  * may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
  */
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
-static u32 nested_vmx_true_procbased_ctls_low;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_true_exit_ctls_low;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static u32 nested_vmx_true_entry_ctls_low;
-static u32 nested_vmx_misc_low, nested_vmx_misc_high;
-static u32 nested_vmx_ept_caps;
-static __init void nested_vmx_setup_ctls_msrs(void)
+static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 {
/*
 * Note that as a general rule, the high half of the MSRs (bits in
@@ -2321,57 +2326,71 @@ static __init void nested_vmx_setup_ctls_msrs(void)

/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
-   nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
-   nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
-   PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
-   nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+   vmx->nested.nested_vmx_pinbased_ctls_low,
+   vmx->nested.nested_vmx_pinbased_ctls_high);
+   vmx->nested.nested_vmx_pinbased_ctls_low |=
+   PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+   vmx->nested.nested_vmx_pinbased_ctls_high &=
+   PIN_BASED_EXT_INTR_MASK |
+   PIN_BASED_NMI_EXITING |
+   PIN_BASED_VIRTUAL_NMIS;
+   vmx->nested.nested_vmx_pinbased_ctls_high |=
+   PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;

/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
-   nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
-   nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+   vmx->nested.nested_vmx_exit_ctls_low,
+   vmx->nested.nested_vmx_exit_ctls_high);
+   vmx->nested.nested_vmx_exit_ctls_low =
+   VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;

-   nested_vmx_exit_ctls_high &=
+   vmx->nested.nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
-   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+   vmx->nested.nested_vmx_exit_ctls_high |=
+   VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;

if (vmx_mpx_supported())
-   nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+  

[PATCH 0/5] KVM: nVMX: Enable nested apicv support.

2015-01-15 Thread Wincy Van
In nested vmx, the efficiency of interrupt virtualization is
very important, especially in high throughput scenes.

This patch set enables nested apicv support, which makes a
huge improvement in nested interrupt virtualization.

I also have done some simple tests:
L0: Intel Xeon E5-2630 v2
L1: CentOS 6.5 with 3.10.64-1.el6.elrepo.x86_64 kernel
16 vcpus, 32GB memory.

L2: Windows Server 2008 R2 Datacenter
8 vcpus, 16GB memory.

 1. Run wprime 32M, 8 threads.

 originalnested apicv

  7.782s   7.172s

Improvement: 7.8%

 2. Run iperf -s -w 64k in L1,
iperf -c 10.1.0.2 -p 5001 -i 1 -t 30 -P 8 -w 64k in L2

 originalnested apicv

2.12 Gbits/s 3.50 Gbits/s

Improvement: 65.0%

_

L2: CentOS 6.5 with 2.6.32-431 kernel
8 vcpus, 16GB memory.

 1. Run iperf -s -w 64k in L1,
iperf -c 10.1.0.2 -p 5001 -i 1 -t 30 -P 8 -w 64k in L2

 originalnested apicv

6.58 Gbits/s 14.2 Gbits/s

Improvement: 115.8%

Wincy Van (5):
  KVM: nVMX: Make nested control MSRs per-cpu.
  KVM: nVMX: Enable nested virtualize x2apic mode.
  KVM: nVMX: Enable nested apic register virtualization.
  KVM: nVMX: Enable nested virtual interrupt delivery.
  KVM: nVMX: Enable nested posted interrupt processing.

 arch/x86/kvm/vmx.c |  444 +---
 1 files changed, 355 insertions(+), 89 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] KVM: ioapic: Record edge-triggered interrupts delivery status.

2015-01-08 Thread Wincy Van
Ping..

Hi, Paolo, could you please have a look at this patch ?


Thanks,

Wincy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] KVM: ioapic: Record edge-triggered interrupts delivery status.

2014-12-25 Thread Wincy Van
2014-12-24 23:29 GMT+08:00 Jan Kiszka :
> On 2014-12-24 04:14, Wincy Van wrote:
>> This patch fixes the bug discussed in
>> https://www.mail-archive.com/kvm@vger.kernel.org/msg109813.html
>>
>> This patch uses a new field named irr_delivered to record the
>> delivery status of edge-triggered interrupts, and clears the
>> delivered interrupts in kvm_get_ioapic. So it has the same effect
>> of commit 0bc830b05c667218d703f2026ec866c49df974fc
>> ("KVM: ioapic: clear IRR for edge-triggered interrupts at delivery")
>> while avoids the bug of Windows guests.
>>
>> Signed-off-by: Wincy Van 
>> ---
>>  arch/x86/kvm/ioapic.c |7 ++-
>>  arch/x86/kvm/ioapic.h |1 +
>>  2 files changed, 7 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
>> index b1947e0..a2e9d96 100644
>> --- a/arch/x86/kvm/ioapic.c
>> +++ b/arch/x86/kvm/ioapic.c
>> @@ -206,6 +206,8 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, 
>> unsigned int irq,
>>
>>   old_irr = ioapic->irr;
>>   ioapic->irr |= mask;
>> + if (edge)
>> + ioapic->irr_delivered &= ~mask;
>>   if ((edge && old_irr == ioapic->irr) ||
>>   (!edge && entry.fields.remote_irr)) {
>>   ret = 0;
>> @@ -349,7 +351,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int 
>> irq, bool line_status)
>>   irqe.shorthand = 0;
>>
>>   if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
>> - ioapic->irr &= ~(1 << irq);
>> + ioapic->irr_delivered |= 1 << irq;
>>
>>   if (irq == RTC_GSI && line_status) {
>>   /*
>> @@ -597,6 +599,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
>>   ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
>>   ioapic->ioregsel = 0;
>>   ioapic->irr = 0;
>> + ioapic->irr_delivered = 0;
>>   ioapic->id = 0;
>>   memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
>>   rtc_irq_eoi_tracking_reset(ioapic);
>> @@ -654,6 +657,7 @@ int kvm_get_ioapic(struct kvm *kvm, struct 
>> kvm_ioapic_state *state)
>>
>>   spin_lock(&ioapic->lock);
>>   memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
>> + state->irr &= ~ioapic->irr_delivered;
>>   spin_unlock(&ioapic->lock);
>>   return 0;
>>  }
>> @@ -667,6 +671,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct 
>> kvm_ioapic_state *state)
>>   spin_lock(&ioapic->lock);
>>   memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
>>   ioapic->irr = 0;
>> + ioapic->irr_delivered = 0;
>>   update_handled_vectors(ioapic);
>>   kvm_vcpu_request_scan_ioapic(kvm);
>>   kvm_ioapic_inject_all(ioapic, state->irr);
>> diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
>> index 3c91955..a5cdfc0 100644
>> --- a/arch/x86/kvm/ioapic.h
>> +++ b/arch/x86/kvm/ioapic.h
>> @@ -77,6 +77,7 @@ struct kvm_ioapic {
>>   struct rtc_status rtc_status;
>>   struct delayed_work eoi_inject;
>>   u32 irq_eoi[IOAPIC_NUM_PINS];
>> + u32 irr_delivered;
>>  };
>>
>>  #ifdef DEBUG
>>
>
> Does this introduce a state which requires save/restore on migration? If
> so, then you need to extend the existing interface - in a
> backward-compatible way. If not, please leave a remark on the reason.
>

No, we do not need to save/restore irr_delivered.

First of all, irr_delivered affects irr only when saving ioapic's
state, it does not affect any of the ioapic's logic.

Then, let's see what will happen if we do not save/restore that field :

1. If irr_delivered is 0 before saving, it is no difference at all.
2. If a bit of irr_delivered is 1 before saving, since irr_delivered
only affects migration,
we should check that if the 2nd+ migration is OK.
There are 3 possibilities on the first destination :
(1) The edge-triggered IRQ is masked, that bit will be cleared, it
is no difference to do 2nd migration.
(2) The edge-triggered IRQ is raised and not masked, that bit will
be set again, it is no difference to do 2nd migration.
(3) The edge-triggered IRQ is lowered, and the corresponding bit
of irr is cleared,
  the result of "state->irr &= ~ioapic->irr_delivered" in
kvm_get_ioapic is not affected by irr_delivered.

So it is OK to migrate with a stateless irr_delivered.


Thanks,

Wincy




> Jan
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/1] KVM: ioapic: Record edge-triggered interrupts delivery status.

2014-12-23 Thread Wincy Van
This patch fixes the bug discussed in
https://www.mail-archive.com/kvm@vger.kernel.org/msg109813.html

This patch uses a new field named irr_delivered to record the
delivery status of edge-triggered interrupts, and clears the
delivered interrupts in kvm_get_ioapic. So it has the same effect
of commit 0bc830b05c667218d703f2026ec866c49df974fc
("KVM: ioapic: clear IRR for edge-triggered interrupts at delivery")
while avoids the bug of Windows guests.

Signed-off-by: Wincy Van 
---
 arch/x86/kvm/ioapic.c |7 ++-
 arch/x86/kvm/ioapic.h |1 +
 2 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index b1947e0..a2e9d96 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -206,6 +206,8 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, 
unsigned int irq,
 
old_irr = ioapic->irr;
ioapic->irr |= mask;
+   if (edge)
+   ioapic->irr_delivered &= ~mask;
if ((edge && old_irr == ioapic->irr) ||
(!edge && entry.fields.remote_irr)) {
ret = 0;
@@ -349,7 +351,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int 
irq, bool line_status)
irqe.shorthand = 0;
 
if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
-   ioapic->irr &= ~(1 << irq);
+   ioapic->irr_delivered |= 1 << irq;
 
if (irq == RTC_GSI && line_status) {
/*
@@ -597,6 +599,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
ioapic->ioregsel = 0;
ioapic->irr = 0;
+   ioapic->irr_delivered = 0;
ioapic->id = 0;
memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
rtc_irq_eoi_tracking_reset(ioapic);
@@ -654,6 +657,7 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state 
*state)
 
spin_lock(&ioapic->lock);
memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+   state->irr &= ~ioapic->irr_delivered;
spin_unlock(&ioapic->lock);
return 0;
 }
@@ -667,6 +671,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state 
*state)
spin_lock(&ioapic->lock);
memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
ioapic->irr = 0;
+   ioapic->irr_delivered = 0;
update_handled_vectors(ioapic);
kvm_vcpu_request_scan_ioapic(kvm);
kvm_ioapic_inject_all(ioapic, state->irr);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 3c91955..a5cdfc0 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -77,6 +77,7 @@ struct kvm_ioapic {
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
+   u32 irr_delivered;
 };
 
 #ifdef DEBUG
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] KVM: nVMX: nested MSR auto load/restore emulation.

2014-11-22 Thread Wincy Van
Some hypervisors need MSR auto load/restore feature.

We read MSRs from vm-entry MSR load area which specified by L1,
and load them via kvm_set_msr in the nested entry.
When nested exit occurs, we get MSRs via kvm_get_msr, writing
them to L1`s MSR store area. After this, we read MSRs from vm-exit
MSR load area, and load them via kvm_set_msr.

VirtualBox will work fine with this patch.

Signed-off-by: Wincy Van 
---
 arch/x86/include/uapi/asm/vmx.h |5 ++
 arch/x86/kvm/vmx.c  |  123 --
 arch/x86/kvm/x86.c  |1 +
 virt/kvm/kvm_main.c |1 +
 4 files changed, 123 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 990a2fe..986af3f 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -56,6 +56,7 @@
 #define EXIT_REASON_MSR_READ31
 #define EXIT_REASON_MSR_WRITE   32
 #define EXIT_REASON_INVALID_STATE   33
+#define EXIT_REASON_MSR_LOAD_FAIL   34
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_MONITOR_INSTRUCTION 39
 #define EXIT_REASON_PAUSE_INSTRUCTION   40
@@ -114,8 +115,12 @@
{ EXIT_REASON_APIC_WRITE,"APIC_WRITE" }, \
{ EXIT_REASON_EOI_INDUCED,   "EOI_INDUCED" }, \
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
+   { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \
{ EXIT_REASON_INVD,  "INVD" }, \
{ EXIT_REASON_INVVPID,   "INVVPID" }, \
{ EXIT_REASON_INVPCID,   "INVPCID" }
 
+#define VMX_ABORT_SAVE_GUEST_MSR_FAIL1
+#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
+
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 330a08a..03daefc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6088,6 +6088,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
 */
 }
 
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+   /* TODO: not to reset guest simply here. */
+   kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+   pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+}
+
 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
 {
struct vcpu_vmx *vmx =
@@ -8215,6 +8222,92 @@ static void vmx_start_preemption_timer(struct kvm_vcpu 
*vcpu)
  ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }
 
+static inline int nested_msr_check_common(struct vmx_msr_entry *e)
+{
+   if (e->index >> 8 == 0x8 || e->reserved != 0)
+   return -EINVAL;
+   return 0;
+}
+
+static inline int nested_load_msr_check(struct vmx_msr_entry *e)
+{
+   if (e->index == MSR_FS_BASE ||
+   e->index == MSR_GS_BASE ||
+   nested_msr_check_common(e))
+   return -EINVAL;
+   return 0;
+}
+
+/* load guest msr at nested entry.
+ * return 0 for success, entry index for failed.
+ */
+static u32 nested_entry_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+   u32 i = 0;
+   struct vmx_msr_entry e;
+   struct msr_data msr;
+
+   msr.host_initiated = false;
+   while (i < count) {
+   kvm_read_guest(vcpu->kvm,
+   gpa + i * sizeof(struct vmx_msr_entry),
+   &e, sizeof(struct vmx_msr_entry));
+   if (nested_load_msr_check(&e))
+   goto fail;
+   msr.index = e.index;
+   msr.data = e.value;
+   if (kvm_set_msr(vcpu, &msr))
+   goto fail;
+   ++i;
+   }
+   return 0;
+fail:
+   return i + 1;
+}
+
+static int nested_exit_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+   u32 i = 0;
+   struct vmx_msr_entry e;
+
+   while (i < count) {
+   kvm_read_guest(vcpu->kvm,
+   gpa + i * sizeof(struct vmx_msr_entry),
+   &e, sizeof(struct vmx_msr_entry));
+   if (nested_msr_check_common(&e))
+   return -EINVAL;
+   if (kvm_get_msr(vcpu, e.index, &e.value))
+   return -EINVAL;
+   kvm_write_guest(vcpu->kvm,
+   gpa + i * sizeof(struct vmx_msr_entry),
+   &e, sizeof(struct vmx_msr_entry));
+   ++i;
+   }
+   return 0;
+}
+
+static int nested_exit_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+   u32 i = 0;
+   struct vmx_msr_entry e;
+   struct msr_data msr;
+
+   msr.host_initiated = false;
+   while (i < count) {
+   kvm_read_guest(vcpu->kvm,
+   gpa 

[PATCH] KVM: nVMX: nested MSR auto load/restore emulation.

2014-11-21 Thread Wincy Van
Some hypervisors need MSR auto load/restore feature.

We read MSRs from vm-entry MSR load area which specified by L1,
and load them via kvm_set_msr in the nested entry.
When nested exit occurs, we get MSRs via kvm_get_msr, writting
them to L1`s MSR store area. After this, we read MSRs from vm-exit
MSR load area, and load them via kvm_set_msr.

VirtualBox will work fine with this patch.

Signed-off-by: Wincy Van 

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 990a2fe..986af3f 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -56,6 +56,7 @@
 #define EXIT_REASON_MSR_READ31
 #define EXIT_REASON_MSR_WRITE   32
 #define EXIT_REASON_INVALID_STATE   33
+#define EXIT_REASON_MSR_LOAD_FAIL   34
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_MONITOR_INSTRUCTION 39
 #define EXIT_REASON_PAUSE_INSTRUCTION   40
@@ -114,8 +115,12 @@
  { EXIT_REASON_APIC_WRITE,"APIC_WRITE" }, \
  { EXIT_REASON_EOI_INDUCED,   "EOI_INDUCED" }, \
  { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
+ { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \
  { EXIT_REASON_INVD,  "INVD" }, \
  { EXIT_REASON_INVVPID,   "INVVPID" }, \
  { EXIT_REASON_INVPCID,   "INVPCID" }

+#define VMX_ABORT_SAVE_GUEST_MSR_FAIL1
+#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
+
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a951d8..377e405 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6088,6 +6088,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
  */
 }

+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+ /* TODO: not to simply reset guest here. */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ printk(KERN_WARNING"kvm: nested vmx abort, indicator %d\n", indicator);
+}
+
 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
 {
  struct vcpu_vmx *vmx =
@@ -8215,6 +8222,88 @@ static void vmx_start_preemption_timer(struct
kvm_vcpu *vcpu)
   ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }

+static inline int nested_msr_check_common(struct vmx_msr_entry *e)
+{
+ if (e->index >> 8 == 0x8 || e->reserved != 0)
+ return -EINVAL;
+return 0;
+}
+
+static inline int nested_load_msr_check(struct vmx_msr_entry *e)
+{
+ if (e->index == MSR_FS_BASE ||
+e->index == MSR_GS_BASE ||
+nested_msr_check_common(e))
+ return -EINVAL;
+ return 0;
+}
+
+/* load guest msr at nested entry.
+ * return 0 for success, entry index for failed.
+ */
+static u32 nested_entry_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i = 0;
+ struct vmx_msr_entry e;
+ struct msr_data msr;
+
+ msr.host_initiated = false;
+ while (i < count) {
+ kvm_read_guest(vcpu->kvm, gpa + i * sizeof(struct vmx_msr_entry),
+ &e, sizeof(struct vmx_msr_entry));
+ if (nested_load_msr_check(&e))
+ goto fail;
+ msr.index = e.index;
+ msr.data = e.value;
+ if (kvm_set_msr(vcpu, &msr))
+ goto fail;
+ ++i;
+}
+ return 0;
+fail:
+ return i + 1;
+}
+
+static int nested_exit_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i = 0;
+ struct vmx_msr_entry e;
+
+while (i < count) {
+ kvm_read_guest(vcpu->kvm, gpa + i * sizeof(struct vmx_msr_entry),
+ &e, sizeof(struct vmx_msr_entry));
+ if (nested_msr_check_common(&e))
+ return -EINVAL;
+ if (kvm_get_msr(vcpu, e.index, &e.value))
+ return -EINVAL;
+ kvm_write_guest(vcpu->kvm, gpa + i * sizeof(struct vmx_msr_entry),
+ &e, sizeof(struct vmx_msr_entry));
+ ++i;
+ }
+ return 0;
+}
+
+static int nested_exit_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i = 0;
+ struct vmx_msr_entry e;
+ struct msr_data msr;
+
+ msr.host_initiated = false;
+ while (i < count) {
+ kvm_read_guest(vcpu->kvm, gpa + i * sizeof(struct vmx_msr_entry),
+ &e, sizeof(struct vmx_msr_entry));
+ if (nested_load_msr_check(&e))
+ return -EINVAL;
+ msr.index = e.index;
+ msr.data = e.value;
+ if (kvm_set_msr(vcpu, &msr))
+ return -EINVAL;
+ ++i;
+ }
+ return 0;
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -8509,6 +8598,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu,
bool launch)
  int cpu;
  struct loaded_vmcs *vmcs02;
  bool ia32e;
+ u32 msr_entry_idx;

  if (!nested_vmx_check_permission(vcpu) ||
 !nested_vmx_check_vmcs12(vcpu))
@@ -8556,11 +8646,12 @@ static int nested_vmx_run(struct kvm_vcpu
*vcpu, bool launch)
  return 1;
  }

- if (vmcs12->vm_entry_msr_load_count > 0 ||
-vmcs12->vm_exit_msr_load_count > 0 ||
-vmcs12->vm_exit_msr_store_count > 0) {
- pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsu