Re: [PATCH v3] KVM: x86/MMU: Do not check unsync status for root SP.

2021-02-09 Thread Yu Zhang
Sorry, forget the change log:

Changes in V3:
- fixed a bug in warnings inside mmu_sync_children().
- commit message changes based on Paolo's suggestion.
- added Co-developed-by: Sean Christopherson 

Changes in V2:
- warnings added based on Sean's suggestion.


On Wed, Feb 10, 2021 at 01:01:11AM +0800, Yu Zhang wrote:
> In shadow page table, only leaf SPs may be marked as unsync;
> instead, for non-leaf SPs, we store the number of unsynced
> children in unsync_children. Therefore, in kvm_mmu_sync_root(),
> sp->unsync shall always be zero for the root SP and there is
> no need to check it. Remove the check, and add a warning
> inside mmu_sync_children() to assert that the flags are used
> properly.
> 
> While at it, move the warning from mmu_need_write_protect()
> to kvm_unsync_page().
> 
> Co-developed-by: Sean Christopherson 
> Signed-off-by: Sean Christopherson 
> Signed-off-by: Paolo Bonzini 
> Signed-off-by: Yu Zhang 
> ---
>  arch/x86/kvm/mmu/mmu.c | 12 +---
>  1 file changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 86af58294272..5f482af125b4 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1995,6 +1995,12 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
>   LIST_HEAD(invalid_list);
>   bool flush = false;
>  
> + /*
> +  * Only 4k SPTEs can directly be made unsync, the parent pages
> +  * should never be unsyc'd.
> +  */
> + WARN_ON_ONCE(parent->unsync);
> +
>   while (mmu_unsync_walk(parent, )) {
>   bool protected = false;
>  
> @@ -2502,6 +2508,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
>  
>  static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
>  {
> + WARN_ON(sp->role.level != PG_LEVEL_4K);
> +
>   trace_kvm_mmu_unsync_page(sp);
>   ++vcpu->kvm->stat.mmu_unsync;
>   sp->unsync = 1;
> @@ -2524,7 +2532,6 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, 
> gfn_t gfn,
>   if (sp->unsync)
>   continue;
>  
> - WARN_ON(sp->role.level != PG_LEVEL_4K);
>   kvm_unsync_page(vcpu, sp);
>   }
>  
> @@ -3406,8 +3413,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
>* mmu_need_write_protect() describe what could go wrong if this
>* requirement isn't satisfied.
>*/
> - if (!smp_load_acquire(>unsync) &&
> - !smp_load_acquire(>unsync_children))
> + if (!smp_load_acquire(>unsync_children))
>   return;
>  
>   write_lock(>kvm->mmu_lock);
> -- 
> 2.17.1
> 


[PATCH v3] KVM: x86/MMU: Do not check unsync status for root SP.

2021-02-09 Thread Yu Zhang
In shadow page table, only leaf SPs may be marked as unsync;
instead, for non-leaf SPs, we store the number of unsynced
children in unsync_children. Therefore, in kvm_mmu_sync_root(),
sp->unsync shall always be zero for the root SP and there is
no need to check it. Remove the check, and add a warning
inside mmu_sync_children() to assert that the flags are used
properly.

While at it, move the warning from mmu_need_write_protect()
to kvm_unsync_page().

Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
Signed-off-by: Paolo Bonzini 
Signed-off-by: Yu Zhang 
---
 arch/x86/kvm/mmu/mmu.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 86af58294272..5f482af125b4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1995,6 +1995,12 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
LIST_HEAD(invalid_list);
bool flush = false;
 
+   /*
+* Only 4k SPTEs can directly be made unsync, the parent pages
+* should never be unsyc'd.
+*/
+   WARN_ON_ONCE(parent->unsync);
+
while (mmu_unsync_walk(parent, )) {
bool protected = false;
 
@@ -2502,6 +2508,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
+   WARN_ON(sp->role.level != PG_LEVEL_4K);
+
trace_kvm_mmu_unsync_page(sp);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
@@ -2524,7 +2532,6 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t 
gfn,
if (sp->unsync)
continue;
 
-   WARN_ON(sp->role.level != PG_LEVEL_4K);
kvm_unsync_page(vcpu, sp);
}
 
@@ -3406,8 +3413,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 * mmu_need_write_protect() describe what could go wrong if this
 * requirement isn't satisfied.
 */
-   if (!smp_load_acquire(>unsync) &&
-   !smp_load_acquire(>unsync_children))
+   if (!smp_load_acquire(>unsync_children))
return;
 
write_lock(>kvm->mmu_lock);
-- 
2.17.1



Re: [PATCH v2] KVM: x86/MMU: Do not check unsync status for root SP.

2021-02-09 Thread Yu Zhang
On Tue, Feb 09, 2021 at 08:46:42AM +0100, Paolo Bonzini wrote:
> On 09/02/21 04:33, Yu Zhang wrote:
> > On Mon, Feb 08, 2021 at 05:47:22PM +0100, Paolo Bonzini wrote:
> > > On 08/02/21 14:49, Yu Zhang wrote:
> > > > On Mon, Feb 08, 2021 at 12:36:57PM +0100, Paolo Bonzini wrote:
> > > > > On 07/02/21 13:22, Yu Zhang wrote:
> > > > > > In shadow page table, only leaf SPs may be marked as unsync.
> > > > > > And for non-leaf SPs, we use unsync_children to keep the number
> > > > > > of the unsynced children. In kvm_mmu_sync_root(), sp->unsync
> > > > > > shall always be zero for the root SP, , hence no need to check
> > > > > > it. Instead, a warning inside mmu_sync_children() is added, in
> > > > > > case someone incorrectly used it.
> > > > > > 
> > > > > > Also, clarify the mmu_need_write_protect(), by moving the warning
> > > > > > into kvm_unsync_page().
> > > > > > 
> > > > > > Signed-off-by: Yu Zhang 
> > > > > > Signed-off-by: Sean Christopherson 
> > > > > 
> > > > > This should really be more of a Co-developed-by, and there are a 
> > > > > couple
> > > > > adjustments that could be made in the commit message.  I've queued 
> > > > > the patch
> > > > > and I'll fix it up later.
> > > > 
> > > > Indeed. Thanks for the remind, and I'll pay attention in the future. :)
> > > 
> > > Also:
> > > 
> > > arch/x86/kvm/mmu/mmu.c: In function ‘mmu_sync_children’:
> > > arch/x86/kvm/mmu/mmu.c:2002:17: error: ‘sp’ is used uninitialized in this
> > > function [-Werror=uninitialized]
> > >WARN_ON_ONCE(sp->unsync);
> > 
> > Oops. This is wrong. Should be WARN_ON_ONCE(parent->unsync);
> > 
> > > 
> > > so how was this tested?
> > > 
> > 
> > I ran access test in kvm-unit-test for previous version, which hasn't
> > this code(also in my local repo "enable_ept" was explicitly set to
> > 0 in order to test the shadow mode). But I did not test this one. I'm
> > truely sorry for the negligence - even trying to compile should make
> > this happen!
> > 
> > Should we submit another version? Any suggestions on the test cases?
> 
> Yes, please send v3.
> 
> The commit message can be:
> 
> In shadow page table, only leaf SPs may be marked as unsync; instead, for
> non-leaf SPs, we store the number of unsynced children in unsync_children.
> Therefore, in kvm_mmu_sync_root(), sp->unsync
> shall always be zero for the root SP and there is no need to check
> it.  Remove the check, and add a warning inside mmu_sync_children() to
> assert that the flags are used properly.
> 
> While at it, move the warning from mmu_need_write_protect() to
> kvm_unsync_page().

Thanks Paolo. Will send out v3.

BTW, I just realized that mmu_sync_children() was not triggered by
kvm-unit-test(the access.flat case), so I ran another test by running
a regular VM using shadow, in which I witnessed the synchronization.

B.R.
Yu

> 
> Paolo
> 


Re: [PATCH v2] KVM: x86/MMU: Do not check unsync status for root SP.

2021-02-08 Thread Yu Zhang
On Mon, Feb 08, 2021 at 05:47:22PM +0100, Paolo Bonzini wrote:
> On 08/02/21 14:49, Yu Zhang wrote:
> > On Mon, Feb 08, 2021 at 12:36:57PM +0100, Paolo Bonzini wrote:
> > > On 07/02/21 13:22, Yu Zhang wrote:
> > > > In shadow page table, only leaf SPs may be marked as unsync.
> > > > And for non-leaf SPs, we use unsync_children to keep the number
> > > > of the unsynced children. In kvm_mmu_sync_root(), sp->unsync
> > > > shall always be zero for the root SP, , hence no need to check
> > > > it. Instead, a warning inside mmu_sync_children() is added, in
> > > > case someone incorrectly used it.
> > > > 
> > > > Also, clarify the mmu_need_write_protect(), by moving the warning
> > > > into kvm_unsync_page().
> > > > 
> > > > Signed-off-by: Yu Zhang 
> > > > Signed-off-by: Sean Christopherson 
> > > 
> > > This should really be more of a Co-developed-by, and there are a couple
> > > adjustments that could be made in the commit message.  I've queued the 
> > > patch
> > > and I'll fix it up later.
> > 
> > Indeed. Thanks for the remind, and I'll pay attention in the future. :)
> 
> Also:
> 
> arch/x86/kvm/mmu/mmu.c: In function ‘mmu_sync_children’:
> arch/x86/kvm/mmu/mmu.c:2002:17: error: ‘sp’ is used uninitialized in this
> function [-Werror=uninitialized]
>   WARN_ON_ONCE(sp->unsync);

Oops. This is wrong. Should be WARN_ON_ONCE(parent->unsync);

> 
> so how was this tested?
> 

I ran access test in kvm-unit-test for previous version, which hasn't
this code(also in my local repo "enable_ept" was explicitly set to
0 in order to test the shadow mode). But I did not test this one. I'm
truely sorry for the negligence - even trying to compile should make
this happen!

Should we submit another version? Any suggestions on the test cases?

Thanks
Yu

> Paolo
> 


Re: [PATCH v2] KVM: x86/MMU: Do not check unsync status for root SP.

2021-02-08 Thread Yu Zhang
On Mon, Feb 08, 2021 at 12:36:57PM +0100, Paolo Bonzini wrote:
> On 07/02/21 13:22, Yu Zhang wrote:
> > In shadow page table, only leaf SPs may be marked as unsync.
> > And for non-leaf SPs, we use unsync_children to keep the number
> > of the unsynced children. In kvm_mmu_sync_root(), sp->unsync
> > shall always be zero for the root SP, , hence no need to check
> > it. Instead, a warning inside mmu_sync_children() is added, in
> > case someone incorrectly used it.
> > 
> > Also, clarify the mmu_need_write_protect(), by moving the warning
> > into kvm_unsync_page().
> > 
> > Signed-off-by: Yu Zhang 
> > Signed-off-by: Sean Christopherson 
> 
> This should really be more of a Co-developed-by, and there are a couple
> adjustments that could be made in the commit message.  I've queued the patch
> and I'll fix it up later.

Indeed. Thanks for the remind, and I'll pay attention in the future. :)

B.R.
Yu

> 
> Paolo
> 
> > ---
> > Changes in V2:
> > - warnings added based on Sean's suggestion.
> > 
> >   arch/x86/kvm/mmu/mmu.c | 12 +---
> >   1 file changed, 9 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 86af582..c4797a00cc 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -1995,6 +1995,12 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
> > LIST_HEAD(invalid_list);
> > bool flush = false;
> > +   /*
> > +* Only 4k SPTEs can directly be made unsync, the parent pages
> > +* should never be unsyc'd.
> > +*/
> > +   WARN_ON_ONCE(sp->unsync);
> > +
> > while (mmu_unsync_walk(parent, )) {
> > bool protected = false;
> > @@ -2502,6 +2508,8 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
> >   static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page 
> > *sp)
> >   {
> > +   WARN_ON(sp->role.level != PG_LEVEL_4K);
> > +
> > trace_kvm_mmu_unsync_page(sp);
> > ++vcpu->kvm->stat.mmu_unsync;
> > sp->unsync = 1;
> > @@ -2524,7 +2532,6 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, 
> > gfn_t gfn,
> > if (sp->unsync)
> > continue;
> > -   WARN_ON(sp->role.level != PG_LEVEL_4K);
> > kvm_unsync_page(vcpu, sp);
> > }
> > @@ -3406,8 +3413,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
> >  * mmu_need_write_protect() describe what could go wrong if this
> >  * requirement isn't satisfied.
> >  */
> > -   if (!smp_load_acquire(>unsync) &&
> > -   !smp_load_acquire(>unsync_children))
> > +   if (!smp_load_acquire(>unsync_children))
> > return;
> > write_lock(>kvm->mmu_lock);
> > 
> 


Re: linux-next: build failure after merge of the kvm tree

2021-02-08 Thread Yu Zhang
Thanks a lot for reporting this, Stephen. Just sent out a patch
to fix it in kvmgt.

B.R.
Yu

On Mon, Feb 08, 2021 at 04:33:08PM +1100, Stephen Rothwell wrote:
> Hi all,
> 
> After merging the kvm tree, today's linux-next build (x86_64 allmodconfig)
> failed like this:
> 
> drivers/gpu/drm/i915/gvt/kvmgt.c: In function 'kvmgt_page_track_add':
> drivers/gpu/drm/i915/gvt/kvmgt.c:1706:12: error: passing argument 1 of 
> 'spin_lock' from incompatible pointer type 
> [-Werror=incompatible-pointer-types]
>  1706 |  spin_lock(>mmu_lock);
>   |^~
>   ||
>   |rwlock_t *
> In file included from include/linux/wait.h:9,
>  from include/linux/pid.h:6,
>  from include/linux/sched.h:14,
>  from include/linux/ratelimit.h:6,
>  from include/linux/dev_printk.h:16,
>  from include/linux/device.h:15,
>  from drivers/gpu/drm/i915/gvt/kvmgt.c:32:
> include/linux/spinlock.h:352:51: note: expected 'spinlock_t *' {aka 'struct 
> spinlock *'} but argument is of type 'rwlock_t *'
>   352 | static __always_inline void spin_lock(spinlock_t *lock)
>   |   ^~~~
> drivers/gpu/drm/i915/gvt/kvmgt.c:1715:14: error: passing argument 1 of 
> 'spin_unlock' from incompatible pointer type 
> [-Werror=incompatible-pointer-types]
>  1715 |  spin_unlock(>mmu_lock);
>   |  ^~
>   |  |
>   |  rwlock_t *
> In file included from include/linux/wait.h:9,
>  from include/linux/pid.h:6,
>  from include/linux/sched.h:14,
>  from include/linux/ratelimit.h:6,
>  from include/linux/dev_printk.h:16,
>  from include/linux/device.h:15,
>  from drivers/gpu/drm/i915/gvt/kvmgt.c:32:
> include/linux/spinlock.h:392:53: note: expected 'spinlock_t *' {aka 'struct 
> spinlock *'} but argument is of type 'rwlock_t *'
>   392 | static __always_inline void spin_unlock(spinlock_t *lock)
>   | ^~~~
> drivers/gpu/drm/i915/gvt/kvmgt.c: In function 'kvmgt_page_track_remove':
> drivers/gpu/drm/i915/gvt/kvmgt.c:1740:12: error: passing argument 1 of 
> 'spin_lock' from incompatible pointer type 
> [-Werror=incompatible-pointer-types]
>  1740 |  spin_lock(>mmu_lock);
>   |^~
>   ||
>   |rwlock_t *
> In file included from include/linux/wait.h:9,
>  from include/linux/pid.h:6,
>  from include/linux/sched.h:14,
>  from include/linux/ratelimit.h:6,
>  from include/linux/dev_printk.h:16,
>  from include/linux/device.h:15,
>  from drivers/gpu/drm/i915/gvt/kvmgt.c:32:
> include/linux/spinlock.h:352:51: note: expected 'spinlock_t *' {aka 'struct 
> spinlock *'} but argument is of type 'rwlock_t *'
>   352 | static __always_inline void spin_lock(spinlock_t *lock)
>   |   ^~~~
> drivers/gpu/drm/i915/gvt/kvmgt.c:1749:14: error: passing argument 1 of 
> 'spin_unlock' from incompatible pointer type 
> [-Werror=incompatible-pointer-types]
>  1749 |  spin_unlock(>mmu_lock);
>   |  ^~
>   |  |
>   |  rwlock_t *
> In file included from include/linux/wait.h:9,
>  from include/linux/pid.h:6,
>  from include/linux/sched.h:14,
>  from include/linux/ratelimit.h:6,
>  from include/linux/dev_printk.h:16,
>  from include/linux/device.h:15,
>  from drivers/gpu/drm/i915/gvt/kvmgt.c:32:
> include/linux/spinlock.h:392:53: note: expected 'spinlock_t *' {aka 'struct 
> spinlock *'} but argument is of type 'rwlock_t *'
>   392 | static __always_inline void spin_unlock(spinlock_t *lock)
>   | ^~~~
> drivers/gpu/drm/i915/gvt/kvmgt.c: In function 'kvmgt_page_track_flush_slot':
> drivers/gpu/drm/i915/gvt/kvmgt.c:1775:12: error: passing argument 1 of 
> 'spin_lock' from incompatible pointer type 
> [-Werror=incompatible-pointer-types]
>  1775 |  spin_lock(>mmu_lock);
>   |^~
>   ||
>   |rwlock_t *
> In file included from include/linux/wait.h:9,
>  from include/linux/pid.h:6,
>  from include/linux/sched.h:14,
>  from include/linux/ratelimit.h:6,
>  from include/linux/dev_printk.h:16,
>  from include/linux/device.h:15,
>  from drivers/gpu/drm/i915/gvt/kvmgt.c:32:
> include/linux/spinlock.h:352:51: note: expected 'spinlock_t *' {aka 'struct 
> spinlock *'} but argument is of type 'rwlock_t *'
>   352 | static __always_inline void 

[PATCH] drm/i915/gvt/kvmgt: Fix the build failure in kvmgt.

2021-02-08 Thread Yu Zhang
Previously, commit 531810caa9f4 ("KVM: x86/mmu: Use
an rwlock for the x86 MMU") replaced KVM's mmu_lock
with type rwlock_t. This will cause a build failure
in kvmgt, which uses the same lock when trying to add/
remove some GFNs to/from the page tracker. Fix it with
write_lock/unlocks in kvmgt.

Reported-by: Stephen Rothwell 
Signed-off-by: Yu Zhang 
---
 drivers/gpu/drm/i915/gvt/kvmgt.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 60f1a386dd06..b4348256ae95 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1703,7 +1703,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 
gfn)
return -EINVAL;
}
 
-   spin_lock(>mmu_lock);
+   write_lock(>mmu_lock);
 
if (kvmgt_gfn_is_write_protected(info, gfn))
goto out;
@@ -1712,7 +1712,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 
gfn)
kvmgt_protect_table_add(info, gfn);
 
 out:
-   spin_unlock(>mmu_lock);
+   write_unlock(>mmu_lock);
srcu_read_unlock(>srcu, idx);
return 0;
 }
@@ -1737,7 +1737,7 @@ static int kvmgt_page_track_remove(unsigned long handle, 
u64 gfn)
return -EINVAL;
}
 
-   spin_lock(>mmu_lock);
+   write_lock(>mmu_lock);
 
if (!kvmgt_gfn_is_write_protected(info, gfn))
goto out;
@@ -1746,7 +1746,7 @@ static int kvmgt_page_track_remove(unsigned long handle, 
u64 gfn)
kvmgt_protect_table_del(info, gfn);
 
 out:
-   spin_unlock(>mmu_lock);
+   write_unlock(>mmu_lock);
srcu_read_unlock(>srcu, idx);
return 0;
 }
@@ -1772,7 +1772,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm,
struct kvmgt_guest_info *info = container_of(node,
struct kvmgt_guest_info, track_node);
 
-   spin_lock(>mmu_lock);
+   write_lock(>mmu_lock);
for (i = 0; i < slot->npages; i++) {
gfn = slot->base_gfn + i;
if (kvmgt_gfn_is_write_protected(info, gfn)) {
@@ -1781,7 +1781,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm,
kvmgt_protect_table_del(info, gfn);
}
}
-   spin_unlock(>mmu_lock);
+   write_unlock(>mmu_lock);
 }
 
 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
-- 
2.17.1



[PATCH v2] KVM: x86/MMU: Do not check unsync status for root SP.

2021-02-06 Thread Yu Zhang
In shadow page table, only leaf SPs may be marked as unsync.
And for non-leaf SPs, we use unsync_children to keep the number
of the unsynced children. In kvm_mmu_sync_root(), sp->unsync
shall always be zero for the root SP, , hence no need to check
it. Instead, a warning inside mmu_sync_children() is added, in
case someone incorrectly used it.

Also, clarify the mmu_need_write_protect(), by moving the warning
into kvm_unsync_page().

Signed-off-by: Yu Zhang 
Signed-off-by: Sean Christopherson 
---
Changes in V2:
- warnings added based on Sean's suggestion.

 arch/x86/kvm/mmu/mmu.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 86af582..c4797a00cc 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1995,6 +1995,12 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
LIST_HEAD(invalid_list);
bool flush = false;
 
+   /*
+* Only 4k SPTEs can directly be made unsync, the parent pages
+* should never be unsyc'd.
+*/
+   WARN_ON_ONCE(sp->unsync);
+
while (mmu_unsync_walk(parent, )) {
bool protected = false;
 
@@ -2502,6 +2508,8 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 
 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
+   WARN_ON(sp->role.level != PG_LEVEL_4K);
+
trace_kvm_mmu_unsync_page(sp);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
@@ -2524,7 +2532,6 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t 
gfn,
if (sp->unsync)
continue;
 
-   WARN_ON(sp->role.level != PG_LEVEL_4K);
kvm_unsync_page(vcpu, sp);
}
 
@@ -3406,8 +3413,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 * mmu_need_write_protect() describe what could go wrong if this
 * requirement isn't satisfied.
 */
-   if (!smp_load_acquire(>unsync) &&
-   !smp_load_acquire(>unsync_children))
+   if (!smp_load_acquire(>unsync_children))
return;
 
write_lock(>kvm->mmu_lock);
-- 
1.9.1



Re: [PATCH] KVM: x86/mmu: Remove the defunct update_pte() paging hook

2021-01-30 Thread Yu Zhang
Thanks a lot for the patch, Sean.

I know this has been queued for quite a while. But I just realized I have
another question of kvm_mmu_pte_write():

> Remove the update_pte() shadow paging logic, which was obsoleted by
> commit 4731d4c7a077 ("KVM: MMU: out of sync shadow core"), but never
> removed.  As pointed out by Yu, KVM never write protects leaf page
> tables for the purposes of shadow paging, and instead marks their
> associated shadow page as unsync so that the guest can write PTEs at
> will.
> 
> The update_pte() path, which predates the unsync logic, optimizes COW
> scenarios by refreshing leaf SPTEs when they are written, as opposed to
> zapping the SPTE, restarting the guest, and installing the new SPTE on
> the subsequent fault.  Since KVM no longer write-protects leaf page
> tables, update_pte() is unreachable and can be dropped.
> 
> Reported-by: Yu Zhang 
> Signed-off-by: Sean Christopherson 
> ---
>  arch/x86/include/asm/kvm_host.h |  3 --
>  arch/x86/kvm/mmu/mmu.c  | 49 ++---
>  arch/x86/kvm/x86.c  |  1 -
>  3 files changed, 2 insertions(+), 51 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..ed575c5655dd 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -358,8 +358,6 @@ struct kvm_mmu {
>   int (*sync_page)(struct kvm_vcpu *vcpu,
>struct kvm_mmu_page *sp);
>   void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
> - void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> -u64 *spte, const void *pte);
>   hpa_t root_hpa;
>   gpa_t root_pgd;
>   union kvm_mmu_role mmu_role;
> @@ -1031,7 +1029,6 @@ struct kvm_arch {
>  struct kvm_vm_stat {
>   ulong mmu_shadow_zapped;
>   ulong mmu_pte_write;
> - ulong mmu_pte_updated;
>   ulong mmu_pde_zapped;
>   ulong mmu_flooded;
>   ulong mmu_recycled;
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..3a2c25852b1f 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1723,13 +1723,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
>   return 0;
>  }
>  
> -static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
> -  struct kvm_mmu_page *sp, u64 *spte,
> -  const void *pte)
> -{
> - WARN_ON(1);
> -}
> -
>  #define KVM_PAGE_ARRAY_NR 16
>  
>  struct kvm_mmu_pages {
> @@ -3813,7 +3806,6 @@ static void nonpaging_init_context(struct kvm_vcpu 
> *vcpu,
>   context->gva_to_gpa = nonpaging_gva_to_gpa;
>   context->sync_page = nonpaging_sync_page;
>   context->invlpg = NULL;
> - context->update_pte = nonpaging_update_pte;
>   context->root_level = 0;
>   context->shadow_root_level = PT32E_ROOT_LEVEL;
>   context->direct_map = true;
> @@ -4395,7 +4387,6 @@ static void paging64_init_context_common(struct 
> kvm_vcpu *vcpu,
>   context->gva_to_gpa = paging64_gva_to_gpa;
>   context->sync_page = paging64_sync_page;
>   context->invlpg = paging64_invlpg;
> - context->update_pte = paging64_update_pte;
>   context->shadow_root_level = level;
>   context->direct_map = false;
>  }
> @@ -4424,7 +4415,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
>   context->gva_to_gpa = paging32_gva_to_gpa;
>   context->sync_page = paging32_sync_page;
>   context->invlpg = paging32_invlpg;
> - context->update_pte = paging32_update_pte;
>   context->shadow_root_level = PT32E_ROOT_LEVEL;
>   context->direct_map = false;
>  }
> @@ -4506,7 +4496,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
>   context->page_fault = kvm_tdp_page_fault;
>   context->sync_page = nonpaging_sync_page;
>   context->invlpg = NULL;
> - context->update_pte = nonpaging_update_pte;
>   context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
>   context->direct_map = true;
>   context->get_guest_pgd = get_cr3;
> @@ -4678,7 +4667,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, 
> bool execonly,
>   context->gva_to_gpa = ept_gva_to_gpa;
>   context->sync_page = ept_sync_page;
>   context->invlpg = ept_invlpg;
> - context->update_pte = ept_update_pte;
>   context->root_level = level;
>   context->direct_map = false;
>   context->mmu_role.as_u64 = new_role.as_u64;
> @@ -4826,19 +4814,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
>

[PATCH] KVM: Documentation: Fix documentation for nested.

2021-01-28 Thread Yu Zhang
Nested VMX was enabled by default in commit <1e58e5e59148> ("KVM:
VMX: enable nested virtualization by default"), which was merged
in Linux 4.20. This patch is to fix the documentation accordingly.

Signed-off-by: Yu Zhang 
---
 Documentation/virt/kvm/nested-vmx.rst| 6 --
 Documentation/virt/kvm/running-nested-guests.rst | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Documentation/virt/kvm/nested-vmx.rst 
b/Documentation/virt/kvm/nested-vmx.rst
index 6ab4e35..ac2095d 100644
--- a/Documentation/virt/kvm/nested-vmx.rst
+++ b/Documentation/virt/kvm/nested-vmx.rst
@@ -37,8 +37,10 @@ call L2.
 Running nested VMX
 --
 
-The nested VMX feature is disabled by default. It can be enabled by giving
-the "nested=1" option to the kvm-intel module.
+The nested VMX feature is enabled by default since Linux kernel v4.20. For
+older Linux kernel, it can be enabled by giving the "nested=1" option to the
+kvm-intel module.
+
 
 No modifications are required to user space (qemu). However, qemu's default
 emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be
diff --git a/Documentation/virt/kvm/running-nested-guests.rst 
b/Documentation/virt/kvm/running-nested-guests.rst
index d0a1fc7..bd70c69 100644
--- a/Documentation/virt/kvm/running-nested-guests.rst
+++ b/Documentation/virt/kvm/running-nested-guests.rst
@@ -74,7 +74,7 @@ few:
 Enabling "nested" (x86)
 ---
 
-From Linux kernel v4.19 onwards, the ``nested`` KVM parameter is enabled
+From Linux kernel v4.20 onwards, the ``nested`` KVM parameter is enabled
 by default for Intel and AMD.  (Though your Linux distribution might
 override this default.)
 
-- 
1.9.1



Re: [PATCH] KVM: x86/MMU: Do not check unsync status for root SP.

2021-01-26 Thread Yu Zhang
Hi Paolo,

  Any comments? Thanks!

B.R.
Yu

On Sat, Jan 16, 2021 at 08:21:00AM +0800, Yu Zhang wrote:
> In shadow page table, only leaf SPs may be marked as unsync.
> And for non-leaf SPs, we use unsync_children to keep the number
> of the unsynced children. In kvm_mmu_sync_root(), sp->unsync
> shall always be zero for the root SP, hence no need to check it.
> 
> Signed-off-by: Yu Zhang 
> ---
>  arch/x86/kvm/mmu/mmu.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481a..1a6bb03 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3412,8 +3412,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
>* mmu_need_write_protect() describe what could go wrong if this
>* requirement isn't satisfied.
>*/
> - if (!smp_load_acquire(>unsync) &&
> - !smp_load_acquire(>unsync_children))
> + if (!smp_load_acquire(>unsync_children))
>   return;
>  
>   spin_lock(>kvm->mmu_lock);
> -- 
> 1.9.1
> 


[PATCH] KVM: x86/MMU: Do not check unsync status for root SP.

2021-01-15 Thread Yu Zhang
In shadow page table, only leaf SPs may be marked as unsync.
And for non-leaf SPs, we use unsync_children to keep the number
of the unsynced children. In kvm_mmu_sync_root(), sp->unsync
shall always be zero for the root SP, hence no need to check it.

Signed-off-by: Yu Zhang 
---
 arch/x86/kvm/mmu/mmu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481a..1a6bb03 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3412,8 +3412,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 * mmu_need_write_protect() describe what could go wrong if this
 * requirement isn't satisfied.
 */
-   if (!smp_load_acquire(>unsync) &&
-   !smp_load_acquire(>unsync_children))
+   if (!smp_load_acquire(>unsync_children))
return;
 
spin_lock(>kvm->mmu_lock);
-- 
1.9.1



Re: [PATCH v2 02/20] kvm: x86/mmu: Introduce tdp_iter

2020-10-21 Thread Yu Zhang
On Wed, Oct 21, 2020 at 11:08:52AM -0700, Ben Gardon wrote:
> On Wed, Oct 21, 2020 at 7:59 AM Yu Zhang  wrote:
> >
> > On Wed, Oct 14, 2020 at 11:26:42AM -0700, Ben Gardon wrote:
> > > The TDP iterator implements a pre-order traversal of a TDP paging
> > > structure. This iterator will be used in future patches to create
> > > an efficient implementation of the KVM MMU for the TDP case.
> > >
> > > Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> > > machine. This series introduced no new failures.
> > >
> > > This series can be viewed in Gerrit at:
> > >   https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538
> > >
> > > Signed-off-by: Ben Gardon 
> > > ---
> > >  arch/x86/kvm/Makefile   |   3 +-
> > >  arch/x86/kvm/mmu/mmu.c  |  66 
> > >  arch/x86/kvm/mmu/mmu_internal.h |  66 
> > >  arch/x86/kvm/mmu/tdp_iter.c | 176 
> > >  arch/x86/kvm/mmu/tdp_iter.h |  56 ++
> > >  5 files changed, 300 insertions(+), 67 deletions(-)
> > >  create mode 100644 arch/x86/kvm/mmu/tdp_iter.c
> > >  create mode 100644 arch/x86/kvm/mmu/tdp_iter.h
> > >
> > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > index 7f86a14aed0e9..4525c1151bf99 100644
> > > --- a/arch/x86/kvm/Makefile
> > > +++ b/arch/x86/kvm/Makefile
> > > @@ -15,7 +15,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF)  += $(KVM)/async_pf.o
> > >
> > >  kvm-y+= x86.o emulate.o i8259.o irq.o lapic.o \
> > >  i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o 
> > > \
> > > -hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
> > > +hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
> > > +mmu/tdp_iter.o
> > >
> > >  kvm-intel-y  += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o 
> > > vmx/vmcs12.o \
> > >  vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 6c9db349600c8..6d82784ed5679 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -121,28 +121,6 @@ module_param(dbg, bool, 0644);
> > >
> > >  #define PTE_PREFETCH_NUM 8
> > >
> > > -#define PT_FIRST_AVAIL_BITS_SHIFT 10
> > > -#define PT64_SECOND_AVAIL_BITS_SHIFT 54
> > > -
> > > -/*
> > > - * The mask used to denote special SPTEs, which can be either MMIO SPTEs 
> > > or
> > > - * Access Tracking SPTEs.
> > > - */
> > > -#define SPTE_SPECIAL_MASK (3ULL << 52)
> > > -#define SPTE_AD_ENABLED_MASK (0ULL << 52)
> > > -#define SPTE_AD_DISABLED_MASK (1ULL << 52)
> > > -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
> > > -#define SPTE_MMIO_MASK (3ULL << 52)
> > > -
> > > -#define PT64_LEVEL_BITS 9
> > > -
> > > -#define PT64_LEVEL_SHIFT(level) \
> > > - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> > > -
> > > -#define PT64_INDEX(address, level)\
> > > - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 
> > > 1))
> > > -
> > > -
> > >  #define PT32_LEVEL_BITS 10
> > >
> > >  #define PT32_LEVEL_SHIFT(level) \
> > > @@ -155,19 +133,6 @@ module_param(dbg, bool, 0644);
> > >  #define PT32_INDEX(address, level)\
> > >   (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 
> > > 1))
> > >
> > > -
> > > -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
> > > -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
> > > -#else
> > > -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
> > > -#endif
> > > -#define PT64_LVL_ADDR_MASK(level) \
> > > - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > > - * PT64_LEVEL_BITS))) - 1))
> > > -#define PT64_LVL_OFFSET_MASK(level) \
> > > - (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > > - * PT64_LEVEL_BITS))) - 1))
> > > -
> > >  #define PT32_BASE_ADDR_MASK PAGE_MASK
> > >  #define PT32_DIR_

Re: [PATCH v2 07/20] kvm: x86/mmu: Support zapping SPTEs in the TDP MMU

2020-10-21 Thread Yu Zhang
On Wed, Oct 21, 2020 at 08:00:47PM +0200, Paolo Bonzini wrote:
> On 21/10/20 19:24, Yu Zhang wrote:
> > On Wed, Oct 21, 2020 at 07:20:15PM +0200, Paolo Bonzini wrote:
> >> On 21/10/20 17:02, Yu Zhang wrote:
> >>>>  void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
> >>>>  {
> >>>> +gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - 
> >>>> PAGE_SHIFT);
> >>>> +
> >>> boot_cpu_data.x86_phys_bits is the host address width. Value of the 
> >>> guest's
> >>> may vary. So maybe we should just traverse the memslots and zap the gfn 
> >>> ranges
> >>> in each of them?
> >>>
> >>
> >> It must be smaller than the host value for two-dimensional paging, though.
> > 
> > Yes. And using boot_cpu_data.x86_phys_bits works, but won't it be somewhat
> > overkilling? E.g. for a host with 46 bits and a guest with 39 bits width?
> 
> It would go quickly through extra memory space because the PML4E entries
> above the first would be empty.  So it's just 511 comparisons.
> 

Oh, yes. The overhead seems not as big as I assumed. :)

Yu
> Paolo
> 


Re: [PATCH v2 07/20] kvm: x86/mmu: Support zapping SPTEs in the TDP MMU

2020-10-21 Thread Yu Zhang
On Wed, Oct 21, 2020 at 07:20:15PM +0200, Paolo Bonzini wrote:
> On 21/10/20 17:02, Yu Zhang wrote:
> >>  void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
> >>  {
> >> +  gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
> >> +
> > boot_cpu_data.x86_phys_bits is the host address width. Value of the guest's
> > may vary. So maybe we should just traverse the memslots and zap the gfn 
> > ranges
> > in each of them?
> > 
> 
> It must be smaller than the host value for two-dimensional paging, though.

Yes. And using boot_cpu_data.x86_phys_bits works, but won't it be somewhat
overkilling? E.g. for a host with 46 bits and a guest with 39 bits width?

Any concern for doing the zap by going through the memslots? Thanks. :)

B.R.
Yu
> 
> Paolo
> 


Re: [PATCH v2 04/20] kvm: x86/mmu: Allocate and free TDP MMU roots

2020-10-21 Thread Yu Zhang
On Wed, Oct 14, 2020 at 11:26:44AM -0700, Ben Gardon wrote:
> The TDP MMU must be able to allocate paging structure root pages and track
> the usage of those pages. Implement a similar, but separate system for root
> page allocation to that of the x86 shadow paging implementation. When
> future patches add synchronization model changes to allow for parallel
> page faults, these pages will need to be handled differently from the
> x86 shadow paging based MMU's root pages.
> 
> Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> machine. This series introduced no new failures.
> 
> This series can be viewed in Gerrit at:
>   https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538
> 
> Signed-off-by: Ben Gardon 
> ---
>  arch/x86/include/asm/kvm_host.h |   1 +
>  arch/x86/kvm/mmu/mmu.c  |  29 +---
>  arch/x86/kvm/mmu/mmu_internal.h |  24 +++
>  arch/x86/kvm/mmu/tdp_mmu.c  | 114 
>  arch/x86/kvm/mmu/tdp_mmu.h  |   5 ++
>  5 files changed, 162 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 6b6dbc20ce23a..e0ec1dd271a32 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -989,6 +989,7 @@ struct kvm_arch {
>* operations.
>*/
>   bool tdp_mmu_enabled;
> + struct list_head tdp_mmu_roots;
>  };
>  
>  struct kvm_vm_stat {
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index f53d29e09367c..a3340ed59ad1d 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -144,11 +144,6 @@ module_param(dbg, bool, 0644);
>  #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | 
> shadow_user_mask \
>   | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
>  
> -#define ACC_EXEC_MASK1
> -#define ACC_WRITE_MASK   PT_WRITABLE_MASK
> -#define ACC_USER_MASKPT_USER_MASK
> -#define ACC_ALL  (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
> -
>  /* The mask for the R/X bits in EPT PTEs */
>  #define PT64_EPT_READABLE_MASK   0x1ull
>  #define PT64_EPT_EXECUTABLE_MASK 0x4ull
> @@ -209,7 +204,7 @@ struct kvm_shadow_walk_iterator {
>__shadow_walk_next(&(_walker), spte))
>  
>  static struct kmem_cache *pte_list_desc_cache;
> -static struct kmem_cache *mmu_page_header_cache;
> +struct kmem_cache *mmu_page_header_cache;
>  static struct percpu_counter kvm_total_used_mmu_pages;
>  
>  static u64 __read_mostly shadow_nx_mask;
> @@ -3588,9 +3583,13 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t 
> *root_hpa,
>   return;
>  
>   sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
> - --sp->root_count;
> - if (!sp->root_count && sp->role.invalid)
> - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
> +
> + if (kvm_mmu_put_root(sp)) {
> + if (sp->tdp_mmu_page)
> + kvm_tdp_mmu_free_root(kvm, sp);
> + else if (sp->role.invalid)
> + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
> + }
>  
>   *root_hpa = INVALID_PAGE;
>  }
> @@ -3680,8 +3679,16 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu 
> *vcpu)
>   hpa_t root;
>   unsigned i;
>  
> - if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
> + if (vcpu->kvm->arch.tdp_mmu_enabled) {
> + root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
> +
> + if (!VALID_PAGE(root))
> + return -ENOSPC;
> + vcpu->arch.mmu->root_hpa = root;
> + } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
> +   true);
> +
>   if (!VALID_PAGE(root))
>   return -ENOSPC;
>   vcpu->arch.mmu->root_hpa = root;
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 74ccbf001a42e..6cedf578c9a8d 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -43,8 +43,12 @@ struct kvm_mmu_page {
>  
>   /* Number of writes since the last time traversal visited this page.  */
>   atomic_t write_flooding_count;
> +
> + bool tdp_mmu_page;
>  };
>  
> +extern struct kmem_cache *mmu_page_header_cache;
> +
>  static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
>  {
>   struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
> @@ -96,6 +100,11 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
>   (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
>   * PT64_LEVEL_BITS))) - 1))
>  
> +#define ACC_EXEC_MASK1
> +#define ACC_WRITE_MASK   PT_WRITABLE_MASK
> +#define ACC_USER_MASKPT_USER_MASK
> +#define ACC_ALL  (ACC_EXEC_MASK 

Re: [PATCH v2 07/20] kvm: x86/mmu: Support zapping SPTEs in the TDP MMU

2020-10-21 Thread Yu Zhang
On Wed, Oct 14, 2020 at 11:26:47AM -0700, Ben Gardon wrote:
> Add functions to zap SPTEs to the TDP MMU. These are needed to tear down
> TDP MMU roots properly and implement other MMU functions which require
> tearing down mappings. Future patches will add functions to populate the
> page tables, but as for this patch there will not be any work for these
> functions to do.
> 
> Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> machine. This series introduced no new failures.
> 
> This series can be viewed in Gerrit at:
>   https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538
> 
> Signed-off-by: Ben Gardon 
> ---
>  arch/x86/kvm/mmu/mmu.c  |  15 +
>  arch/x86/kvm/mmu/tdp_iter.c |   5 ++
>  arch/x86/kvm/mmu/tdp_iter.h |   1 +
>  arch/x86/kvm/mmu/tdp_mmu.c  | 109 
>  arch/x86/kvm/mmu/tdp_mmu.h  |   2 +
>  5 files changed, 132 insertions(+)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 8bf20723c6177..337ab6823e312 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5787,6 +5787,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
>   kvm_reload_remote_mmus(kvm);
>  
>   kvm_zap_obsolete_pages(kvm);
> +
> + if (kvm->arch.tdp_mmu_enabled)
> + kvm_tdp_mmu_zap_all(kvm);
> +
>   spin_unlock(>mmu_lock);
>  }
>  
> @@ -5827,6 +5831,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t 
> gfn_start, gfn_t gfn_end)
>   struct kvm_memslots *slots;
>   struct kvm_memory_slot *memslot;
>   int i;
> + bool flush;
>  
>   spin_lock(>mmu_lock);
>   for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> @@ -5846,6 +5851,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t 
> gfn_start, gfn_t gfn_end)
>   }
>   }
>  
> + if (kvm->arch.tdp_mmu_enabled) {
> + flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
> + if (flush)
> + kvm_flush_remote_tlbs(kvm);
> + }
> +
>   spin_unlock(>mmu_lock);
>  }
>  
> @@ -6012,6 +6023,10 @@ void kvm_mmu_zap_all(struct kvm *kvm)
>   }
>  
>   kvm_mmu_commit_zap_page(kvm, _list);
> +
> + if (kvm->arch.tdp_mmu_enabled)
> + kvm_tdp_mmu_zap_all(kvm);
> +
>   spin_unlock(>mmu_lock);
>  }
>  
> diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> index b07e9f0c5d4aa..701eb753b701e 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.c
> +++ b/arch/x86/kvm/mmu/tdp_iter.c
> @@ -174,3 +174,8 @@ void tdp_iter_refresh_walk(struct tdp_iter *iter)
>  iter->root_level, iter->min_level, goal_gfn);
>  }
>  
> +u64 *tdp_iter_root_pt(struct tdp_iter *iter)
> +{
> + return iter->pt_path[iter->root_level - 1];
> +}
> +
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> index d629a53e1b73f..884ed2c70bfed 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.h
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -52,5 +52,6 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, 
> int root_level,
>   int min_level, gfn_t goal_gfn);
>  void tdp_iter_next(struct tdp_iter *iter);
>  void tdp_iter_refresh_walk(struct tdp_iter *iter);
> +u64 *tdp_iter_root_pt(struct tdp_iter *iter);
>  
>  #endif /* __KVM_X86_MMU_TDP_ITER_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index f2bd3a6928ce9..9b5cd4a832f1a 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -56,8 +56,13 @@ bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
>   return sp->tdp_mmu_page && sp->root_count;
>  }
>  
> +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
> +   gfn_t start, gfn_t end);
> +
>  void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
>  {
> + gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
> +

boot_cpu_data.x86_phys_bits is the host address width. Value of the guest's
may vary. So maybe we should just traverse the memslots and zap the gfn ranges
in each of them?

>   lockdep_assert_held(>mmu_lock);
>  
>   WARN_ON(root->root_count);
> @@ -65,6 +70,8 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct 
> kvm_mmu_page *root)
>  
>   list_del(>link);
>  
> + zap_gfn_range(kvm, root, 0, max_gfn);
> +
>   free_page((unsigned long)root->spt);
>   kmem_cache_free(mmu_page_header_cache, root);
>  }
> @@ -155,6 +162,11 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu 
> *vcpu)
>  static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
>   u64 old_spte, u64 new_spte, int level);
>  
> +static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
> +{
> + return sp->role.smm ? 1 : 0;
> +}
> +
>  /**
>   * handle_changed_spte - handle bookkeeping associated with an SPTE change
>   * @kvm: kvm instance
> @@ -262,3 +274,100 @@ static void handle_changed_spte(struct kvm *kvm, int 
> as_id, gfn_t gfn,
>  {
> 

Re: [PATCH v2 02/20] kvm: x86/mmu: Introduce tdp_iter

2020-10-21 Thread Yu Zhang
On Wed, Oct 14, 2020 at 11:26:42AM -0700, Ben Gardon wrote:
> The TDP iterator implements a pre-order traversal of a TDP paging
> structure. This iterator will be used in future patches to create
> an efficient implementation of the KVM MMU for the TDP case.
> 
> Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> machine. This series introduced no new failures.
> 
> This series can be viewed in Gerrit at:
>   https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538
> 
> Signed-off-by: Ben Gardon 
> ---
>  arch/x86/kvm/Makefile   |   3 +-
>  arch/x86/kvm/mmu/mmu.c  |  66 
>  arch/x86/kvm/mmu/mmu_internal.h |  66 
>  arch/x86/kvm/mmu/tdp_iter.c | 176 
>  arch/x86/kvm/mmu/tdp_iter.h |  56 ++
>  5 files changed, 300 insertions(+), 67 deletions(-)
>  create mode 100644 arch/x86/kvm/mmu/tdp_iter.c
>  create mode 100644 arch/x86/kvm/mmu/tdp_iter.h
> 
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index 7f86a14aed0e9..4525c1151bf99 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -15,7 +15,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF)  += $(KVM)/async_pf.o
>  
>  kvm-y+= x86.o emulate.o i8259.o irq.o lapic.o \
>  i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
> -hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
> +hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
> +mmu/tdp_iter.o
>  
>  kvm-intel-y  += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o 
> \
>  vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6c9db349600c8..6d82784ed5679 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -121,28 +121,6 @@ module_param(dbg, bool, 0644);
>  
>  #define PTE_PREFETCH_NUM 8
>  
> -#define PT_FIRST_AVAIL_BITS_SHIFT 10
> -#define PT64_SECOND_AVAIL_BITS_SHIFT 54
> -
> -/*
> - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
> - * Access Tracking SPTEs.
> - */
> -#define SPTE_SPECIAL_MASK (3ULL << 52)
> -#define SPTE_AD_ENABLED_MASK (0ULL << 52)
> -#define SPTE_AD_DISABLED_MASK (1ULL << 52)
> -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
> -#define SPTE_MMIO_MASK (3ULL << 52)
> -
> -#define PT64_LEVEL_BITS 9
> -
> -#define PT64_LEVEL_SHIFT(level) \
> - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> -
> -#define PT64_INDEX(address, level)\
> - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> -
> -
>  #define PT32_LEVEL_BITS 10
>  
>  #define PT32_LEVEL_SHIFT(level) \
> @@ -155,19 +133,6 @@ module_param(dbg, bool, 0644);
>  #define PT32_INDEX(address, level)\
>   (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
>  
> -
> -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
> -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
> -#else
> -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
> -#endif
> -#define PT64_LVL_ADDR_MASK(level) \
> - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
> - * PT64_LEVEL_BITS))) - 1))
> -#define PT64_LVL_OFFSET_MASK(level) \
> - (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
> - * PT64_LEVEL_BITS))) - 1))
> -
>  #define PT32_BASE_ADDR_MASK PAGE_MASK
>  #define PT32_DIR_BASE_ADDR_MASK \
>   (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
> @@ -192,8 +157,6 @@ module_param(dbg, bool, 0644);
>  #define SPTE_HOST_WRITEABLE  (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
>  #define SPTE_MMU_WRITEABLE   (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
>  
> -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> -
>  /* make pte_list_desc fit well in cache line */
>  #define PTE_LIST_EXT 3
>  
> @@ -349,11 +312,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 
> access_mask)
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
>  
> -static bool is_mmio_spte(u64 spte)
> -{
> - return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
> -}
> -
>  static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
>  {
>   return sp->role.ad_disabled;
> @@ -626,35 +584,11 @@ static int is_nx(struct kvm_vcpu *vcpu)
>   return vcpu->arch.efer & EFER_NX;
>  }
>  
> -static int is_shadow_present_pte(u64 pte)
> -{
> - return (pte != 0) && !is_mmio_spte(pte);
> -}
> -
> -static int is_large_pte(u64 pte)
> -{
> - return pte & PT_PAGE_SIZE_MASK;
> -}
> -
> -static int is_last_spte(u64 pte, int level)
> -{
> - if (level == PG_LEVEL_4K)
> - return 1;
> - if (is_large_pte(pte))
> - return 1;
> - return 0;
> -}
> -
>  static bool is_executable_pte(u64 spte)
>  {
>   return (spte & (shadow_x_mask | shadow_nx_mask)) == 

Re: [RFC PATCH 01/13] kvm: Enable MTRR to work with GFNs with perm bits

2019-10-14 Thread Yu Zhang
On Thu, Oct 03, 2019 at 02:23:48PM -0700, Rick Edgecombe wrote:
> Mask gfn by maxphyaddr in kvm_mtrr_get_guest_memory_type so that the
> guests view of gfn is used when high bits of the physical memory are
> used as extra permissions bits. This supports the KVM XO feature.
> 
> TODO: Since MTRR is emulated using EPT permissions, the XO version of
> the gpa range will not inherrit the MTRR type with this implementation.
> There shouldn't be any legacy use of KVM XO, but hypothetically it could
> interfere with the uncacheable MTRR type.
> 
> Signed-off-by: Rick Edgecombe 
> ---
>  arch/x86/kvm/mtrr.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
> index 25ce3edd1872..da38f3b83e51 100644
> --- a/arch/x86/kvm/mtrr.c
> +++ b/arch/x86/kvm/mtrr.c
> @@ -621,6 +621,14 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, 
> gfn_t gfn)
>   const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
>  | (1 << MTRR_TYPE_WRTHROUGH);
>  
> + /*
> +  * Handle situations where gfn bits are used as permissions bits by
> +  * masking KVMs view of the gfn with the guests physical address bits
> +  * in order to match the guests view of physical address. For normal
> +  * situations this will have no effect.
> +  */
> + gfn &= (1ULL << (cpuid_maxphyaddr(vcpu) - PAGE_SHIFT));
> +

Won't this break the MTRR calculation for normal gfns?
Are you suggesting use the same MTRR value for the XO range as the normal one's?
If so, may be we should use:

if (guest_cpuid_has(vcpu, X86_FEATURE_KVM_XO))
gfn &= ~(1ULL << (cpuid_maxphyaddr(vcpu) - PAGE_SHIFT));


>   start = gfn_to_gpa(gfn);
>   end = start + PAGE_SIZE;
>  
> -- 
> 2.17.1
> 

B.R.
Yu


Re: [PATCH] KVM: MMU: record maximum physical address width in kvm_mmu_extended_role

2019-02-20 Thread Yu Zhang
On Wed, Feb 20, 2019 at 03:06:10PM +0100, Vitaly Kuznetsov wrote:
> Yu Zhang  writes:
> 
> > Previously, commit 7dcd57552008 ("x86/kvm/mmu: check if tdp/shadow
> > MMU reconfiguration is needed") offered some optimization to avoid
> > the unnecessary reconfiguration. Yet one scenario is broken - when
> > cpuid changes VM's maximum physical address width, reconfiguration
> > is needed to reset the reserved bits.  Also, the TDP may need to
> > reset its shadow_root_level when this value is changed.
> >
> > To fix this, a new field, maxphyaddr, is introduced in the extended
> > role structure to keep track of the configured guest physical address
> > width.
> >
> > Signed-off-by: Yu Zhang 
> > ---
> > Cc: Paolo Bonzini 
> > Cc: "Radim Krčmář" 
> > Cc: Thomas Gleixner 
> > Cc: Ingo Molnar 
> > Cc: Borislav Petkov 
> > Cc: "H. Peter Anvin" 
> > Cc: linux-kernel@vger.kernel.org
> > ---
> >  arch/x86/include/asm/kvm_host.h | 1 +
> >  arch/x86/kvm/mmu.c  | 1 +
> >  2 files changed, 2 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h 
> > b/arch/x86/include/asm/kvm_host.h
> > index 4660ce9..be87f71 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -299,6 +299,7 @@ struct kvm_mmu_memory_cache {
> > unsigned int cr4_smap:1;
> > unsigned int cr4_smep:1;
> > unsigned int cr4_la57:1;
> > +   unsigned int maxphyaddr:6;
> > };
> >  };
> >  
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index ce770b4..2b74505 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -4769,6 +4769,7 @@ static union kvm_mmu_extended_role 
> > kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
> > ext.cr4_pse = !!is_pse(vcpu);
> > ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
> > ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
> > +   ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
> >  
> > ext.valid = 1;
> 
> It seems that we can now drop 'valid' from role_ext as maxphyaddr can't
> be 0.

Thanks, Vitaly. Yes, we can drop this field. :)

> 
> Reviewed-by: Vitaly Kuznetsov 
> 
> -- 
> Vitaly
> 

B.R.
Yu


Re: [PATCH] KVM: MMU: record maximum physical address width in kvm_mmu_extended_role

2019-02-19 Thread Yu Zhang
Hi Paolo, any comments on this patch? And the other one(kvm: x86: Return
LA57 feature based on hardware capability )? :-)

On Fri, Feb 01, 2019 at 12:09:23AM +0800, Yu Zhang wrote:
> Previously, commit 7dcd57552008 ("x86/kvm/mmu: check if tdp/shadow
> MMU reconfiguration is needed") offered some optimization to avoid
> the unnecessary reconfiguration. Yet one scenario is broken - when
> cpuid changes VM's maximum physical address width, reconfiguration
> is needed to reset the reserved bits.  Also, the TDP may need to
> reset its shadow_root_level when this value is changed.
> 
> To fix this, a new field, maxphyaddr, is introduced in the extended
> role structure to keep track of the configured guest physical address
> width.
> 
> Signed-off-by: Yu Zhang 
> ---
> Cc: Paolo Bonzini 
> Cc: "Radim Krčmář" 
> Cc: Thomas Gleixner 
> Cc: Ingo Molnar 
> Cc: Borislav Petkov 
> Cc: "H. Peter Anvin" 
> Cc: linux-kernel@vger.kernel.org
> ---
>  arch/x86/include/asm/kvm_host.h | 1 +
>  arch/x86/kvm/mmu.c  | 1 +
>  2 files changed, 2 insertions(+)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 4660ce9..be87f71 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -299,6 +299,7 @@ struct kvm_mmu_memory_cache {
>   unsigned int cr4_smap:1;
>   unsigned int cr4_smep:1;
>   unsigned int cr4_la57:1;
> + unsigned int maxphyaddr:6;
>   };
>  };
>  
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index ce770b4..2b74505 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -4769,6 +4769,7 @@ static union kvm_mmu_extended_role 
> kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
>   ext.cr4_pse = !!is_pse(vcpu);
>   ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
>   ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
> + ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
>  
>   ext.valid = 1;
>  
> -- 
> 1.9.1
> 

Thanks
Yu


Re: linux-next: Fixes tag needs some work in the kvm tree

2019-02-19 Thread Yu Zhang
Thanks for the notification, Stephen.
@Paolo, should I resubmit the patch to correct?

On Sat, Feb 16, 2019 at 06:34:33PM +1100, Stephen Rothwell wrote:
> Hi all,
> 
> In commit
> 
>   aa8359972cfc ("KVM: x86/mmu: Differentiate between nr zapped and list 
> unstable")
> 
> Fixes tag
> 
>   Fixes: 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return
> 
> has these problem(s):
> 
>   - Subject has leading but no trailing quotes
> Please do not split Fixes tags over more than one line
> 
> In commit
> 
>   4d3f8e4ff75e ("kvm: vmx: Fix typos in vmentry/vmexit control setting")
> 
> Fixes tag
> 
>   Fixes: 'commit f99e3daf94ff ("KVM: x86: Add Intel PT virtualization work 
> mode")'
> 
> has these problem(s):
> 
>   - No SHA1 recognised
> The leading word 'commit' is unexpected and the surrounding single
> quotes likewise.
> Please just use
>   git log -1 --format='Fixes: %h ("%s")' 
> 
> -- 
> Cheers,
> Stephen Rothwell

B.R.
Yu



[PATCH] kvm: x86: Return LA57 feature based on hardware capability

2019-01-31 Thread Yu Zhang
Previously, 'commit 372fddf70904 ("x86/mm: Introduce the 'no5lvl' kernel
parameter")' cleared X86_FEATURE_LA57 in boot_cpu_data, if Linux chooses
to not run in 5-level paging mode. Yet boot_cpu_data is queried by
do_cpuid_ent() as the host capability later when creating vcpus, and Qemu
will not be able to detect this feature and create VMs with LA57 feature.

As discussed earlier, VMs can still benefit from extended linear address
width, e.g. to enhance features like ASLR. So we would like to fix this,
by return the true hardware capability when Qemu queries.

Signed-off-by: Yu Zhang 
---
Cc: Paolo Bonzini 
Cc: "Radim Krčmář" 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/kvm/cpuid.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index bbffa6c..c07958b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -335,6 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
+   unsigned f_la57 = 0;
 
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -489,7 +490,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
// TSC_ADJUST is emulated
entry->ebx |= F(TSC_ADJUST);
entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+   f_la57 = entry->ecx & F(LA57);
cpuid_mask(>ecx, CPUID_7_ECX);
+   /* Set LA57 based on hardware capability. */
+   entry->ecx |= f_la57;
entry->ecx |= f_umip;
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
-- 
1.9.1



[PATCH] KVM: MMU: record maximum physical address width in kvm_mmu_extended_role

2019-01-31 Thread Yu Zhang
Previously, commit 7dcd57552008 ("x86/kvm/mmu: check if tdp/shadow
MMU reconfiguration is needed") offered some optimization to avoid
the unnecessary reconfiguration. Yet one scenario is broken - when
cpuid changes VM's maximum physical address width, reconfiguration
is needed to reset the reserved bits.  Also, the TDP may need to
reset its shadow_root_level when this value is changed.

To fix this, a new field, maxphyaddr, is introduced in the extended
role structure to keep track of the configured guest physical address
width.

Signed-off-by: Yu Zhang 
---
Cc: Paolo Bonzini 
Cc: "Radim Krčmář" 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/mmu.c  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4660ce9..be87f71 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -299,6 +299,7 @@ struct kvm_mmu_memory_cache {
unsigned int cr4_smap:1;
unsigned int cr4_smep:1;
unsigned int cr4_la57:1;
+   unsigned int maxphyaddr:6;
};
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ce770b4..2b74505 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4769,6 +4769,7 @@ static union kvm_mmu_extended_role 
kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
ext.cr4_pse = !!is_pse(vcpu);
ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+   ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
 
ext.valid = 1;
 
-- 
1.9.1



[PATCH] kvm: vmx: Fix typos in vmentry/vmexit control setting

2019-01-30 Thread Yu Zhang
Previously, 'commit f99e3daf94ff ("KVM: x86: Add Intel PT
virtualization work mode")' work mode' offered framework
to support Intel PT virtualization. However, the patch has
some typos in vmx_vmentry_ctrl() and vmx_vmexit_ctrl(), e.g.
used wrong flags and wrong variable, which will cause the
VM entry failure later.

Fixes: 'commit f99e3daf94ff ("KVM: x86: Add Intel PT virtualization work mode")'
Signed-off-by: Yu Zhang 
---
Cc: Paolo Bonzini 
Cc: "Radim Krčmář" 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/kvm/vmx/vmx.h | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9932895..267de48 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -445,7 +445,8 @@ static inline u32 vmx_vmentry_ctrl(void)
 {
u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
if (pt_mode == PT_MODE_SYSTEM)
-   vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 
VM_EXIT_CLEAR_IA32_RTIT_CTL);
+   vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+ VM_ENTRY_LOAD_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
return vmentry_ctrl &
~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 
VM_ENTRY_LOAD_IA32_EFER);
@@ -455,9 +456,10 @@ static inline u32 vmx_vmexit_ctrl(void)
 {
u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
if (pt_mode == PT_MODE_SYSTEM)
-   vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 
VM_ENTRY_LOAD_IA32_RTIT_CTL);
+   vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
+VM_EXIT_CLEAR_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
-   return vmcs_config.vmexit_ctrl &
+   return vmexit_ctrl &
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
 }
 
-- 
1.9.1



Re: [PATCH] KVM: x86: Fix the NULL pointer parameter in check_cr_write()

2017-09-20 Thread Yu Zhang



On 9/20/2017 4:13 PM, Paolo Bonzini wrote:

On 20/09/2017 08:35, Yu Zhang wrote:

2 reasons I did not choose to change kvm_cpuid(): 1> like Jim's
comments, kvm_cpuid() will eventually write the *eax - *edx no
matter a cpuid entry is found or not; 2> currently, return value of
kvm_cpuid() is either true when an entry is found or false otherwise.
We can change kvm_cpuid() to check the pointers of GPRs against NULL
and return false immediately. Then the false value would have 2
different meanings - entry not found, or invalid params.

Paolo, any suggestion? :-)

Radim, has already sent this version to Linus. :)


Got it. Thanks. :)

Yu

Paolo





Re: [PATCH] KVM: x86: Fix the NULL pointer parameter in check_cr_write()

2017-09-20 Thread Yu Zhang



On 9/20/2017 4:13 PM, Paolo Bonzini wrote:

On 20/09/2017 08:35, Yu Zhang wrote:

2 reasons I did not choose to change kvm_cpuid(): 1> like Jim's
comments, kvm_cpuid() will eventually write the *eax - *edx no
matter a cpuid entry is found or not; 2> currently, return value of
kvm_cpuid() is either true when an entry is found or false otherwise.
We can change kvm_cpuid() to check the pointers of GPRs against NULL
and return false immediately. Then the false value would have 2
different meanings - entry not found, or invalid params.

Paolo, any suggestion? :-)

Radim, has already sent this version to Linus. :)


Got it. Thanks. :)

Yu

Paolo





Re: [PATCH] KVM: x86: Fix the NULL pointer parameter in check_cr_write()

2017-09-20 Thread Yu Zhang



On 9/18/2017 11:56 PM, Jim Mattson wrote:

kvm_cpuid ultimately wants to write all four of the GPRs passed in by
reference. I don't see any advantage to allowing some of these
pointers to be NULL.


Thanks for your comments, Jim & David.

2 reasons I did not choose to change kvm_cpuid():
1> like Jim's comments, kvm_cpuid() will eventually write the *eax - 
*edx no matter

a cpuid entry is found or not;
2> currently, return value of kvm_cpuid() is either true when an entry 
is found or false
otherwise. We can change kvm_cpuid() to check the pointers of GPRs 
against NULL and
return false immediately. Then the false value would have 2 different 
meanings - entry

not found, or invalid params.

Paolo, any suggestion? :-)

Thanks
Yu


Reviewed-by: Jim Mattson <jmatt...@google.com>

On Mon, Sep 18, 2017 at 5:19 AM, David Hildenbrand <da...@redhat.com> wrote:

On 18.09.2017 12:45, Yu Zhang wrote:

Routine check_cr_write() will trigger emulator_get_cpuid()->
kvm_cpuid() to get maxphyaddr, and NULL is passed as values
for ebx/ecx/edx. This is problematic because kvm_cpuid() will
dereference these pointers.

Fixes: d1cd3ce90044 ("KVM: MMU: check guest CR3 reserved bits based on its physical 
address width.")
Reported-by: Jim Mattson <jmatt...@google.com>
Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
  arch/x86/kvm/emulate.c | 8 +---
  1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 16bf665..15f527b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4102,10 +4102,12 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
   ctxt->ops->get_msr(ctxt, MSR_EFER, );
   if (efer & EFER_LMA) {
   u64 maxphyaddr;
- u32 eax = 0x8008;
+ u32 eax, ebx, ecx, edx;

- if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL,
-  NULL, false))
+ eax = 0x8008;
+ ecx = 0;
+ if (ctxt->ops->get_cpuid(ctxt, , , ,
+  , false))
   maxphyaddr = eax & 0xff;
   else
   maxphyaddr = 36;


Not sure if fixing kvm_cpuid() would be better.

Reviewed-by: David Hildenbrand <da...@redhat.com>

--

Thanks,

David




Re: [PATCH] KVM: x86: Fix the NULL pointer parameter in check_cr_write()

2017-09-20 Thread Yu Zhang



On 9/18/2017 11:56 PM, Jim Mattson wrote:

kvm_cpuid ultimately wants to write all four of the GPRs passed in by
reference. I don't see any advantage to allowing some of these
pointers to be NULL.


Thanks for your comments, Jim & David.

2 reasons I did not choose to change kvm_cpuid():
1> like Jim's comments, kvm_cpuid() will eventually write the *eax - 
*edx no matter

a cpuid entry is found or not;
2> currently, return value of kvm_cpuid() is either true when an entry 
is found or false
otherwise. We can change kvm_cpuid() to check the pointers of GPRs 
against NULL and
return false immediately. Then the false value would have 2 different 
meanings - entry

not found, or invalid params.

Paolo, any suggestion? :-)

Thanks
Yu


Reviewed-by: Jim Mattson 

On Mon, Sep 18, 2017 at 5:19 AM, David Hildenbrand  wrote:

On 18.09.2017 12:45, Yu Zhang wrote:

Routine check_cr_write() will trigger emulator_get_cpuid()->
kvm_cpuid() to get maxphyaddr, and NULL is passed as values
for ebx/ecx/edx. This is problematic because kvm_cpuid() will
dereference these pointers.

Fixes: d1cd3ce90044 ("KVM: MMU: check guest CR3 reserved bits based on its physical 
address width.")
Reported-by: Jim Mattson 
Signed-off-by: Yu Zhang 
---
  arch/x86/kvm/emulate.c | 8 +---
  1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 16bf665..15f527b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4102,10 +4102,12 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
   ctxt->ops->get_msr(ctxt, MSR_EFER, );
   if (efer & EFER_LMA) {
   u64 maxphyaddr;
- u32 eax = 0x8008;
+ u32 eax, ebx, ecx, edx;

- if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL,
-  NULL, false))
+ eax = 0x8008;
+ ecx = 0;
+ if (ctxt->ops->get_cpuid(ctxt, , , ,
+  , false))
   maxphyaddr = eax & 0xff;
   else
   maxphyaddr = 36;


Not sure if fixing kvm_cpuid() would be better.

Reviewed-by: David Hildenbrand 

--

Thanks,

David




[PATCH] KVM: x86: Fix the NULL pointer parameter in check_cr_write()

2017-09-18 Thread Yu Zhang
Routine check_cr_write() will trigger emulator_get_cpuid()->
kvm_cpuid() to get maxphyaddr, and NULL is passed as values
for ebx/ecx/edx. This is problematic because kvm_cpuid() will
dereference these pointers.

Fixes: d1cd3ce90044 ("KVM: MMU: check guest CR3 reserved bits based on its 
physical address width.")
Reported-by: Jim Mattson <jmatt...@google.com>
Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/kvm/emulate.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 16bf665..15f527b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4102,10 +4102,12 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
ctxt->ops->get_msr(ctxt, MSR_EFER, );
if (efer & EFER_LMA) {
u64 maxphyaddr;
-   u32 eax = 0x8008;
+   u32 eax, ebx, ecx, edx;
 
-   if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL,
-NULL, false))
+   eax = 0x8008;
+   ecx = 0;
+   if (ctxt->ops->get_cpuid(ctxt, , , ,
+, false))
maxphyaddr = eax & 0xff;
else
maxphyaddr = 36;
-- 
2.5.0



[PATCH] KVM: x86: Fix the NULL pointer parameter in check_cr_write()

2017-09-18 Thread Yu Zhang
Routine check_cr_write() will trigger emulator_get_cpuid()->
kvm_cpuid() to get maxphyaddr, and NULL is passed as values
for ebx/ecx/edx. This is problematic because kvm_cpuid() will
dereference these pointers.

Fixes: d1cd3ce90044 ("KVM: MMU: check guest CR3 reserved bits based on its 
physical address width.")
Reported-by: Jim Mattson 
Signed-off-by: Yu Zhang 
---
 arch/x86/kvm/emulate.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 16bf665..15f527b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4102,10 +4102,12 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
ctxt->ops->get_msr(ctxt, MSR_EFER, );
if (efer & EFER_LMA) {
u64 maxphyaddr;
-   u32 eax = 0x8008;
+   u32 eax, ebx, ecx, edx;
 
-   if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL,
-NULL, false))
+   eax = 0x8008;
+   ecx = 0;
+   if (ctxt->ops->get_cpuid(ctxt, , , ,
+, false))
maxphyaddr = eax & 0xff;
else
maxphyaddr = 36;
-- 
2.5.0



Re: [PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-09-18 Thread Yu Zhang



On 9/18/2017 4:41 PM, Paolo Bonzini wrote:

On 18/09/2017 10:15, Yu Zhang wrote:

static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
  u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool
check_limit)
{
  return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx,
check_limit);
}

And:

bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
 u32 *ecx, u32 *edx, bool check_limit)
{
u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
bool entry_found = true;
...

Doesn't this immediately try to dereference a NULL pointer?  How much
testing have you done of this code?

Thanks Jim.
I have tested this code in a simulator to successfully boot a VM in
shadow mode. Seems this code is not covered(but I am now still
perplexed why this is not covered). Any possibility that the
check_cr_write() is not triggered when emulating the cr operations?

CR moves usually don't go through the emulator (the main exception is
emulation of invalid guest state when the processor doesn't support
unrestricted_guest=1, but even that is unlikely to happen with
EFER.LMA=1).  This explains why you didn't see the failure.


Oh, right. It normally goes to handle_cr(). Thanks, Paolo.

Yu




Anyway, this should be a bug and thanks for pointing this out, and
I'll send out the fix later.

Thanks,

Paolo





Re: [PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-09-18 Thread Yu Zhang



On 9/18/2017 4:41 PM, Paolo Bonzini wrote:

On 18/09/2017 10:15, Yu Zhang wrote:

static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
  u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool
check_limit)
{
  return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx,
check_limit);
}

And:

bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
 u32 *ecx, u32 *edx, bool check_limit)
{
u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
bool entry_found = true;
...

Doesn't this immediately try to dereference a NULL pointer?  How much
testing have you done of this code?

Thanks Jim.
I have tested this code in a simulator to successfully boot a VM in
shadow mode. Seems this code is not covered(but I am now still
perplexed why this is not covered). Any possibility that the
check_cr_write() is not triggered when emulating the cr operations?

CR moves usually don't go through the emulator (the main exception is
emulation of invalid guest state when the processor doesn't support
unrestricted_guest=1, but even that is unlikely to happen with
EFER.LMA=1).  This explains why you didn't see the failure.


Oh, right. It normally goes to handle_cr(). Thanks, Paolo.

Yu




Anyway, this should be a bug and thanks for pointing this out, and
I'll send out the fix later.

Thanks,

Paolo





Re: [PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-09-18 Thread Yu Zhang



On 9/16/2017 7:19 AM, Jim Mattson wrote:

On Thu, Aug 24, 2017 at 5:27 AM, Yu Zhang <yu.c.zh...@linux.intel.com> wrote:

Currently, KVM uses CR3_L_MODE_RESERVED_BITS to check the
reserved bits in CR3. Yet the length of reserved bits in
guest CR3 should be based on the physical address width
exposed to the VM. This patch changes CR3 check logic to
calculate the reserved bits at runtime.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
  arch/x86/include/asm/kvm_host.h |  1 -
  arch/x86/kvm/emulate.c  | 14 --
  arch/x86/kvm/mmu.h  |  3 +++
  arch/x86/kvm/x86.c  |  8 
  4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6db0ed9..e716228 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,7 +79,6 @@
   | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
   | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))

-#define CR3_L_MODE_RESERVED_BITS 0xFF00ULL
  #define CR3_PCID_INVD   BIT_64(63)
  #define CR4_RESERVED_BITS   \
 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 319d91f..a89b595 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@

  #include "x86.h"
  #include "tss.h"
+#include "mmu.h"

  /*
   * Operand types
@@ -4097,8 +4098,17 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
 u64 rsvd = 0;

 ctxt->ops->get_msr(ctxt, MSR_EFER, );
-   if (efer & EFER_LMA)
-   rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL,
+NULL, false))

Passing NULL for the address of ecx looks problematic to me.

We have:

static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool
check_limit)
{
 return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
}

And:

bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool check_limit)
{
u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
bool entry_found = true;
...

Doesn't this immediately try to dereference a NULL pointer?  How much
testing have you done of this code?


Thanks Jim.
I have tested this code in a simulator to successfully boot a VM in 
shadow mode.
Seems this code is not covered(but I am now still perplexed why this is 
not covered).
Any possibility that the check_cr_write() is not triggered when 
emulating the cr

operations?

Anyway, this should be a bug and thanks for pointing this out, and I'll 
send out the

fix later.

BR
Yu


Re: [PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-09-18 Thread Yu Zhang



On 9/16/2017 7:19 AM, Jim Mattson wrote:

On Thu, Aug 24, 2017 at 5:27 AM, Yu Zhang  wrote:

Currently, KVM uses CR3_L_MODE_RESERVED_BITS to check the
reserved bits in CR3. Yet the length of reserved bits in
guest CR3 should be based on the physical address width
exposed to the VM. This patch changes CR3 check logic to
calculate the reserved bits at runtime.

Signed-off-by: Yu Zhang 
---
  arch/x86/include/asm/kvm_host.h |  1 -
  arch/x86/kvm/emulate.c  | 14 --
  arch/x86/kvm/mmu.h  |  3 +++
  arch/x86/kvm/x86.c  |  8 
  4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6db0ed9..e716228 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,7 +79,6 @@
   | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
   | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))

-#define CR3_L_MODE_RESERVED_BITS 0xFF00ULL
  #define CR3_PCID_INVD   BIT_64(63)
  #define CR4_RESERVED_BITS   \
 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 319d91f..a89b595 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@

  #include "x86.h"
  #include "tss.h"
+#include "mmu.h"

  /*
   * Operand types
@@ -4097,8 +4098,17 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
 u64 rsvd = 0;

 ctxt->ops->get_msr(ctxt, MSR_EFER, );
-   if (efer & EFER_LMA)
-   rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL,
+NULL, false))

Passing NULL for the address of ecx looks problematic to me.

We have:

static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool
check_limit)
{
 return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
}

And:

bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool check_limit)
{
u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
bool entry_found = true;
...

Doesn't this immediately try to dereference a NULL pointer?  How much
testing have you done of this code?


Thanks Jim.
I have tested this code in a simulator to successfully boot a VM in 
shadow mode.
Seems this code is not covered(but I am now still perplexed why this is 
not covered).
Any possibility that the check_cr_write() is not triggered when 
emulating the cr

operations?

Anyway, this should be a bug and thanks for pointing this out, and I'll 
send out the

fix later.

BR
Yu


Re: [PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-24 Thread Yu Zhang



On 8/25/2017 12:27 AM, Paolo Bonzini wrote:

On 24/08/2017 17:38, Yu Zhang wrote:


In practice, MAXPHYADDR will never be 59 even because the PKRU bits are
at bits 59..62.

Thanks, Paolo.
I see. I had made an assumption that MAXPHYADDR shall not exceed the
physical one,
which is 52 I believe. But I'm not sure there's any place to check this.
Maybe we should make sure the vcpu->arch.maxphyaddr will not be greater
than the value of the host?

That's a separate change anyway.  In any case, since currently the
MAXPHYADDR is not validated, your change to rsvd_bits makes sense.


Thanks, Paolo.
As to this patch series, any change I need do?

BTW,  I have written a patch for kvm-unit-test access test, but the test 
failed.
Not sure if my patch is erroneous or due to a simulator error. I'll send 
out the

test patch after it works.:-)

Yu

Paolo





Re: [PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-24 Thread Yu Zhang



On 8/25/2017 12:27 AM, Paolo Bonzini wrote:

On 24/08/2017 17:38, Yu Zhang wrote:


In practice, MAXPHYADDR will never be 59 even because the PKRU bits are
at bits 59..62.

Thanks, Paolo.
I see. I had made an assumption that MAXPHYADDR shall not exceed the
physical one,
which is 52 I believe. But I'm not sure there's any place to check this.
Maybe we should make sure the vcpu->arch.maxphyaddr will not be greater
than the value of the host?

That's a separate change anyway.  In any case, since currently the
MAXPHYADDR is not validated, your change to rsvd_bits makes sense.


Thanks, Paolo.
As to this patch series, any change I need do?

BTW,  I have written a patch for kvm-unit-test access test, but the test 
failed.
Not sure if my patch is erroneous or due to a simulator error. I'll send 
out the

test patch after it works.:-)

Yu

Paolo





Re: [PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-24 Thread Yu Zhang



On 8/24/2017 11:50 PM, Paolo Bonzini wrote:

On 24/08/2017 17:23, Yu Zhang wrote:

 static inline u64 rsvd_bits(int s, int e)
   {
+if (e < s)
+return 0;
+
   return ((1ULL << (e - s + 1)) - 1) << s;
   }

e = s - 1 is already supported; why do you need e <= s - 2?

Sorry? I do not quite understand. When will e = s - 1?

Is there any case where e < s?  I can see that MAXPHYADDR=63 gives
rsvd_bits(63, 62), but that works.

In practice, MAXPHYADDR will never be 59 even because the PKRU bits are
at bits 59..62.


Thanks, Paolo.
I see. I had made an assumption that MAXPHYADDR shall not exceed the 
physical one,

which is 52 I believe. But I'm not sure there's any place to check this.
Maybe we should make sure the vcpu->arch.maxphyaddr will not be greater 
than the

value of the host?

Yu


Paolo





Re: [PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-24 Thread Yu Zhang



On 8/24/2017 11:50 PM, Paolo Bonzini wrote:

On 24/08/2017 17:23, Yu Zhang wrote:

 static inline u64 rsvd_bits(int s, int e)
   {
+if (e < s)
+return 0;
+
   return ((1ULL << (e - s + 1)) - 1) << s;
   }

e = s - 1 is already supported; why do you need e <= s - 2?

Sorry? I do not quite understand. When will e = s - 1?

Is there any case where e < s?  I can see that MAXPHYADDR=63 gives
rsvd_bits(63, 62), but that works.

In practice, MAXPHYADDR will never be 59 even because the PKRU bits are
at bits 59..62.


Thanks, Paolo.
I see. I had made an assumption that MAXPHYADDR shall not exceed the 
physical one,

which is 52 I believe. But I'm not sure there's any place to check this.
Maybe we should make sure the vcpu->arch.maxphyaddr will not be greater 
than the

value of the host?

Yu


Paolo





Re: [PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-24 Thread Yu Zhang



On 8/24/2017 9:40 PM, Paolo Bonzini wrote:

On 24/08/2017 14:27, Yu Zhang wrote:

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3ed6192..67e7ec2 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,9 @@
  
  static inline u64 rsvd_bits(int s, int e)

  {
+   if (e < s)
+   return 0;
+
return ((1ULL << (e - s + 1)) - 1) << s;
  }

e = s - 1 is already supported; why do you need e <= s - 2?


Sorry? I do not quite understand. When will e = s - 1?

Thanks
Yu

Paolo





Re: [PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-24 Thread Yu Zhang



On 8/24/2017 9:40 PM, Paolo Bonzini wrote:

On 24/08/2017 14:27, Yu Zhang wrote:

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3ed6192..67e7ec2 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,9 @@
  
  static inline u64 rsvd_bits(int s, int e)

  {
+   if (e < s)
+   return 0;
+
return ((1ULL << (e - s + 1)) - 1) << s;
  }

e = s - 1 is already supported; why do you need e <= s - 2?


Sorry? I do not quite understand. When will e = s - 1?

Thanks
Yu

Paolo





[PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-24 Thread Yu Zhang
Currently, KVM uses CR3_L_MODE_RESERVED_BITS to check the
reserved bits in CR3. Yet the length of reserved bits in
guest CR3 should be based on the physical address width
exposed to the VM. This patch changes CR3 check logic to
calculate the reserved bits at runtime.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/emulate.c  | 14 --
 arch/x86/kvm/mmu.h  |  3 +++
 arch/x86/kvm/x86.c  |  8 
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6db0ed9..e716228 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,7 +79,6 @@
  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 
-#define CR3_L_MODE_RESERVED_BITS 0xFF00ULL
 #define CR3_PCID_INVD   BIT_64(63)
 #define CR4_RESERVED_BITS   \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 319d91f..a89b595 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@
 
 #include "x86.h"
 #include "tss.h"
+#include "mmu.h"
 
 /*
  * Operand types
@@ -4097,8 +4098,17 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
u64 rsvd = 0;
 
ctxt->ops->get_msr(ctxt, MSR_EFER, );
-   if (efer & EFER_LMA)
-   rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL,
+NULL, false))
+   maxphyaddr = eax & 0xff;
+   else
+   maxphyaddr = 36;
+   rsvd = rsvd_bits(maxphyaddr, 62);
+   }
 
if (new_val & rsvd)
return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3ed6192..67e7ec2 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,9 @@
 
 static inline u64 rsvd_bits(int s, int e)
 {
+   if (e < s)
+   return 0;
+
return ((1ULL << (e - s + 1)) - 1) << s;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cc2c7e4..79f5889 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -813,10 +813,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
 
-   if (is_long_mode(vcpu)) {
-   if (cr3 & CR3_L_MODE_RESERVED_BITS)
-   return 1;
-   } else if (is_pae(vcpu) && is_paging(vcpu) &&
+   if (is_long_mode(vcpu) &&
+   (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62)))
+   return 1;
+   else if (is_pae(vcpu) && is_paging(vcpu) &&
   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
 
-- 
2.5.0



[PATCH v3 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-24 Thread Yu Zhang
Currently, KVM uses CR3_L_MODE_RESERVED_BITS to check the
reserved bits in CR3. Yet the length of reserved bits in
guest CR3 should be based on the physical address width
exposed to the VM. This patch changes CR3 check logic to
calculate the reserved bits at runtime.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/emulate.c  | 14 --
 arch/x86/kvm/mmu.h  |  3 +++
 arch/x86/kvm/x86.c  |  8 
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6db0ed9..e716228 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,7 +79,6 @@
  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 
-#define CR3_L_MODE_RESERVED_BITS 0xFF00ULL
 #define CR3_PCID_INVD   BIT_64(63)
 #define CR4_RESERVED_BITS   \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 319d91f..a89b595 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@
 
 #include "x86.h"
 #include "tss.h"
+#include "mmu.h"
 
 /*
  * Operand types
@@ -4097,8 +4098,17 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
u64 rsvd = 0;
 
ctxt->ops->get_msr(ctxt, MSR_EFER, );
-   if (efer & EFER_LMA)
-   rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL,
+NULL, false))
+   maxphyaddr = eax & 0xff;
+   else
+   maxphyaddr = 36;
+   rsvd = rsvd_bits(maxphyaddr, 62);
+   }
 
if (new_val & rsvd)
return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3ed6192..67e7ec2 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,9 @@
 
 static inline u64 rsvd_bits(int s, int e)
 {
+   if (e < s)
+   return 0;
+
return ((1ULL << (e - s + 1)) - 1) << s;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cc2c7e4..79f5889 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -813,10 +813,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
 
-   if (is_long_mode(vcpu)) {
-   if (cr3 & CR3_L_MODE_RESERVED_BITS)
-   return 1;
-   } else if (is_pae(vcpu) && is_paging(vcpu) &&
+   if (is_long_mode(vcpu) &&
+   (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62)))
+   return 1;
+   else if (is_pae(vcpu) && is_paging(vcpu) &&
   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
 
-- 
2.5.0



[PATCH v3 1/5] KVM: x86: Add return value to kvm_cpuid().

2017-08-24 Thread Yu Zhang
Return false in kvm_cpuid() when it fails to find the cpuid
entry. Also, this routine(and its caller) is optimized with
a new argument - check_limit, so that the check_cpuid_limit()
fall back can be avoided.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_emulate.h |  4 ++--
 arch/x86/kvm/cpuid.c   | 17 +
 arch/x86/kvm/cpuid.h   |  3 ++-
 arch/x86/kvm/emulate.c | 12 ++--
 arch/x86/kvm/svm.c |  2 +-
 arch/x86/kvm/trace.h   | 11 +++
 arch/x86/kvm/x86.c |  6 +++---
 7 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h 
b/arch/x86/include/asm/kvm_emulate.h
index fde36f1..fa2558e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -219,8 +219,8 @@ struct x86_emulate_ops {
 struct x86_instruction_info *info,
 enum x86_intercept_stage stage);
 
-   void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+   bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool check_limit);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
 
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59ca2ee..1450547 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -853,16 +853,24 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct 
kvm_vcpu *vcpu,
return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
 }
 
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+  u32 *ecx, u32 *edx, bool check_limit)
 {
u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
+   bool entry_found = true;
 
best = kvm_find_cpuid_entry(vcpu, function, index);
 
-   if (!best)
+   if (!best) {
+   entry_found = false;
+   if (!check_limit)
+   goto out;
+
best = check_cpuid_limit(vcpu, function, index);
+   }
 
+out:
if (best) {
*eax = best->eax;
*ebx = best->ebx;
@@ -870,7 +878,8 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, 
u32 *ecx, u32 *edx)
*edx = best->edx;
} else
*eax = *ebx = *ecx = *edx = 0;
-   trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx);
+   trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
+   return entry_found;
 }
 EXPORT_SYMBOL_GPL(kvm_cpuid);
 
@@ -883,7 +892,7 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 
eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-   kvm_cpuid(vcpu, , , , );
+   kvm_cpuid(vcpu, , , , , true);
kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ac15193..1ea3c0e 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -21,7 +21,8 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
  struct kvm_cpuid2 *cpuid,
  struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+  u32 *ecx, u32 *edx, bool check_limit);
 
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb00559..319d91f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2333,7 +2333,7 @@ static int emulator_has_longmode(struct x86_emulate_ctxt 
*ctxt)
 
eax = 0x8001;
ecx = 0;
-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , false);
return edx & bit(X86_FEATURE_LM);
 }
 
@@ -2636,7 +2636,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
u32 eax, ebx, ecx, edx;
 
eax = ecx = 0;
-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , false);
return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
@@ -2656,7 +2656,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt 
*ctxt)
 
eax = 0x;
ecx = 0x;
-   ops->get_cpuid(ctxt, , , , );
+   ops->get_cp

[PATCH v3 1/5] KVM: x86: Add return value to kvm_cpuid().

2017-08-24 Thread Yu Zhang
Return false in kvm_cpuid() when it fails to find the cpuid
entry. Also, this routine(and its caller) is optimized with
a new argument - check_limit, so that the check_cpuid_limit()
fall back can be avoided.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_emulate.h |  4 ++--
 arch/x86/kvm/cpuid.c   | 17 +
 arch/x86/kvm/cpuid.h   |  3 ++-
 arch/x86/kvm/emulate.c | 12 ++--
 arch/x86/kvm/svm.c |  2 +-
 arch/x86/kvm/trace.h   | 11 +++
 arch/x86/kvm/x86.c |  6 +++---
 7 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h 
b/arch/x86/include/asm/kvm_emulate.h
index fde36f1..fa2558e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -219,8 +219,8 @@ struct x86_emulate_ops {
 struct x86_instruction_info *info,
 enum x86_intercept_stage stage);
 
-   void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+   bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool check_limit);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
 
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59ca2ee..1450547 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -853,16 +853,24 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct 
kvm_vcpu *vcpu,
return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
 }
 
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+  u32 *ecx, u32 *edx, bool check_limit)
 {
u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
+   bool entry_found = true;
 
best = kvm_find_cpuid_entry(vcpu, function, index);
 
-   if (!best)
+   if (!best) {
+   entry_found = false;
+   if (!check_limit)
+   goto out;
+
best = check_cpuid_limit(vcpu, function, index);
+   }
 
+out:
if (best) {
*eax = best->eax;
*ebx = best->ebx;
@@ -870,7 +878,8 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, 
u32 *ecx, u32 *edx)
*edx = best->edx;
} else
*eax = *ebx = *ecx = *edx = 0;
-   trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx);
+   trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
+   return entry_found;
 }
 EXPORT_SYMBOL_GPL(kvm_cpuid);
 
@@ -883,7 +892,7 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 
eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-   kvm_cpuid(vcpu, , , , );
+   kvm_cpuid(vcpu, , , , , true);
kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ac15193..1ea3c0e 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -21,7 +21,8 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
  struct kvm_cpuid2 *cpuid,
  struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+  u32 *ecx, u32 *edx, bool check_limit);
 
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb00559..319d91f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2333,7 +2333,7 @@ static int emulator_has_longmode(struct x86_emulate_ctxt 
*ctxt)
 
eax = 0x8001;
ecx = 0;
-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , false);
return edx & bit(X86_FEATURE_LM);
 }
 
@@ -2636,7 +2636,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
u32 eax, ebx, ecx, edx;
 
eax = ecx = 0;
-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , false);
return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
@@ -2656,7 +2656,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt 
*ctxt)
 
eax = 0x;
ecx = 0x;
-   ops->get_cpuid(ctxt, , , , );
+   ops->get_cpuid(ctxt, , , , , false);
/*
  

[PATCH v3 4/5] KVM: MMU: Add 5 level EPT & Shadow page table support.

2017-08-24 Thread Yu Zhang
Extends the shadow paging code, so that 5 level shadow page
table can be constructed if VM is running in 5 level paging
mode.

Also extends the ept code, so that 5 level ept table can be
constructed if maxphysaddr of VM exceeds 48 bits. Unlike the
shadow logic, KVM should still use 4 level ept table for a VM
whose physical address width is less than 48 bits, even when
the VM is running in 5 level paging mode.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h | 10 +-
 arch/x86/include/asm/vmx.h  |  2 ++
 arch/x86/kvm/cpuid.c|  5 +
 arch/x86/kvm/mmu.c  | 43 +++--
 arch/x86/kvm/mmu.h  |  1 +
 arch/x86/kvm/mmu_audit.c|  4 ++--
 arch/x86/kvm/svm.c  |  4 ++--
 arch/x86/kvm/vmx.c  | 21 ++--
 arch/x86/kvm/x86.h  | 10 ++
 9 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5907d46..bdef532 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,7 +315,7 @@ struct kvm_pio_request {
int size;
 };
 
-#define PT64_ROOT_MAX_LEVEL 4
+#define PT64_ROOT_MAX_LEVEL 5
 
 struct rsvd_bits_validate {
u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
@@ -323,9 +323,9 @@ struct rsvd_bits_validate {
 };
 
 /*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
- * mode.
+ * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
+ * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
+ * current mmu mode.
  */
 struct kvm_mmu {
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
@@ -982,7 +982,7 @@ struct kvm_x86_ops {
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
-   int (*get_tdp_level)(void);
+   int (*get_tdp_level)(struct kvm_vcpu *vcpu);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 340007a..caec841 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -453,6 +453,7 @@ enum vmcs_field {
 
 #define VMX_EPT_EXECUTE_ONLY_BIT   (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT(1ull << 6)
+#define VMX_EPT_PAGE_WALK_5_BIT(1ull << 7)
 #define VMX_EPTP_UC_BIT(1ull << 8)
 #define VMX_EPTP_WB_BIT(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull << 16)
@@ -471,6 +472,7 @@ enum vmcs_field {
 #define VMX_EPT_MT_EPTE_SHIFT  3
 #define VMX_EPTP_PWL_MASK  0x38ull
 #define VMX_EPTP_PWL_4 0x18ull
+#define VMX_EPTP_PWL_5 0x20ull
 #define VMX_EPTP_AD_ENABLE_BIT (1ull << 6)
 #define VMX_EPTP_MT_MASK   0x7ull
 #define VMX_EPTP_MT_WB 0x6ull
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 1450547..83865a3 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -137,6 +137,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
+#ifdef CONFIG_X86_64
+   if (vcpu->arch.maxphyaddr > 48)
+   kvm_mmu_reset_context(vcpu);
+#endif
+
kvm_pmu_refresh(vcpu);
return 0;
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3faa20c..f47ccca 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3322,8 +3322,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+   if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
+   (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL ||
 vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -3375,13 +3375,14 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
spin_lock(>kvm->mmu_lock);
if(make_mmu_pages_available(vcpu) < 0) {
   

[PATCH v3 4/5] KVM: MMU: Add 5 level EPT & Shadow page table support.

2017-08-24 Thread Yu Zhang
Extends the shadow paging code, so that 5 level shadow page
table can be constructed if VM is running in 5 level paging
mode.

Also extends the ept code, so that 5 level ept table can be
constructed if maxphysaddr of VM exceeds 48 bits. Unlike the
shadow logic, KVM should still use 4 level ept table for a VM
whose physical address width is less than 48 bits, even when
the VM is running in 5 level paging mode.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_host.h | 10 +-
 arch/x86/include/asm/vmx.h  |  2 ++
 arch/x86/kvm/cpuid.c|  5 +
 arch/x86/kvm/mmu.c  | 43 +++--
 arch/x86/kvm/mmu.h  |  1 +
 arch/x86/kvm/mmu_audit.c|  4 ++--
 arch/x86/kvm/svm.c  |  4 ++--
 arch/x86/kvm/vmx.c  | 21 ++--
 arch/x86/kvm/x86.h  | 10 ++
 9 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5907d46..bdef532 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,7 +315,7 @@ struct kvm_pio_request {
int size;
 };
 
-#define PT64_ROOT_MAX_LEVEL 4
+#define PT64_ROOT_MAX_LEVEL 5
 
 struct rsvd_bits_validate {
u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
@@ -323,9 +323,9 @@ struct rsvd_bits_validate {
 };
 
 /*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
- * mode.
+ * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
+ * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
+ * current mmu mode.
  */
 struct kvm_mmu {
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
@@ -982,7 +982,7 @@ struct kvm_x86_ops {
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
-   int (*get_tdp_level)(void);
+   int (*get_tdp_level)(struct kvm_vcpu *vcpu);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 340007a..caec841 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -453,6 +453,7 @@ enum vmcs_field {
 
 #define VMX_EPT_EXECUTE_ONLY_BIT   (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT(1ull << 6)
+#define VMX_EPT_PAGE_WALK_5_BIT(1ull << 7)
 #define VMX_EPTP_UC_BIT(1ull << 8)
 #define VMX_EPTP_WB_BIT(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull << 16)
@@ -471,6 +472,7 @@ enum vmcs_field {
 #define VMX_EPT_MT_EPTE_SHIFT  3
 #define VMX_EPTP_PWL_MASK  0x38ull
 #define VMX_EPTP_PWL_4 0x18ull
+#define VMX_EPTP_PWL_5 0x20ull
 #define VMX_EPTP_AD_ENABLE_BIT (1ull << 6)
 #define VMX_EPTP_MT_MASK   0x7ull
 #define VMX_EPTP_MT_WB 0x6ull
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 1450547..83865a3 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -137,6 +137,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
+#ifdef CONFIG_X86_64
+   if (vcpu->arch.maxphyaddr > 48)
+   kvm_mmu_reset_context(vcpu);
+#endif
+
kvm_pmu_refresh(vcpu);
return 0;
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3faa20c..f47ccca 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3322,8 +3322,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+   if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
+   (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL ||
 vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -3375,13 +3375,14 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
spin_lock(>kvm->mmu_lock);
if(make_mmu_pages_available(vcpu) < 0) {
spin_unlock(>kvm->

[PATCH v3 5/5] KVM: MMU: Expose the LA57 feature to VM.

2017-08-24 Thread Yu Zhang
This patch exposes 5 level page table feature to the VM.
At the same time, the canonical virtual address checking is
extended to support both 48-bits and 57-bits address width.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h | 18 ++
 arch/x86/kvm/cpuid.c| 16 ++--
 arch/x86/kvm/emulate.c  | 16 +---
 arch/x86/kvm/kvm_cache_regs.h   |  2 +-
 arch/x86/kvm/vmx.c  |  8 
 arch/x86/kvm/x86.c  |  7 +--
 arch/x86/kvm/x86.h  | 34 ++
 7 files changed, 65 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bdef532..b4d4f51 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,8 +85,8 @@
  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | 
X86_CR4_PCIDE \
  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
- | X86_CR4_PKE))
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
+ | X86_CR4_SMAP | X86_CR4_PKE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
@@ -1300,20 +1300,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, 
u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 }
 
-static inline u64 get_canonical(u64 la)
-{
-   return ((int64_t)la << 16) >> 16;
-}
-
-static inline bool is_noncanonical_address(u64 la)
-{
-#ifdef CONFIG_X86_64
-   return get_canonical(la) != la;
-#else
-   return false;
-#endif
-}
-
 #define TSS_IOPB_BASE_OFFSET 0x66
 #define TSS_BASE_SIZE 0x68
 #define TSS_IOPB_SIZE (65536 / 8)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 83865a3..8683811 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -126,13 +126,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
/*
-* The existing code assumes virtual address is 48-bit in the canonical
-* address checks; exit if it is ever changed.
+* The existing code assumes virtual address is 48-bit or 57-bit in the
+* canonical address checks; exit if it is ever changed.
 */
best = kvm_find_cpuid_entry(vcpu, 0x8008, 0);
-   if (best && ((best->eax & 0xff00) >> 8) != 48 &&
-   ((best->eax & 0xff00) >> 8) != 0)
-   return -EINVAL;
+   if (best) {
+   int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+   if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+   return -EINVAL;
+   }
 
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -388,7 +391,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
-   F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
+   F(AVX512VBMI) | F(LA57) | F(PKU) |
+   0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
 
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a89b595..16bf665 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -689,16 +689,18 @@ static __always_inline int __linearize(struct 
x86_emulate_ctxt *ctxt,
ulong la;
u32 lim;
u16 sel;
+   u8  va_bits;
 
la = seg_base(ctxt, addr.seg) + addr.ea;
*max_size = 0;
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
-   if (is_noncanonical_address(la))
+   va_bits = ctxt_virt_addr_bits(ctxt);
+   if (get_canonical(la, va_bits) != la)
goto bad;
 
-   *max_size = min_t(u64, ~0u, (1ull << 48) - la);
+   *max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
if (size > *max_size)
goto bad;
break;
@@ -1749,8 +1751,8 @@ static int __load_segment_descriptor(struct 
x86_emulate_ctxt *ctxt,
sizeof(base3), >exception);
if (ret != X86EMUL_CONTINUE)
return ret;
-   if (is_noncanonical_address(get_desc_base(_desc) |
-((u64)base3 << 32)))
+   if (emul_is_noncanonical_address(get_desc_base(_desc) |
+   ((u64)base3 << 32)

[PATCH v3 5/5] KVM: MMU: Expose the LA57 feature to VM.

2017-08-24 Thread Yu Zhang
This patch exposes 5 level page table feature to the VM.
At the same time, the canonical virtual address checking is
extended to support both 48-bits and 57-bits address width.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_host.h | 18 ++
 arch/x86/kvm/cpuid.c| 16 ++--
 arch/x86/kvm/emulate.c  | 16 +---
 arch/x86/kvm/kvm_cache_regs.h   |  2 +-
 arch/x86/kvm/vmx.c  |  8 
 arch/x86/kvm/x86.c  |  7 +--
 arch/x86/kvm/x86.h  | 34 ++
 7 files changed, 65 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bdef532..b4d4f51 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,8 +85,8 @@
  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | 
X86_CR4_PCIDE \
  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
- | X86_CR4_PKE))
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
+ | X86_CR4_SMAP | X86_CR4_PKE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
@@ -1300,20 +1300,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, 
u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 }
 
-static inline u64 get_canonical(u64 la)
-{
-   return ((int64_t)la << 16) >> 16;
-}
-
-static inline bool is_noncanonical_address(u64 la)
-{
-#ifdef CONFIG_X86_64
-   return get_canonical(la) != la;
-#else
-   return false;
-#endif
-}
-
 #define TSS_IOPB_BASE_OFFSET 0x66
 #define TSS_BASE_SIZE 0x68
 #define TSS_IOPB_SIZE (65536 / 8)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 83865a3..8683811 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -126,13 +126,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
/*
-* The existing code assumes virtual address is 48-bit in the canonical
-* address checks; exit if it is ever changed.
+* The existing code assumes virtual address is 48-bit or 57-bit in the
+* canonical address checks; exit if it is ever changed.
 */
best = kvm_find_cpuid_entry(vcpu, 0x8008, 0);
-   if (best && ((best->eax & 0xff00) >> 8) != 48 &&
-   ((best->eax & 0xff00) >> 8) != 0)
-   return -EINVAL;
+   if (best) {
+   int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+   if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+   return -EINVAL;
+   }
 
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -388,7 +391,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
-   F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
+   F(AVX512VBMI) | F(LA57) | F(PKU) |
+   0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
 
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a89b595..16bf665 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -689,16 +689,18 @@ static __always_inline int __linearize(struct 
x86_emulate_ctxt *ctxt,
ulong la;
u32 lim;
u16 sel;
+   u8  va_bits;
 
la = seg_base(ctxt, addr.seg) + addr.ea;
*max_size = 0;
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
-   if (is_noncanonical_address(la))
+   va_bits = ctxt_virt_addr_bits(ctxt);
+   if (get_canonical(la, va_bits) != la)
goto bad;
 
-   *max_size = min_t(u64, ~0u, (1ull << 48) - la);
+   *max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
if (size > *max_size)
goto bad;
break;
@@ -1749,8 +1751,8 @@ static int __load_segment_descriptor(struct 
x86_emulate_ctxt *ctxt,
sizeof(base3), >exception);
if (ret != X86EMUL_CONTINUE)
return ret;
-   if (is_noncanonical_address(get_desc_base(_desc) |
-((u64)base3 << 32)))
+   if (emul_is_noncanonical_address(get_desc_base(_desc) |
+   ((u64)base3 << 32), ctxt))

[PATCH v3 3/5] KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.

2017-08-24 Thread Yu Zhang
Now we have 4 level page table and 5 level page table in 64 bits
long mode, let's rename the PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL,
then we can use PT64_ROOT_5LEVEL for 5 level page table, it's
helpful to make the code more clear.

Also PT64_ROOT_MAX_LEVEL is defined as 4, so that we can just
redefine it to 5 whenever a replacement is needed for 5 level
paging.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |  4 +++-
 arch/x86/kvm/mmu.c  | 36 ++--
 arch/x86/kvm/mmu.h  |  2 +-
 arch/x86/kvm/mmu_audit.c|  4 ++--
 arch/x86/kvm/svm.c  |  2 +-
 5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e716228..5907d46 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,8 +315,10 @@ struct kvm_pio_request {
int size;
 };
 
+#define PT64_ROOT_MAX_LEVEL 4
+
 struct rsvd_bits_validate {
-   u64 rsvd_bits_mask[2][4];
+   u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
u64 bad_mt_xwr;
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2dafd36..3faa20c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2167,8 +2167,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t 
gfn,
 }
 
 struct mmu_page_path {
-   struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
-   unsigned int idx[PT64_ROOT_LEVEL];
+   struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+   unsigned int idx[PT64_ROOT_MAX_LEVEL];
 };
 
 #define for_each_sp(pvec, sp, parents, i)  \
@@ -2383,8 +2383,8 @@ static void shadow_walk_init(struct 
kvm_shadow_walk_iterator *iterator,
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
 
-   if (iterator->level == PT64_ROOT_LEVEL &&
-   vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+   if (iterator->level == PT64_ROOT_4LEVEL &&
+   vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu.direct_map)
--iterator->level;
 
@@ -3322,8 +3322,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
+   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
 vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -3375,13 +3375,13 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
spin_lock(>kvm->mmu_lock);
if(make_mmu_pages_available(vcpu) < 0) {
spin_unlock(>kvm->mmu_lock);
return 1;
}
-   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL);
+   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
++sp->root_count;
spin_unlock(>kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3425,7 +3425,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * Do we shadow a long mode page table? If so we need to
 * write-protect the guests page table root.
 */
-   if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
MMU_WARN_ON(VALID_PAGE(root));
@@ -3435,7 +3435,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
spin_unlock(>kvm->mmu_lock);
return 1;
}
-   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
  0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
@@ -3450,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * the shadow page table may be a PAE or a long mode page table.
 */
pm_mask = PT_PRESENT_MASK;
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
for (i = 0; i < 4; ++i) {
@@ 

[PATCH v3 0/5] KVM: MMU: 5 level EPT/shadow support

2017-08-24 Thread Yu Zhang
Intel's existing processors limit the maximum linear address width to
48 bits, and the maximum physical address width to 46 bits. And the
upcoming processors will extend maximum linear address width to 57 bits
and maximum physical address width can go upto 52 bits in practical.

With linear address width greater than 48, a new paging mode in IA-32e
is introduced - 5 level paging(also known as LA57). And to support VMs 
with this feature, KVM MMU code need to be extended. 

And to achieve this, this patchset:
1> leverages 2 qemu parameters: +la57 and phys-bits to expose wider linear
address width and physical address width to the VM; 
2> extends shadow logic to construct 5 level shadow page for VMs running
in LA57 mode;
3> extends ept logic to construct 5 level ept table for VMs whose maximum
physical width exceeds 48 bits.

Changes in v3: 
- Address comments from Paolo Bonzini: do not fall into check_cpuid_limit()
  in kvm_cpuid() for em_movbe() and check_fxsr();
- Address comments from Paolo Bonzini: change parameter 'check_limit' of
  kvm_cpuid() to bool type;
- Address comments from Paolo Bonzini: set maxphyaddr to 36, for guest cr3
  reserved bits check if cpuid.0x8008 is not available;
- Address comments from Paolo Bonzini: replace the hardcoded value 48 as
  va_bits in __linearize();
- Rebase change: add new eptp definition VMX_EPTP_PWL_5, instead of use bit
  shifts(in line with previous commit bb97a01).

Changes in v2: 
- Address comments from Paolo Bonzini and Jim Mattson: add a new patch to let
  kvm_cpuid() return false when cpuid entry is not found; 
- Address comments from Paolo Bonzini: fix a typo in check_cr_write() and use
  62 as the upper limit when checking reserved bits for a physical address;
- Address comments from Paolo Bonzini: move definition of PT64_ROOT_MAX_LEVEL
  into kvm_host.h;
- Address comments from Paolo Bonzini: add checking for shadow_root_level in
  mmu_free_roots(); 
- Address comments from Paolo Bonzini: set root_level & shadow_root_level both
  to PT64_ROOT_4LEVEL for shadow ept situation.


Yu Zhang (5):
  KVM: x86: Add return value to kvm_cpuid().
  KVM: MMU: check guest CR3 reserved bits based on its physical address
width.
  KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.
  KVM: MMU: Add 5 level EPT & Shadow page table support.
  KVM: MMU: Expose the LA57 feature to VM.

 arch/x86/include/asm/kvm_emulate.h |  4 +--
 arch/x86/include/asm/kvm_host.h| 31 ++--
 arch/x86/include/asm/vmx.h |  2 ++
 arch/x86/kvm/cpuid.c   | 38 +---
 arch/x86/kvm/cpuid.h   |  3 +-
 arch/x86/kvm/emulate.c | 42 +--
 arch/x86/kvm/kvm_cache_regs.h  |  2 +-
 arch/x86/kvm/mmu.c | 59 --
 arch/x86/kvm/mmu.h |  6 +++-
 arch/x86/kvm/mmu_audit.c   |  4 +--
 arch/x86/kvm/svm.c |  8 +++---
 arch/x86/kvm/trace.h   | 11 ---
 arch/x86/kvm/vmx.c | 29 ---
 arch/x86/kvm/x86.c | 21 --
 arch/x86/kvm/x86.h | 44 
 15 files changed, 201 insertions(+), 103 deletions(-)

-- 
2.5.0



[PATCH v3 3/5] KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.

2017-08-24 Thread Yu Zhang
Now we have 4 level page table and 5 level page table in 64 bits
long mode, let's rename the PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL,
then we can use PT64_ROOT_5LEVEL for 5 level page table, it's
helpful to make the code more clear.

Also PT64_ROOT_MAX_LEVEL is defined as 4, so that we can just
redefine it to 5 whenever a replacement is needed for 5 level
paging.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_host.h |  4 +++-
 arch/x86/kvm/mmu.c  | 36 ++--
 arch/x86/kvm/mmu.h  |  2 +-
 arch/x86/kvm/mmu_audit.c|  4 ++--
 arch/x86/kvm/svm.c  |  2 +-
 5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e716228..5907d46 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,8 +315,10 @@ struct kvm_pio_request {
int size;
 };
 
+#define PT64_ROOT_MAX_LEVEL 4
+
 struct rsvd_bits_validate {
-   u64 rsvd_bits_mask[2][4];
+   u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
u64 bad_mt_xwr;
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2dafd36..3faa20c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2167,8 +2167,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t 
gfn,
 }
 
 struct mmu_page_path {
-   struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
-   unsigned int idx[PT64_ROOT_LEVEL];
+   struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+   unsigned int idx[PT64_ROOT_MAX_LEVEL];
 };
 
 #define for_each_sp(pvec, sp, parents, i)  \
@@ -2383,8 +2383,8 @@ static void shadow_walk_init(struct 
kvm_shadow_walk_iterator *iterator,
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
 
-   if (iterator->level == PT64_ROOT_LEVEL &&
-   vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+   if (iterator->level == PT64_ROOT_4LEVEL &&
+   vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu.direct_map)
--iterator->level;
 
@@ -3322,8 +3322,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
+   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
 vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -3375,13 +3375,13 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
spin_lock(>kvm->mmu_lock);
if(make_mmu_pages_available(vcpu) < 0) {
spin_unlock(>kvm->mmu_lock);
return 1;
}
-   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL);
+   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
++sp->root_count;
spin_unlock(>kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3425,7 +3425,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * Do we shadow a long mode page table? If so we need to
 * write-protect the guests page table root.
 */
-   if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
MMU_WARN_ON(VALID_PAGE(root));
@@ -3435,7 +3435,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
spin_unlock(>kvm->mmu_lock);
return 1;
}
-   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
  0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
@@ -3450,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * the shadow page table may be a PAE or a long mode page table.
 */
pm_mask = PT_PRESENT_MASK;
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
for (i = 0; i < 4; ++i) {
@@ -3486,7 +3486,7 @@ static int mmu_al

[PATCH v3 0/5] KVM: MMU: 5 level EPT/shadow support

2017-08-24 Thread Yu Zhang
Intel's existing processors limit the maximum linear address width to
48 bits, and the maximum physical address width to 46 bits. And the
upcoming processors will extend maximum linear address width to 57 bits
and maximum physical address width can go upto 52 bits in practical.

With linear address width greater than 48, a new paging mode in IA-32e
is introduced - 5 level paging(also known as LA57). And to support VMs 
with this feature, KVM MMU code need to be extended. 

And to achieve this, this patchset:
1> leverages 2 qemu parameters: +la57 and phys-bits to expose wider linear
address width and physical address width to the VM; 
2> extends shadow logic to construct 5 level shadow page for VMs running
in LA57 mode;
3> extends ept logic to construct 5 level ept table for VMs whose maximum
physical width exceeds 48 bits.

Changes in v3: 
- Address comments from Paolo Bonzini: do not fall into check_cpuid_limit()
  in kvm_cpuid() for em_movbe() and check_fxsr();
- Address comments from Paolo Bonzini: change parameter 'check_limit' of
  kvm_cpuid() to bool type;
- Address comments from Paolo Bonzini: set maxphyaddr to 36, for guest cr3
  reserved bits check if cpuid.0x8008 is not available;
- Address comments from Paolo Bonzini: replace the hardcoded value 48 as
  va_bits in __linearize();
- Rebase change: add new eptp definition VMX_EPTP_PWL_5, instead of use bit
  shifts(in line with previous commit bb97a01).

Changes in v2: 
- Address comments from Paolo Bonzini and Jim Mattson: add a new patch to let
  kvm_cpuid() return false when cpuid entry is not found; 
- Address comments from Paolo Bonzini: fix a typo in check_cr_write() and use
  62 as the upper limit when checking reserved bits for a physical address;
- Address comments from Paolo Bonzini: move definition of PT64_ROOT_MAX_LEVEL
  into kvm_host.h;
- Address comments from Paolo Bonzini: add checking for shadow_root_level in
  mmu_free_roots(); 
- Address comments from Paolo Bonzini: set root_level & shadow_root_level both
  to PT64_ROOT_4LEVEL for shadow ept situation.


Yu Zhang (5):
  KVM: x86: Add return value to kvm_cpuid().
  KVM: MMU: check guest CR3 reserved bits based on its physical address
width.
  KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.
  KVM: MMU: Add 5 level EPT & Shadow page table support.
  KVM: MMU: Expose the LA57 feature to VM.

 arch/x86/include/asm/kvm_emulate.h |  4 +--
 arch/x86/include/asm/kvm_host.h| 31 ++--
 arch/x86/include/asm/vmx.h |  2 ++
 arch/x86/kvm/cpuid.c   | 38 +---
 arch/x86/kvm/cpuid.h   |  3 +-
 arch/x86/kvm/emulate.c | 42 +--
 arch/x86/kvm/kvm_cache_regs.h  |  2 +-
 arch/x86/kvm/mmu.c | 59 --
 arch/x86/kvm/mmu.h |  6 +++-
 arch/x86/kvm/mmu_audit.c   |  4 +--
 arch/x86/kvm/svm.c |  8 +++---
 arch/x86/kvm/trace.h   | 11 ---
 arch/x86/kvm/vmx.c | 29 ---
 arch/x86/kvm/x86.c | 21 --
 arch/x86/kvm/x86.h | 44 
 15 files changed, 201 insertions(+), 103 deletions(-)

-- 
2.5.0



Re: [PATCH v1 4/4] KVM: MMU: Expose the LA57 feature to VM.

2017-08-21 Thread Yu Zhang



On 8/21/2017 6:12 PM, Paolo Bonzini wrote:

On 21/08/2017 09:27, Yu Zhang wrote:


On 8/18/2017 8:50 PM, Paolo Bonzini wrote:

On 18/08/2017 10:28, Yu Zhang wrote:

On 8/17/2017 10:29 PM, Paolo Bonzini wrote:

On 17/08/2017 13:53, Yu Zhang wrote:

On 8/17/2017 7:57 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

index a98b88a..50107ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -694,7 +694,7 @@ static __always_inline int __linearize(struct
x86_emulate_ctxt *ctxt,
 switch (mode) {
 case X86EMUL_MODE_PROT64:
 *linear = la;
-if (is_noncanonical_address(la))
+if (emul_is_noncanonical_address(la, ctxt))
 goto bad;
   *max_size = min_t(u64, ~0u, (1ull << 48) - la);

Oops, you missed one here.  Probably best to use ctxt_virt_addr_bits
and
then "inline" emul_is_noncanonical_address as "get_canonical(la,
va_bits) != la".

Sorry, I just sent out the v2 patch set without noticing this
reply. :-)

The emul_is_noncanonical() is defined in x86.h so that no
ctxt_virt_addr_bits needed in emulate.c, are you
suggesting to use ctx_virt_addr_bits in this file each time before
emul_is_noncanonical_address() is called?

No, only in this instance which uses "48" after the call to
emul_is_noncanonical_address.

Sorry, Paolo. I still do not quite get it.
Do you mean the
   *max_size = min_t(u64, ~0u, (1ull << 48) - la);
also need to be changed?

But I do not understand why this statement is used like this. My
understanding is that
for 64 bit scenario, the *max_size is calculated to guarantee la +
*max_size still falls in
the canonical address space.

And if above understanding is correct, I think it should be something
like below:
*max_size = min_t(u64, ~0u - la, (1ull << 48) - la);

The "~0u" part is simply because max_size has 32-bit size (it's an
unsigned int variable), while (1ull << 48) - la has 64-bit size.  It
protects from the overflow.

But what if value of "la" falls in between 0x and
0x? (1ull << 48) - la may result in something between
0x10001 and> 0x2, and the *max_size would be 4G - 1
in this scenario.  For instance, when "la" is 0xFFF0 (unlikely
in practice though), the *max_size we are expecting should be 15, instead
of 4G - 1.

No, it is possible to wrap a memory access from the top half of the
address space to the bottom half, so there's no limit at 0xFFF0.


Oh? So you mean it is allowed for one instruction to reside both in the 
top half of

the address space and in the bottom half?

If this is possible, I guess the code should be

*max_size = min_t(u64, ~0u, (1ull << va_bits) - la);

But I wonder, why should such scenario be allowed? :-)

Thanks
Yu






Paolo





Re: [PATCH v1 4/4] KVM: MMU: Expose the LA57 feature to VM.

2017-08-21 Thread Yu Zhang



On 8/21/2017 6:12 PM, Paolo Bonzini wrote:

On 21/08/2017 09:27, Yu Zhang wrote:


On 8/18/2017 8:50 PM, Paolo Bonzini wrote:

On 18/08/2017 10:28, Yu Zhang wrote:

On 8/17/2017 10:29 PM, Paolo Bonzini wrote:

On 17/08/2017 13:53, Yu Zhang wrote:

On 8/17/2017 7:57 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

index a98b88a..50107ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -694,7 +694,7 @@ static __always_inline int __linearize(struct
x86_emulate_ctxt *ctxt,
 switch (mode) {
 case X86EMUL_MODE_PROT64:
 *linear = la;
-if (is_noncanonical_address(la))
+if (emul_is_noncanonical_address(la, ctxt))
 goto bad;
   *max_size = min_t(u64, ~0u, (1ull << 48) - la);

Oops, you missed one here.  Probably best to use ctxt_virt_addr_bits
and
then "inline" emul_is_noncanonical_address as "get_canonical(la,
va_bits) != la".

Sorry, I just sent out the v2 patch set without noticing this
reply. :-)

The emul_is_noncanonical() is defined in x86.h so that no
ctxt_virt_addr_bits needed in emulate.c, are you
suggesting to use ctx_virt_addr_bits in this file each time before
emul_is_noncanonical_address() is called?

No, only in this instance which uses "48" after the call to
emul_is_noncanonical_address.

Sorry, Paolo. I still do not quite get it.
Do you mean the
   *max_size = min_t(u64, ~0u, (1ull << 48) - la);
also need to be changed?

But I do not understand why this statement is used like this. My
understanding is that
for 64 bit scenario, the *max_size is calculated to guarantee la +
*max_size still falls in
the canonical address space.

And if above understanding is correct, I think it should be something
like below:
*max_size = min_t(u64, ~0u - la, (1ull << 48) - la);

The "~0u" part is simply because max_size has 32-bit size (it's an
unsigned int variable), while (1ull << 48) - la has 64-bit size.  It
protects from the overflow.

But what if value of "la" falls in between 0x and
0x? (1ull << 48) - la may result in something between
0x10001 and> 0x2, and the *max_size would be 4G - 1
in this scenario.  For instance, when "la" is 0xFFF0 (unlikely
in practice though), the *max_size we are expecting should be 15, instead
of 4G - 1.

No, it is possible to wrap a memory access from the top half of the
address space to the bottom half, so there's no limit at 0xFFF0.


Oh? So you mean it is allowed for one instruction to reside both in the 
top half of

the address space and in the bottom half?

If this is possible, I guess the code should be

*max_size = min_t(u64, ~0u, (1ull << va_bits) - la);

But I wonder, why should such scenario be allowed? :-)

Thanks
Yu






Paolo





Re: [PATCH v1 4/4] KVM: MMU: Expose the LA57 feature to VM.

2017-08-21 Thread Yu Zhang



On 8/18/2017 8:50 PM, Paolo Bonzini wrote:

On 18/08/2017 10:28, Yu Zhang wrote:


On 8/17/2017 10:29 PM, Paolo Bonzini wrote:

On 17/08/2017 13:53, Yu Zhang wrote:

On 8/17/2017 7:57 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

index a98b88a..50107ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -694,7 +694,7 @@ static __always_inline int __linearize(struct
x86_emulate_ctxt *ctxt,
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
-if (is_noncanonical_address(la))
+if (emul_is_noncanonical_address(la, ctxt))
goto bad;
  *max_size = min_t(u64, ~0u, (1ull << 48) - la);

Oops, you missed one here.  Probably best to use ctxt_virt_addr_bits
and
then "inline" emul_is_noncanonical_address as "get_canonical(la,
va_bits) != la".

Sorry, I just sent out the v2 patch set without noticing this reply. :-)

The emul_is_noncanonical() is defined in x86.h so that no
ctxt_virt_addr_bits needed in emulate.c, are you
suggesting to use ctx_virt_addr_bits in this file each time before
emul_is_noncanonical_address() is called?

No, only in this instance which uses "48" after the call to
emul_is_noncanonical_address.

Sorry, Paolo. I still do not quite get it.
Do you mean the
  *max_size = min_t(u64, ~0u, (1ull << 48) - la);
also need to be changed?

But I do not understand why this statement is used like this. My
understanding is that
for 64 bit scenario, the *max_size is calculated to guarantee la +
*max_size still falls in
the canonical address space.

And if above understanding is correct, I think it should be something
like below:
   *max_size = min_t(u64, ~0u - la, (1ull << 48) - la);

The "~0u" part is simply because max_size has 32-bit size (it's an
unsigned int variable), while (1ull << 48) - la has 64-bit size.  It
protects from the overflow.


Oh, right. "~0u" is only an unsigned int. Thanks for your clarification. :-)

But what if value of "la" falls in between 0x and 
0x?
(1ull << 48) - la may result in something between 0x10001 and 
0x2,

and the *max_size would be 4G - 1 in this scenario.
For instance, when "la" is 0xFFF0(unlikely in practice 
though), the *max_size

we are expecting should be 15, instead of 4G - 1.

If above understanding is correct, maybe we should change this code as 
below:
@@ -690,16 +690,21 @@ static __always_inline int __linearize(struct 
x86_emulate_ctxt *ctxt,

    ulong la;
    u32 lim;
    u16 sel;
+   u64 canonical_limit;
+   u8 va_bits;

    la = seg_base(ctxt, addr.seg) + addr.ea;
    *max_size = 0;
    switch (mode) {
    case X86EMUL_MODE_PROT64:
    *linear = la;
-   if (emul_is_noncanonical_address(la, ctxt))
+   va_bits = ctxt_virt_addr_bits(ctxt);
+   if (get_canonical(la, va_bits) != la)
    goto bad;

-   *max_size = min_t(u64, ~0u, (1ull << 48) - la);
+   canonical_limit = (la & (1 << va_bits)) ?
+ ~0ull : ((1 << va_bits) -1);
+   *max_size = min_t(u64, ~0u, canonical_limit - la + 1);

Does this sound reasonable?
BTW, I did not use min_t(u64, ~0ull - la + 1, (1 << va_bits) - la) here, 
because I still would like to
keep *max_size as an unsigned int, and my previous suggestion may cause 
the return value of

min_t be truncated.

Yu


And with LA57, may better be changed to:
   *max_size = min_t(u64, ~0u - la, (1ull << ctxt_virt_addr_bits(ctxt)) -
la);

And for the above
   if (emul_is_noncanonical_address(la, ctxt))
we may just leave it as it is.

Yes, exactly.  But since emul_is_noncanonical_address is already using
ctxt_virt_addr_bits(ctxt), it may make sense to compute
ctxt_virt_addr_bits(ctxt) once and then reuse it twice, once in
get_canonical(la, va_bits) != la and once in (1ull << va_bits) - la.

Paolo


Is this understanding correct? Or did I misunderstand your comments? :-)

Thanks
Yu

Paolo







Re: [PATCH v1 4/4] KVM: MMU: Expose the LA57 feature to VM.

2017-08-21 Thread Yu Zhang



On 8/18/2017 8:50 PM, Paolo Bonzini wrote:

On 18/08/2017 10:28, Yu Zhang wrote:


On 8/17/2017 10:29 PM, Paolo Bonzini wrote:

On 17/08/2017 13:53, Yu Zhang wrote:

On 8/17/2017 7:57 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

index a98b88a..50107ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -694,7 +694,7 @@ static __always_inline int __linearize(struct
x86_emulate_ctxt *ctxt,
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
-if (is_noncanonical_address(la))
+if (emul_is_noncanonical_address(la, ctxt))
goto bad;
  *max_size = min_t(u64, ~0u, (1ull << 48) - la);

Oops, you missed one here.  Probably best to use ctxt_virt_addr_bits
and
then "inline" emul_is_noncanonical_address as "get_canonical(la,
va_bits) != la".

Sorry, I just sent out the v2 patch set without noticing this reply. :-)

The emul_is_noncanonical() is defined in x86.h so that no
ctxt_virt_addr_bits needed in emulate.c, are you
suggesting to use ctx_virt_addr_bits in this file each time before
emul_is_noncanonical_address() is called?

No, only in this instance which uses "48" after the call to
emul_is_noncanonical_address.

Sorry, Paolo. I still do not quite get it.
Do you mean the
  *max_size = min_t(u64, ~0u, (1ull << 48) - la);
also need to be changed?

But I do not understand why this statement is used like this. My
understanding is that
for 64 bit scenario, the *max_size is calculated to guarantee la +
*max_size still falls in
the canonical address space.

And if above understanding is correct, I think it should be something
like below:
   *max_size = min_t(u64, ~0u - la, (1ull << 48) - la);

The "~0u" part is simply because max_size has 32-bit size (it's an
unsigned int variable), while (1ull << 48) - la has 64-bit size.  It
protects from the overflow.


Oh, right. "~0u" is only an unsigned int. Thanks for your clarification. :-)

But what if value of "la" falls in between 0x and 
0x?
(1ull << 48) - la may result in something between 0x10001 and 
0x2,

and the *max_size would be 4G - 1 in this scenario.
For instance, when "la" is 0xFFF0(unlikely in practice 
though), the *max_size

we are expecting should be 15, instead of 4G - 1.

If above understanding is correct, maybe we should change this code as 
below:
@@ -690,16 +690,21 @@ static __always_inline int __linearize(struct 
x86_emulate_ctxt *ctxt,

    ulong la;
    u32 lim;
    u16 sel;
+   u64 canonical_limit;
+   u8 va_bits;

    la = seg_base(ctxt, addr.seg) + addr.ea;
    *max_size = 0;
    switch (mode) {
    case X86EMUL_MODE_PROT64:
    *linear = la;
-   if (emul_is_noncanonical_address(la, ctxt))
+   va_bits = ctxt_virt_addr_bits(ctxt);
+   if (get_canonical(la, va_bits) != la)
    goto bad;

-   *max_size = min_t(u64, ~0u, (1ull << 48) - la);
+   canonical_limit = (la & (1 << va_bits)) ?
+ ~0ull : ((1 << va_bits) -1);
+   *max_size = min_t(u64, ~0u, canonical_limit - la + 1);

Does this sound reasonable?
BTW, I did not use min_t(u64, ~0ull - la + 1, (1 << va_bits) - la) here, 
because I still would like to
keep *max_size as an unsigned int, and my previous suggestion may cause 
the return value of

min_t be truncated.

Yu


And with LA57, may better be changed to:
   *max_size = min_t(u64, ~0u - la, (1ull << ctxt_virt_addr_bits(ctxt)) -
la);

And for the above
   if (emul_is_noncanonical_address(la, ctxt))
we may just leave it as it is.

Yes, exactly.  But since emul_is_noncanonical_address is already using
ctxt_virt_addr_bits(ctxt), it may make sense to compute
ctxt_virt_addr_bits(ctxt) once and then reuse it twice, once in
get_canonical(la, va_bits) != la and once in (1ull << va_bits) - la.

Paolo


Is this understanding correct? Or did I misunderstand your comments? :-)

Thanks
Yu

Paolo







Re: [PATCH v1 4/4] KVM: MMU: Expose the LA57 feature to VM.

2017-08-18 Thread Yu Zhang



On 8/17/2017 10:29 PM, Paolo Bonzini wrote:

On 17/08/2017 13:53, Yu Zhang wrote:


On 8/17/2017 7:57 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

index a98b88a..50107ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -694,7 +694,7 @@ static __always_inline int __linearize(struct
x86_emulate_ctxt *ctxt,
   switch (mode) {
   case X86EMUL_MODE_PROT64:
   *linear = la;
-if (is_noncanonical_address(la))
+if (emul_is_noncanonical_address(la, ctxt))
   goto bad;
 *max_size = min_t(u64, ~0u, (1ull << 48) - la);

Oops, you missed one here.  Probably best to use ctxt_virt_addr_bits and
then "inline" emul_is_noncanonical_address as "get_canonical(la,
va_bits) != la".

Sorry, I just sent out the v2 patch set without noticing this reply. :-)

The emul_is_noncanonical() is defined in x86.h so that no
ctxt_virt_addr_bits needed in emulate.c, are you
suggesting to use ctx_virt_addr_bits in this file each time before
emul_is_noncanonical_address() is called?

No, only in this instance which uses "48" after the call to
emul_is_noncanonical_address.


Sorry, Paolo. I still do not quite get it.
Do you mean the
 *max_size = min_t(u64, ~0u, (1ull << 48) - la);
also need to be changed?

But I do not understand why this statement is used like this. My 
understanding is that
for 64 bit scenario, the *max_size is calculated to guarantee la + 
*max_size still falls in

the canonical address space.

And if above understanding is correct, I think it should be something 
like below:

  *max_size = min_t(u64, ~0u - la, (1ull << 48) - la);

And with LA57, may better be changed to:
  *max_size = min_t(u64, ~0u - la, (1ull << ctxt_virt_addr_bits(ctxt)) 
- la);


And for the above
  if (emul_is_noncanonical_address(la, ctxt))
we may just leave it as it is.

Is this understanding correct? Or did I misunderstand your comments? :-)

Thanks
Yu

Paolo





Re: [PATCH v1 4/4] KVM: MMU: Expose the LA57 feature to VM.

2017-08-18 Thread Yu Zhang



On 8/17/2017 10:29 PM, Paolo Bonzini wrote:

On 17/08/2017 13:53, Yu Zhang wrote:


On 8/17/2017 7:57 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

index a98b88a..50107ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -694,7 +694,7 @@ static __always_inline int __linearize(struct
x86_emulate_ctxt *ctxt,
   switch (mode) {
   case X86EMUL_MODE_PROT64:
   *linear = la;
-if (is_noncanonical_address(la))
+if (emul_is_noncanonical_address(la, ctxt))
   goto bad;
 *max_size = min_t(u64, ~0u, (1ull << 48) - la);

Oops, you missed one here.  Probably best to use ctxt_virt_addr_bits and
then "inline" emul_is_noncanonical_address as "get_canonical(la,
va_bits) != la".

Sorry, I just sent out the v2 patch set without noticing this reply. :-)

The emul_is_noncanonical() is defined in x86.h so that no
ctxt_virt_addr_bits needed in emulate.c, are you
suggesting to use ctx_virt_addr_bits in this file each time before
emul_is_noncanonical_address() is called?

No, only in this instance which uses "48" after the call to
emul_is_noncanonical_address.


Sorry, Paolo. I still do not quite get it.
Do you mean the
 *max_size = min_t(u64, ~0u, (1ull << 48) - la);
also need to be changed?

But I do not understand why this statement is used like this. My 
understanding is that
for 64 bit scenario, the *max_size is calculated to guarantee la + 
*max_size still falls in

the canonical address space.

And if above understanding is correct, I think it should be something 
like below:

  *max_size = min_t(u64, ~0u - la, (1ull << 48) - la);

And with LA57, may better be changed to:
  *max_size = min_t(u64, ~0u - la, (1ull << ctxt_virt_addr_bits(ctxt)) 
- la);


And for the above
  if (emul_is_noncanonical_address(la, ctxt))
we may just leave it as it is.

Is this understanding correct? Or did I misunderstand your comments? :-)

Thanks
Yu

Paolo





Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

2017-08-17 Thread Yu Zhang



On 8/17/2017 9:17 PM, Paolo Bonzini wrote:

On 17/08/2017 14:23, Yu Zhang wrote:


On 8/17/2017 8:29 PM, Paolo Bonzini wrote:

On 17/08/2017 21:52, Yu Zhang wrote:

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ac15193..3e759cf 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
   int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 struct kvm_cpuid2 *cpuid,
 struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx,
u32 *edx);
+
+enum {
+NO_CHECK_LIMIT = 0,
+CHECK_LIMIT = 1,
+};

emulate.c should not include cpuid.h.  The argument can be simply a
bool, though.

Thanks, Paolo.
So we just use true/false in emulate.c & svm.c, is this OK?
BTW could you please


+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+   u32 *ecx, u32 *edx, int check_limit);
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
   diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb00559..46daa37 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@
 #include "x86.h"
   #include "tss.h"
+#include "cpuid.h"
 /*
* Operand types
@@ -2333,8 +2334,10 @@ static int emulator_has_longmode(struct
x86_emulate_ctxt *ctxt)
 eax = 0x8001;
   ecx = 0;
-ctxt->ops->get_cpuid(ctxt, , , , );
-return edx & bit(X86_FEATURE_LM);
+if (ctxt->ops->get_cpuid(ctxt, , , , ,
NO_CHECK_LIMIT))
+return edx & bit(X86_FEATURE_LM);
+else
+return 0;
   }
 #define GET_SMSTATE(type, smbase, offset)  \
@@ -2636,7 +2639,7 @@ static bool vendor_intel(struct
x86_emulate_ctxt *ctxt)
   u32 eax, ebx, ecx, edx;
 eax = ecx = 0;
-ctxt->ops->get_cpuid(ctxt, , , , );
+ctxt->ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT);
   return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
   && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
   && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
@@ -2656,7 +2659,7 @@ static bool em_syscall_is_enabled(struct
x86_emulate_ctxt *ctxt)
 eax = 0x;
   ecx = 0x;
-ops->get_cpuid(ctxt, , , , );
+ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT);
   /*
* Intel ("GenuineIntel")
* remark: Intel CPUs only support "syscall" in 64bit
@@ -3551,7 +3554,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
   /*
* Check MOVBE is set in the guest-visible CPUID leaf.
*/
-ctxt->ops->get_cpuid(ctxt, , , , );
+ctxt->ops->get_cpuid(ctxt, , , , , CHECK_LIMIT);

This should be NO_CHECK_LIMIT.

Otherwise okay!

Then I guess check_fxsr() should also use NO_CHECK_LIMIT('false' for a
bool argument), because it's also for eax=1?

Good point.


And what about svm_vcpu_reset()?

No, this one should be left as is, it's just writing a register and not
checking a feature.


Got it. Thanks.




I am not sure if leaf 1 is always available. And if the answer is yes, I
do not think any of these 3 places(em_movbe/check_fxsr/svm_vcpu_reset) will
need to fall back to check_cpuid_limit(),
nor do we need to check the return value of get_cpuid(). Do you agree? :-)

I think the answer is no, but you don't need to check the return value
because testing against 0 is okay (if best is NULL, get_cpuid returns 0
for eax/ebx/ecx/edx).


OK. And to return 0 for eax/ebx/ecx/edx if check_cpuid_limit() is also 
to be omitted,
I'd better refactor this patch and move the "out:" before the if 
statement. :-)


best = check_cpuid_limit(vcpu, function, index);
}

+out:
if (best) {
*eax = best->eax;
*ebx = best->ebx;
@@ -887,7 +888,6 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 
*ebx,

} else
*eax = *ebx = *ecx = *edx = 0;

-out:
trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
return entry_found;
 }

And for all get_cpuid() callers which is testing the existence of a 
feature, we do not need to
check the return value, just checking the flag in the register should be 
fine, correct?


Yu



Paolo


Yu


Paolo


   if (!(ecx & FFL(MOVBE)))
   return emulate_ud(ctxt);
   @@ -3865,7 +3868,7 @@ static int em_cpuid(struct x86_emulate_ctxt
*ctxt)
 eax = reg_read(ctxt, VCPU_REGS_RAX);
   ecx = reg_read(ctxt, VCPU_REGS_RCX);
-ctxt->ops->get_cpuid(ctxt, , , , );
+ctxt->ops->get_cpuid(ctxt, , , , , CHECK_LIMIT);
   *reg_write(ctxt, VCPU_REGS_RAX) = eax;
   *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
   *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
@@ -3924,7 +3927,7 @@ static int check_fxsr(struct x86_emulate_ctxt
*ctxt)
   {
   u32 e

Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

2017-08-17 Thread Yu Zhang



On 8/17/2017 9:17 PM, Paolo Bonzini wrote:

On 17/08/2017 14:23, Yu Zhang wrote:


On 8/17/2017 8:29 PM, Paolo Bonzini wrote:

On 17/08/2017 21:52, Yu Zhang wrote:

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ac15193..3e759cf 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
   int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 struct kvm_cpuid2 *cpuid,
 struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx,
u32 *edx);
+
+enum {
+NO_CHECK_LIMIT = 0,
+CHECK_LIMIT = 1,
+};

emulate.c should not include cpuid.h.  The argument can be simply a
bool, though.

Thanks, Paolo.
So we just use true/false in emulate.c & svm.c, is this OK?
BTW could you please


+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+   u32 *ecx, u32 *edx, int check_limit);
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
   diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb00559..46daa37 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@
 #include "x86.h"
   #include "tss.h"
+#include "cpuid.h"
 /*
* Operand types
@@ -2333,8 +2334,10 @@ static int emulator_has_longmode(struct
x86_emulate_ctxt *ctxt)
 eax = 0x8001;
   ecx = 0;
-ctxt->ops->get_cpuid(ctxt, , , , );
-return edx & bit(X86_FEATURE_LM);
+if (ctxt->ops->get_cpuid(ctxt, , , , ,
NO_CHECK_LIMIT))
+return edx & bit(X86_FEATURE_LM);
+else
+return 0;
   }
 #define GET_SMSTATE(type, smbase, offset)  \
@@ -2636,7 +2639,7 @@ static bool vendor_intel(struct
x86_emulate_ctxt *ctxt)
   u32 eax, ebx, ecx, edx;
 eax = ecx = 0;
-ctxt->ops->get_cpuid(ctxt, , , , );
+ctxt->ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT);
   return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
   && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
   && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
@@ -2656,7 +2659,7 @@ static bool em_syscall_is_enabled(struct
x86_emulate_ctxt *ctxt)
 eax = 0x;
   ecx = 0x;
-ops->get_cpuid(ctxt, , , , );
+ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT);
   /*
* Intel ("GenuineIntel")
* remark: Intel CPUs only support "syscall" in 64bit
@@ -3551,7 +3554,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
   /*
* Check MOVBE is set in the guest-visible CPUID leaf.
*/
-ctxt->ops->get_cpuid(ctxt, , , , );
+ctxt->ops->get_cpuid(ctxt, , , , , CHECK_LIMIT);

This should be NO_CHECK_LIMIT.

Otherwise okay!

Then I guess check_fxsr() should also use NO_CHECK_LIMIT('false' for a
bool argument), because it's also for eax=1?

Good point.


And what about svm_vcpu_reset()?

No, this one should be left as is, it's just writing a register and not
checking a feature.


Got it. Thanks.




I am not sure if leaf 1 is always available. And if the answer is yes, I
do not think any of these 3 places(em_movbe/check_fxsr/svm_vcpu_reset) will
need to fall back to check_cpuid_limit(),
nor do we need to check the return value of get_cpuid(). Do you agree? :-)

I think the answer is no, but you don't need to check the return value
because testing against 0 is okay (if best is NULL, get_cpuid returns 0
for eax/ebx/ecx/edx).


OK. And to return 0 for eax/ebx/ecx/edx if check_cpuid_limit() is also 
to be omitted,
I'd better refactor this patch and move the "out:" before the if 
statement. :-)


best = check_cpuid_limit(vcpu, function, index);
}

+out:
if (best) {
*eax = best->eax;
*ebx = best->ebx;
@@ -887,7 +888,6 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 
*ebx,

} else
*eax = *ebx = *ecx = *edx = 0;

-out:
trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
return entry_found;
 }

And for all get_cpuid() callers which is testing the existence of a 
feature, we do not need to
check the return value, just checking the flag in the register should be 
fine, correct?


Yu



Paolo


Yu


Paolo


   if (!(ecx & FFL(MOVBE)))
   return emulate_ud(ctxt);
   @@ -3865,7 +3868,7 @@ static int em_cpuid(struct x86_emulate_ctxt
*ctxt)
 eax = reg_read(ctxt, VCPU_REGS_RAX);
   ecx = reg_read(ctxt, VCPU_REGS_RCX);
-ctxt->ops->get_cpuid(ctxt, , , , );
+ctxt->ops->get_cpuid(ctxt, , , , , CHECK_LIMIT);
   *reg_write(ctxt, VCPU_REGS_RAX) = eax;
   *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
   *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
@@ -3924,7 +3927,7 @@ static int check_fxsr(struct x86_emulate_ctxt
*ctxt)
   {
   u32 e

Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

2017-08-17 Thread Yu Zhang



On 8/17/2017 8:23 PM, Yu Zhang wrote:



On 8/17/2017 8:29 PM, Paolo Bonzini wrote:

On 17/08/2017 21:52, Yu Zhang wrote:

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ac15193..3e759cf 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
  int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, 
u32 *edx);

+
+enum {
+NO_CHECK_LIMIT = 0,
+CHECK_LIMIT = 1,
+};

emulate.c should not include cpuid.h.  The argument can be simply a
bool, though.


Thanks, Paolo.
So we just use true/false in emulate.c & svm.c, is this OK?
BTW could you please
Sorry for the unfinished line. I was wondering, why can't emulate.c 
include cpuid.h?


Yu



Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

2017-08-17 Thread Yu Zhang



On 8/17/2017 8:23 PM, Yu Zhang wrote:



On 8/17/2017 8:29 PM, Paolo Bonzini wrote:

On 17/08/2017 21:52, Yu Zhang wrote:

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ac15193..3e759cf 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
  int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, 
u32 *edx);

+
+enum {
+NO_CHECK_LIMIT = 0,
+CHECK_LIMIT = 1,
+};

emulate.c should not include cpuid.h.  The argument can be simply a
bool, though.


Thanks, Paolo.
So we just use true/false in emulate.c & svm.c, is this OK?
BTW could you please
Sorry for the unfinished line. I was wondering, why can't emulate.c 
include cpuid.h?


Yu



Re: [PATCH v2 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-17 Thread Yu Zhang



On 8/17/2017 8:31 PM, Paolo Bonzini wrote:

On 17/08/2017 21:52, Yu Zhang wrote:

+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL, NULL,
+NO_CHECK_LIMIT)) {
+   maxphyaddr = eax & 0xff;
+   rsvd = rsvd_bits(maxphyaddr, 62);
+   }

You should use 36 here if ctxt->ops->get_cpuid returns false, for
consistency with cpuid_query_maxphyaddr.


Oh, right. Thanks! :-)

Yu


Re: [PATCH v2 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-17 Thread Yu Zhang



On 8/17/2017 8:31 PM, Paolo Bonzini wrote:

On 17/08/2017 21:52, Yu Zhang wrote:

+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL, NULL,
+NO_CHECK_LIMIT)) {
+   maxphyaddr = eax & 0xff;
+   rsvd = rsvd_bits(maxphyaddr, 62);
+   }

You should use 36 here if ctxt->ops->get_cpuid returns false, for
consistency with cpuid_query_maxphyaddr.


Oh, right. Thanks! :-)

Yu


Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

2017-08-17 Thread Yu Zhang



On 8/17/2017 8:29 PM, Paolo Bonzini wrote:

On 17/08/2017 21:52, Yu Zhang wrote:

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ac15193..3e759cf 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
  int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
  struct kvm_cpuid2 *cpuid,
  struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+
+enum {
+   NO_CHECK_LIMIT = 0,
+   CHECK_LIMIT = 1,
+};

emulate.c should not include cpuid.h.  The argument can be simply a
bool, though.


Thanks, Paolo.
So we just use true/false in emulate.c & svm.c, is this OK?
BTW could you please


+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+  u32 *ecx, u32 *edx, int check_limit);
  
  int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
  
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c

index fb00559..46daa37 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@
  
  #include "x86.h"

  #include "tss.h"
+#include "cpuid.h"
  
  /*

   * Operand types
@@ -2333,8 +2334,10 @@ static int emulator_has_longmode(struct x86_emulate_ctxt 
*ctxt)
  
  	eax = 0x8001;

ecx = 0;
-   ctxt->ops->get_cpuid(ctxt, , , , );
-   return edx & bit(X86_FEATURE_LM);
+   if (ctxt->ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT))
+   return edx & bit(X86_FEATURE_LM);
+   else
+   return 0;
  }
  
  #define GET_SMSTATE(type, smbase, offset)  \

@@ -2636,7 +2639,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
u32 eax, ebx, ecx, edx;
  
  	eax = ecx = 0;

-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT);
return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
@@ -2656,7 +2659,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt 
*ctxt)
  
  	eax = 0x;

ecx = 0x;
-   ops->get_cpuid(ctxt, , , , );
+   ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT);
/*
 * Intel ("GenuineIntel")
 * remark: Intel CPUs only support "syscall" in 64bit
@@ -3551,7 +3554,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
/*
 * Check MOVBE is set in the guest-visible CPUID leaf.
 */
-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , CHECK_LIMIT);

This should be NO_CHECK_LIMIT.

Otherwise okay!


Then I guess check_fxsr() should also use NO_CHECK_LIMIT('false' for a 
bool argument),

because it's also for eax=1?

And what about svm_vcpu_reset()?

I am not sure if leaf 1 is always available. And if the answer is yes, I 
do not think any of these
3 places(em_movbe/check_fxsr/svm_vcpu_reset) will need to fall back to 
check_cpuid_limit(),

nor do we need to check the return value of get_cpuid(). Do you agree? :-)

Yu



Paolo


if (!(ecx & FFL(MOVBE)))
return emulate_ud(ctxt);
  
@@ -3865,7 +3868,7 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
  
  	eax = reg_read(ctxt, VCPU_REGS_RAX);

ecx = reg_read(ctxt, VCPU_REGS_RCX);
-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , CHECK_LIMIT);
*reg_write(ctxt, VCPU_REGS_RAX) = eax;
*reg_write(ctxt, VCPU_REGS_RBX) = ebx;
*reg_write(ctxt, VCPU_REGS_RCX) = ecx;
@@ -3924,7 +3927,7 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
  {
u32 eax = 1, ebx, ecx = 0, edx;
  
-	ctxt->ops->get_cpuid(ctxt, , , , );

+   ctxt->ops->get_cpuid(ctxt, , , , , CHECK_LIMIT);
if (!(edx & FFL(FXSR)))
return emulate_ud(ctxt);
  
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c

index 1fa9ee5..9def4a8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1580,7 +1580,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool 
init_event)
}
init_vmcb(svm);
  
-	kvm_cpuid(vcpu, , , , );

+   kvm_cpuid(vcpu, , , , , CHECK_LIMIT);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
  
  	if (kvm_vcpu_apicv_active(vcpu) && !init_event)

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0a6cc67..8a202c4 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -151,8 +151,8 @@ TRACE_EVENT(kvm_fast_mmio,
   */
  TRACE_EVENT(kvm_cpuid,
TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
-unsigned long rcx, unsigned long rdx),
-   TP_ARGS(function, rax, rbx, rcx, rdx),
+unsigned long 

Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

2017-08-17 Thread Yu Zhang



On 8/17/2017 8:29 PM, Paolo Bonzini wrote:

On 17/08/2017 21:52, Yu Zhang wrote:

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ac15193..3e759cf 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
  int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
  struct kvm_cpuid2 *cpuid,
  struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+
+enum {
+   NO_CHECK_LIMIT = 0,
+   CHECK_LIMIT = 1,
+};

emulate.c should not include cpuid.h.  The argument can be simply a
bool, though.


Thanks, Paolo.
So we just use true/false in emulate.c & svm.c, is this OK?
BTW could you please


+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+  u32 *ecx, u32 *edx, int check_limit);
  
  int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
  
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c

index fb00559..46daa37 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@
  
  #include "x86.h"

  #include "tss.h"
+#include "cpuid.h"
  
  /*

   * Operand types
@@ -2333,8 +2334,10 @@ static int emulator_has_longmode(struct x86_emulate_ctxt 
*ctxt)
  
  	eax = 0x8001;

ecx = 0;
-   ctxt->ops->get_cpuid(ctxt, , , , );
-   return edx & bit(X86_FEATURE_LM);
+   if (ctxt->ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT))
+   return edx & bit(X86_FEATURE_LM);
+   else
+   return 0;
  }
  
  #define GET_SMSTATE(type, smbase, offset)  \

@@ -2636,7 +2639,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
u32 eax, ebx, ecx, edx;
  
  	eax = ecx = 0;

-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT);
return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
@@ -2656,7 +2659,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt 
*ctxt)
  
  	eax = 0x;

ecx = 0x;
-   ops->get_cpuid(ctxt, , , , );
+   ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT);
/*
 * Intel ("GenuineIntel")
 * remark: Intel CPUs only support "syscall" in 64bit
@@ -3551,7 +3554,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
/*
 * Check MOVBE is set in the guest-visible CPUID leaf.
 */
-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , CHECK_LIMIT);

This should be NO_CHECK_LIMIT.

Otherwise okay!


Then I guess check_fxsr() should also use NO_CHECK_LIMIT('false' for a 
bool argument),

because it's also for eax=1?

And what about svm_vcpu_reset()?

I am not sure if leaf 1 is always available. And if the answer is yes, I 
do not think any of these
3 places(em_movbe/check_fxsr/svm_vcpu_reset) will need to fall back to 
check_cpuid_limit(),

nor do we need to check the return value of get_cpuid(). Do you agree? :-)

Yu



Paolo


if (!(ecx & FFL(MOVBE)))
return emulate_ud(ctxt);
  
@@ -3865,7 +3868,7 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
  
  	eax = reg_read(ctxt, VCPU_REGS_RAX);

ecx = reg_read(ctxt, VCPU_REGS_RCX);
-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , CHECK_LIMIT);
*reg_write(ctxt, VCPU_REGS_RAX) = eax;
*reg_write(ctxt, VCPU_REGS_RBX) = ebx;
*reg_write(ctxt, VCPU_REGS_RCX) = ecx;
@@ -3924,7 +3927,7 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
  {
u32 eax = 1, ebx, ecx = 0, edx;
  
-	ctxt->ops->get_cpuid(ctxt, , , , );

+   ctxt->ops->get_cpuid(ctxt, , , , , CHECK_LIMIT);
if (!(edx & FFL(FXSR)))
return emulate_ud(ctxt);
  
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c

index 1fa9ee5..9def4a8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1580,7 +1580,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool 
init_event)
}
init_vmcb(svm);
  
-	kvm_cpuid(vcpu, , , , );

+   kvm_cpuid(vcpu, , , , , CHECK_LIMIT);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
  
  	if (kvm_vcpu_apicv_active(vcpu) && !init_event)

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0a6cc67..8a202c4 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -151,8 +151,8 @@ TRACE_EVENT(kvm_fast_mmio,
   */
  TRACE_EVENT(kvm_cpuid,
TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
-unsigned long rcx, unsigned long rdx),
-   TP_ARGS(function, rax, rbx, rcx, rdx),
+unsigned long 

Re: [PATCH v1 4/4] KVM: MMU: Expose the LA57 feature to VM.

2017-08-17 Thread Yu Zhang



On 8/17/2017 7:57 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

index a98b88a..50107ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -694,7 +694,7 @@ static __always_inline int __linearize(struct 
x86_emulate_ctxt *ctxt,
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
-   if (is_noncanonical_address(la))
+   if (emul_is_noncanonical_address(la, ctxt))
goto bad;
  
  		*max_size = min_t(u64, ~0u, (1ull << 48) - la);

Oops, you missed one here.  Probably best to use ctxt_virt_addr_bits and
then "inline" emul_is_noncanonical_address as "get_canonical(la,
va_bits) != la".


Sorry, I just sent out the v2 patch set without noticing this reply. :-)

The emul_is_noncanonical() is defined in x86.h so that no 
ctxt_virt_addr_bits needed in emulate.c, are you
suggesting to use ctx_virt_addr_bits in this file each time before 
emul_is_noncanonical_address() is called?


Yu


Paolo





Re: [PATCH v1 4/4] KVM: MMU: Expose the LA57 feature to VM.

2017-08-17 Thread Yu Zhang



On 8/17/2017 7:57 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

index a98b88a..50107ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -694,7 +694,7 @@ static __always_inline int __linearize(struct 
x86_emulate_ctxt *ctxt,
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
-   if (is_noncanonical_address(la))
+   if (emul_is_noncanonical_address(la, ctxt))
goto bad;
  
  		*max_size = min_t(u64, ~0u, (1ull << 48) - la);

Oops, you missed one here.  Probably best to use ctxt_virt_addr_bits and
then "inline" emul_is_noncanonical_address as "get_canonical(la,
va_bits) != la".


Sorry, I just sent out the v2 patch set without noticing this reply. :-)

The emul_is_noncanonical() is defined in x86.h so that no 
ctxt_virt_addr_bits needed in emulate.c, are you
suggesting to use ctx_virt_addr_bits in this file each time before 
emul_is_noncanonical_address() is called?


Yu


Paolo





[PATCH v2 0/5] KVM: MMU: 5 level EPT/shadow support

2017-08-17 Thread Yu Zhang
Intel's existing processors limit the maximum linear address width to
48 bits, and the maximum physical address width to 46 bits. And the
upcoming processors will extend maximum linear address width to 57 bits
and maximum physical address width can go upto 52 bits in practical.

With linear address width greater than 48, a new paging mode in IA-32e
is introduced - 5 level paging(also known as LA57). And to support VMs 
with this feature, KVM MMU code need to be extended. 

And to achieve this, this patchset:
1> leverages 2 qemu parameters: +la57 and phys-bits to expose wider linear
address width and physical address width to the VM; 
2> extends shadow logic to construct 5 level shadow page for VMs running
in LA57 mode;
3> extends ept logic to construct 5 level ept table for VMs whose maximum
physical width exceeds 48 bits.

Changes in v2:
- Address comments from Paolo Bonzini and Jim Mattson: add a new patch to let
  kvm_cpuid() return false when cpuid entry is not found; 
- Address comments from Paolo Bonzini: fix a typo in check_cr_write() and use
  62 as the upper limit when checking reserved bits for a physical address;
- Address comments from Paolo Bonzini: move definition of PT64_ROOT_MAX_LEVEL
  into kvm_host.h;
- Address comments from Paolo Bonzini: add checking for shadow_root_level in
  mmu_free_roots(); 
- Address comments from Paolo Bonzini: set root_level & shadow_root_level both
  to PT64_ROOT_4LEVEL for shadow ept situation.

Yu Zhang (5):
  KVM: x86: Add return value to kvm_cpuid().
  KVM: MMU: check guest CR3 reserved bits based on its physical address
width.
  KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.
  KVM: MMU: Add 5 level EPT & Shadow page table support.
  KVM: MMU: Expose the LA57 feature to VM.

 arch/x86/include/asm/kvm_emulate.h |  4 +--
 arch/x86/include/asm/kvm_host.h| 31 ++--
 arch/x86/include/asm/vmx.h |  1 +
 arch/x86/kvm/cpuid.c   | 39 ++---
 arch/x86/kvm/cpuid.h   |  9 +-
 arch/x86/kvm/emulate.c | 42 +--
 arch/x86/kvm/kvm_cache_regs.h  |  2 +-
 arch/x86/kvm/mmu.c | 59 --
 arch/x86/kvm/mmu.h |  6 +++-
 arch/x86/kvm/mmu_audit.c   |  4 +--
 arch/x86/kvm/svm.c |  8 +++---
 arch/x86/kvm/trace.h   | 11 ---
 arch/x86/kvm/vmx.c | 27 ++---
 arch/x86/kvm/x86.c | 21 --
 arch/x86/kvm/x86.h | 44 
 15 files changed, 205 insertions(+), 103 deletions(-)

-- 
2.5.0



[PATCH v2 0/5] KVM: MMU: 5 level EPT/shadow support

2017-08-17 Thread Yu Zhang
Intel's existing processors limit the maximum linear address width to
48 bits, and the maximum physical address width to 46 bits. And the
upcoming processors will extend maximum linear address width to 57 bits
and maximum physical address width can go upto 52 bits in practical.

With linear address width greater than 48, a new paging mode in IA-32e
is introduced - 5 level paging(also known as LA57). And to support VMs 
with this feature, KVM MMU code need to be extended. 

And to achieve this, this patchset:
1> leverages 2 qemu parameters: +la57 and phys-bits to expose wider linear
address width and physical address width to the VM; 
2> extends shadow logic to construct 5 level shadow page for VMs running
in LA57 mode;
3> extends ept logic to construct 5 level ept table for VMs whose maximum
physical width exceeds 48 bits.

Changes in v2:
- Address comments from Paolo Bonzini and Jim Mattson: add a new patch to let
  kvm_cpuid() return false when cpuid entry is not found; 
- Address comments from Paolo Bonzini: fix a typo in check_cr_write() and use
  62 as the upper limit when checking reserved bits for a physical address;
- Address comments from Paolo Bonzini: move definition of PT64_ROOT_MAX_LEVEL
  into kvm_host.h;
- Address comments from Paolo Bonzini: add checking for shadow_root_level in
  mmu_free_roots(); 
- Address comments from Paolo Bonzini: set root_level & shadow_root_level both
  to PT64_ROOT_4LEVEL for shadow ept situation.

Yu Zhang (5):
  KVM: x86: Add return value to kvm_cpuid().
  KVM: MMU: check guest CR3 reserved bits based on its physical address
width.
  KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.
  KVM: MMU: Add 5 level EPT & Shadow page table support.
  KVM: MMU: Expose the LA57 feature to VM.

 arch/x86/include/asm/kvm_emulate.h |  4 +--
 arch/x86/include/asm/kvm_host.h| 31 ++--
 arch/x86/include/asm/vmx.h |  1 +
 arch/x86/kvm/cpuid.c   | 39 ++---
 arch/x86/kvm/cpuid.h   |  9 +-
 arch/x86/kvm/emulate.c | 42 +--
 arch/x86/kvm/kvm_cache_regs.h  |  2 +-
 arch/x86/kvm/mmu.c | 59 --
 arch/x86/kvm/mmu.h |  6 +++-
 arch/x86/kvm/mmu_audit.c   |  4 +--
 arch/x86/kvm/svm.c |  8 +++---
 arch/x86/kvm/trace.h   | 11 ---
 arch/x86/kvm/vmx.c | 27 ++---
 arch/x86/kvm/x86.c | 21 --
 arch/x86/kvm/x86.h | 44 
 15 files changed, 205 insertions(+), 103 deletions(-)

-- 
2.5.0



[PATCH v2 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-17 Thread Yu Zhang
Currently, KVM uses CR3_L_MODE_RESERVED_BITS to check the
reserved bits in CR3. Yet the length of reserved bits in
guest CR3 should be based on the physical address width
exposed to the VM. This patch changes CR3 check logic to
calculate the reserved bits at runtime.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/emulate.c  | 13 +++--
 arch/x86/kvm/mmu.h  |  3 +++
 arch/x86/kvm/x86.c  |  8 
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e4862e..018300e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,7 +79,6 @@
  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 
-#define CR3_L_MODE_RESERVED_BITS 0xFF00ULL
 #define CR3_PCID_INVD   BIT_64(63)
 #define CR4_RESERVED_BITS   \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 46daa37..f3e534d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -29,6 +29,7 @@
 #include "x86.h"
 #include "tss.h"
 #include "cpuid.h"
+#include "mmu.h"
 
 /*
  * Operand types
@@ -4100,8 +4101,16 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
u64 rsvd = 0;
 
ctxt->ops->get_msr(ctxt, MSR_EFER, );
-   if (efer & EFER_LMA)
-   rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL, NULL,
+NO_CHECK_LIMIT)) {
+   maxphyaddr = eax & 0xff;
+   rsvd = rsvd_bits(maxphyaddr, 62);
+   }
+   }
 
if (new_val & rsvd)
return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d7d248a..1cd0fcb 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,9 @@
 
 static inline u64 rsvd_bits(int s, int e)
 {
+   if (e < s)
+   return 0;
+
return ((1ULL << (e - s + 1)) - 1) << s;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ee99fc1..fa3041f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -813,10 +813,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
 
-   if (is_long_mode(vcpu)) {
-   if (cr3 & CR3_L_MODE_RESERVED_BITS)
-   return 1;
-   } else if (is_pae(vcpu) && is_paging(vcpu) &&
+   if (is_long_mode(vcpu) &&
+   (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62)))
+   return 1;
+   else if (is_pae(vcpu) && is_paging(vcpu) &&
   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
 
-- 
2.5.0



[PATCH v2 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-17 Thread Yu Zhang
Currently, KVM uses CR3_L_MODE_RESERVED_BITS to check the
reserved bits in CR3. Yet the length of reserved bits in
guest CR3 should be based on the physical address width
exposed to the VM. This patch changes CR3 check logic to
calculate the reserved bits at runtime.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/emulate.c  | 13 +++--
 arch/x86/kvm/mmu.h  |  3 +++
 arch/x86/kvm/x86.c  |  8 
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e4862e..018300e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,7 +79,6 @@
  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 
-#define CR3_L_MODE_RESERVED_BITS 0xFF00ULL
 #define CR3_PCID_INVD   BIT_64(63)
 #define CR4_RESERVED_BITS   \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 46daa37..f3e534d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -29,6 +29,7 @@
 #include "x86.h"
 #include "tss.h"
 #include "cpuid.h"
+#include "mmu.h"
 
 /*
  * Operand types
@@ -4100,8 +4101,16 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
u64 rsvd = 0;
 
ctxt->ops->get_msr(ctxt, MSR_EFER, );
-   if (efer & EFER_LMA)
-   rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   if (ctxt->ops->get_cpuid(ctxt, , NULL, NULL, NULL,
+NO_CHECK_LIMIT)) {
+   maxphyaddr = eax & 0xff;
+   rsvd = rsvd_bits(maxphyaddr, 62);
+   }
+   }
 
if (new_val & rsvd)
return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d7d248a..1cd0fcb 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,9 @@
 
 static inline u64 rsvd_bits(int s, int e)
 {
+   if (e < s)
+   return 0;
+
return ((1ULL << (e - s + 1)) - 1) << s;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ee99fc1..fa3041f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -813,10 +813,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
 
-   if (is_long_mode(vcpu)) {
-   if (cr3 & CR3_L_MODE_RESERVED_BITS)
-   return 1;
-   } else if (is_pae(vcpu) && is_paging(vcpu) &&
+   if (is_long_mode(vcpu) &&
+   (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62)))
+   return 1;
+   else if (is_pae(vcpu) && is_paging(vcpu) &&
   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
 
-- 
2.5.0



[PATCH v2 5/5] KVM: MMU: Expose the LA57 feature to VM.

2017-08-17 Thread Yu Zhang
This patch exposes 5 level page table feature to the VM,
at the same time, the canonical virtual address checking is
extended to support both 48-bits and 57-bits address width.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h | 18 ++
 arch/x86/kvm/cpuid.c| 16 ++--
 arch/x86/kvm/emulate.c  | 12 ++--
 arch/x86/kvm/kvm_cache_regs.h   |  2 +-
 arch/x86/kvm/vmx.c  |  8 
 arch/x86/kvm/x86.c  |  7 +--
 arch/x86/kvm/x86.h  | 34 ++
 7 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 34b0313..7a0f12b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,8 +85,8 @@
  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | 
X86_CR4_PCIDE \
  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
- | X86_CR4_PKE))
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
+ | X86_CR4_SMAP | X86_CR4_PKE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
@@ -1299,20 +1299,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, 
u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 }
 
-static inline u64 get_canonical(u64 la)
-{
-   return ((int64_t)la << 16) >> 16;
-}
-
-static inline bool is_noncanonical_address(u64 la)
-{
-#ifdef CONFIG_X86_64
-   return get_canonical(la) != la;
-#else
-   return false;
-#endif
-}
-
 #define TSS_IOPB_BASE_OFFSET 0x66
 #define TSS_BASE_SIZE 0x68
 #define TSS_IOPB_SIZE (65536 / 8)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 65da75d..126b726 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -126,13 +126,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
/*
-* The existing code assumes virtual address is 48-bit in the canonical
-* address checks; exit if it is ever changed.
+* The existing code assumes virtual address is 48-bit or 57-bit in the
+* canonical address checks; exit if it is ever changed.
 */
best = kvm_find_cpuid_entry(vcpu, 0x8008, 0);
-   if (best && ((best->eax & 0xff00) >> 8) != 48 &&
-   ((best->eax & 0xff00) >> 8) != 0)
-   return -EINVAL;
+   if (best) {
+   int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+   if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+   return -EINVAL;
+   }
 
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -388,7 +391,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
-   F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
+   F(AVX512VBMI) | F(LA57) | F(PKU) |
+   0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
 
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f3e534d..03b462f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -696,7 +696,7 @@ static __always_inline int __linearize(struct 
x86_emulate_ctxt *ctxt,
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
-   if (is_noncanonical_address(la))
+   if (emul_is_noncanonical_address(la, ctxt))
goto bad;
 
*max_size = min_t(u64, ~0u, (1ull << 48) - la);
@@ -1750,8 +1750,8 @@ static int __load_segment_descriptor(struct 
x86_emulate_ctxt *ctxt,
sizeof(base3), >exception);
if (ret != X86EMUL_CONTINUE)
return ret;
-   if (is_noncanonical_address(get_desc_base(_desc) |
-((u64)base3 << 32)))
+   if (emul_is_noncanonical_address(get_desc_base(_desc) |
+   ((u64)base3 << 32), ctxt))
return emulate_gp(ctxt, 0);
}
 load:
@@ -2844,8 +2844,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
-   if (is_noncanonical_address(rcx) ||
-   is_noncanonical_address(rdx))
+   if (emul_is_noncano

[PATCH v2 5/5] KVM: MMU: Expose the LA57 feature to VM.

2017-08-17 Thread Yu Zhang
This patch exposes 5 level page table feature to the VM,
at the same time, the canonical virtual address checking is
extended to support both 48-bits and 57-bits address width.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_host.h | 18 ++
 arch/x86/kvm/cpuid.c| 16 ++--
 arch/x86/kvm/emulate.c  | 12 ++--
 arch/x86/kvm/kvm_cache_regs.h   |  2 +-
 arch/x86/kvm/vmx.c  |  8 
 arch/x86/kvm/x86.c  |  7 +--
 arch/x86/kvm/x86.h  | 34 ++
 7 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 34b0313..7a0f12b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,8 +85,8 @@
  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | 
X86_CR4_PCIDE \
  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
- | X86_CR4_PKE))
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
+ | X86_CR4_SMAP | X86_CR4_PKE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
@@ -1299,20 +1299,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, 
u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 }
 
-static inline u64 get_canonical(u64 la)
-{
-   return ((int64_t)la << 16) >> 16;
-}
-
-static inline bool is_noncanonical_address(u64 la)
-{
-#ifdef CONFIG_X86_64
-   return get_canonical(la) != la;
-#else
-   return false;
-#endif
-}
-
 #define TSS_IOPB_BASE_OFFSET 0x66
 #define TSS_BASE_SIZE 0x68
 #define TSS_IOPB_SIZE (65536 / 8)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 65da75d..126b726 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -126,13 +126,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
/*
-* The existing code assumes virtual address is 48-bit in the canonical
-* address checks; exit if it is ever changed.
+* The existing code assumes virtual address is 48-bit or 57-bit in the
+* canonical address checks; exit if it is ever changed.
 */
best = kvm_find_cpuid_entry(vcpu, 0x8008, 0);
-   if (best && ((best->eax & 0xff00) >> 8) != 48 &&
-   ((best->eax & 0xff00) >> 8) != 0)
-   return -EINVAL;
+   if (best) {
+   int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+   if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+   return -EINVAL;
+   }
 
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -388,7 +391,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
-   F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
+   F(AVX512VBMI) | F(LA57) | F(PKU) |
+   0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
 
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f3e534d..03b462f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -696,7 +696,7 @@ static __always_inline int __linearize(struct 
x86_emulate_ctxt *ctxt,
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
-   if (is_noncanonical_address(la))
+   if (emul_is_noncanonical_address(la, ctxt))
goto bad;
 
*max_size = min_t(u64, ~0u, (1ull << 48) - la);
@@ -1750,8 +1750,8 @@ static int __load_segment_descriptor(struct 
x86_emulate_ctxt *ctxt,
sizeof(base3), >exception);
if (ret != X86EMUL_CONTINUE)
return ret;
-   if (is_noncanonical_address(get_desc_base(_desc) |
-((u64)base3 << 32)))
+   if (emul_is_noncanonical_address(get_desc_base(_desc) |
+   ((u64)base3 << 32), ctxt))
return emulate_gp(ctxt, 0);
}
 load:
@@ -2844,8 +2844,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
-   if (is_noncanonical_address(rcx) ||
-   is_noncanonical_address(rdx))
+   if (emul_is_noncanonical_address(rcx, ctxt) 

[PATCH v2 4/5] KVM: MMU: Add 5 level EPT & Shadow page table support.

2017-08-17 Thread Yu Zhang
Extends the shadow paging code, so that 5 level shadow page
table can be constructed if VM is running in 5 level paging
mode.

Also extends the ept code, so that 5 level ept table can be
constructed if maxphysaddr of VM exceeds 48 bits. Unlike the
shadow logic, KVM should still use 4 level ept table for a VM
whose physical address width is less than 48 bits, even when
the VM is running in 5 level paging mode.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h | 10 +-
 arch/x86/include/asm/vmx.h  |  1 +
 arch/x86/kvm/cpuid.c|  5 +
 arch/x86/kvm/mmu.c  | 43 +++--
 arch/x86/kvm/mmu.h  |  1 +
 arch/x86/kvm/mmu_audit.c|  4 ++--
 arch/x86/kvm/svm.c  |  4 ++--
 arch/x86/kvm/vmx.c  | 19 --
 arch/x86/kvm/x86.h  | 10 ++
 9 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7f70b8a..34b0313 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,7 +315,7 @@ struct kvm_pio_request {
int size;
 };
 
-#define PT64_ROOT_MAX_LEVEL 4
+#define PT64_ROOT_MAX_LEVEL 5
 
 struct rsvd_bits_validate {
u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
@@ -323,9 +323,9 @@ struct rsvd_bits_validate {
 };
 
 /*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
- * mode.
+ * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
+ * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
+ * current mmu mode.
  */
 struct kvm_mmu {
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
@@ -981,7 +981,7 @@ struct kvm_x86_ops {
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
-   int (*get_tdp_level)(void);
+   int (*get_tdp_level)(struct kvm_vcpu *vcpu);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 5f63a2e..a0fb025 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -453,6 +453,7 @@ enum vmcs_field {
 
 #define VMX_EPT_EXECUTE_ONLY_BIT   (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT(1ull << 6)
+#define VMX_EPT_PAGE_WALK_5_BIT(1ull << 7)
 #define VMX_EPTP_UC_BIT(1ull << 8)
 #define VMX_EPTP_WB_BIT(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull << 16)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 989ba4e..65da75d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -137,6 +137,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
+#ifdef CONFIG_X86_64
+   if (vcpu->arch.maxphyaddr > 48)
+   kvm_mmu_reset_context(vcpu);
+#endif
+
kvm_pmu_refresh(vcpu);
return 0;
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cd4d2cc..c392ae7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3323,8 +3323,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+   if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
+   (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL ||
 vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -3376,10 +3376,11 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
spin_lock(>kvm->mmu_lock);
make_mmu_pages_available(vcpu);
-   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
+   sp = kvm_mmu_get_page(vcpu, 0, 0,
+   vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
++sp->root_count;
spin_unlock(>kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3420,15 +3421,15 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * Do we shadow a

[PATCH v2 4/5] KVM: MMU: Add 5 level EPT & Shadow page table support.

2017-08-17 Thread Yu Zhang
Extends the shadow paging code, so that 5 level shadow page
table can be constructed if VM is running in 5 level paging
mode.

Also extends the ept code, so that 5 level ept table can be
constructed if maxphysaddr of VM exceeds 48 bits. Unlike the
shadow logic, KVM should still use 4 level ept table for a VM
whose physical address width is less than 48 bits, even when
the VM is running in 5 level paging mode.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_host.h | 10 +-
 arch/x86/include/asm/vmx.h  |  1 +
 arch/x86/kvm/cpuid.c|  5 +
 arch/x86/kvm/mmu.c  | 43 +++--
 arch/x86/kvm/mmu.h  |  1 +
 arch/x86/kvm/mmu_audit.c|  4 ++--
 arch/x86/kvm/svm.c  |  4 ++--
 arch/x86/kvm/vmx.c  | 19 --
 arch/x86/kvm/x86.h  | 10 ++
 9 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7f70b8a..34b0313 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,7 +315,7 @@ struct kvm_pio_request {
int size;
 };
 
-#define PT64_ROOT_MAX_LEVEL 4
+#define PT64_ROOT_MAX_LEVEL 5
 
 struct rsvd_bits_validate {
u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
@@ -323,9 +323,9 @@ struct rsvd_bits_validate {
 };
 
 /*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
- * mode.
+ * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
+ * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
+ * current mmu mode.
  */
 struct kvm_mmu {
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
@@ -981,7 +981,7 @@ struct kvm_x86_ops {
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
-   int (*get_tdp_level)(void);
+   int (*get_tdp_level)(struct kvm_vcpu *vcpu);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 5f63a2e..a0fb025 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -453,6 +453,7 @@ enum vmcs_field {
 
 #define VMX_EPT_EXECUTE_ONLY_BIT   (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT(1ull << 6)
+#define VMX_EPT_PAGE_WALK_5_BIT(1ull << 7)
 #define VMX_EPTP_UC_BIT(1ull << 8)
 #define VMX_EPTP_WB_BIT(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull << 16)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 989ba4e..65da75d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -137,6 +137,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
+#ifdef CONFIG_X86_64
+   if (vcpu->arch.maxphyaddr > 48)
+   kvm_mmu_reset_context(vcpu);
+#endif
+
kvm_pmu_refresh(vcpu);
return 0;
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cd4d2cc..c392ae7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3323,8 +3323,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+   if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
+   (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL ||
 vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -3376,10 +3376,11 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
spin_lock(>kvm->mmu_lock);
make_mmu_pages_available(vcpu);
-   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
+   sp = kvm_mmu_get_page(vcpu, 0, 0,
+   vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
++sp->root_count;
spin_unlock(>kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3420,15 +3421,15 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * Do we shadow a long mode page table? If s

[PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

2017-08-17 Thread Yu Zhang
Return false in kvm_cpuid() when it fails to find the cpuid
entry. Also, this routine(and its caller) is optimized with
a new argument - check_limit, so that the check_cpuid_limit()
fall back can be avoided.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_emulate.h |  4 ++--
 arch/x86/kvm/cpuid.c   | 18 ++
 arch/x86/kvm/cpuid.h   |  9 -
 arch/x86/kvm/emulate.c | 17 ++---
 arch/x86/kvm/svm.c |  2 +-
 arch/x86/kvm/trace.h   | 11 +++
 arch/x86/kvm/x86.c |  6 +++---
 7 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h 
b/arch/x86/include/asm/kvm_emulate.h
index fde36f1..0e51a07 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -219,8 +219,8 @@ struct x86_emulate_ops {
 struct x86_instruction_info *info,
 enum x86_intercept_stage stage);
 
-   void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+   bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, int check_limit);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
 
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59ca2ee..989ba4e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -853,15 +853,22 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct 
kvm_vcpu *vcpu,
return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
 }
 
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+  u32 *ecx, u32 *edx, int check_limit)
 {
u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
+   bool entry_found = true;
 
best = kvm_find_cpuid_entry(vcpu, function, index);
 
-   if (!best)
+   if (!best) {
+   entry_found = false;
+   if (!check_limit)
+   goto out;
+
best = check_cpuid_limit(vcpu, function, index);
+   }
 
if (best) {
*eax = best->eax;
@@ -870,7 +877,10 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, 
u32 *ecx, u32 *edx)
*edx = best->edx;
} else
*eax = *ebx = *ecx = *edx = 0;
-   trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx);
+
+out:
+   trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
+   return entry_found;
 }
 EXPORT_SYMBOL_GPL(kvm_cpuid);
 
@@ -883,7 +893,7 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 
eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-   kvm_cpuid(vcpu, , , , );
+   kvm_cpuid(vcpu, , , , , CHECK_LIMIT);
kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ac15193..3e759cf 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
  struct kvm_cpuid2 *cpuid,
  struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+
+enum {
+   NO_CHECK_LIMIT = 0,
+   CHECK_LIMIT = 1,
+};
+
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+  u32 *ecx, u32 *edx, int check_limit);
 
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb00559..46daa37 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@
 
 #include "x86.h"
 #include "tss.h"
+#include "cpuid.h"
 
 /*
  * Operand types
@@ -2333,8 +2334,10 @@ static int emulator_has_longmode(struct x86_emulate_ctxt 
*ctxt)
 
eax = 0x8001;
ecx = 0;
-   ctxt->ops->get_cpuid(ctxt, , , , );
-   return edx & bit(X86_FEATURE_LM);
+   if (ctxt->ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT))
+   return edx & bit(X86_FEATURE_LM);
+   else
+   return 0;
 }
 
 #define GET_SMSTATE(type, smbase, offset)\
@@ -2636,7 +2639,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
u32 eax, ebx, ecx, edx;
 
eax = ecx = 0;
-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT);
return ebx == X86EMUL_CPUID_VENDOR_Gen

[PATCH v2 3/5] KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.

2017-08-17 Thread Yu Zhang
Now we have 4 level page table and 5 level page table in 64 bits
long mode, let's rename the PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL,
then we can use PT64_ROOT_5LEVEL for 5 level page table, it's
helpful to make the code more clear.

Also PT64_ROOT_MAX_LEVEL is defined as 4, so that we can just
redefine it to 5 whenever a replacement is needed for 5 level
paging.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |  4 +++-
 arch/x86/kvm/mmu.c  | 36 ++--
 arch/x86/kvm/mmu.h  |  2 +-
 arch/x86/kvm/mmu_audit.c|  4 ++--
 arch/x86/kvm/svm.c  |  2 +-
 5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 018300e..7f70b8a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,8 +315,10 @@ struct kvm_pio_request {
int size;
 };
 
+#define PT64_ROOT_MAX_LEVEL 4
+
 struct rsvd_bits_validate {
-   u64 rsvd_bits_mask[2][4];
+   u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
u64 bad_mt_xwr;
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7ee21c0..cd4d2cc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2167,8 +2167,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t 
gfn,
 }
 
 struct mmu_page_path {
-   struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
-   unsigned int idx[PT64_ROOT_LEVEL];
+   struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+   unsigned int idx[PT64_ROOT_MAX_LEVEL];
 };
 
 #define for_each_sp(pvec, sp, parents, i)  \
@@ -2383,8 +2383,8 @@ static void shadow_walk_init(struct 
kvm_shadow_walk_iterator *iterator,
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
 
-   if (iterator->level == PT64_ROOT_LEVEL &&
-   vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+   if (iterator->level == PT64_ROOT_4LEVEL &&
+   vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu.direct_map)
--iterator->level;
 
@@ -3323,8 +3323,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
+   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
 vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -3376,10 +3376,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
spin_lock(>kvm->mmu_lock);
make_mmu_pages_available(vcpu);
-   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL);
+   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
++sp->root_count;
spin_unlock(>kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3420,14 +3420,14 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * Do we shadow a long mode page table? If so we need to
 * write-protect the guests page table root.
 */
-   if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
MMU_WARN_ON(VALID_PAGE(root));
 
spin_lock(>kvm->mmu_lock);
make_mmu_pages_available(vcpu);
-   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
  0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
@@ -3442,7 +3442,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * the shadow page table may be a PAE or a long mode page table.
 */
pm_mask = PT_PRESENT_MASK;
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
for (i = 0; i < 4; ++i) {
@@ -3475,7 +3475,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * If we shadow a 32 bit page table with a long mode page
 * table we enter this path.
 */
-   if (vcpu-

[PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

2017-08-17 Thread Yu Zhang
Return false in kvm_cpuid() when it fails to find the cpuid
entry. Also, this routine(and its caller) is optimized with
a new argument - check_limit, so that the check_cpuid_limit()
fall back can be avoided.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_emulate.h |  4 ++--
 arch/x86/kvm/cpuid.c   | 18 ++
 arch/x86/kvm/cpuid.h   |  9 -
 arch/x86/kvm/emulate.c | 17 ++---
 arch/x86/kvm/svm.c |  2 +-
 arch/x86/kvm/trace.h   | 11 +++
 arch/x86/kvm/x86.c |  6 +++---
 7 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h 
b/arch/x86/include/asm/kvm_emulate.h
index fde36f1..0e51a07 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -219,8 +219,8 @@ struct x86_emulate_ops {
 struct x86_instruction_info *info,
 enum x86_intercept_stage stage);
 
-   void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+   bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, int check_limit);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
 
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59ca2ee..989ba4e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -853,15 +853,22 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct 
kvm_vcpu *vcpu,
return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
 }
 
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+  u32 *ecx, u32 *edx, int check_limit)
 {
u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
+   bool entry_found = true;
 
best = kvm_find_cpuid_entry(vcpu, function, index);
 
-   if (!best)
+   if (!best) {
+   entry_found = false;
+   if (!check_limit)
+   goto out;
+
best = check_cpuid_limit(vcpu, function, index);
+   }
 
if (best) {
*eax = best->eax;
@@ -870,7 +877,10 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, 
u32 *ecx, u32 *edx)
*edx = best->edx;
} else
*eax = *ebx = *ecx = *edx = 0;
-   trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx);
+
+out:
+   trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
+   return entry_found;
 }
 EXPORT_SYMBOL_GPL(kvm_cpuid);
 
@@ -883,7 +893,7 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 
eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-   kvm_cpuid(vcpu, , , , );
+   kvm_cpuid(vcpu, , , , , CHECK_LIMIT);
kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ac15193..3e759cf 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
  struct kvm_cpuid2 *cpuid,
  struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+
+enum {
+   NO_CHECK_LIMIT = 0,
+   CHECK_LIMIT = 1,
+};
+
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+  u32 *ecx, u32 *edx, int check_limit);
 
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb00559..46daa37 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@
 
 #include "x86.h"
 #include "tss.h"
+#include "cpuid.h"
 
 /*
  * Operand types
@@ -2333,8 +2334,10 @@ static int emulator_has_longmode(struct x86_emulate_ctxt 
*ctxt)
 
eax = 0x8001;
ecx = 0;
-   ctxt->ops->get_cpuid(ctxt, , , , );
-   return edx & bit(X86_FEATURE_LM);
+   if (ctxt->ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT))
+   return edx & bit(X86_FEATURE_LM);
+   else
+   return 0;
 }
 
 #define GET_SMSTATE(type, smbase, offset)\
@@ -2636,7 +2639,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
u32 eax, ebx, ecx, edx;
 
eax = ecx = 0;
-   ctxt->ops->get_cpuid(ctxt, , , , );
+   ctxt->ops->get_cpuid(ctxt, , , , , NO_CHECK_LIMIT);
return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
   

[PATCH v2 3/5] KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.

2017-08-17 Thread Yu Zhang
Now we have 4 level page table and 5 level page table in 64 bits
long mode, let's rename the PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL,
then we can use PT64_ROOT_5LEVEL for 5 level page table, it's
helpful to make the code more clear.

Also PT64_ROOT_MAX_LEVEL is defined as 4, so that we can just
redefine it to 5 whenever a replacement is needed for 5 level
paging.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_host.h |  4 +++-
 arch/x86/kvm/mmu.c  | 36 ++--
 arch/x86/kvm/mmu.h  |  2 +-
 arch/x86/kvm/mmu_audit.c|  4 ++--
 arch/x86/kvm/svm.c  |  2 +-
 5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 018300e..7f70b8a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,8 +315,10 @@ struct kvm_pio_request {
int size;
 };
 
+#define PT64_ROOT_MAX_LEVEL 4
+
 struct rsvd_bits_validate {
-   u64 rsvd_bits_mask[2][4];
+   u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
u64 bad_mt_xwr;
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7ee21c0..cd4d2cc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2167,8 +2167,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t 
gfn,
 }
 
 struct mmu_page_path {
-   struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
-   unsigned int idx[PT64_ROOT_LEVEL];
+   struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+   unsigned int idx[PT64_ROOT_MAX_LEVEL];
 };
 
 #define for_each_sp(pvec, sp, parents, i)  \
@@ -2383,8 +2383,8 @@ static void shadow_walk_init(struct 
kvm_shadow_walk_iterator *iterator,
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
 
-   if (iterator->level == PT64_ROOT_LEVEL &&
-   vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+   if (iterator->level == PT64_ROOT_4LEVEL &&
+   vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu.direct_map)
--iterator->level;
 
@@ -3323,8 +3323,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
+   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
 vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -3376,10 +3376,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
spin_lock(>kvm->mmu_lock);
make_mmu_pages_available(vcpu);
-   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL);
+   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
++sp->root_count;
spin_unlock(>kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3420,14 +3420,14 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * Do we shadow a long mode page table? If so we need to
 * write-protect the guests page table root.
 */
-   if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
MMU_WARN_ON(VALID_PAGE(root));
 
spin_lock(>kvm->mmu_lock);
make_mmu_pages_available(vcpu);
-   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
  0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
@@ -3442,7 +3442,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * the shadow page table may be a PAE or a long mode page table.
 */
pm_mask = PT_PRESENT_MASK;
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
for (i = 0; i < 4; ++i) {
@@ -3475,7 +3475,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * If we shadow a 32 bit page table with a long mode page
 * table we enter this path.
 */
-   if (vcpu->ar

Re: [PATCH v1 1/4] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-15 Thread Yu Zhang



On 8/15/2017 12:40 AM, Paolo Bonzini wrote:

On 14/08/2017 18:13, Jim Mattson wrote:

 ctxt->ops->get_msr(ctxt, MSR_EFER, );
-   if (efer & EFER_LMA)
-   rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   ctxt->ops->get_cpuid(ctxt, , NULL, NULL, NULL);
+   maxphyaddr = eax * 0xff;

What if leaf 0x8008 is not defined?

I noticed this too, but I thought it was mitigated by being under
EFER_LMA.  Unfortunately, kvm_set_efer doesn't check
guest_cpuid_has_longmode, so I guess you do have to test leaf 0x8000
first.  Alternatively:

1) kvm_cpuid could return false if it's falling back to
check_cpuid_limit, and emulator_get_cpuid can then be changed to return bool

2) kvm_cpuid and emulator_get_cpuid could gain a new argument to disable
the check_cpuid_limit fallback.

Yu, would you like to implement the latter?


Thanks for pointing this out, Jim & Paolo. The latter choice sounds 
better to me. :-)

I'd like to implement this in a separate patch in next version patch set.

Yu

Paolo





Re: [PATCH v1 1/4] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-15 Thread Yu Zhang



On 8/15/2017 12:40 AM, Paolo Bonzini wrote:

On 14/08/2017 18:13, Jim Mattson wrote:

 ctxt->ops->get_msr(ctxt, MSR_EFER, );
-   if (efer & EFER_LMA)
-   rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   ctxt->ops->get_cpuid(ctxt, , NULL, NULL, NULL);
+   maxphyaddr = eax * 0xff;

What if leaf 0x8008 is not defined?

I noticed this too, but I thought it was mitigated by being under
EFER_LMA.  Unfortunately, kvm_set_efer doesn't check
guest_cpuid_has_longmode, so I guess you do have to test leaf 0x8000
first.  Alternatively:

1) kvm_cpuid could return false if it's falling back to
check_cpuid_limit, and emulator_get_cpuid can then be changed to return bool

2) kvm_cpuid and emulator_get_cpuid could gain a new argument to disable
the check_cpuid_limit fallback.

Yu, would you like to implement the latter?


Thanks for pointing this out, Jim & Paolo. The latter choice sounds 
better to me. :-)

I'd like to implement this in a separate patch in next version patch set.

Yu

Paolo





Re: [PATCH v1 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support.

2017-08-14 Thread Yu Zhang



On 8/14/2017 11:02 PM, Paolo Bonzini wrote:

On 14/08/2017 16:32, Yu Zhang wrote:


On 8/14/2017 10:13 PM, Paolo Bonzini wrote:

On 14/08/2017 13:37, Yu Zhang wrote:

Thanks a lot for your comments, Paolo. :-)


On 8/14/2017 3:31 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

struct rsvd_bits_validate {
-u64 rsvd_bits_mask[2][4];
+u64 rsvd_bits_mask[2][5];
u64 bad_mt_xwr;
};

Can you change this 4 to PT64_ROOT_MAX_LEVEL in patch 2?

Well, I had tried, but failed to find a neat approach to do so. The
difficulty I have met is that PT64_ROOT_MAX_LEVEL is defined together
with PT64_ROOT_4LEVEL/PT32E_ROOT_LEVEL/PT32_ROOT_LEVEL in mmu.h, yet
the rsvd_bits_validate structure is defined in kvm_host.h, which are
included in quite a lot .c files that do not include mmu.h or include
the mmu.h after kvm_host.h.

I guess that's the reason why the magic number 4 instead of
PT64_ROOT_4LEVEL is used in current definition of
rsvd_bits_vadlidate. :-)

Yes, you're right.  I think the solution is to define
PT64_ROOT_MAX_LEVEL in kvm_host.h.

Thanks, Paolo. How about we also move the definition of PT64_ROOT_4LEVEL/
PT32E_ROOT_LEVEL/PT32_ROOT_LEVEL from mmu.h to kvm_host.h? Then we
can define PT64_ROOT_MAX_LEVEL as PT64_ROOT_4LEVEL instead of 4 in
kvm_host.h.

No, I think those are best left in mmu.h.  They are only used in mmu
files, except for two occurrences in svm.c.

kvm_host.h would have PT64_ROOT_MAX_LEVEL just because it is slightly
better than "4" or "5".


OK. I can define PT64_ROOT_MAX_LEVEL in kvm_host.h as 4 in patch 2, and 
change

it to 5 in patch 3. :- )

Thanks
Yu


Re: [PATCH v1 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support.

2017-08-14 Thread Yu Zhang



On 8/14/2017 11:02 PM, Paolo Bonzini wrote:

On 14/08/2017 16:32, Yu Zhang wrote:


On 8/14/2017 10:13 PM, Paolo Bonzini wrote:

On 14/08/2017 13:37, Yu Zhang wrote:

Thanks a lot for your comments, Paolo. :-)


On 8/14/2017 3:31 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

struct rsvd_bits_validate {
-u64 rsvd_bits_mask[2][4];
+u64 rsvd_bits_mask[2][5];
u64 bad_mt_xwr;
};

Can you change this 4 to PT64_ROOT_MAX_LEVEL in patch 2?

Well, I had tried, but failed to find a neat approach to do so. The
difficulty I have met is that PT64_ROOT_MAX_LEVEL is defined together
with PT64_ROOT_4LEVEL/PT32E_ROOT_LEVEL/PT32_ROOT_LEVEL in mmu.h, yet
the rsvd_bits_validate structure is defined in kvm_host.h, which are
included in quite a lot .c files that do not include mmu.h or include
the mmu.h after kvm_host.h.

I guess that's the reason why the magic number 4 instead of
PT64_ROOT_4LEVEL is used in current definition of
rsvd_bits_vadlidate. :-)

Yes, you're right.  I think the solution is to define
PT64_ROOT_MAX_LEVEL in kvm_host.h.

Thanks, Paolo. How about we also move the definition of PT64_ROOT_4LEVEL/
PT32E_ROOT_LEVEL/PT32_ROOT_LEVEL from mmu.h to kvm_host.h? Then we
can define PT64_ROOT_MAX_LEVEL as PT64_ROOT_4LEVEL instead of 4 in
kvm_host.h.

No, I think those are best left in mmu.h.  They are only used in mmu
files, except for two occurrences in svm.c.

kvm_host.h would have PT64_ROOT_MAX_LEVEL just because it is slightly
better than "4" or "5".


OK. I can define PT64_ROOT_MAX_LEVEL in kvm_host.h as 4 in patch 2, and 
change

it to 5 in patch 3. :- )

Thanks
Yu


Re: [PATCH v1 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support.

2017-08-14 Thread Yu Zhang



On 8/14/2017 10:13 PM, Paolo Bonzini wrote:

On 14/08/2017 13:37, Yu Zhang wrote:

Thanks a lot for your comments, Paolo. :-)


On 8/14/2017 3:31 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

   struct rsvd_bits_validate {
-u64 rsvd_bits_mask[2][4];
+u64 rsvd_bits_mask[2][5];
   u64 bad_mt_xwr;
   };

Can you change this 4 to PT64_ROOT_MAX_LEVEL in patch 2?

Well, I had tried, but failed to find a neat approach to do so. The
difficulty I have met is that PT64_ROOT_MAX_LEVEL is defined together
with PT64_ROOT_4LEVEL/PT32E_ROOT_LEVEL/PT32_ROOT_LEVEL in mmu.h, yet
the rsvd_bits_validate structure is defined in kvm_host.h, which are
included in quite a lot .c files that do not include mmu.h or include
the mmu.h after kvm_host.h.

I guess that's the reason why the magic number 4 instead of
PT64_ROOT_4LEVEL is used in current definition of rsvd_bits_vadlidate. :-)

Yes, you're right.  I think the solution is to define
PT64_ROOT_MAX_LEVEL in kvm_host.h.


Thanks, Paolo. How about we also move the definition of PT64_ROOT_4LEVEL/
PT32E_ROOT_LEVEL/PT32_ROOT_LEVEL from mmu.h to kvm_host.h? Then we
can define PT64_ROOT_MAX_LEVEL as PT64_ROOT_4LEVEL instead of 4 in 
kvm_host.h.



@@ -,7 +4457,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu
*vcpu, bool execonly,
 MMU_WARN_ON(VALID_PAGE(context->root_hpa));
   -context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
 context->nx = true;
   context->ept_ad = accessed_dirty;

Below, there is:

  context->root_level = context->shadow_root_level;

this should be forced to PT64_ROOT_4LEVEL until there is support for
nested EPT 5-level page tables.

So the context->shadow_root_level could be 5 or 4, and
context->root_level is always 4?

That was my idea, but setting both to 4 should be fine too as you
suggest below.


My understanding is that shadow ept level should be determined by
the width of ngpa, and that if L1 guest is not exposed with EPT5
feature, it shall only use 4 level ept for L2 guest, and the shadow
ept does not need a 5 level one. Is this understanding correct? And
how about we set both values to PT64_ROOT_4LEVEL for now?>
Besides, if we wanna support nested EPT5, what do you think we need to
do besides exposing the EPT5 feature to L1 guest?

Nothing else, I think.


Thanks. I'll try to keep both values fixed to PT64_ROOT_4LEVEL then. :-)
For nested EPT5, we can enable it later(should be a quite simple patch, 
but need to
be verified in our simics environment, which I am not sure if nested 
scenario works).


B.R.
Yu


Re: [PATCH v1 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support.

2017-08-14 Thread Yu Zhang



On 8/14/2017 10:13 PM, Paolo Bonzini wrote:

On 14/08/2017 13:37, Yu Zhang wrote:

Thanks a lot for your comments, Paolo. :-)


On 8/14/2017 3:31 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

   struct rsvd_bits_validate {
-u64 rsvd_bits_mask[2][4];
+u64 rsvd_bits_mask[2][5];
   u64 bad_mt_xwr;
   };

Can you change this 4 to PT64_ROOT_MAX_LEVEL in patch 2?

Well, I had tried, but failed to find a neat approach to do so. The
difficulty I have met is that PT64_ROOT_MAX_LEVEL is defined together
with PT64_ROOT_4LEVEL/PT32E_ROOT_LEVEL/PT32_ROOT_LEVEL in mmu.h, yet
the rsvd_bits_validate structure is defined in kvm_host.h, which are
included in quite a lot .c files that do not include mmu.h or include
the mmu.h after kvm_host.h.

I guess that's the reason why the magic number 4 instead of
PT64_ROOT_4LEVEL is used in current definition of rsvd_bits_vadlidate. :-)

Yes, you're right.  I think the solution is to define
PT64_ROOT_MAX_LEVEL in kvm_host.h.


Thanks, Paolo. How about we also move the definition of PT64_ROOT_4LEVEL/
PT32E_ROOT_LEVEL/PT32_ROOT_LEVEL from mmu.h to kvm_host.h? Then we
can define PT64_ROOT_MAX_LEVEL as PT64_ROOT_4LEVEL instead of 4 in 
kvm_host.h.



@@ -,7 +4457,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu
*vcpu, bool execonly,
 MMU_WARN_ON(VALID_PAGE(context->root_hpa));
   -context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
 context->nx = true;
   context->ept_ad = accessed_dirty;

Below, there is:

  context->root_level = context->shadow_root_level;

this should be forced to PT64_ROOT_4LEVEL until there is support for
nested EPT 5-level page tables.

So the context->shadow_root_level could be 5 or 4, and
context->root_level is always 4?

That was my idea, but setting both to 4 should be fine too as you
suggest below.


My understanding is that shadow ept level should be determined by
the width of ngpa, and that if L1 guest is not exposed with EPT5
feature, it shall only use 4 level ept for L2 guest, and the shadow
ept does not need a 5 level one. Is this understanding correct? And
how about we set both values to PT64_ROOT_4LEVEL for now?>
Besides, if we wanna support nested EPT5, what do you think we need to
do besides exposing the EPT5 feature to L1 guest?

Nothing else, I think.


Thanks. I'll try to keep both values fixed to PT64_ROOT_4LEVEL then. :-)
For nested EPT5, we can enable it later(should be a quite simple patch, 
but need to
be verified in our simics environment, which I am not sure if nested 
scenario works).


B.R.
Yu


Re: [PATCH v1 1/4] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-14 Thread Yu Zhang



On 8/14/2017 3:36 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

+   ctxt->ops->get_cpuid(ctxt, , NULL, NULL, NULL);
+   maxphyaddr = eax * 0xff;

This is "&", not "*".  You can also use rsvd_bits here.


Oops. Sorry for the typo. :-)


+   rsvd = (~((1UL << maxphyaddr) - 1)) &
+   ~CR3_PCID_INVD;
+   }
  
  		if (new_val & rsvd)

return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e40a779..d9100c4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -813,10 +813,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
  
-	if (is_long_mode(vcpu)) {

-   if (cr3 & CR3_L_MODE_RESERVED_BITS)
-   return 1;
-   } else if (is_pae(vcpu) && is_paging(vcpu) &&
+   if (is_long_mode(vcpu) &&
+   (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
+   return 1;

62 is a little better, since 63 is the PCID invalidate bit.


Got it. Both kvm_set_cr3() and check_cr_write() should use 
rsvd_bits(maxphyaddr, 62) .


Thanks
Yu


Paolo


+   else if (is_pae(vcpu) && is_paging(vcpu) &&
   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
  






Re: [PATCH v1 1/4] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-14 Thread Yu Zhang



On 8/14/2017 3:36 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

+   ctxt->ops->get_cpuid(ctxt, , NULL, NULL, NULL);
+   maxphyaddr = eax * 0xff;

This is "&", not "*".  You can also use rsvd_bits here.


Oops. Sorry for the typo. :-)


+   rsvd = (~((1UL << maxphyaddr) - 1)) &
+   ~CR3_PCID_INVD;
+   }
  
  		if (new_val & rsvd)

return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e40a779..d9100c4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -813,10 +813,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
  
-	if (is_long_mode(vcpu)) {

-   if (cr3 & CR3_L_MODE_RESERVED_BITS)
-   return 1;
-   } else if (is_pae(vcpu) && is_paging(vcpu) &&
+   if (is_long_mode(vcpu) &&
+   (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
+   return 1;

62 is a little better, since 63 is the PCID invalidate bit.


Got it. Both kvm_set_cr3() and check_cr_write() should use 
rsvd_bits(maxphyaddr, 62) .


Thanks
Yu


Paolo


+   else if (is_pae(vcpu) && is_paging(vcpu) &&
   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
  






Re: [PATCH v1 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support.

2017-08-14 Thread Yu Zhang

Thanks a lot for your comments, Paolo. :-)


On 8/14/2017 3:31 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

  struct rsvd_bits_validate {
-   u64 rsvd_bits_mask[2][4];
+   u64 rsvd_bits_mask[2][5];
u64 bad_mt_xwr;
  };


Can you change this 4 to PT64_ROOT_MAX_LEVEL in patch 2?


Well, I had tried, but failed to find a neat approach to do so.
The difficulty I have met is that PT64_ROOT_MAX_LEVEL is defined 
together with

PT64_ROOT_4LEVEL/PT32E_ROOT_LEVEL/PT32_ROOT_LEVEL in mmu.h, yet the
rsvd_bits_validate structure is defined in kvm_host.h, which are 
included in quite
a lot .c files that do not include mmu.h or include the mmu.h after 
kvm_host.h.


I guess that's the reason why the magic number 4 instead of PT64_ROOT_4LEVEL
is used in current definition of rsvd_bits_vadlidate. :-)




-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
-vcpu->arch.mmu.direct_map)) {
+   if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL ||
+   vcpu->arch.mmu.direct_map) {
hpa_t root = vcpu->arch.mmu.root_hpa;

You should keep the check on shadow_root_level (changing it to >= of
course), otherwise you break the case where EPT is disabled, paging is
disabled (so vcpu->arch.mmu.direct_map is true) and the host kernel is
32-bit.  In that case shadow pages use PAE format, and entering this
branch is incorrect.


Oh, right. Thanks!




@@ -,7 +4457,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool 
execonly,
  
  	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
  
-	context->shadow_root_level = kvm_x86_ops->get_tdp_level();

+   context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
  
  	context->nx = true;

context->ept_ad = accessed_dirty;

Below, there is:

 context->root_level = context->shadow_root_level;

this should be forced to PT64_ROOT_4LEVEL until there is support for
nested EPT 5-level page tables.


So the context->shadow_root_level could be 5 or 4, and 
context->root_level is always 4?


My understanding is that shadow ept level should be determined by the 
width of ngpa,
and that if L1 guest is not exposed with EPT5 feature, it shall only use 
4 level ept for L2
guest, and the shadow ept does not need a 5 level one. Is this 
understanding correct?

And how about we set both values to PT64_ROOT_4LEVEL for now?

Besides, if we wanna support nested EPT5, what do you think we need to 
do besides exposing

the EPT5 feature to L1 guest?

Thanks
Yu



Thanks,

Paolo





Re: [PATCH v1 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support.

2017-08-14 Thread Yu Zhang

Thanks a lot for your comments, Paolo. :-)


On 8/14/2017 3:31 PM, Paolo Bonzini wrote:

On 12/08/2017 15:35, Yu Zhang wrote:

  struct rsvd_bits_validate {
-   u64 rsvd_bits_mask[2][4];
+   u64 rsvd_bits_mask[2][5];
u64 bad_mt_xwr;
  };


Can you change this 4 to PT64_ROOT_MAX_LEVEL in patch 2?


Well, I had tried, but failed to find a neat approach to do so.
The difficulty I have met is that PT64_ROOT_MAX_LEVEL is defined 
together with

PT64_ROOT_4LEVEL/PT32E_ROOT_LEVEL/PT32_ROOT_LEVEL in mmu.h, yet the
rsvd_bits_validate structure is defined in kvm_host.h, which are 
included in quite
a lot .c files that do not include mmu.h or include the mmu.h after 
kvm_host.h.


I guess that's the reason why the magic number 4 instead of PT64_ROOT_4LEVEL
is used in current definition of rsvd_bits_vadlidate. :-)




-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
-vcpu->arch.mmu.direct_map)) {
+   if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL ||
+   vcpu->arch.mmu.direct_map) {
hpa_t root = vcpu->arch.mmu.root_hpa;

You should keep the check on shadow_root_level (changing it to >= of
course), otherwise you break the case where EPT is disabled, paging is
disabled (so vcpu->arch.mmu.direct_map is true) and the host kernel is
32-bit.  In that case shadow pages use PAE format, and entering this
branch is incorrect.


Oh, right. Thanks!




@@ -,7 +4457,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool 
execonly,
  
  	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
  
-	context->shadow_root_level = kvm_x86_ops->get_tdp_level();

+   context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
  
  	context->nx = true;

context->ept_ad = accessed_dirty;

Below, there is:

 context->root_level = context->shadow_root_level;

this should be forced to PT64_ROOT_4LEVEL until there is support for
nested EPT 5-level page tables.


So the context->shadow_root_level could be 5 or 4, and 
context->root_level is always 4?


My understanding is that shadow ept level should be determined by the 
width of ngpa,
and that if L1 guest is not exposed with EPT5 feature, it shall only use 
4 level ept for L2
guest, and the shadow ept does not need a 5 level one. Is this 
understanding correct?

And how about we set both values to PT64_ROOT_4LEVEL for now?

Besides, if we wanna support nested EPT5, what do you think we need to 
do besides exposing

the EPT5 feature to L1 guest?

Thanks
Yu



Thanks,

Paolo





[PATCH v1 1/4] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-12 Thread Yu Zhang
Currently, KVM uses CR3_L_MODE_RESERVED_BITS to check the
reserved bits in CR3. Yet the length of reserved bits in
guest CR3 should be based on the physical address width
exposed to the VM. This patch changes CR3 check logic to
calculate the reserved bits at runtime.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/emulate.c  | 12 ++--
 arch/x86/kvm/x86.c  |  8 
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e4862e..018300e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,7 +79,6 @@
  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 
-#define CR3_L_MODE_RESERVED_BITS 0xFF00ULL
 #define CR3_PCID_INVD   BIT_64(63)
 #define CR4_RESERVED_BITS   \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb00559..a98b88a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4097,8 +4097,16 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
u64 rsvd = 0;
 
ctxt->ops->get_msr(ctxt, MSR_EFER, );
-   if (efer & EFER_LMA)
-   rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   ctxt->ops->get_cpuid(ctxt, , NULL, NULL, NULL);
+   maxphyaddr = eax * 0xff;
+
+   rsvd = (~((1UL << maxphyaddr) - 1)) &
+   ~CR3_PCID_INVD;
+   }
 
if (new_val & rsvd)
return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e40a779..d9100c4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -813,10 +813,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
 
-   if (is_long_mode(vcpu)) {
-   if (cr3 & CR3_L_MODE_RESERVED_BITS)
-   return 1;
-   } else if (is_pae(vcpu) && is_paging(vcpu) &&
+   if (is_long_mode(vcpu) &&
+   (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
+   return 1;
+   else if (is_pae(vcpu) && is_paging(vcpu) &&
   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
 
-- 
2.5.0



[PATCH v1 1/4] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

2017-08-12 Thread Yu Zhang
Currently, KVM uses CR3_L_MODE_RESERVED_BITS to check the
reserved bits in CR3. Yet the length of reserved bits in
guest CR3 should be based on the physical address width
exposed to the VM. This patch changes CR3 check logic to
calculate the reserved bits at runtime.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/emulate.c  | 12 ++--
 arch/x86/kvm/x86.c  |  8 
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e4862e..018300e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,7 +79,6 @@
  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 
-#define CR3_L_MODE_RESERVED_BITS 0xFF00ULL
 #define CR3_PCID_INVD   BIT_64(63)
 #define CR4_RESERVED_BITS   \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb00559..a98b88a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4097,8 +4097,16 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
u64 rsvd = 0;
 
ctxt->ops->get_msr(ctxt, MSR_EFER, );
-   if (efer & EFER_LMA)
-   rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+   if (efer & EFER_LMA) {
+   u64 maxphyaddr;
+   u32 eax = 0x8008;
+
+   ctxt->ops->get_cpuid(ctxt, , NULL, NULL, NULL);
+   maxphyaddr = eax * 0xff;
+
+   rsvd = (~((1UL << maxphyaddr) - 1)) &
+   ~CR3_PCID_INVD;
+   }
 
if (new_val & rsvd)
return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e40a779..d9100c4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -813,10 +813,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
 
-   if (is_long_mode(vcpu)) {
-   if (cr3 & CR3_L_MODE_RESERVED_BITS)
-   return 1;
-   } else if (is_pae(vcpu) && is_paging(vcpu) &&
+   if (is_long_mode(vcpu) &&
+   (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
+   return 1;
+   else if (is_pae(vcpu) && is_paging(vcpu) &&
   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
 
-- 
2.5.0



[PATCH v1 0/4] KVM: MMU: 5 level EPT/shadow support

2017-08-12 Thread Yu Zhang
Intel's existing processors limit the maximum linear address width to
48 bits, and the maximum physical address width to 46 bits. And the
upcoming processors will extend maximum linear address width to 57 bits
and maximum physical address width can go upto 52 bits in practical.

With linear address width greater than 48, a new paging mode in IA-32e
is introduced - 5 level paging(also known as LA57). And to support VMs
with this feature, KVM MMU code need to be extended. 

And to achieve this, this patchset:
1> leverages 2 qemu parameters: +la57 and phys-bits to expose wider linear
address width and physical address width to the VM;
2> extends shadow logic to construct 5 level shadow page for VMs running
in LA57 mode;
3> extends ept logic to construct 5 level ept table for VMs whose maximum
physical width exceeds 48 bits.

Yu Zhang (4):
  KVM: MMU: check guest CR3 reserved bits based on its physical address
width.
  KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.
  KVM: MMU: Add 5 level EPT & Shadow page table support.
  KVM: MMU: Expose the LA57 feature to VM.

 arch/x86/include/asm/kvm_host.h | 29 +
 arch/x86/include/asm/vmx.h  |  1 +
 arch/x86/kvm/cpuid.c| 21 ++-
 arch/x86/kvm/emulate.c  | 24 +++--
 arch/x86/kvm/kvm_cache_regs.h   |  2 +-
 arch/x86/kvm/mmu.c  | 58 +
 arch/x86/kvm/mmu.h  |  8 +-
 arch/x86/kvm/mmu_audit.c|  4 +--
 arch/x86/kvm/svm.c  |  6 ++---
 arch/x86/kvm/vmx.c  | 27 ---
 arch/x86/kvm/x86.c  | 15 ++-
 arch/x86/kvm/x86.h  | 44 +++
 12 files changed, 158 insertions(+), 81 deletions(-)

-- 
2.5.0



[PATCH v1 0/4] KVM: MMU: 5 level EPT/shadow support

2017-08-12 Thread Yu Zhang
Intel's existing processors limit the maximum linear address width to
48 bits, and the maximum physical address width to 46 bits. And the
upcoming processors will extend maximum linear address width to 57 bits
and maximum physical address width can go upto 52 bits in practical.

With linear address width greater than 48, a new paging mode in IA-32e
is introduced - 5 level paging(also known as LA57). And to support VMs
with this feature, KVM MMU code need to be extended. 

And to achieve this, this patchset:
1> leverages 2 qemu parameters: +la57 and phys-bits to expose wider linear
address width and physical address width to the VM;
2> extends shadow logic to construct 5 level shadow page for VMs running
in LA57 mode;
3> extends ept logic to construct 5 level ept table for VMs whose maximum
physical width exceeds 48 bits.

Yu Zhang (4):
  KVM: MMU: check guest CR3 reserved bits based on its physical address
width.
  KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.
  KVM: MMU: Add 5 level EPT & Shadow page table support.
  KVM: MMU: Expose the LA57 feature to VM.

 arch/x86/include/asm/kvm_host.h | 29 +
 arch/x86/include/asm/vmx.h  |  1 +
 arch/x86/kvm/cpuid.c| 21 ++-
 arch/x86/kvm/emulate.c  | 24 +++--
 arch/x86/kvm/kvm_cache_regs.h   |  2 +-
 arch/x86/kvm/mmu.c  | 58 +
 arch/x86/kvm/mmu.h  |  8 +-
 arch/x86/kvm/mmu_audit.c|  4 +--
 arch/x86/kvm/svm.c  |  6 ++---
 arch/x86/kvm/vmx.c  | 27 ---
 arch/x86/kvm/x86.c  | 15 ++-
 arch/x86/kvm/x86.h  | 44 +++
 12 files changed, 158 insertions(+), 81 deletions(-)

-- 
2.5.0



[PATCH v1 4/4] KVM: MMU: Expose the LA57 feature to VM.

2017-08-12 Thread Yu Zhang
This patch exposes 5 level page table feature to the VM,
at the same time, the canonical virtual address checking is
extended to support both 48-bits and 57-bits address width.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h | 18 ++
 arch/x86/kvm/cpuid.c| 16 ++--
 arch/x86/kvm/emulate.c  | 12 ++--
 arch/x86/kvm/kvm_cache_regs.h   |  2 +-
 arch/x86/kvm/vmx.c  |  8 
 arch/x86/kvm/x86.c  |  7 +--
 arch/x86/kvm/x86.h  | 34 ++
 7 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7e98a75..4bc7f11 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,8 +85,8 @@
  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | 
X86_CR4_PCIDE \
  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
- | X86_CR4_PKE))
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
+ | X86_CR4_SMAP | X86_CR4_PKE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
@@ -1297,20 +1297,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, 
u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 }
 
-static inline u64 get_canonical(u64 la)
-{
-   return ((int64_t)la << 16) >> 16;
-}
-
-static inline bool is_noncanonical_address(u64 la)
-{
-#ifdef CONFIG_X86_64
-   return get_canonical(la) != la;
-#else
-   return false;
-#endif
-}
-
 #define TSS_IOPB_BASE_OFFSET 0x66
 #define TSS_BASE_SIZE 0x68
 #define TSS_IOPB_SIZE (65536 / 8)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index aceacf8..2161b33 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -126,13 +126,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
/*
-* The existing code assumes virtual address is 48-bit in the canonical
-* address checks; exit if it is ever changed.
+* The existing code assumes virtual address is 48-bit or 57-bit in the
+* canonical address checks; exit if it is ever changed.
 */
best = kvm_find_cpuid_entry(vcpu, 0x8008, 0);
-   if (best && ((best->eax & 0xff00) >> 8) != 48 &&
-   ((best->eax & 0xff00) >> 8) != 0)
-   return -EINVAL;
+   if (best) {
+   int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+   if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+   return -EINVAL;
+   }
 
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -388,7 +391,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
-   F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
+   F(AVX512VBMI) | F(LA57) | F(PKU) |
+   0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
 
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a98b88a..50107ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -694,7 +694,7 @@ static __always_inline int __linearize(struct 
x86_emulate_ctxt *ctxt,
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
-   if (is_noncanonical_address(la))
+   if (emul_is_noncanonical_address(la, ctxt))
goto bad;
 
*max_size = min_t(u64, ~0u, (1ull << 48) - la);
@@ -1748,8 +1748,8 @@ static int __load_segment_descriptor(struct 
x86_emulate_ctxt *ctxt,
sizeof(base3), >exception);
if (ret != X86EMUL_CONTINUE)
return ret;
-   if (is_noncanonical_address(get_desc_base(_desc) |
-((u64)base3 << 32)))
+   if (emul_is_noncanonical_address(get_desc_base(_desc) |
+   ((u64)base3 << 32), ctxt))
return emulate_gp(ctxt, 0);
}
 load:
@@ -2840,8 +2840,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
-   if (is_noncanonical_address(rcx) ||
-   is_noncanonical_address(rdx))
+   if (emul_is_noncano

[PATCH v1 2/4] KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.

2017-08-12 Thread Yu Zhang
Now we have 4 level page table and 5 level page table in 64 bits
long mode, let's rename the PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL,
then we can use PT64_ROOT_5LEVEL for 5 level page table, it's
helpful to make the code more clear.

Also PT64_ROOT_MAX_LEVEL is defined as PT64_ROOT_4LEVEL, so that
we can just redefine it to PT64_ROOT_5LEVEL whenever a replacement
is needed for 5 level paging.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/kvm/mmu.c   | 36 ++--
 arch/x86/kvm/mmu.h   |  4 +++-
 arch/x86/kvm/mmu_audit.c |  4 ++--
 arch/x86/kvm/svm.c   |  2 +-
 4 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7ee21c0..cd4d2cc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2167,8 +2167,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t 
gfn,
 }
 
 struct mmu_page_path {
-   struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
-   unsigned int idx[PT64_ROOT_LEVEL];
+   struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+   unsigned int idx[PT64_ROOT_MAX_LEVEL];
 };
 
 #define for_each_sp(pvec, sp, parents, i)  \
@@ -2383,8 +2383,8 @@ static void shadow_walk_init(struct 
kvm_shadow_walk_iterator *iterator,
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
 
-   if (iterator->level == PT64_ROOT_LEVEL &&
-   vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+   if (iterator->level == PT64_ROOT_4LEVEL &&
+   vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu.direct_map)
--iterator->level;
 
@@ -3323,8 +3323,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
+   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
 vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -3376,10 +3376,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
spin_lock(>kvm->mmu_lock);
make_mmu_pages_available(vcpu);
-   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL);
+   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
++sp->root_count;
spin_unlock(>kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3420,14 +3420,14 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * Do we shadow a long mode page table? If so we need to
 * write-protect the guests page table root.
 */
-   if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
MMU_WARN_ON(VALID_PAGE(root));
 
spin_lock(>kvm->mmu_lock);
make_mmu_pages_available(vcpu);
-   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
  0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
@@ -3442,7 +3442,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * the shadow page table may be a PAE or a long mode page table.
 */
pm_mask = PT_PRESENT_MASK;
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
for (i = 0; i < 4; ++i) {
@@ -3475,7 +3475,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * If we shadow a 32 bit page table with a long mode page
 * table we enter this path.
 */
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
if (vcpu->arch.mmu.lm_root == NULL) {
/*
 * The additional page necessary for this is only
@@ -3520,7 +3520,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
-   if (vcpu->arch.mmu.root_

[PATCH v1 2/4] KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.

2017-08-12 Thread Yu Zhang
Now we have 4 level page table and 5 level page table in 64 bits
long mode, let's rename the PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL,
then we can use PT64_ROOT_5LEVEL for 5 level page table, it's
helpful to make the code more clear.

Also PT64_ROOT_MAX_LEVEL is defined as PT64_ROOT_4LEVEL, so that
we can just redefine it to PT64_ROOT_5LEVEL whenever a replacement
is needed for 5 level paging.

Signed-off-by: Yu Zhang 
---
 arch/x86/kvm/mmu.c   | 36 ++--
 arch/x86/kvm/mmu.h   |  4 +++-
 arch/x86/kvm/mmu_audit.c |  4 ++--
 arch/x86/kvm/svm.c   |  2 +-
 4 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7ee21c0..cd4d2cc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2167,8 +2167,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t 
gfn,
 }
 
 struct mmu_page_path {
-   struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
-   unsigned int idx[PT64_ROOT_LEVEL];
+   struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+   unsigned int idx[PT64_ROOT_MAX_LEVEL];
 };
 
 #define for_each_sp(pvec, sp, parents, i)  \
@@ -2383,8 +2383,8 @@ static void shadow_walk_init(struct 
kvm_shadow_walk_iterator *iterator,
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
 
-   if (iterator->level == PT64_ROOT_LEVEL &&
-   vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+   if (iterator->level == PT64_ROOT_4LEVEL &&
+   vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu.direct_map)
--iterator->level;
 
@@ -3323,8 +3323,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
+   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
 vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -3376,10 +3376,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
spin_lock(>kvm->mmu_lock);
make_mmu_pages_available(vcpu);
-   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL);
+   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
++sp->root_count;
spin_unlock(>kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3420,14 +3420,14 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * Do we shadow a long mode page table? If so we need to
 * write-protect the guests page table root.
 */
-   if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
MMU_WARN_ON(VALID_PAGE(root));
 
spin_lock(>kvm->mmu_lock);
make_mmu_pages_available(vcpu);
-   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+   sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
  0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
@@ -3442,7 +3442,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * the shadow page table may be a PAE or a long mode page table.
 */
pm_mask = PT_PRESENT_MASK;
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
for (i = 0; i < 4; ++i) {
@@ -3475,7 +3475,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * If we shadow a 32 bit page table with a long mode page
 * table we enter this path.
 */
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
if (vcpu->arch.mmu.lm_root == NULL) {
/*
 * The additional page necessary for this is only
@@ -3520,7 +3520,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
-   if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+   if (vcpu-&g

[PATCH v1 4/4] KVM: MMU: Expose the LA57 feature to VM.

2017-08-12 Thread Yu Zhang
This patch exposes 5 level page table feature to the VM,
at the same time, the canonical virtual address checking is
extended to support both 48-bits and 57-bits address width.

Signed-off-by: Yu Zhang 
---
 arch/x86/include/asm/kvm_host.h | 18 ++
 arch/x86/kvm/cpuid.c| 16 ++--
 arch/x86/kvm/emulate.c  | 12 ++--
 arch/x86/kvm/kvm_cache_regs.h   |  2 +-
 arch/x86/kvm/vmx.c  |  8 
 arch/x86/kvm/x86.c  |  7 +--
 arch/x86/kvm/x86.h  | 34 ++
 7 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7e98a75..4bc7f11 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,8 +85,8 @@
  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | 
X86_CR4_PCIDE \
  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
- | X86_CR4_PKE))
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
+ | X86_CR4_SMAP | X86_CR4_PKE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
@@ -1297,20 +1297,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, 
u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 }
 
-static inline u64 get_canonical(u64 la)
-{
-   return ((int64_t)la << 16) >> 16;
-}
-
-static inline bool is_noncanonical_address(u64 la)
-{
-#ifdef CONFIG_X86_64
-   return get_canonical(la) != la;
-#else
-   return false;
-#endif
-}
-
 #define TSS_IOPB_BASE_OFFSET 0x66
 #define TSS_BASE_SIZE 0x68
 #define TSS_IOPB_SIZE (65536 / 8)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index aceacf8..2161b33 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -126,13 +126,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
/*
-* The existing code assumes virtual address is 48-bit in the canonical
-* address checks; exit if it is ever changed.
+* The existing code assumes virtual address is 48-bit or 57-bit in the
+* canonical address checks; exit if it is ever changed.
 */
best = kvm_find_cpuid_entry(vcpu, 0x8008, 0);
-   if (best && ((best->eax & 0xff00) >> 8) != 48 &&
-   ((best->eax & 0xff00) >> 8) != 0)
-   return -EINVAL;
+   if (best) {
+   int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+   if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+   return -EINVAL;
+   }
 
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -388,7 +391,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
-   F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
+   F(AVX512VBMI) | F(LA57) | F(PKU) |
+   0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
 
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a98b88a..50107ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -694,7 +694,7 @@ static __always_inline int __linearize(struct 
x86_emulate_ctxt *ctxt,
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
-   if (is_noncanonical_address(la))
+   if (emul_is_noncanonical_address(la, ctxt))
goto bad;
 
*max_size = min_t(u64, ~0u, (1ull << 48) - la);
@@ -1748,8 +1748,8 @@ static int __load_segment_descriptor(struct 
x86_emulate_ctxt *ctxt,
sizeof(base3), >exception);
if (ret != X86EMUL_CONTINUE)
return ret;
-   if (is_noncanonical_address(get_desc_base(_desc) |
-((u64)base3 << 32)))
+   if (emul_is_noncanonical_address(get_desc_base(_desc) |
+   ((u64)base3 << 32), ctxt))
return emulate_gp(ctxt, 0);
}
 load:
@@ -2840,8 +2840,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
-   if (is_noncanonical_address(rcx) ||
-   is_noncanonical_address(rdx))
+   if (emul_is_noncanonical_address(rcx, ctxt) 

[PATCH v1 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support.

2017-08-12 Thread Yu Zhang
Extends the shadow paging code, so that 5 level shadow page
table can be constructed if VM is running in 5 level paging
mode.

Also extends the ept code, so that 5 level ept table can be
constructed if maxphysaddr of VM exceeds 48 bits. Unlike the
shadow logic, KVM should still use 4 level ept table for a VM
whose physical address width is less than 48 bits, even when
the VM is running in 5 level paging mode.

Signed-off-by: Yu Zhang <yu.c.zh...@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h | 10 +-
 arch/x86/include/asm/vmx.h  |  1 +
 arch/x86/kvm/cpuid.c|  5 +
 arch/x86/kvm/mmu.c  | 42 +++--
 arch/x86/kvm/mmu.h  |  6 +-
 arch/x86/kvm/mmu_audit.c|  4 ++--
 arch/x86/kvm/svm.c  |  4 ++--
 arch/x86/kvm/vmx.c  | 19 +--
 arch/x86/kvm/x86.h  | 10 ++
 9 files changed, 71 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 018300e..7e98a75 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -316,14 +316,14 @@ struct kvm_pio_request {
 };
 
 struct rsvd_bits_validate {
-   u64 rsvd_bits_mask[2][4];
+   u64 rsvd_bits_mask[2][5];
u64 bad_mt_xwr;
 };
 
 /*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
- * mode.
+ * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
+ * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
+ * current mmu mode.
  */
 struct kvm_mmu {
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
@@ -979,7 +979,7 @@ struct kvm_x86_ops {
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
-   int (*get_tdp_level)(void);
+   int (*get_tdp_level)(struct kvm_vcpu *vcpu);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 5f63a2e..a0fb025 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -453,6 +453,7 @@ enum vmcs_field {
 
 #define VMX_EPT_EXECUTE_ONLY_BIT   (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT(1ull << 6)
+#define VMX_EPT_PAGE_WALK_5_BIT(1ull << 7)
 #define VMX_EPTP_UC_BIT(1ull << 8)
 #define VMX_EPTP_WB_BIT(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull << 16)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59ca2ee..aceacf8 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -137,6 +137,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
+#ifdef CONFIG_X86_64
+   if (vcpu->arch.maxphyaddr > 48)
+   kvm_mmu_reset_context(vcpu);
+#endif
+
kvm_pmu_refresh(vcpu);
return 0;
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cd4d2cc..298d840 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3323,9 +3323,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
-   (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
-vcpu->arch.mmu.direct_map)) {
+   if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL ||
+   vcpu->arch.mmu.direct_map) {
hpa_t root = vcpu->arch.mmu.root_hpa;
 
spin_lock(>kvm->mmu_lock);
@@ -3376,10 +3375,11 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
 
-   if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+   if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
spin_lock(>kvm->mmu_lock);
make_mmu_pages_available(vcpu);
-   sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
+   sp = kvm_mmu_get_page(vcpu, 0, 0,
+   vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
++sp->root_count;
spin_unlock(>kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3420,15 +3420,15 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 * Do we shadow a long mode page table? If so we need to
 * write-protect t

  1   2   >