Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-06-18 Thread Gleb Natapov
On Tue, Jun 18, 2013 at 08:51:25PM +0800, Xiao Guangrong wrote:
> On 06/18/2013 06:57 PM, Gleb Natapov wrote:
> > On Mon, Jun 17, 2013 at 08:11:03PM +0800, Xiao Guangrong wrote:
> >> On 06/11/2013 07:32 PM, Gleb Natapov wrote:
> >>> On Tue, May 21, 2013 at 03:52:12PM +0800, Xiao Guangrong wrote:
>  On 05/19/2013 12:52 PM, Jun Nakajima wrote:
> > From: Nadav Har'El 
> >
> > This is the first patch in a series which adds nested EPT support to 
> > KVM's
> > nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 
> > can use
> > EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 
> > guest
> > to set its own cr3 and take its own page faults without either of L0 or 
> > L1
> > getting involved. This often significanlty improves L2's performance 
> > over the
> > previous two alternatives (shadow page tables over EPT, and shadow page
> > tables over shadow page tables).
> >
> > This patch adds EPT support to paging_tmpl.h.
> >
> > paging_tmpl.h contains the code for reading and writing page tables. 
> > The code
> > for 32-bit and 64-bit tables is very similar, but not identical, so
> > paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and 
> > once
> > with PTTYPE=64, and this generates the two sets of similar functions.
> >
> > There are subtle but important differences between the format of EPT 
> > tables
> > and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> > third set of functions to read the guest EPT table and to write the 
> > shadow
> > EPT table.
> >
> > So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions 
> > (prefixed
> > with "EPT") which correctly read and write EPT tables.
> >
> > Signed-off-by: Nadav Har'El 
> > Signed-off-by: Jun Nakajima 
> > Signed-off-by: Xinhao Xu 
> > ---
> >  arch/x86/kvm/mmu.c |  5 +
> >  arch/x86/kvm/paging_tmpl.h | 43 
> > +--
> >  2 files changed, 46 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 117233f..6c1670f 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu 
> > *mmu, unsigned level, unsigned gp
> > return mmu->last_pte_bitmap & (1 << index);
> >  }
> >
> > +#define PTTYPE_EPT 18 /* arbitrary */
> > +#define PTTYPE PTTYPE_EPT
> > +#include "paging_tmpl.h"
> > +#undef PTTYPE
> > +
> >  #define PTTYPE 64
> >  #include "paging_tmpl.h"
> >  #undef PTTYPE
> > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> > index df34d4a..4c45654 100644
> > --- a/arch/x86/kvm/paging_tmpl.h
> > +++ b/arch/x86/kvm/paging_tmpl.h
> > @@ -50,6 +50,22 @@
> > #define PT_LEVEL_BITS PT32_LEVEL_BITS
> > #define PT_MAX_FULL_LEVELS 2
> > #define CMPXCHG cmpxchg
> > +#elif PTTYPE == PTTYPE_EPT
> > +   #define pt_element_t u64
> > +   #define guest_walker guest_walkerEPT
> > +   #define FNAME(name) EPT_##name
> > +   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> > +   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> > +   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> > +   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > +   #define PT_LEVEL_BITS PT64_LEVEL_BITS
> > +   #ifdef CONFIG_X86_64
> > +   #define PT_MAX_FULL_LEVELS 4
> > +   #define CMPXCHG cmpxchg
> > +   #else
> > +   #define CMPXCHG cmpxchg64
> 
>  CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later.
>  Do we really need it?
> 
> > +   #define PT_MAX_FULL_LEVELS 2
> 
>  And the SDM says:
> 
>  "It uses a page-walk length of 4, meaning that at most 4 EPT 
>  paging-structure
>  entriesare accessed to translate a guest-physical address.", Is My SDM 
>  obsolete?
>  Which kind of process supports page-walk length = 2?
> 
>  It seems your patch is not able to handle the case that the guest uses 
>  walk-lenght = 2
>  which is running on the host with walk-lenght = 4.
>  (plrease refer to how to handle sp->role.quadrant in 
>  FNAME(get_level1_sp_gpa) in
>  the current code.)
> 
> >>> But since EPT always has 4 levels on all existing cpus it is not an issue 
> >>> and the only case
> >>> that we should worry about is guest walk-lenght == host walk-lenght == 4, 
> >>> or have I
> >>
> >> Yes. I totally agree with you, but...
> >>
> >>> misunderstood what you mean here?
> >>
> >> What confused me is that this patch defines "#define PT_MAX_FULL_LEVELS 
> >> 2", so i asked the
> >> question

Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-06-18 Thread Xiao Guangrong
On 06/18/2013 06:57 PM, Gleb Natapov wrote:
> On Mon, Jun 17, 2013 at 08:11:03PM +0800, Xiao Guangrong wrote:
>> On 06/11/2013 07:32 PM, Gleb Natapov wrote:
>>> On Tue, May 21, 2013 at 03:52:12PM +0800, Xiao Guangrong wrote:
 On 05/19/2013 12:52 PM, Jun Nakajima wrote:
> From: Nadav Har'El 
>
> This is the first patch in a series which adds nested EPT support to KVM's
> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can 
> use
> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 
> guest
> to set its own cr3 and take its own page faults without either of L0 or L1
> getting involved. This often significanlty improves L2's performance over 
> the
> previous two alternatives (shadow page tables over EPT, and shadow page
> tables over shadow page tables).
>
> This patch adds EPT support to paging_tmpl.h.
>
> paging_tmpl.h contains the code for reading and writing page tables. The 
> code
> for 32-bit and 64-bit tables is very similar, but not identical, so
> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> with PTTYPE=64, and this generates the two sets of similar functions.
>
> There are subtle but important differences between the format of EPT 
> tables
> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> third set of functions to read the guest EPT table and to write the shadow
> EPT table.
>
> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions 
> (prefixed
> with "EPT") which correctly read and write EPT tables.
>
> Signed-off-by: Nadav Har'El 
> Signed-off-by: Jun Nakajima 
> Signed-off-by: Xinhao Xu 
> ---
>  arch/x86/kvm/mmu.c |  5 +
>  arch/x86/kvm/paging_tmpl.h | 43 
> +--
>  2 files changed, 46 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 117233f..6c1670f 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu 
> *mmu, unsigned level, unsigned gp
>   return mmu->last_pte_bitmap & (1 << index);
>  }
>
> +#define PTTYPE_EPT 18 /* arbitrary */
> +#define PTTYPE PTTYPE_EPT
> +#include "paging_tmpl.h"
> +#undef PTTYPE
> +
>  #define PTTYPE 64
>  #include "paging_tmpl.h"
>  #undef PTTYPE
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index df34d4a..4c45654 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -50,6 +50,22 @@
>   #define PT_LEVEL_BITS PT32_LEVEL_BITS
>   #define PT_MAX_FULL_LEVELS 2
>   #define CMPXCHG cmpxchg
> +#elif PTTYPE == PTTYPE_EPT
> + #define pt_element_t u64
> + #define guest_walker guest_walkerEPT
> + #define FNAME(name) EPT_##name
> + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> + #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> + #define PT_LEVEL_BITS PT64_LEVEL_BITS
> + #ifdef CONFIG_X86_64
> + #define PT_MAX_FULL_LEVELS 4
> + #define CMPXCHG cmpxchg
> + #else
> + #define CMPXCHG cmpxchg64

 CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later.
 Do we really need it?

> + #define PT_MAX_FULL_LEVELS 2

 And the SDM says:

 "It uses a page-walk length of 4, meaning that at most 4 EPT 
 paging-structure
 entriesare accessed to translate a guest-physical address.", Is My SDM 
 obsolete?
 Which kind of process supports page-walk length = 2?

 It seems your patch is not able to handle the case that the guest uses 
 walk-lenght = 2
 which is running on the host with walk-lenght = 4.
 (plrease refer to how to handle sp->role.quadrant in 
 FNAME(get_level1_sp_gpa) in
 the current code.)

>>> But since EPT always has 4 levels on all existing cpus it is not an issue 
>>> and the only case
>>> that we should worry about is guest walk-lenght == host walk-lenght == 4, 
>>> or have I
>>
>> Yes. I totally agree with you, but...
>>
>>> misunderstood what you mean here?
>>
>> What confused me is that this patch defines "#define PT_MAX_FULL_LEVELS 2", 
>> so i asked the
>> question: "Which kind of process supports page-walk length = 2".
>> Sorry, there is a typo in my origin comments. "process" should be 
>> "processor" or "CPU".
>>
> That is how I understood it, but then the discussion moved to dropping
> of nEPT support on 32-bit host. What's the connection? Even on 32bit

If EPT supports "walk-level = 2" on 32bit host (maybe it is not true), i 
thought dropping
32bit support to reduce the complex is worthwhile, 

Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-06-18 Thread Gleb Natapov
On Mon, Jun 17, 2013 at 08:11:03PM +0800, Xiao Guangrong wrote:
> On 06/11/2013 07:32 PM, Gleb Natapov wrote:
> > On Tue, May 21, 2013 at 03:52:12PM +0800, Xiao Guangrong wrote:
> >> On 05/19/2013 12:52 PM, Jun Nakajima wrote:
> >>> From: Nadav Har'El 
> >>>
> >>> This is the first patch in a series which adds nested EPT support to KVM's
> >>> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can 
> >>> use
> >>> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 
> >>> guest
> >>> to set its own cr3 and take its own page faults without either of L0 or L1
> >>> getting involved. This often significanlty improves L2's performance over 
> >>> the
> >>> previous two alternatives (shadow page tables over EPT, and shadow page
> >>> tables over shadow page tables).
> >>>
> >>> This patch adds EPT support to paging_tmpl.h.
> >>>
> >>> paging_tmpl.h contains the code for reading and writing page tables. The 
> >>> code
> >>> for 32-bit and 64-bit tables is very similar, but not identical, so
> >>> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> >>> with PTTYPE=64, and this generates the two sets of similar functions.
> >>>
> >>> There are subtle but important differences between the format of EPT 
> >>> tables
> >>> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> >>> third set of functions to read the guest EPT table and to write the shadow
> >>> EPT table.
> >>>
> >>> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions 
> >>> (prefixed
> >>> with "EPT") which correctly read and write EPT tables.
> >>>
> >>> Signed-off-by: Nadav Har'El 
> >>> Signed-off-by: Jun Nakajima 
> >>> Signed-off-by: Xinhao Xu 
> >>> ---
> >>>  arch/x86/kvm/mmu.c |  5 +
> >>>  arch/x86/kvm/paging_tmpl.h | 43 
> >>> +--
> >>>  2 files changed, 46 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >>> index 117233f..6c1670f 100644
> >>> --- a/arch/x86/kvm/mmu.c
> >>> +++ b/arch/x86/kvm/mmu.c
> >>> @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu 
> >>> *mmu, unsigned level, unsigned gp
> >>>   return mmu->last_pte_bitmap & (1 << index);
> >>>  }
> >>>
> >>> +#define PTTYPE_EPT 18 /* arbitrary */
> >>> +#define PTTYPE PTTYPE_EPT
> >>> +#include "paging_tmpl.h"
> >>> +#undef PTTYPE
> >>> +
> >>>  #define PTTYPE 64
> >>>  #include "paging_tmpl.h"
> >>>  #undef PTTYPE
> >>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> >>> index df34d4a..4c45654 100644
> >>> --- a/arch/x86/kvm/paging_tmpl.h
> >>> +++ b/arch/x86/kvm/paging_tmpl.h
> >>> @@ -50,6 +50,22 @@
> >>>   #define PT_LEVEL_BITS PT32_LEVEL_BITS
> >>>   #define PT_MAX_FULL_LEVELS 2
> >>>   #define CMPXCHG cmpxchg
> >>> +#elif PTTYPE == PTTYPE_EPT
> >>> + #define pt_element_t u64
> >>> + #define guest_walker guest_walkerEPT
> >>> + #define FNAME(name) EPT_##name
> >>> + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> >>> + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> >>> + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> >>> + #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> >>> + #define PT_LEVEL_BITS PT64_LEVEL_BITS
> >>> + #ifdef CONFIG_X86_64
> >>> + #define PT_MAX_FULL_LEVELS 4
> >>> + #define CMPXCHG cmpxchg
> >>> + #else
> >>> + #define CMPXCHG cmpxchg64
> >>
> >> CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later.
> >> Do we really need it?
> >>
> >>> + #define PT_MAX_FULL_LEVELS 2
> >>
> >> And the SDM says:
> >>
> >> "It uses a page-walk length of 4, meaning that at most 4 EPT 
> >> paging-structure
> >> entriesare accessed to translate a guest-physical address.", Is My SDM 
> >> obsolete?
> >> Which kind of process supports page-walk length = 2?
> >>
> >> It seems your patch is not able to handle the case that the guest uses 
> >> walk-lenght = 2
> >> which is running on the host with walk-lenght = 4.
> >> (plrease refer to how to handle sp->role.quadrant in 
> >> FNAME(get_level1_sp_gpa) in
> >> the current code.)
> >>
> > But since EPT always has 4 levels on all existing cpus it is not an issue 
> > and the only case
> > that we should worry about is guest walk-lenght == host walk-lenght == 4, 
> > or have I
> 
> Yes. I totally agree with you, but...
> 
> > misunderstood what you mean here?
> 
> What confused me is that this patch defines "#define PT_MAX_FULL_LEVELS 2", 
> so i asked the
> question: "Which kind of process supports page-walk length = 2".
> Sorry, there is a typo in my origin comments. "process" should be "processor" 
> or "CPU".
> 
That is how I understood it, but then the discussion moved to dropping
of nEPT support on 32-bit host. What's the connection? Even on 32bit
host the walk is 4 levels. Doesn't shadow page code support 4 levels on
32bit host?

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a

Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-06-17 Thread Xiao Guangrong
On 06/11/2013 07:32 PM, Gleb Natapov wrote:
> On Tue, May 21, 2013 at 03:52:12PM +0800, Xiao Guangrong wrote:
>> On 05/19/2013 12:52 PM, Jun Nakajima wrote:
>>> From: Nadav Har'El 
>>>
>>> This is the first patch in a series which adds nested EPT support to KVM's
>>> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can 
>>> use
>>> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
>>> to set its own cr3 and take its own page faults without either of L0 or L1
>>> getting involved. This often significanlty improves L2's performance over 
>>> the
>>> previous two alternatives (shadow page tables over EPT, and shadow page
>>> tables over shadow page tables).
>>>
>>> This patch adds EPT support to paging_tmpl.h.
>>>
>>> paging_tmpl.h contains the code for reading and writing page tables. The 
>>> code
>>> for 32-bit and 64-bit tables is very similar, but not identical, so
>>> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
>>> with PTTYPE=64, and this generates the two sets of similar functions.
>>>
>>> There are subtle but important differences between the format of EPT tables
>>> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
>>> third set of functions to read the guest EPT table and to write the shadow
>>> EPT table.
>>>
>>> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions 
>>> (prefixed
>>> with "EPT") which correctly read and write EPT tables.
>>>
>>> Signed-off-by: Nadav Har'El 
>>> Signed-off-by: Jun Nakajima 
>>> Signed-off-by: Xinhao Xu 
>>> ---
>>>  arch/x86/kvm/mmu.c |  5 +
>>>  arch/x86/kvm/paging_tmpl.h | 43 +--
>>>  2 files changed, 46 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>> index 117233f..6c1670f 100644
>>> --- a/arch/x86/kvm/mmu.c
>>> +++ b/arch/x86/kvm/mmu.c
>>> @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
>>> unsigned level, unsigned gp
>>> return mmu->last_pte_bitmap & (1 << index);
>>>  }
>>>
>>> +#define PTTYPE_EPT 18 /* arbitrary */
>>> +#define PTTYPE PTTYPE_EPT
>>> +#include "paging_tmpl.h"
>>> +#undef PTTYPE
>>> +
>>>  #define PTTYPE 64
>>>  #include "paging_tmpl.h"
>>>  #undef PTTYPE
>>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
>>> index df34d4a..4c45654 100644
>>> --- a/arch/x86/kvm/paging_tmpl.h
>>> +++ b/arch/x86/kvm/paging_tmpl.h
>>> @@ -50,6 +50,22 @@
>>> #define PT_LEVEL_BITS PT32_LEVEL_BITS
>>> #define PT_MAX_FULL_LEVELS 2
>>> #define CMPXCHG cmpxchg
>>> +#elif PTTYPE == PTTYPE_EPT
>>> +   #define pt_element_t u64
>>> +   #define guest_walker guest_walkerEPT
>>> +   #define FNAME(name) EPT_##name
>>> +   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
>>> +   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
>>> +   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
>>> +   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
>>> +   #define PT_LEVEL_BITS PT64_LEVEL_BITS
>>> +   #ifdef CONFIG_X86_64
>>> +   #define PT_MAX_FULL_LEVELS 4
>>> +   #define CMPXCHG cmpxchg
>>> +   #else
>>> +   #define CMPXCHG cmpxchg64
>>
>> CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later.
>> Do we really need it?
>>
>>> +   #define PT_MAX_FULL_LEVELS 2
>>
>> And the SDM says:
>>
>> "It uses a page-walk length of 4, meaning that at most 4 EPT paging-structure
>> entriesare accessed to translate a guest-physical address.", Is My SDM 
>> obsolete?
>> Which kind of process supports page-walk length = 2?
>>
>> It seems your patch is not able to handle the case that the guest uses 
>> walk-lenght = 2
>> which is running on the host with walk-lenght = 4.
>> (plrease refer to how to handle sp->role.quadrant in 
>> FNAME(get_level1_sp_gpa) in
>> the current code.)
>>
> But since EPT always has 4 levels on all existing cpus it is not an issue and 
> the only case
> that we should worry about is guest walk-lenght == host walk-lenght == 4, or 
> have I

Yes. I totally agree with you, but...

> misunderstood what you mean here?

What confused me is that this patch defines "#define PT_MAX_FULL_LEVELS 2", so 
i asked the
question: "Which kind of process supports page-walk length = 2".
Sorry, there is a typo in my origin comments. "process" should be "processor" 
or "CPU".




--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-06-11 Thread Gleb Natapov
On Tue, May 21, 2013 at 03:52:12PM +0800, Xiao Guangrong wrote:
> On 05/19/2013 12:52 PM, Jun Nakajima wrote:
> > From: Nadav Har'El 
> > 
> > This is the first patch in a series which adds nested EPT support to KVM's
> > nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can 
> > use
> > EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
> > to set its own cr3 and take its own page faults without either of L0 or L1
> > getting involved. This often significanlty improves L2's performance over 
> > the
> > previous two alternatives (shadow page tables over EPT, and shadow page
> > tables over shadow page tables).
> > 
> > This patch adds EPT support to paging_tmpl.h.
> > 
> > paging_tmpl.h contains the code for reading and writing page tables. The 
> > code
> > for 32-bit and 64-bit tables is very similar, but not identical, so
> > paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> > with PTTYPE=64, and this generates the two sets of similar functions.
> > 
> > There are subtle but important differences between the format of EPT tables
> > and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> > third set of functions to read the guest EPT table and to write the shadow
> > EPT table.
> > 
> > So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions 
> > (prefixed
> > with "EPT") which correctly read and write EPT tables.
> > 
> > Signed-off-by: Nadav Har'El 
> > Signed-off-by: Jun Nakajima 
> > Signed-off-by: Xinhao Xu 
> > ---
> >  arch/x86/kvm/mmu.c |  5 +
> >  arch/x86/kvm/paging_tmpl.h | 43 +--
> >  2 files changed, 46 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 117233f..6c1670f 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
> > unsigned level, unsigned gp
> > return mmu->last_pte_bitmap & (1 << index);
> >  }
> > 
> > +#define PTTYPE_EPT 18 /* arbitrary */
> > +#define PTTYPE PTTYPE_EPT
> > +#include "paging_tmpl.h"
> > +#undef PTTYPE
> > +
> >  #define PTTYPE 64
> >  #include "paging_tmpl.h"
> >  #undef PTTYPE
> > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> > index df34d4a..4c45654 100644
> > --- a/arch/x86/kvm/paging_tmpl.h
> > +++ b/arch/x86/kvm/paging_tmpl.h
> > @@ -50,6 +50,22 @@
> > #define PT_LEVEL_BITS PT32_LEVEL_BITS
> > #define PT_MAX_FULL_LEVELS 2
> > #define CMPXCHG cmpxchg
> > +#elif PTTYPE == PTTYPE_EPT
> > +   #define pt_element_t u64
> > +   #define guest_walker guest_walkerEPT
> > +   #define FNAME(name) EPT_##name
> > +   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> > +   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> > +   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> > +   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > +   #define PT_LEVEL_BITS PT64_LEVEL_BITS
> > +   #ifdef CONFIG_X86_64
> > +   #define PT_MAX_FULL_LEVELS 4
> > +   #define CMPXCHG cmpxchg
> > +   #else
> > +   #define CMPXCHG cmpxchg64
> 
> CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later.
> Do we really need it?
> 
> > +   #define PT_MAX_FULL_LEVELS 2
> 
> And the SDM says:
> 
> "It uses a page-walk length of 4, meaning that at most 4 EPT paging-structure
> entriesare accessed to translate a guest-physical address.", Is My SDM 
> obsolete?
> Which kind of process supports page-walk length = 2?
> 
> It seems your patch is not able to handle the case that the guest uses 
> walk-lenght = 2
> which is running on the host with walk-lenght = 4.
> (plrease refer to how to handle sp->role.quadrant in FNAME(get_level1_sp_gpa) 
> in
> the current code.)
> 
But since EPT always has 4 levels on all existing cpus it is not an issue and 
the only case
that we should worry about is guest walk-lenght == host walk-lenght == 4, or 
have I
misunderstood what you mean here?

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-21 Thread Gleb Natapov
On Tue, May 21, 2013 at 03:26:18PM -0700, Nakajima, Jun wrote:
> On Tue, May 21, 2013 at 4:05 AM, Xiao Guangrong
>  wrote:
> > On 05/21/2013 05:01 PM, Gleb Natapov wrote:
> >> On Tue, May 21, 2013 at 04:30:13PM +0800, Xiao Guangrong wrote:
> > @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct 
> > kvm_vcpu *vcpu, gva_t vaddr,
> >
> >return gpa;
> >  }
> > +#endif
> 
>  Strange!
> 
>  Why does nested ept not need these functions? How to emulate the 
>  instruction faulted on L2?
> >>>
> >>> Sorry, i misunderstood it. Have found the reason out.
> >>>
> >> You can write it down here for future reviewers :)
> >
> > Okay.
> >
> > The functions used to translate L2's gva to L1's gpa are 
> > paging32_gva_to_gpa_nested
> > and paging64_gva_to_gpa_nested which are created by PTTYPE == 32 and PTTYPE 
> > == 64.
> >
> >
> 
> Back to your comments on PT_MAX_FULL_LEVELS:
> > + #ifdef CONFIG_X86_64
> > + #define PT_MAX_FULL_LEVELS 4
> > + #define CMPXCHG cmpxchg
> > + #else
> > + #define CMPXCHG cmpxchg64
> > +#define PT_MAX_FULL_LEVELS 2
> I don't think we need to support nEPT on 32-bit hosts.  So, I plan to
> remove such code. What do you think?
> 
Why shouldn't we support nEPT on 32-bit hosts?

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-21 Thread Xiao Guangrong
On 05/22/2013 06:26 AM, Nakajima, Jun wrote:
> On Tue, May 21, 2013 at 4:05 AM, Xiao Guangrong
>  wrote:
>> On 05/21/2013 05:01 PM, Gleb Natapov wrote:
>>> On Tue, May 21, 2013 at 04:30:13PM +0800, Xiao Guangrong wrote:
>> @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct 
>> kvm_vcpu *vcpu, gva_t vaddr,
>>
>>return gpa;
>>  }
>> +#endif
>
> Strange!
>
> Why does nested ept not need these functions? How to emulate the 
> instruction faulted on L2?

 Sorry, i misunderstood it. Have found the reason out.

>>> You can write it down here for future reviewers :)
>>
>> Okay.
>>
>> The functions used to translate L2's gva to L1's gpa are 
>> paging32_gva_to_gpa_nested
>> and paging64_gva_to_gpa_nested which are created by PTTYPE == 32 and PTTYPE 
>> == 64.
>>
>>
> 
> Back to your comments on PT_MAX_FULL_LEVELS:
>> + #ifdef CONFIG_X86_64
>> + #define PT_MAX_FULL_LEVELS 4
>> + #define CMPXCHG cmpxchg
>> + #else
>> + #define CMPXCHG cmpxchg64
>> +#define PT_MAX_FULL_LEVELS 2
> I don't think we need to support nEPT on 32-bit hosts.  So, I plan to
> remove such code. What do you think?

Good to me. :)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-21 Thread Nakajima, Jun
On Tue, May 21, 2013 at 4:05 AM, Xiao Guangrong
 wrote:
> On 05/21/2013 05:01 PM, Gleb Natapov wrote:
>> On Tue, May 21, 2013 at 04:30:13PM +0800, Xiao Guangrong wrote:
> @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu 
> *vcpu, gva_t vaddr,
>
>return gpa;
>  }
> +#endif

 Strange!

 Why does nested ept not need these functions? How to emulate the 
 instruction faulted on L2?
>>>
>>> Sorry, i misunderstood it. Have found the reason out.
>>>
>> You can write it down here for future reviewers :)
>
> Okay.
>
> The functions used to translate L2's gva to L1's gpa are 
> paging32_gva_to_gpa_nested
> and paging64_gva_to_gpa_nested which are created by PTTYPE == 32 and PTTYPE 
> == 64.
>
>

Back to your comments on PT_MAX_FULL_LEVELS:
> + #ifdef CONFIG_X86_64
> + #define PT_MAX_FULL_LEVELS 4
> + #define CMPXCHG cmpxchg
> + #else
> + #define CMPXCHG cmpxchg64
> +#define PT_MAX_FULL_LEVELS 2
I don't think we need to support nEPT on 32-bit hosts.  So, I plan to
remove such code. What do you think?

--
Jun
Intel Open Source Technology Center
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-21 Thread Xiao Guangrong
On 05/21/2013 05:01 PM, Gleb Natapov wrote:
> On Tue, May 21, 2013 at 04:30:13PM +0800, Xiao Guangrong wrote:
 @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu 
 *vcpu, gva_t vaddr,

return gpa;
  }
 +#endif
>>>
>>> Strange!
>>>
>>> Why does nested ept not need these functions? How to emulate the 
>>> instruction faulted on L2?
>>
>> Sorry, i misunderstood it. Have found the reason out.
>>
> You can write it down here for future reviewers :)

Okay.

The functions used to translate L2's gva to L1's gpa are 
paging32_gva_to_gpa_nested
and paging64_gva_to_gpa_nested which are created by PTTYPE == 32 and PTTYPE == 
64.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-21 Thread Gleb Natapov
On Tue, May 21, 2013 at 04:30:13PM +0800, Xiao Guangrong wrote:
> >> @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu 
> >> *vcpu, gva_t vaddr,
> >>
> >>return gpa;
> >>  }
> >> +#endif
> > 
> > Strange!
> > 
> > Why does nested ept not need these functions? How to emulate the 
> > instruction faulted on L2?
> 
> Sorry, i misunderstood it. Have found the reason out.
> 
You can write it down here for future reviewers :)

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-21 Thread Xiao Guangrong
On 05/21/2013 03:52 PM, Xiao Guangrong wrote:
> On 05/19/2013 12:52 PM, Jun Nakajima wrote:
>> From: Nadav Har'El 
>>
>> This is the first patch in a series which adds nested EPT support to KVM's
>> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
>> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
>> to set its own cr3 and take its own page faults without either of L0 or L1
>> getting involved. This often significanlty improves L2's performance over the
>> previous two alternatives (shadow page tables over EPT, and shadow page
>> tables over shadow page tables).
>>
>> This patch adds EPT support to paging_tmpl.h.
>>
>> paging_tmpl.h contains the code for reading and writing page tables. The code
>> for 32-bit and 64-bit tables is very similar, but not identical, so
>> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
>> with PTTYPE=64, and this generates the two sets of similar functions.
>>
>> There are subtle but important differences between the format of EPT tables
>> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
>> third set of functions to read the guest EPT table and to write the shadow
>> EPT table.
>>
>> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions 
>> (prefixed
>> with "EPT") which correctly read and write EPT tables.
>>
>> Signed-off-by: Nadav Har'El 
>> Signed-off-by: Jun Nakajima 
>> Signed-off-by: Xinhao Xu 
>> ---
>>  arch/x86/kvm/mmu.c |  5 +
>>  arch/x86/kvm/paging_tmpl.h | 43 +--
>>  2 files changed, 46 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 117233f..6c1670f 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
>> unsigned level, unsigned gp
>>  return mmu->last_pte_bitmap & (1 << index);
>>  }
>>
>> +#define PTTYPE_EPT 18 /* arbitrary */
>> +#define PTTYPE PTTYPE_EPT
>> +#include "paging_tmpl.h"
>> +#undef PTTYPE
>> +
>>  #define PTTYPE 64
>>  #include "paging_tmpl.h"
>>  #undef PTTYPE
>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
>> index df34d4a..4c45654 100644
>> --- a/arch/x86/kvm/paging_tmpl.h
>> +++ b/arch/x86/kvm/paging_tmpl.h
>> @@ -50,6 +50,22 @@
>>  #define PT_LEVEL_BITS PT32_LEVEL_BITS
>>  #define PT_MAX_FULL_LEVELS 2
>>  #define CMPXCHG cmpxchg
>> +#elif PTTYPE == PTTYPE_EPT
>> +#define pt_element_t u64
>> +#define guest_walker guest_walkerEPT
>> +#define FNAME(name) EPT_##name
>> +#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
>> +#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
>> +#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
>> +#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
>> +#define PT_LEVEL_BITS PT64_LEVEL_BITS
>> +#ifdef CONFIG_X86_64
>> +#define PT_MAX_FULL_LEVELS 4
>> +#define CMPXCHG cmpxchg
>> +#else
>> +#define CMPXCHG cmpxchg64
> 
> CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later.
> Do we really need it?
> 
>> +#define PT_MAX_FULL_LEVELS 2
> 
> And the SDM says:
> 
> "It uses a page-walk length of 4, meaning that at most 4 EPT paging-structure
> entriesare accessed to translate a guest-physical address.", Is My SDM 
> obsolete?
> Which kind of process supports page-walk length = 2?
> 
> It seems your patch is not able to handle the case that the guest uses 
> walk-lenght = 2
> which is running on the host with walk-lenght = 4.
> (plrease refer to how to handle sp->role.quadrant in FNAME(get_level1_sp_gpa) 
> in
> the current code.)
> 
>> +#endif
>>  #else
>>  #error Invalid PTTYPE value
>>  #endif
>> @@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
>>  return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
>>  }
>>
>> +#if PTTYPE != PTTYPE_EPT
>> +/*
>> + *  Comment out this for EPT because update_accessed_dirty_bits() is not 
>> used.
>> + */
>>  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>> pt_element_t __user *ptep_user, unsigned index,
>> pt_element_t orig_pte, pt_element_t new_pte)
>> @@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
>> struct kvm_mmu *mmu,
>>
>>  return (ret != orig_pte);
>>  }
>> +#endif
>>
>>  static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
>>struct kvm_mmu_page *sp, u64 *spte,
>> @@ -126,13 +147,21 @@ no_present:
>>  static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
>>  {
>>  unsigned access;
>> -
>> +#if PTTYPE == PTTYPE_EPT
>> +access = (gpte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
>> +  VMX_EPT_EXECUTABLE_MASK));
> 
> It seems wrong. The ACC_XXX definition:
> 
> #define ACC_EXEC_MASK1
> #define ACC_

Re: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-21 Thread Xiao Guangrong
On 05/19/2013 12:52 PM, Jun Nakajima wrote:
> From: Nadav Har'El 
> 
> This is the first patch in a series which adds nested EPT support to KVM's
> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
> to set its own cr3 and take its own page faults without either of L0 or L1
> getting involved. This often significanlty improves L2's performance over the
> previous two alternatives (shadow page tables over EPT, and shadow page
> tables over shadow page tables).
> 
> This patch adds EPT support to paging_tmpl.h.
> 
> paging_tmpl.h contains the code for reading and writing page tables. The code
> for 32-bit and 64-bit tables is very similar, but not identical, so
> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> with PTTYPE=64, and this generates the two sets of similar functions.
> 
> There are subtle but important differences between the format of EPT tables
> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> third set of functions to read the guest EPT table and to write the shadow
> EPT table.
> 
> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
> with "EPT") which correctly read and write EPT tables.
> 
> Signed-off-by: Nadav Har'El 
> Signed-off-by: Jun Nakajima 
> Signed-off-by: Xinhao Xu 
> ---
>  arch/x86/kvm/mmu.c |  5 +
>  arch/x86/kvm/paging_tmpl.h | 43 +--
>  2 files changed, 46 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 117233f..6c1670f 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
> unsigned level, unsigned gp
>   return mmu->last_pte_bitmap & (1 << index);
>  }
> 
> +#define PTTYPE_EPT 18 /* arbitrary */
> +#define PTTYPE PTTYPE_EPT
> +#include "paging_tmpl.h"
> +#undef PTTYPE
> +
>  #define PTTYPE 64
>  #include "paging_tmpl.h"
>  #undef PTTYPE
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index df34d4a..4c45654 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -50,6 +50,22 @@
>   #define PT_LEVEL_BITS PT32_LEVEL_BITS
>   #define PT_MAX_FULL_LEVELS 2
>   #define CMPXCHG cmpxchg
> +#elif PTTYPE == PTTYPE_EPT
> + #define pt_element_t u64
> + #define guest_walker guest_walkerEPT
> + #define FNAME(name) EPT_##name
> + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> + #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> + #define PT_LEVEL_BITS PT64_LEVEL_BITS
> + #ifdef CONFIG_X86_64
> + #define PT_MAX_FULL_LEVELS 4
> + #define CMPXCHG cmpxchg
> + #else
> + #define CMPXCHG cmpxchg64

CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later.
Do we really need it?

> + #define PT_MAX_FULL_LEVELS 2

And the SDM says:

"It uses a page-walk length of 4, meaning that at most 4 EPT paging-structure
entriesare accessed to translate a guest-physical address.", Is My SDM obsolete?
Which kind of process supports page-walk length = 2?

It seems your patch is not able to handle the case that the guest uses 
walk-lenght = 2
which is running on the host with walk-lenght = 4.
(plrease refer to how to handle sp->role.quadrant in FNAME(get_level1_sp_gpa) in
the current code.)

> + #endif
>  #else
>   #error Invalid PTTYPE value
>  #endif
> @@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
>   return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
>  }
> 
> +#if PTTYPE != PTTYPE_EPT
> +/*
> + *  Comment out this for EPT because update_accessed_dirty_bits() is not 
> used.
> + */
>  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>  pt_element_t __user *ptep_user, unsigned index,
>  pt_element_t orig_pte, pt_element_t new_pte)
> @@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
> struct kvm_mmu *mmu,
> 
>   return (ret != orig_pte);
>  }
> +#endif
> 
>  static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
> struct kvm_mmu_page *sp, u64 *spte,
> @@ -126,13 +147,21 @@ no_present:
>  static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
>  {
>   unsigned access;
> -
> +#if PTTYPE == PTTYPE_EPT
> + access = (gpte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
> +   VMX_EPT_EXECUTABLE_MASK));

It seems wrong. The ACC_XXX definition:

#define ACC_EXEC_MASK1
#define ACC_WRITE_MASK   PT_WRITABLE_MASK
#define ACC_USER_MASKPT_USER_MASK
#define ACC_ALL  (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)

The bits are different wit

[PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c |  5 +
 arch/x86/kvm/paging_tmpl.h | 43 +--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 117233f..6c1670f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df34d4a..4c45654 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,22 @@
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+   #define pt_element_t u64
+   #define guest_walker guest_walkerEPT
+   #define FNAME(name) EPT_##name
+   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+   #define PT_LEVEL_BITS PT64_LEVEL_BITS
+   #ifdef CONFIG_X86_64
+   #define PT_MAX_FULL_LEVELS 4
+   #define CMPXCHG cmpxchg
+   #else
+   #define CMPXCHG cmpxchg64
+   #define PT_MAX_FULL_LEVELS 2
+   #endif
 #else
#error Invalid PTTYPE value
 #endif
@@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ *  Comment out this for EPT because update_accessed_dirty_bits() is not used.
+ */
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
   pt_element_t __user *ptep_user, unsigned index,
   pt_element_t orig_pte, pt_element_t new_pte)
@@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
return (ret != orig_pte);
 }
+#endif
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
  struct kvm_mmu_page *sp, u64 *spte,
@@ -126,13 +147,21 @@ no_present:
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
unsigned access;
-
+#if PTTYPE == PTTYPE_EPT
+   access = (gpte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+ VMX_EPT_EXECUTABLE_MASK));
+#else
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
 
return access;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ * EPT A/D bit support is not implemented.
+ */
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 struct kvm_mmu *mmu,
 struct guest_walker *walker,
@@ -169,6 +198,7 @@ static int FNAME(update_accessed_dirty_bits)(struct 
kvm_vcpu *vcpu,
}
return 0;
 }
+#endif
 
 /*
  * Fetch a guest pte for a guest virtual address
@@ -177,7 +207,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker 
*walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gva_t addr, u32 access)
 {
-   int ret;
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
@@ -192,7 +221,9 @@ st

[PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-08 Thread Jun Nakajima
This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c |  5 +
 arch/x86/kvm/paging_tmpl.h | 43 +--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 117233f..6c1670f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df34d4a..4c45654 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,22 @@
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+   #define pt_element_t u64
+   #define guest_walker guest_walkerEPT
+   #define FNAME(name) EPT_##name
+   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+   #define PT_LEVEL_BITS PT64_LEVEL_BITS
+   #ifdef CONFIG_X86_64
+   #define PT_MAX_FULL_LEVELS 4
+   #define CMPXCHG cmpxchg
+   #else
+   #define CMPXCHG cmpxchg64
+   #define PT_MAX_FULL_LEVELS 2
+   #endif
 #else
#error Invalid PTTYPE value
 #endif
@@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ *  Comment out this for EPT because update_accessed_dirty_bits() is not used.
+ */
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
   pt_element_t __user *ptep_user, unsigned index,
   pt_element_t orig_pte, pt_element_t new_pte)
@@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
return (ret != orig_pte);
 }
+#endif
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
  struct kvm_mmu_page *sp, u64 *spte,
@@ -126,13 +147,21 @@ no_present:
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
unsigned access;
-
+#if PTTYPE == PTTYPE_EPT
+   access = (gpte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+ VMX_EPT_EXECUTABLE_MASK));
+#else
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
 
return access;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ * EPT A/D bit support is not implemented.
+ */
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 struct kvm_mmu *mmu,
 struct guest_walker *walker,
@@ -169,6 +198,7 @@ static int FNAME(update_accessed_dirty_bits)(struct 
kvm_vcpu *vcpu,
}
return 0;
 }
+#endif
 
 /*
  * Fetch a guest pte for a guest virtual address
@@ -177,7 +207,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker 
*walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gva_t addr, u32 access)
 {
-   int ret;
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
@@ -192,7 +221,9 @@ static int FNAME(walk_a