Re: [Xen-devel] [PATCH] vmx/hap: optimize CR4 trapping

2018-02-16 Thread Razvan Cojocaru
On 02/16/2018 01:25 PM, Roger Pau Monné wrote:
> On Thu, Feb 15, 2018 at 09:32:00PM +0200, Razvan Cojocaru wrote:
>> On 02/15/2018 08:57 PM, Andrew Cooper wrote:
>>> On 15/02/18 16:25, Roger Pau Monne wrote:
 There a bunch of bits in CR4 that should be allowed to be set directly
 by the guest without requiring Xen intervention, currently this is
 already done by passing through guest writes into the CR4 used when
 running in non-root mode, but taking an expensive vmexit in order to
 do so.

 xenalyze reports the following when running a PV guest in shim mode:

  CR_ACCESS 3885950  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
cr4  3885940  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
cr31  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
  *[  0]1  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
cr07  0.00s  0.00%  7112 cyc { 3248| 5960|17480}
clts2  0.00s  0.00%  4588 cyc { 3456| 5720| 5720}

 After this change this turns into:

  CR_ACCESS  12  0.00s  0.00%  9972 cyc { 3680|11024|24032}
cr42  0.00s  0.00% 17528 cyc {11024|24032|24032}
cr31  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
  *[  0]1  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
cr07  0.00s  0.00%  9209 cyc { 4184| 7848|17488}
clts2  0.00s  0.00%  8232 cyc { 5352|2|2}

 Note that this optimized trapping is currently only applied to guests
 running with HAP on Intel hardware. If using shadow paging more CR4
 bits need to be unconditionally trapped, which makes this approach
 unlikely to yield any important performance improvements.

 Reported-by: Andrew Cooper 
 Signed-off-by: Roger Pau Monné 
 ---
 Cc: Jun Nakajima 
 Cc: Kevin Tian 
 Cc: Jan Beulich 
 Cc: Andrew Cooper 
 Cc: Razvan Cojocaru 
 Cc: Tamas K Lengyel 
 ---
  xen/arch/x86/hvm/vmx/vmx.c  | 41 +
  xen/arch/x86/hvm/vmx/vvmx.c |  5 -
  xen/arch/x86/monitor.c  |  5 +++--
  3 files changed, 48 insertions(+), 3 deletions(-)

 diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
 index d35cf55982..9747b2a398 100644
 --- a/xen/arch/x86/hvm/vmx/vmx.c
 +++ b/xen/arch/x86/hvm/vmx/vmx.c
 @@ -1684,6 +1684,35 @@ static void vmx_update_guest_cr(struct vcpu *v, 
 unsigned int cr)
  }
  
  __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
 +
 +if ( (v->domain->arch.monitor.write_ctrlreg_enabled &
 +  monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) ||
 + !paging_mode_hap(v->domain) )
 +/*
 + * If requested by introspection or running in shadow mode 
 trap all
 + * accesses to CR4.
>>>
>>> The monitor write_ctrlreg_onchangeonly feature was purposefully
>>> introduced to avoid sending PGE updates to the introspection agent.  It
>>> would be ideal to include that in the mask calculation so introspection
>>> cases don't vmexit for PGE changes.
>>>
>>> Also, AMD has similar capabilities, and (as of today) has gained CR
>>> monitoring.
>>
>> I believe the patch Andrew is referring to is this one:
>>
>> https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=59aad28cfac09640e2272f1e87951406233c3192
>>
>> We added that specifically so that no PGE-only exits end up needing
>> (pointless) processing by the introspection agent.
> 
> I've been looking at that change and I think the logic is wrong, the
> following chunk:
> 
>  if ( (ad->monitor.write_ctrlreg_enabled & ctrlreg_bitmask) &&
>   (!(ad->monitor.write_ctrlreg_onchangeonly & ctrlreg_bitmask) ||
> -  value != old) )
> +  value != old) &&
> + (!((value ^ old) & ad->monitor.write_ctrlreg_mask[index])) )
>  {
>  bool_t sync = !!(ad->monitor.write_ctrlreg_sync & ctrlreg_bitmask);
> 
> Seems wrong. Imagine for example the case where (value ^ old) ==
> PGE|PSE, and mask == PGE:
> 
> !((PGE|PSE) & PGE) will yield false, and the monitor won't be notified.
> 
> I think what you want is:
> 
> ((value ^ old) & ~ad->monitor.write_ctrlreg_mask[index])
> 
> But maybe I'm just confused.

No, I think you're quite right. Thanks for pointing that out!
We'll submit a fix patch shortly.


Thanks,
Razvan

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH] vmx/hap: optimize CR4 trapping

2018-02-16 Thread Roger Pau Monné
On Thu, Feb 15, 2018 at 09:32:00PM +0200, Razvan Cojocaru wrote:
> On 02/15/2018 08:57 PM, Andrew Cooper wrote:
> > On 15/02/18 16:25, Roger Pau Monne wrote:
> >> There a bunch of bits in CR4 that should be allowed to be set directly
> >> by the guest without requiring Xen intervention, currently this is
> >> already done by passing through guest writes into the CR4 used when
> >> running in non-root mode, but taking an expensive vmexit in order to
> >> do so.
> >>
> >> xenalyze reports the following when running a PV guest in shim mode:
> >>
> >>  CR_ACCESS 3885950  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
> >>cr4  3885940  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
> >>cr31  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
> >>  *[  0]1  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
> >>cr07  0.00s  0.00%  7112 cyc { 3248| 5960|17480}
> >>clts2  0.00s  0.00%  4588 cyc { 3456| 5720| 5720}
> >>
> >> After this change this turns into:
> >>
> >>  CR_ACCESS  12  0.00s  0.00%  9972 cyc { 3680|11024|24032}
> >>cr42  0.00s  0.00% 17528 cyc {11024|24032|24032}
> >>cr31  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
> >>  *[  0]1  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
> >>cr07  0.00s  0.00%  9209 cyc { 4184| 7848|17488}
> >>clts2  0.00s  0.00%  8232 cyc { 5352|2|2}
> >>
> >> Note that this optimized trapping is currently only applied to guests
> >> running with HAP on Intel hardware. If using shadow paging more CR4
> >> bits need to be unconditionally trapped, which makes this approach
> >> unlikely to yield any important performance improvements.
> >>
> >> Reported-by: Andrew Cooper 
> >> Signed-off-by: Roger Pau Monné 
> >> ---
> >> Cc: Jun Nakajima 
> >> Cc: Kevin Tian 
> >> Cc: Jan Beulich 
> >> Cc: Andrew Cooper 
> >> Cc: Razvan Cojocaru 
> >> Cc: Tamas K Lengyel 
> >> ---
> >>  xen/arch/x86/hvm/vmx/vmx.c  | 41 +
> >>  xen/arch/x86/hvm/vmx/vvmx.c |  5 -
> >>  xen/arch/x86/monitor.c  |  5 +++--
> >>  3 files changed, 48 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> >> index d35cf55982..9747b2a398 100644
> >> --- a/xen/arch/x86/hvm/vmx/vmx.c
> >> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> >> @@ -1684,6 +1684,35 @@ static void vmx_update_guest_cr(struct vcpu *v, 
> >> unsigned int cr)
> >>  }
> >>  
> >>  __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
> >> +
> >> +if ( (v->domain->arch.monitor.write_ctrlreg_enabled &
> >> +  monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) ||
> >> + !paging_mode_hap(v->domain) )
> >> +/*
> >> + * If requested by introspection or running in shadow mode 
> >> trap all
> >> + * accesses to CR4.
> > 
> > The monitor write_ctrlreg_onchangeonly feature was purposefully
> > introduced to avoid sending PGE updates to the introspection agent.  It
> > would be ideal to include that in the mask calculation so introspection
> > cases don't vmexit for PGE changes.
> > 
> > Also, AMD has similar capabilities, and (as of today) has gained CR
> > monitoring.
> 
> I believe the patch Andrew is referring to is this one:
> 
> https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=59aad28cfac09640e2272f1e87951406233c3192
> 
> We added that specifically so that no PGE-only exits end up needing
> (pointless) processing by the introspection agent.

I've been looking at that change and I think the logic is wrong, the
following chunk:

 if ( (ad->monitor.write_ctrlreg_enabled & ctrlreg_bitmask) &&
  (!(ad->monitor.write_ctrlreg_onchangeonly & ctrlreg_bitmask) ||
-  value != old) )
+  value != old) &&
+ (!((value ^ old) & ad->monitor.write_ctrlreg_mask[index])) )
 {
 bool_t sync = !!(ad->monitor.write_ctrlreg_sync & ctrlreg_bitmask);

Seems wrong. Imagine for example the case where (value ^ old) ==
PGE|PSE, and mask == PGE:

!((PGE|PSE) & PGE) will yield false, and the monitor won't be notified.

I think what you want is:

((value ^ old) & ~ad->monitor.write_ctrlreg_mask[index])

But maybe I'm just confused.

Roger.

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH] vmx/hap: optimize CR4 trapping

2018-02-16 Thread Roger Pau Monné
On Thu, Feb 15, 2018 at 06:57:46PM +, Andrew Cooper wrote:
> On 15/02/18 16:25, Roger Pau Monne wrote:
> > There a bunch of bits in CR4 that should be allowed to be set directly
> > by the guest without requiring Xen intervention, currently this is
> > already done by passing through guest writes into the CR4 used when
> > running in non-root mode, but taking an expensive vmexit in order to
> > do so.
> >
> > xenalyze reports the following when running a PV guest in shim mode:
> >
> >  CR_ACCESS 3885950  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
> >cr4  3885940  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
> >cr31  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
> >  *[  0]1  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
> >cr07  0.00s  0.00%  7112 cyc { 3248| 5960|17480}
> >clts2  0.00s  0.00%  4588 cyc { 3456| 5720| 5720}
> >
> > After this change this turns into:
> >
> >  CR_ACCESS  12  0.00s  0.00%  9972 cyc { 3680|11024|24032}
> >cr42  0.00s  0.00% 17528 cyc {11024|24032|24032}
> >cr31  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
> >  *[  0]1  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
> >cr07  0.00s  0.00%  9209 cyc { 4184| 7848|17488}
> >clts2  0.00s  0.00%  8232 cyc { 5352|2|2}
> >
> > Note that this optimized trapping is currently only applied to guests
> > running with HAP on Intel hardware. If using shadow paging more CR4
> > bits need to be unconditionally trapped, which makes this approach
> > unlikely to yield any important performance improvements.
> >
> > Reported-by: Andrew Cooper 
> > Signed-off-by: Roger Pau Monné 
> > ---
> > Cc: Jun Nakajima 
> > Cc: Kevin Tian 
> > Cc: Jan Beulich 
> > Cc: Andrew Cooper 
> > Cc: Razvan Cojocaru 
> > Cc: Tamas K Lengyel 
> > ---
> >  xen/arch/x86/hvm/vmx/vmx.c  | 41 +
> >  xen/arch/x86/hvm/vmx/vvmx.c |  5 -
> >  xen/arch/x86/monitor.c  |  5 +++--
> >  3 files changed, 48 insertions(+), 3 deletions(-)
> >
> > diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> > index d35cf55982..9747b2a398 100644
> > --- a/xen/arch/x86/hvm/vmx/vmx.c
> > +++ b/xen/arch/x86/hvm/vmx/vmx.c
> > @@ -1684,6 +1684,35 @@ static void vmx_update_guest_cr(struct vcpu *v, 
> > unsigned int cr)
> >  }
> >  
> >  __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
> > +
> > +if ( (v->domain->arch.monitor.write_ctrlreg_enabled &
> > +  monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) ||
> > + !paging_mode_hap(v->domain) )
> > +/*
> > + * If requested by introspection or running in shadow mode 
> > trap all
> > + * accesses to CR4.
> 
> The monitor write_ctrlreg_onchangeonly feature was purposefully
> introduced to avoid sending PGE updates to the introspection agent.  It
> would be ideal to include that in the mask calculation so introspection
> cases don't vmexit for PGE changes.

Oh, haven't realized about that, will include it.

> Also, AMD has similar capabilities, and (as of today) has gained CR
> monitoring.

OK, will have to find some AMD hardware and prepare a similar patch,
but that's certainly for later.

> > + *
> > + * NB: shadow path has not been optimized because it requires
> > + * unconditionally trapping more CR4 bits, at which point the
> > + * performance benefit of doing this is quite dubious.
> > + */
> > +__vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
> 
> It would be better to stash the calculated mask, rather than reading it
> back in the vmexit handler.

Right, I guess for coherency I should introduce a mask_cr[4] array to
hvm_vcpu. I could do that in a pre-patch if it sounds fine.

Thanks, Roger.

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH] vmx/hap: optimize CR4 trapping

2018-02-15 Thread Razvan Cojocaru
On 02/15/2018 08:57 PM, Andrew Cooper wrote:
> On 15/02/18 16:25, Roger Pau Monne wrote:
>> There a bunch of bits in CR4 that should be allowed to be set directly
>> by the guest without requiring Xen intervention, currently this is
>> already done by passing through guest writes into the CR4 used when
>> running in non-root mode, but taking an expensive vmexit in order to
>> do so.
>>
>> xenalyze reports the following when running a PV guest in shim mode:
>>
>>  CR_ACCESS 3885950  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
>>cr4  3885940  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
>>cr31  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
>>  *[  0]1  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
>>cr07  0.00s  0.00%  7112 cyc { 3248| 5960|17480}
>>clts2  0.00s  0.00%  4588 cyc { 3456| 5720| 5720}
>>
>> After this change this turns into:
>>
>>  CR_ACCESS  12  0.00s  0.00%  9972 cyc { 3680|11024|24032}
>>cr42  0.00s  0.00% 17528 cyc {11024|24032|24032}
>>cr31  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
>>  *[  0]1  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
>>cr07  0.00s  0.00%  9209 cyc { 4184| 7848|17488}
>>clts2  0.00s  0.00%  8232 cyc { 5352|2|2}
>>
>> Note that this optimized trapping is currently only applied to guests
>> running with HAP on Intel hardware. If using shadow paging more CR4
>> bits need to be unconditionally trapped, which makes this approach
>> unlikely to yield any important performance improvements.
>>
>> Reported-by: Andrew Cooper 
>> Signed-off-by: Roger Pau Monné 
>> ---
>> Cc: Jun Nakajima 
>> Cc: Kevin Tian 
>> Cc: Jan Beulich 
>> Cc: Andrew Cooper 
>> Cc: Razvan Cojocaru 
>> Cc: Tamas K Lengyel 
>> ---
>>  xen/arch/x86/hvm/vmx/vmx.c  | 41 +
>>  xen/arch/x86/hvm/vmx/vvmx.c |  5 -
>>  xen/arch/x86/monitor.c  |  5 +++--
>>  3 files changed, 48 insertions(+), 3 deletions(-)
>>
>> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
>> index d35cf55982..9747b2a398 100644
>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> @@ -1684,6 +1684,35 @@ static void vmx_update_guest_cr(struct vcpu *v, 
>> unsigned int cr)
>>  }
>>  
>>  __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
>> +
>> +if ( (v->domain->arch.monitor.write_ctrlreg_enabled &
>> +  monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) ||
>> + !paging_mode_hap(v->domain) )
>> +/*
>> + * If requested by introspection or running in shadow mode trap 
>> all
>> + * accesses to CR4.
> 
> The monitor write_ctrlreg_onchangeonly feature was purposefully
> introduced to avoid sending PGE updates to the introspection agent.  It
> would be ideal to include that in the mask calculation so introspection
> cases don't vmexit for PGE changes.
> 
> Also, AMD has similar capabilities, and (as of today) has gained CR
> monitoring.

I believe the patch Andrew is referring to is this one:

https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=59aad28cfac09640e2272f1e87951406233c3192

We added that specifically so that no PGE-only exits end up needing
(pointless) processing by the introspection agent.


Thanks,
Razvan

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH] vmx/hap: optimize CR4 trapping

2018-02-15 Thread Andrew Cooper
On 15/02/18 16:25, Roger Pau Monne wrote:
> There a bunch of bits in CR4 that should be allowed to be set directly
> by the guest without requiring Xen intervention, currently this is
> already done by passing through guest writes into the CR4 used when
> running in non-root mode, but taking an expensive vmexit in order to
> do so.
>
> xenalyze reports the following when running a PV guest in shim mode:
>
>  CR_ACCESS 3885950  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
>cr4  3885940  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
>cr31  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
>  *[  0]1  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
>cr07  0.00s  0.00%  7112 cyc { 3248| 5960|17480}
>clts2  0.00s  0.00%  4588 cyc { 3456| 5720| 5720}
>
> After this change this turns into:
>
>  CR_ACCESS  12  0.00s  0.00%  9972 cyc { 3680|11024|24032}
>cr42  0.00s  0.00% 17528 cyc {11024|24032|24032}
>cr31  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
>  *[  0]1  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
>cr07  0.00s  0.00%  9209 cyc { 4184| 7848|17488}
>clts2  0.00s  0.00%  8232 cyc { 5352|2|2}
>
> Note that this optimized trapping is currently only applied to guests
> running with HAP on Intel hardware. If using shadow paging more CR4
> bits need to be unconditionally trapped, which makes this approach
> unlikely to yield any important performance improvements.
>
> Reported-by: Andrew Cooper 
> Signed-off-by: Roger Pau Monné 
> ---
> Cc: Jun Nakajima 
> Cc: Kevin Tian 
> Cc: Jan Beulich 
> Cc: Andrew Cooper 
> Cc: Razvan Cojocaru 
> Cc: Tamas K Lengyel 
> ---
>  xen/arch/x86/hvm/vmx/vmx.c  | 41 +
>  xen/arch/x86/hvm/vmx/vvmx.c |  5 -
>  xen/arch/x86/monitor.c  |  5 +++--
>  3 files changed, 48 insertions(+), 3 deletions(-)
>
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index d35cf55982..9747b2a398 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -1684,6 +1684,35 @@ static void vmx_update_guest_cr(struct vcpu *v, 
> unsigned int cr)
>  }
>  
>  __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
> +
> +if ( (v->domain->arch.monitor.write_ctrlreg_enabled &
> +  monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) ||
> + !paging_mode_hap(v->domain) )
> +/*
> + * If requested by introspection or running in shadow mode trap 
> all
> + * accesses to CR4.

The monitor write_ctrlreg_onchangeonly feature was purposefully
introduced to avoid sending PGE updates to the introspection agent.  It
would be ideal to include that in the mask calculation so introspection
cases don't vmexit for PGE changes.

Also, AMD has similar capabilities, and (as of today) has gained CR
monitoring.

> + *
> + * NB: shadow path has not been optimized because it requires
> + * unconditionally trapping more CR4 bits, at which point the
> + * performance benefit of doing this is quite dubious.
> + */
> +__vmwrite(CR4_GUEST_HOST_MASK, ~0UL);

It would be better to stash the calculated mask, rather than reading it
back in the vmexit handler.

~Andrew

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH] vmx/hap: optimize CR4 trapping

2018-02-15 Thread Roger Pau Monne
There a bunch of bits in CR4 that should be allowed to be set directly
by the guest without requiring Xen intervention, currently this is
already done by passing through guest writes into the CR4 used when
running in non-root mode, but taking an expensive vmexit in order to
do so.

xenalyze reports the following when running a PV guest in shim mode:

 CR_ACCESS 3885950  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
   cr4  3885940  6.41s 17.04%  3957 cyc { 2361| 3378| 7920}
   cr31  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
 *[  0]1  0.00s  0.00%  3480 cyc { 3480| 3480| 3480}
   cr07  0.00s  0.00%  7112 cyc { 3248| 5960|17480}
   clts2  0.00s  0.00%  4588 cyc { 3456| 5720| 5720}

After this change this turns into:

 CR_ACCESS  12  0.00s  0.00%  9972 cyc { 3680|11024|24032}
   cr42  0.00s  0.00% 17528 cyc {11024|24032|24032}
   cr31  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
 *[  0]1  0.00s  0.00%  3680 cyc { 3680| 3680| 3680}
   cr07  0.00s  0.00%  9209 cyc { 4184| 7848|17488}
   clts2  0.00s  0.00%  8232 cyc { 5352|2|2}

Note that this optimized trapping is currently only applied to guests
running with HAP on Intel hardware. If using shadow paging more CR4
bits need to be unconditionally trapped, which makes this approach
unlikely to yield any important performance improvements.

Reported-by: Andrew Cooper 
Signed-off-by: Roger Pau Monné 
---
Cc: Jun Nakajima 
Cc: Kevin Tian 
Cc: Jan Beulich 
Cc: Andrew Cooper 
Cc: Razvan Cojocaru 
Cc: Tamas K Lengyel 
---
 xen/arch/x86/hvm/vmx/vmx.c  | 41 +
 xen/arch/x86/hvm/vmx/vvmx.c |  5 -
 xen/arch/x86/monitor.c  |  5 +++--
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index d35cf55982..9747b2a398 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1684,6 +1684,35 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned 
int cr)
 }
 
 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
+
+if ( (v->domain->arch.monitor.write_ctrlreg_enabled &
+  monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) ||
+ !paging_mode_hap(v->domain) )
+/*
+ * If requested by introspection or running in shadow mode trap all
+ * accesses to CR4.
+ *
+ * NB: shadow path has not been optimized because it requires
+ * unconditionally trapping more CR4 bits, at which point the
+ * performance benefit of doing this is quite dubious.
+ */
+__vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
+else
+{
+uint64_t mask = HVM_CR4_HOST_MASK | X86_CR4_PKE |
+~hvm_cr4_guest_valid_bits(v, 0);
+
+/*
+ * Update CR4 host mask to only trap when the guest tries to set
+ * bits that are controlled by the hypervisor.
+ */
+mask |= v->arch.hvm_vmx.vmx_realmode ? X86_CR4_VME : 0;
+mask |= !hvm_paging_enabled(v) ? (X86_CR4_PSE | X86_CR4_SMEP |
+  X86_CR4_SMAP)
+   : 0;
+__vmwrite(CR4_GUEST_HOST_MASK, mask);
+}
+
 break;
 
 case 2:
@@ -3512,6 +3541,18 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 
 if ( paging_mode_hap(v->domain) )
 {
+uint64_t cr4, mask;
+
+/*
+ * Xen allows the guest to modify some CR4 bits directly, update cached
+ * values to match.
+ */
+__vmread(GUEST_CR4, &cr4);
+__vmread(CR4_GUEST_HOST_MASK, &mask);
+v->arch.hvm_vcpu.hw_cr[4] = cr4;
+v->arch.hvm_vcpu.guest_cr[4] &= mask;
+v->arch.hvm_vcpu.guest_cr[4] |= cr4 & ~mask;
+
 __vmread(GUEST_CR3, &v->arch.hvm_vcpu.hw_cr[3]);
 if ( vmx_unrestricted_guest(v) || hvm_paging_enabled(v) )
 v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3];
diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index dfe97b9705..65f2629118 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1043,7 +1043,7 @@ static void load_shadow_guest_state(struct vcpu *v)
 {
 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
 u32 control;
-u64 cr_gh_mask, cr_read_shadow;
+uint64_t cr_gh_mask, cr_mask, cr_read_shadow;
 int rc;
 
 static const u16 vmentry_fields[] = {
@@ -1100,6 +1100,9 @@ static void load_shadow_guest_state(struct vcpu *v)
 cr_read_shadow = (get_vvmcs(v, GUEST_CR4) & ~cr_gh_mask) |
  (get_vvmcs(v, CR4_READ_SHADOW) & cr_gh_mask);
 __vmwrite(CR4_READ_SHADOW, cr_read_shadow);
+/* Add the nested host mask to the one set by vmx_update_guest_cr. */
+__vmread(CR4_GUEST_HOST_MASK, &cr_mask);
+__vmwrite(CR4_GUEST_HOST_M