Re: [patch V2 00/46] x86, PCI, XEN, genirq ...: Prepare for device MSI

2020-09-01 Thread Boqun Feng
Hi Thomas,

On Wed, Aug 26, 2020 at 01:16:28PM +0200, Thomas Gleixner wrote:
[...]
> 
> The whole lot is also available from git:
> 
>git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git device-msi
> 
> This has been tested on Intel/AMD/KVM but lacks testing on:
> 
> - HYPERV (-ENODEV)

FWIW, I did a build and boot test in a hyperv guest with your
development branch, the latest commit is 71cbf478eb6f ("irqchip: Add
IMS (Interrupt Message Storm) driver - NOT FOR MERGING"). And everything
seemed working fine.

If you want me to set/unset a particular CONFIG option or run some
command for testing purposes, please let me know ;-)

Regards,
Bqoun

> - VMD enabled systems (-ENODEV)
> - XEN (-ENOCLUE)
> - IMS (-ENODEV)
> 
> - Any non-X86 code which might depend on the broken compose MSI message
>   logic. Marc excpects not much fallout, but agrees that we need to fix
>   it anyway.
> 
> #1 - #3 should be applied unconditionally for obvious reasons
> #4 - #6 are wortwhile cleanups which should be done independent of device MSI
> 
> #7 - #8 look promising to cleanup the platform MSI implementation
>   independent of #8, but I neither had cycles nor the stomach to
>   tackle that.
> 
> #9is obviously just for the folks interested in IMS
> 
> Thanks,
> 
>   tglx



Re: [patch RFC 10/38] x86/ioapic: Consolidate IOAPIC allocation

2020-08-26 Thread Boqun Feng
Hi Thomas,

I hit a compiler error while I was trying to compile this patchset:

arch/x86/kernel/devicetree.c: In function ‘dt_irqdomain_alloc’:
arch/x86/kernel/devicetree.c:232:6: error: ‘struct irq_alloc_info’ has no 
member named ‘ioapic_id’; did you mean ‘ioapic’?
  232 |  tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
  |  ^
  |  ioapic
arch/x86/kernel/devicetree.c:233:6: error: ‘struct irq_alloc_info’ has no 
member named ‘ioapic_pin’; did you mean ‘ioapic’?
  233 |  tmp.ioapic_pin = fwspec->param[0]
  |  ^~
  |  ioapic

with CONFIG_OF=y. IIUC, the following changes are needed to fold into
this patch. (At least I can continue to compile the kernel with this
change)

diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index a0e8fc7d85f1..ddffd80f5c52 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -229,8 +229,8 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, 
unsigned int virq,
 
it = &of_ioapic_type[type_index];
ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
-   tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
-   tmp.ioapic_pin = fwspec->param[0];
+   tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+   tmp.ioapic.pin = fwspec->param[0];
 
return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
 }

Regards,
Boqun

On Fri, Aug 21, 2020 at 02:24:34AM +0200, Thomas Gleixner wrote:
> Move the IOAPIC specific fields into their own struct and reuse the common
> devid. Get rid of the #ifdeffery as it does not matter at all whether the
> alloc info is a couple of bytes longer or not.
> 
> Signed-off-by: Thomas Gleixner 
> Cc: Wei Liu 
> Cc: "K. Y. Srinivasan" 
> Cc: Stephen Hemminger 
> Cc: Joerg Roedel 
> Cc: linux-hyp...@vger.kernel.org
> Cc: io...@lists.linux-foundation.org
> Cc: Haiyang Zhang 
> Cc: Jon Derrick 
> Cc: Lu Baolu 
> ---
>  arch/x86/include/asm/hw_irq.h   |   23 ++-
>  arch/x86/kernel/apic/io_apic.c  |   70 
> ++--
>  drivers/iommu/amd/iommu.c   |   14 +++
>  drivers/iommu/hyperv-iommu.c|2 -
>  drivers/iommu/intel/irq_remapping.c |   18 -
>  5 files changed, 64 insertions(+), 63 deletions(-)
> 
> --- a/arch/x86/include/asm/hw_irq.h
> +++ b/arch/x86/include/asm/hw_irq.h
> @@ -44,6 +44,15 @@ enum irq_alloc_type {
>   X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT,
>  };
>  
> +struct ioapic_alloc_info {
> + int pin;
> + int node;
> + u32 trigger : 1;
> + u32 polarity : 1;
> + u32 valid : 1;
> + struct IO_APIC_route_entry  *entry;
> +};
> +
>  /**
>   * irq_alloc_info - X86 specific interrupt allocation info
>   * @type:X86 specific allocation type
> @@ -53,6 +62,8 @@ enum irq_alloc_type {
>   * @mask:CPU mask for vector allocation
>   * @desc:Pointer to msi descriptor
>   * @data:Allocation specific data
> + *
> + * @ioapic:  IOAPIC specific allocation data
>   */
>  struct irq_alloc_info {
>   enum irq_alloc_type type;
> @@ -64,6 +75,7 @@ struct irq_alloc_info {
>   void*data;
>  
>   union {
> + struct ioapic_alloc_infoioapic;
>   int unused;
>  #ifdef   CONFIG_PCI_MSI
>   struct {
> @@ -71,17 +83,6 @@ struct irq_alloc_info {
>   irq_hw_number_t msi_hwirq;
>   };
>  #endif
> -#ifdef   CONFIG_X86_IO_APIC
> - struct {
> - int ioapic_id;
> - int ioapic_pin;
> - int ioapic_node;
> - u32 ioapic_trigger : 1;
> - u32 ioapic_polarity : 1;
> - u32 ioapic_valid : 1;
> - struct IO_APIC_route_entry *ioapic_entry;
> - };
> -#endif
>  #ifdef   CONFIG_DMAR_TABLE
>   struct {
>   int dmar_id;
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -860,10 +860,10 @@ void ioapic_set_alloc_attr(struct irq_al
>  {
>   init_irq_alloc_info(info, NULL);
>   info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
> - info->ioapic_node = node;
> - info->ioapic_trigger = trigger;
> - info->ioapic_polarity = polarity;
> - info->ioapic_valid = 1;
> + info->ioapic.node = node;
> + info->ioapic.trigger = trigger;
> + info->ioapic.polarity = polarity;
> + info->ioapic.valid = 1;
>  }
>  
>  #ifndef CONFIG_ACPI
> @@ -878,32 +878,32 @@ static void ioapic_copy_alloc_attr(struc
>  
>   copy_irq_alloc_info(dst, src);
>   dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
> - dst->ioapic_id = mpc_ioapi

Re: [Xen-devel] [RFC 0/6] vDSO support for Hyper-V guest on ARM64

2020-01-27 Thread Boqun Feng
On Fri, Jan 24, 2020 at 10:24:44AM +, Vincenzo Frascino wrote:
> Hi Boqun Feng,
> 
> On 24/01/2020 06:32, Boqun Feng wrote:
> > Hi Vincenzo,
> > 
> 
> [...]
> 
> >>
> >> I had a look to your patches and overall, I could not understand why we 
> >> can't
> >> use the arch_timer to do the same things you are doing with the one you
> >> introduced in this series. What confuses me is that KVM works just fine 
> >> with the
> >> arch_timer which was designed with virtualization in mind. Why do we need
> >> another one? Could you please explain?
> >>
> > 
> > Please note that the guest VM on Hyper-V for ARM64 doesn't use
> > arch_timer as the clocksource. See:
> > 
> > 
> > https://lore.kernel.org/linux-arm-kernel/1570129355-16005-7-git-send-email-mikel...@microsoft.com/
> > 
> > ,  ACPI_SIG_GTDT is used for setting up Hyper-V synthetic clocksource
> > and other initialization work.
> >
> 
> I had a look a look at it and my question stands, why do we need another timer
> on arm64?
> 

Sorry for the late response. It's weekend and Chinese New Year, so I got
to spend some time making (and mostly eating) dumplings ;-)

After discussion with Michael, here is some explanation why we need
another timer:

The synthetic clocks that Hyper-V presents in a guest VM were originally
created for the x86 architecture. They provide a level of abstraction
that solves problems like continuity across live migrations where the
hardware clock (i.e., TSC in the case x86) frequency may be different
across the migration. When Hyper-V was brought to ARM64, this
abstraction was maintained to provide consistency across the x86 and
ARM64 architectures, and for both Windows and Linux guest VMs.   The
core Linux code for the Hyper-V clocks (in
drivers/clocksource/hyperv_timer.c) is architecture neutral and works on
both x86 and ARM64. As you can see, this part is done in Michael's
patchset.

Arguably, Hyper-V for ARM64 should have optimized for consistency with
the ARM64 community rather with the existing x86 implementation and
existing guest code in Windows. But at this point, it is what it is,
and the Hyper-V clocks do solve problems like migration that aren’t
addressed in ARM64 until v8.4 of the architecture with the addition of
the counter hardware scaling feature. Hyper-V doesn’t currently map the
ARM arch timer interrupts into guest VMs, so we need to use the existing
Hyper-V clocks and the common code that already exists.


Does the above answer your question?

Regards,
Boqun

> > So just to be clear, your suggestion is
> > 
> > 1) Hyper-V guest on ARM64 should use arch_timer as clocksource and vDSO
> > will just work.
> > 
> > or
> > 
> > 2) Even though arch_timer is not used as the clocksource, we can still
> > use it for vDSO.
> > 
> > ?
> > 
> 
> Option #1 would be the preferred solution, unless there is a good reason 
> against.
> 
> > Regards,
> > Boqun
> > 
> 
> -- 
> Regards,
> Vincenzo



___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [RFC 0/6] vDSO support for Hyper-V guest on ARM64

2020-01-23 Thread Boqun Feng
Hi Vincenzo,

On Thu, Jan 23, 2020 at 10:48:07AM +, Vincenzo Frascino wrote:
> Hi Boqun Feng,
> 
> sorry for the late reply.
> 

That's OK, thanks for your review ;-)

> On 16/12/2019 00:19, Boqun Feng wrote:
> > Hi,
> > 
> > This is the RFC patchset for vDSO support in ARM64 Hyper-V guest. To
> > test it, Michael's ARM64 support patchset:
> > 
> > 
> > https://lore.kernel.org/linux-arm-kernel/1570129355-16005-1-git-send-email-mikel...@microsoft.com/
> > 
> > is needed.
> > 
> > Similar as x86, Hyper-V on ARM64 use a TSC page for guests to read
> > the virtualized hardware timer, this TSC page is read-only for the
> > guests, so could be used for vDSO data page. And the vDSO (userspace)
> > code could use the same code for timer reading as kernel, since
> > they read the same TSC page.
> > 
> 
> I had a look to your patches and overall, I could not understand why we can't
> use the arch_timer to do the same things you are doing with the one you
> introduced in this series. What confuses me is that KVM works just fine with 
> the
> arch_timer which was designed with virtualization in mind. Why do we need
> another one? Could you please explain?
> 

Please note that the guest VM on Hyper-V for ARM64 doesn't use
arch_timer as the clocksource. See:


https://lore.kernel.org/linux-arm-kernel/1570129355-16005-7-git-send-email-mikel...@microsoft.com/

,  ACPI_SIG_GTDT is used for setting up Hyper-V synthetic clocksource
and other initialization work.

So just to be clear, your suggestion is

1) Hyper-V guest on ARM64 should use arch_timer as clocksource and vDSO
will just work.

or

2) Even though arch_timer is not used as the clocksource, we can still
use it for vDSO.

?

Regards,
Boqun

> > This patchset therefore extends ARM64's __vsdo_init() to allow multiple
> > data pages and introduces the vclock_mode concept similar to x86 to
> > allow different platforms (bare-metal, Hyper-V, etc.) to switch to
> > different __arch_get_hw_counter() implementations. The rest of this
> > patchset does the necessary setup for Hyper-V guests: mapping tsc page,
> > enabling userspace to read cntvct, etc. to enable vDSO.
> > 
> > This patchset consists of 6 patches:
> > 
> > patch #1 allows hv_get_raw_timer() definition to be overridden for
> > userspace and kernel to share the same hv_read_tsc_page() definition.
> > 
> > patch #2 extends ARM64 to support multiple vDSO data pages.
> > 
> > patch #3 introduces vclock_mode similiar to x86 to allow different
> > __arch_get_hw_counter() implementations for different clocksources.
> > 
> > patch #4 maps Hyper-V TSC page into vDSO data page.
> > 
> > patch #5 allows userspace to read cntvct, so that userspace can
> > efficiently read the clocksource.
> > 
> > patch #6 enables the vDSO for ARM64 Hyper-V guest.
> > 
> > The whole patchset is based on v5.5-rc1 plus Michael's ARM64 support
> > patchset, and I've done a few tests with:
> > 
> > https://github.com/nlynch-mentor/vdsotest
> > 
> > Comments and suggestions are welcome!
> > 
> > Regards,
> > Boqun
> > 
> > ___
> > linux-arm-kernel mailing list
> > linux-arm-ker...@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
> > 
> 
> -- 
> Regards,
> Vincenzo



___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [RFC 6/6] arm64: hyperv: Enable vDSO

2019-12-17 Thread Boqun Feng
On Tue, Dec 17, 2019 at 03:10:16PM +0100, Vitaly Kuznetsov wrote:
> Boqun Feng  writes:
> 
> > Similar to x86, add a new vclock_mode VCLOCK_HVCLOCK, and reuse the
> > hv_read_tsc_page() for userspace to read tsc page clocksource.
> >
> > Signed-off-by: Boqun Feng (Microsoft) 
> > ---
> >  arch/arm64/include/asm/clocksource.h   |  3 ++-
> >  arch/arm64/include/asm/mshyperv.h  |  2 +-
> >  arch/arm64/include/asm/vdso/gettimeofday.h | 19 +++
> >  3 files changed, 22 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/clocksource.h 
> > b/arch/arm64/include/asm/clocksource.h
> > index fbe80057468c..c6acd45fe748 100644
> > --- a/arch/arm64/include/asm/clocksource.h
> > +++ b/arch/arm64/include/asm/clocksource.h
> > @@ -4,7 +4,8 @@
> >  
> >  #define VCLOCK_NONE0   /* No vDSO clock available. 
> > */
> >  #define VCLOCK_CNTVCT  1   /* vDSO should use cntvcnt  
> > */
> > -#define VCLOCK_MAX 1
> > +#define VCLOCK_HVCLOCK 2   /* vDSO should use vread_hvclock()  
> > */
> > +#define VCLOCK_MAX 2
> >  
> >  struct arch_clocksource_data {
> > int vclock_mode;
> > diff --git a/arch/arm64/include/asm/mshyperv.h 
> > b/arch/arm64/include/asm/mshyperv.h
> > index 0afb00e3501d..7c85dd816dca 100644
> > --- a/arch/arm64/include/asm/mshyperv.h
> > +++ b/arch/arm64/include/asm/mshyperv.h
> > @@ -90,7 +90,7 @@ extern void hv_get_vpreg_128(u32 reg, struct 
> > hv_get_vp_register_output *result);
> >  #define hv_set_reference_tsc(val) \
> > hv_set_vpreg(HV_REGISTER_REFERENCE_TSC, val)
> >  #define hv_set_clocksource_vdso(val) \
> > -   ((val).archdata.vclock_mode = VCLOCK_NONE)
> > +   ((val).archdata.vclock_mode = VCLOCK_HVCLOCK)
> >  
> >  #if IS_ENABLED(CONFIG_HYPERV)
> >  #define hv_enable_stimer0_percpu_irq(irq)  enable_percpu_irq(irq, 0)
> > diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h 
> > b/arch/arm64/include/asm/vdso/gettimeofday.h
> > index e6e3fe0488c7..7e689b903f4d 100644
> > --- a/arch/arm64/include/asm/vdso/gettimeofday.h
> > +++ b/arch/arm64/include/asm/vdso/gettimeofday.h
> > @@ -67,6 +67,20 @@ int clock_getres_fallback(clockid_t _clkid, struct 
> > __kernel_timespec *_ts)
> > return ret;
> >  }
> >  
> > +#ifdef CONFIG_HYPERV_TIMER
> > +/* This will override the default hv_get_raw_timer() */
> > +#define hv_get_raw_timer() __arch_counter_get_cntvct()
> > +#include 
> > +
> > +extern struct ms_hyperv_tsc_page
> > +_hvclock_page __attribute__((visibility("hidden")));
> > +
> > +static u64 vread_hvclock(void)
> > +{
> > +   return hv_read_tsc_page(&_hvclock_page);
> > +}
> > +#endif
> 
> The function is almost the same on x86 (&_hvclock_page ->
> &hvclock_page), would it maybe make sense to move this to arch neutral
> clocksource/hyperv_timer.h?
> 

I'm not sure whether the underscore matters in the vDSO data symbol, so
I follow the architectural name convention. If the leading underscore
doesn't have special purpose I'm happy to move this to arch neutral
header file.

> > +
> >  static __always_inline u64 __arch_get_hw_counter(s32 clock_mode)
> >  {
> > u64 res;
> > @@ -78,6 +92,11 @@ static __always_inline u64 __arch_get_hw_counter(s32 
> > clock_mode)
> > if (clock_mode == VCLOCK_NONE)
> > return __VDSO_USE_SYSCALL;
> >  
> > +#ifdef CONFIG_HYPERV_TIMER
> > +   if (likely(clock_mode == VCLOCK_HVCLOCK))
> > +   return vread_hvclock();
> 
> I'm not sure likely() is justified here: it'll make ALL builds which
> enable CONFIG_HYPERV_TIMER (e.g. distro kernels) to prefer
> VCLOCK_HVCLOCK, even if the kernel is not running on Hyper-V.
> 

Make sense. Thanks for pointing this out! I will change it in the next
version.

Regards,
Boqun

> > +#endif
> > +
> > /*
> >  * This isb() is required to prevent that the counter value
> >  * is speculated.
> 
> -- 
> Vitaly
> 

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [RFC 5/6] arm64: hyperv: Enable userspace to read cntvct

2019-12-15 Thread Boqun Feng
Since reading hyperv-timer clocksource requires reading cntvct,
userspace should be allowed to read it, otherwise reading cntvct will
result in traps, which makes vsyscall's cost similar compared to
syscall's.

So enable it on every cpu when a Hyper-V guest booting up.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm64/hyperv/hv_init.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm64/hyperv/hv_init.c b/arch/arm64/hyperv/hv_init.c
index 86e4621d5885..1ea97ecfb143 100644
--- a/arch/arm64/hyperv/hv_init.c
+++ b/arch/arm64/hyperv/hv_init.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -45,6 +46,7 @@ EXPORT_SYMBOL_GPL(hv_max_vp_index);
 static int hv_cpu_init(unsigned int cpu)
 {
u64 msr_vp_index;
+   u32 cntkctl;
 
hv_get_vp_index(msr_vp_index);
 
@@ -53,6 +55,11 @@ static int hv_cpu_init(unsigned int cpu)
if (msr_vp_index > hv_max_vp_index)
hv_max_vp_index = msr_vp_index;
 
+   /* Enable EL0 to access cntvct */
+   cntkctl = arch_timer_get_cntkctl();
+   cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+   arch_timer_set_cntkctl(cntkctl);
+
return 0;
 }
 
-- 
2.24.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [RFC 0/6] vDSO support for Hyper-V guest on ARM64

2019-12-15 Thread Boqun Feng
Hi,

This is the RFC patchset for vDSO support in ARM64 Hyper-V guest. To
test it, Michael's ARM64 support patchset:


https://lore.kernel.org/linux-arm-kernel/1570129355-16005-1-git-send-email-mikel...@microsoft.com/

is needed.

Similar as x86, Hyper-V on ARM64 use a TSC page for guests to read
the virtualized hardware timer, this TSC page is read-only for the
guests, so could be used for vDSO data page. And the vDSO (userspace)
code could use the same code for timer reading as kernel, since
they read the same TSC page.

This patchset therefore extends ARM64's __vsdo_init() to allow multiple
data pages and introduces the vclock_mode concept similar to x86 to
allow different platforms (bare-metal, Hyper-V, etc.) to switch to
different __arch_get_hw_counter() implementations. The rest of this
patchset does the necessary setup for Hyper-V guests: mapping tsc page,
enabling userspace to read cntvct, etc. to enable vDSO.

This patchset consists of 6 patches:

patch #1 allows hv_get_raw_timer() definition to be overridden for
userspace and kernel to share the same hv_read_tsc_page() definition.

patch #2 extends ARM64 to support multiple vDSO data pages.

patch #3 introduces vclock_mode similiar to x86 to allow different
__arch_get_hw_counter() implementations for different clocksources.

patch #4 maps Hyper-V TSC page into vDSO data page.

patch #5 allows userspace to read cntvct, so that userspace can
efficiently read the clocksource.

patch #6 enables the vDSO for ARM64 Hyper-V guest.

The whole patchset is based on v5.5-rc1 plus Michael's ARM64 support
patchset, and I've done a few tests with:

https://github.com/nlynch-mentor/vdsotest

Comments and suggestions are welcome!

Regards,
Boqun

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [RFC 6/6] arm64: hyperv: Enable vDSO

2019-12-15 Thread Boqun Feng
Similar to x86, add a new vclock_mode VCLOCK_HVCLOCK, and reuse the
hv_read_tsc_page() for userspace to read tsc page clocksource.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm64/include/asm/clocksource.h   |  3 ++-
 arch/arm64/include/asm/mshyperv.h  |  2 +-
 arch/arm64/include/asm/vdso/gettimeofday.h | 19 +++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/clocksource.h 
b/arch/arm64/include/asm/clocksource.h
index fbe80057468c..c6acd45fe748 100644
--- a/arch/arm64/include/asm/clocksource.h
+++ b/arch/arm64/include/asm/clocksource.h
@@ -4,7 +4,8 @@
 
 #define VCLOCK_NONE0   /* No vDSO clock available. */
 #define VCLOCK_CNTVCT  1   /* vDSO should use cntvcnt  */
-#define VCLOCK_MAX 1
+#define VCLOCK_HVCLOCK 2   /* vDSO should use vread_hvclock()  */
+#define VCLOCK_MAX 2
 
 struct arch_clocksource_data {
int vclock_mode;
diff --git a/arch/arm64/include/asm/mshyperv.h 
b/arch/arm64/include/asm/mshyperv.h
index 0afb00e3501d..7c85dd816dca 100644
--- a/arch/arm64/include/asm/mshyperv.h
+++ b/arch/arm64/include/asm/mshyperv.h
@@ -90,7 +90,7 @@ extern void hv_get_vpreg_128(u32 reg, struct 
hv_get_vp_register_output *result);
 #define hv_set_reference_tsc(val) \
hv_set_vpreg(HV_REGISTER_REFERENCE_TSC, val)
 #define hv_set_clocksource_vdso(val) \
-   ((val).archdata.vclock_mode = VCLOCK_NONE)
+   ((val).archdata.vclock_mode = VCLOCK_HVCLOCK)
 
 #if IS_ENABLED(CONFIG_HYPERV)
 #define hv_enable_stimer0_percpu_irq(irq)  enable_percpu_irq(irq, 0)
diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h 
b/arch/arm64/include/asm/vdso/gettimeofday.h
index e6e3fe0488c7..7e689b903f4d 100644
--- a/arch/arm64/include/asm/vdso/gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/gettimeofday.h
@@ -67,6 +67,20 @@ int clock_getres_fallback(clockid_t _clkid, struct 
__kernel_timespec *_ts)
return ret;
 }
 
+#ifdef CONFIG_HYPERV_TIMER
+/* This will override the default hv_get_raw_timer() */
+#define hv_get_raw_timer() __arch_counter_get_cntvct()
+#include 
+
+extern struct ms_hyperv_tsc_page
+_hvclock_page __attribute__((visibility("hidden")));
+
+static u64 vread_hvclock(void)
+{
+   return hv_read_tsc_page(&_hvclock_page);
+}
+#endif
+
 static __always_inline u64 __arch_get_hw_counter(s32 clock_mode)
 {
u64 res;
@@ -78,6 +92,11 @@ static __always_inline u64 __arch_get_hw_counter(s32 
clock_mode)
if (clock_mode == VCLOCK_NONE)
return __VDSO_USE_SYSCALL;
 
+#ifdef CONFIG_HYPERV_TIMER
+   if (likely(clock_mode == VCLOCK_HVCLOCK))
+   return vread_hvclock();
+#endif
+
/*
 * This isb() is required to prevent that the counter value
 * is speculated.
-- 
2.24.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [RFC 2/6] arm64: vdso: Add support for multiple vDSO data pages

2019-12-15 Thread Boqun Feng
Split __vdso_abi::vdso_pages into nr_vdso_{data,code}_pages, so that
__setup_additional_pages() could work with multiple vDSO data pages with
the setup from __vdso_init().

Multiple vDSO data pages are required when running in a virtualized
environment, where the cycles read from cntvct at userspace need to
be adjusted with some data from a page maintained by the hypervisor. For
example, the TSC page in Hyper-V.

This is a prerequisite for vDSO support in ARM64 on Hyper-V.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm64/kernel/vdso.c | 43 
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 354b11e27c07..b9b5ec7a3084 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -50,7 +50,8 @@ struct __vdso_abi {
const char *name;
const char *vdso_code_start;
const char *vdso_code_end;
-   unsigned long vdso_pages;
+   unsigned long nr_vdso_data_pages;
+   unsigned long nr_vdso_code_pages;
/* Data Mapping */
struct vm_special_mapping *dm;
/* Code Mapping */
@@ -101,6 +102,8 @@ static int __vdso_init(enum arch_vdso_type arch_index)
 {
int i;
struct page **vdso_pagelist;
+   struct page **vdso_code_pagelist;
+   unsigned long nr_vdso_pages;
unsigned long pfn;
 
if (memcmp(vdso_lookup[arch_index].vdso_code_start, "\177ELF", 4)) {
@@ -108,14 +111,18 @@ static int __vdso_init(enum arch_vdso_type arch_index)
return -EINVAL;
}
 
-   vdso_lookup[arch_index].vdso_pages = (
+   vdso_lookup[arch_index].nr_vdso_data_pages = 1;
+
+   vdso_lookup[arch_index].nr_vdso_code_pages = (
vdso_lookup[arch_index].vdso_code_end -
vdso_lookup[arch_index].vdso_code_start) >>
PAGE_SHIFT;
 
-   /* Allocate the vDSO pagelist, plus a page for the data. */
-   vdso_pagelist = kcalloc(vdso_lookup[arch_index].vdso_pages + 1,
-   sizeof(struct page *),
+   nr_vdso_pages = vdso_lookup[arch_index].nr_vdso_data_pages +
+   vdso_lookup[arch_index].nr_vdso_code_pages;
+
+   /* Allocate the vDSO pagelist. */
+   vdso_pagelist = kcalloc(nr_vdso_pages, sizeof(struct page *),
GFP_KERNEL);
if (vdso_pagelist == NULL)
return -ENOMEM;
@@ -123,15 +130,17 @@ static int __vdso_init(enum arch_vdso_type arch_index)
/* Grab the vDSO data page. */
vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data));
 
-
/* Grab the vDSO code pages. */
pfn = sym_to_pfn(vdso_lookup[arch_index].vdso_code_start);
 
-   for (i = 0; i < vdso_lookup[arch_index].vdso_pages; i++)
-   vdso_pagelist[i + 1] = pfn_to_page(pfn + i);
+   vdso_code_pagelist = vdso_pagelist +
+vdso_lookup[arch_index].nr_vdso_data_pages;
+
+   for (i = 0; i < vdso_lookup[arch_index].nr_vdso_code_pages; i++)
+   vdso_code_pagelist[i] = pfn_to_page(pfn + i);
 
-   vdso_lookup[arch_index].dm->pages = &vdso_pagelist[0];
-   vdso_lookup[arch_index].cm->pages = &vdso_pagelist[1];
+   vdso_lookup[arch_index].dm->pages = vdso_pagelist;
+   vdso_lookup[arch_index].cm->pages = vdso_code_pagelist;
 
return 0;
 }
@@ -141,26 +150,26 @@ static int __setup_additional_pages(enum arch_vdso_type 
arch_index,
struct linux_binprm *bprm,
int uses_interp)
 {
-   unsigned long vdso_base, vdso_text_len, vdso_mapping_len;
+   unsigned long vdso_base, vdso_text_len, vdso_data_len;
void *ret;
 
-   vdso_text_len = vdso_lookup[arch_index].vdso_pages << PAGE_SHIFT;
-   /* Be sure to map the data page */
-   vdso_mapping_len = vdso_text_len + PAGE_SIZE;
+   vdso_data_len = vdso_lookup[arch_index].nr_vdso_data_pages << 
PAGE_SHIFT;
+   vdso_text_len = vdso_lookup[arch_index].nr_vdso_code_pages << 
PAGE_SHIFT;
 
-   vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0);
+   vdso_base = get_unmapped_area(NULL, 0,
+ vdso_data_len + vdso_text_len, 0, 0);
if (IS_ERR_VALUE(vdso_base)) {
ret = ERR_PTR(vdso_base);
goto up_fail;
}
 
-   ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE,
+   ret = _install_special_mapping(mm, vdso_base, vdso_data_len,
   VM_READ|VM_MAYREAD,
   vdso_lookup[arch_index].dm);
if (IS_ERR(ret))
goto up_fail;
 
-   vdso_base += PAGE_SIZE;
+   vdso_base += vdso_data_len;
mm->context.vdso = (void *)vdso_base;
ret = _install_special_mapping(mm, vdso_base

[Xen-devel] [RFC 3/6] arm/arm64: clocksource: Introduce vclock_mode

2019-12-15 Thread Boqun Feng
Similar to x86, use a vclock_mode in arch_clocksource_data to differ
clocksoures use different read function in vDSO.

No functional changes, only preparation for support vDSO in ARM64 on
Hyper-V.

Note: the changes for arm are only because arm and arm64 share the same
code in the arch timer driver and require arch_clocksource_data having
the same field.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm/include/asm/clocksource.h| 6 +-
 arch/arm/kernel/vdso.c| 1 -
 arch/arm64/include/asm/clocksource.h  | 6 +-
 arch/arm64/include/asm/mshyperv.h | 2 +-
 arch/arm64/include/asm/vdso/compat_gettimeofday.h | 5 +++--
 arch/arm64/include/asm/vdso/gettimeofday.h| 5 +++--
 arch/arm64/include/asm/vdso/vsyscall.h| 4 +---
 drivers/clocksource/arm_arch_timer.c  | 8 
 8 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/arch/arm/include/asm/clocksource.h 
b/arch/arm/include/asm/clocksource.h
index 0b350a7e26f3..017c5ab6e587 100644
--- a/arch/arm/include/asm/clocksource.h
+++ b/arch/arm/include/asm/clocksource.h
@@ -1,8 +1,12 @@
 #ifndef _ASM_CLOCKSOURCE_H
 #define _ASM_CLOCKSOURCE_H
 
+#define VCLOCK_NONE0   /* No vDSO clock available. */
+#define VCLOCK_CNTVCT  1   /* vDSO should use cntvcnt  */
+#define VCLOCK_MAX 1
+
 struct arch_clocksource_data {
-   bool vdso_direct;   /* Usable for direct VDSO access? */
+   int vclock_mode;
 };
 
 #endif
diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
index c89ac1b9d28b..09e46ec420fe 100644
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -263,4 +263,3 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long 
addr)
if (!IS_ERR(vma))
mm->context.vdso = addr;
 }
-
diff --git a/arch/arm64/include/asm/clocksource.h 
b/arch/arm64/include/asm/clocksource.h
index 0ece64a26c8c..fbe80057468c 100644
--- a/arch/arm64/include/asm/clocksource.h
+++ b/arch/arm64/include/asm/clocksource.h
@@ -2,8 +2,12 @@
 #ifndef _ASM_CLOCKSOURCE_H
 #define _ASM_CLOCKSOURCE_H
 
+#define VCLOCK_NONE0   /* No vDSO clock available. */
+#define VCLOCK_CNTVCT  1   /* vDSO should use cntvcnt  */
+#define VCLOCK_MAX 1
+
 struct arch_clocksource_data {
-   bool vdso_direct;   /* Usable for direct VDSO access? */
+   int vclock_mode;
 };
 
 #endif
diff --git a/arch/arm64/include/asm/mshyperv.h 
b/arch/arm64/include/asm/mshyperv.h
index 9cc4aeddf2d0..0afb00e3501d 100644
--- a/arch/arm64/include/asm/mshyperv.h
+++ b/arch/arm64/include/asm/mshyperv.h
@@ -90,7 +90,7 @@ extern void hv_get_vpreg_128(u32 reg, struct 
hv_get_vp_register_output *result);
 #define hv_set_reference_tsc(val) \
hv_set_vpreg(HV_REGISTER_REFERENCE_TSC, val)
 #define hv_set_clocksource_vdso(val) \
-   ((val).archdata.vdso_direct = false)
+   ((val).archdata.vclock_mode = VCLOCK_NONE)
 
 #if IS_ENABLED(CONFIG_HYPERV)
 #define hv_enable_stimer0_percpu_irq(irq)  enable_percpu_irq(irq, 0)
diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h 
b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
index c50ee1b7d5cd..630d04c3c92e 100644
--- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
@@ -8,6 +8,7 @@
 #ifndef __ASSEMBLY__
 
 #include 
+#include 
 #include 
 
 #include 
@@ -117,10 +118,10 @@ static __always_inline u64 __arch_get_hw_counter(s32 
clock_mode)
u64 res;
 
/*
-* clock_mode == 0 implies that vDSO are enabled otherwise
+* clock_mode == VCLOCK_NONE implies that vDSO are disabled so
 * fallback on syscall.
 */
-   if (clock_mode)
+   if (clock_mode == VCLOCK_NONE)
return __VDSO_USE_SYSCALL;
 
/*
diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h 
b/arch/arm64/include/asm/vdso/gettimeofday.h
index b08f476b72b4..e6e3fe0488c7 100644
--- a/arch/arm64/include/asm/vdso/gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/gettimeofday.h
@@ -8,6 +8,7 @@
 #ifndef __ASSEMBLY__
 
 #include 
+#include 
 #include 
 
 #define __VDSO_USE_SYSCALL ULLONG_MAX
@@ -71,10 +72,10 @@ static __always_inline u64 __arch_get_hw_counter(s32 
clock_mode)
u64 res;
 
/*
-* clock_mode == 0 implies that vDSO are enabled otherwise
+* clock_mode == VCLOCK_NONE implies that vDSO are disabled so
 * fallback on syscall.
 */
-   if (clock_mode)
+   if (clock_mode == VCLOCK_NONE)
return __VDSO_USE_SYSCALL;
 
/*
diff --git a/arch/arm64/include/asm/vdso/vsyscall.h 
b/arch/arm64/include/asm/vdso/vsyscall.h
index 0c20a7c1bee5..07f78b0da498 100644
--- a/arch/arm64/include/asm/vdso/vsyscall.h
+++ b/arch/arm64/include/asm/vdso/vsyscall.h
@@ -24,9 +24,7 @@ struct vdso_data *__arm64_get_k_vdso_data(void)
 static __always_inl

[Xen-devel] [RFC 1/6] arm64: hyperv: Allow hv_get_raw_timer() definition to be overridden

2019-12-15 Thread Boqun Feng
In order to support vDSO, hv_read_tsc_page() should be able to be called
from userspace if tsc page mapped. As a result, hv_get_raw_timer(),
called by hv_read_tsc_page() requires to be called by both kernel and
vDSO. Currently, it's defined as arch_timer_read_counter(), which is a
function pointer initialized (using a kernel address) by the arch timer
driver, therefore not usable in vDSO.

Fix this by allowing a previous definition to override the default one,
so that in vDSO code, we can define it as a function callable in
userspace.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm64/include/asm/mshyperv.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/mshyperv.h 
b/arch/arm64/include/asm/mshyperv.h
index a8468a611912..9cc4aeddf2d0 100644
--- a/arch/arm64/include/asm/mshyperv.h
+++ b/arch/arm64/include/asm/mshyperv.h
@@ -97,8 +97,15 @@ extern void hv_get_vpreg_128(u32 reg, struct 
hv_get_vp_register_output *result);
 #define hv_disable_stimer0_percpu_irq(irq) disable_percpu_irq(irq)
 #endif
 
-/* ARM64 specific code to read the hardware clock */
+/*
+ * ARM64 specific code to read the hardware clock.
+ *
+ * This could be used in both kernel space and userspace (vDSO), so make it
+ * possible for a previous definition to override the default one.
+ */
+#ifndef hv_get_raw_timer
 #define hv_get_raw_timer() arch_timer_read_counter()
+#endif
 
 #include 
 
-- 
2.24.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [RFC 4/6] arm64: vdso: hyperv: Map tsc page into vDSO if enabled

2019-12-15 Thread Boqun Feng
On Hyper-V, a tsc page has the data for adjusting cntvct numbers to
clocksource cycles, and that's how Hyper-V guest kernel reads the
clocksource. In order to allow userspace to read the same clocksource
directly, the tsc page has to been mapped into userspace via vDSO.

Use the framework for vDSO set-up in __vdso_init() to do this.

Note: if HYPERV_TIMER=y but the kernel is using other clocksource or
doesn't have the hyperv timer clocksource, tsc page will still be mapped
into userspace.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm64/kernel/vdso.c  | 12 
 arch/arm64/kernel/vdso/vdso.lds.S | 12 +++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index b9b5ec7a3084..18a634987bdc 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -9,6 +9,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -105,14 +106,22 @@ static int __vdso_init(enum arch_vdso_type arch_index)
struct page **vdso_code_pagelist;
unsigned long nr_vdso_pages;
unsigned long pfn;
+   struct ms_hyperv_tsc_page *tsc_page;
+   int tsc_page_idx;
 
if (memcmp(vdso_lookup[arch_index].vdso_code_start, "\177ELF", 4)) {
pr_err("vDSO is not a valid ELF object!\n");
return -EINVAL;
}
 
+   /* One vDSO data page */
vdso_lookup[arch_index].nr_vdso_data_pages = 1;
 
+   /* Grab the Hyper-V tsc page, if enabled, add one more page */
+   tsc_page = hv_get_tsc_page();
+   if (tsc_page)
+   tsc_page_idx = vdso_lookup[arch_index].nr_vdso_data_pages++;
+
vdso_lookup[arch_index].nr_vdso_code_pages = (
vdso_lookup[arch_index].vdso_code_end -
vdso_lookup[arch_index].vdso_code_start) >>
@@ -130,6 +139,9 @@ static int __vdso_init(enum arch_vdso_type arch_index)
/* Grab the vDSO data page. */
vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data));
 
+   if (tsc_page)
+   vdso_pagelist[tsc_page_idx] = phys_to_page(__pa(tsc_page));
+
/* Grab the vDSO code pages. */
pfn = sym_to_pfn(vdso_lookup[arch_index].vdso_code_start);
 
diff --git a/arch/arm64/kernel/vdso/vdso.lds.S 
b/arch/arm64/kernel/vdso/vdso.lds.S
index 7ad2d3a0cd48..e40a1f5a6d30 100644
--- a/arch/arm64/kernel/vdso/vdso.lds.S
+++ b/arch/arm64/kernel/vdso/vdso.lds.S
@@ -17,7 +17,17 @@ OUTPUT_ARCH(aarch64)
 
 SECTIONS
 {
-   PROVIDE(_vdso_data = . - PAGE_SIZE);
+   /*
+* vdso data pages:
+*   vdso data (1 page)
+*   hv tsc page (1 page if enabled)
+*/
+   PROVIDE(_vdso_data = _hvclock_page - PAGE_SIZE);
+#ifdef CONFIG_HYPERV_TIMER
+   PROVIDE(_hvclock_page = . - PAGE_SIZE);
+#else
+   PROVIDE(_hvclock_page = .);
+#endif
. = VDSO_LBASE + SIZEOF_HEADERS;
 
.hash   : { *(.hash) }  :text
-- 
2.24.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [RFC PATCH v2 00/17] RFC: SGX Virtualization design and draft patches

2017-12-24 Thread Boqun Feng
On Mon, Dec 04, 2017 at 08:15:11AM +0800, Boqun Feng wrote:
> Hi all,
> 
> This is the v2 of RFC SGX Virtualization design and draft patches, you

Ping ;-)

Any comments?

Regards,
Boqun

> can find v1 at:
> 
> https://lists.gt.net/xen/devel/483404
> 
> In the new version, I fix a few things according to the feedbacks for
> previous version(mostly are cleanups and code movement).
> 
> Besides, Kai and I redesign the SGX MSRs setting up part and introduce
> new XL parameter 'lehash' and 'lewr'.
> 
> Another big change is that I modify the EPC management to fit EPC pages
> in 'struct page_info', and in patch #6 and #7, unscrubbable pages,
> 'PGC_epc', 'MEMF_epc' and 'XENZONE_EPC' are introduced, so that EPC
> management is fully integrated into existing memory management of xen.
> This might be the controversial bit, so patch 6~8 are simply to show the
> idea and drive deep discussion.
> 
> Detailed changes since v1: (modifications with tag "[New]" is totally
> new in this series, reviews and comments are highly welcome for those
> parts)
> 
> *   Make SGX related mostly common for x86 by: 1) moving sgx.[ch] to
> arch/x86/ and include/asm-x86/ and 2) renaming EPC related functions
> with domain_* prefix.
> 
> *   Rename ioremap_cache() with ioremap_wb() and make it x86-specific as
> suggested by Jan Beulich.
> 
> *   Remove percpu sgx_cpudata, during bootup secondary CPUs now check
> whether they read different value than boot CPU, if so SGX is
> disabled.
> 
> *   Remove domain_has_sgx_{,launch_control}, and make sure we can
> rely on domain's arch.cpuid->feat.sgx{_lc} for setting checks.
> 
> *   Cleanup the code for CPUID handling as suggested by Andrew Cooper.
> 
> *   Adjust to msr_policy framework for SGX MSRs handling, and remove
> unnecessary fields like 'readable' and 'writable'
> 
> *   Use 'page_info' to maintain EPC pages, and [NEW] add an draft
> implementation for employing xenheap for EPC page management. Please
> see patch 6~8
> 
> *   [New] Modify the XL parameter for SGX, please see section 2.1.1 in
> the updated design doc. 
> 
> *   [New] Use _set_vcpu_msrs hypercall in the toolstack to set the SGX
> related. Please see patch #17.
> 
> *   ACPI related tool changes are temporarily dropped in this patchset,
> as I need more time to resolve the comments and do related tests.
> 
> And the update design doc is as follow, as the previous version in the
> design there are some particualr points that we don't know which
> implementation is better. For those a question mark (?) is added at the
> right of the menu. And for SGX live migration, thanks to Wei Liu for
> providing comments that it's nice to support if we can in previous
> version review, but we'd like hear more from you guys so we still put a
> question mark fot this item. Your comments on those "question mark (?)"
> parts (and other comments as well, of course) are highly appreciated.
> 
> ===
> 1. SGX Introduction
> 1.1 Overview
> 1.1.1 Enclave
> 1.1.2 EPC (Enclave Paage Cache)
> 1.1.3 ENCLS and ENCLU
> 1.2 Discovering SGX Capability
> 1.2.1 Enumerate SGX via CPUID
> 1.2.2 Intel SGX Opt-in Configuration
> 1.3 Enclave Life Cycle
> 1.3.1 Constructing & Destroying Enclave
> 1.3.2 Enclave Entry and Exit
> 1.3.2.1 Synchonous Entry and Exit
> 1.3.2.2 Asynchounous Enclave Exit
> 1.3.3 EPC Eviction and Reload
> 1.4 SGX Launch Control
> 1.5 SGX Interaction with IA32 and IA64 Architecture
> 2. SGX Virtualization Design
> 2.1 High Level Toolstack Changes
> 2.1.1 New 'sgx' XL configure file parameter
> 2.1.2 New XL commands (?)
> 2.1.3 Notify domain's virtual EPC base and size to Xen
> 2.2 High Level Hypervisor Changes
> 2.2.1 EPC Management
> 2.2.2 EPC Virtualization
> 2.2.3 Populate EPC for Guest
> 2.2.4 Launch Control Support
> 2.2.5 CPUID Emulation
> 2.2.6 EPT Violation & ENCLS Trapping Handling
> 2.2.7 Guest Suspend & Resume
> 2.2.8 Destroying Domain
> 2.3 Additional Point: Live Migration, Snapshot Support (?)
> 3. Reference
> 
> 1. SGX Introduction
> 
> 1.1 Overview
> 
> 1.1.1 Enclave
> 
> Intel Software Guard Extensions (SGX) is a set of instructions and mechanisms
> for memory accesses in or

Re: [Xen-devel] [PATCH v2 01/17] xen: x86: expose SGX to HVM domain in CPU featureset

2017-12-04 Thread Boqun Feng
On Mon, Dec 04, 2017 at 07:13:52AM -0700, Jan Beulich wrote:
> >>> On 04.12.17 at 14:10,  wrote:
> > On Mon, Dec 04, 2017 at 11:13:45AM +, Julien Grall wrote:
> >> I am not sure to understand why I am being CCed. But it looks like you CC
> >> everyone on each patch... Please CC only relevant person on each patch.
> >> 
> > 
> > Apologies...  I thought the whole pathset will provide more context for
> > the reviewers. Will drop you from unrelevant patches in next verion. And
> > I guess it's OK for me to drop you from replies on unrelevant patches of
> > this version too?
> 
> You shouldn't do this for just Julien - Cc lists of patches should
> generally be composed per patch. Most people are subscribed to
> the list anyway, and hence receive a copy of the other patches.
> In the worst case people can either tell you to always be Cc-ed
> on an entire patch set, or go to the list archives. Yet when you
> Cc everyone on everything, it is quite difficult for an individual to
> tell which parts to actually pay special attention to.
> 

Good point ;-) I will compose the Cc lists per patch in next version.

Regards,
Boqun

> Jan
> 

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v2 01/17] xen: x86: expose SGX to HVM domain in CPU featureset

2017-12-04 Thread Boqun Feng
On Mon, Dec 04, 2017 at 11:13:45AM +, Julien Grall wrote:
> Hello,
> 

Hi Julien,

> I am not sure to understand why I am being CCed. But it looks like you CC
> everyone on each patch... Please CC only relevant person on each patch.
> 

Apologies...  I thought the whole pathset will provide more context for
the reviewers. Will drop you from unrelevant patches in next verion. And
I guess it's OK for me to drop you from replies on unrelevant patches of
this version too?

Regards,
Boqun

> Cheers,
> 
> On 04/12/17 00:15, Boqun Feng wrote:
> > From: Kai Huang 
> > 
> > Expose SGX in CPU featureset for HVM domain. SGX will not be supported for
> > PV domain, as ENCLS (which SGX driver in guest essentially runs) must run
> > in ring 0, while PV kernel runs in ring 3. Theoretically we can support SGX
> > in PV domain via either emulating #GP caused by ENCLS running in ring 3, or
> > by PV ENCLS but it is really not necessary at this stage.
> > 
> > SGX Launch Control is also exposed in CPU featureset for HVM domain. SGX
> > Launch Control depends on SGX.
> > 
> > Signed-off-by: Kai Huang 
> > Signed-off-by: Boqun Feng 
> > ---
> >   xen/include/public/arch-x86/cpufeatureset.h | 3 ++-
> >   xen/tools/gen-cpuid.py  | 3 +++
> >   2 files changed, 5 insertions(+), 1 deletion(-)
> > 
> > diff --git a/xen/include/public/arch-x86/cpufeatureset.h 
> > b/xen/include/public/arch-x86/cpufeatureset.h
> > index be6da8eaf17c..1f8510eebb1d 100644
> > --- a/xen/include/public/arch-x86/cpufeatureset.h
> > +++ b/xen/include/public/arch-x86/cpufeatureset.h
> > @@ -193,7 +193,7 @@ XEN_CPUFEATURE(XSAVES,4*32+ 3) /*S  
> > XSAVES/XRSTORS instructions */
> >   /* Intel-defined CPU features, CPUID level 0x0007:0.ebx, word 5 */
> >   XEN_CPUFEATURE(FSGSBASE,  5*32+ 0) /*A  {RD,WR}{FS,GS}BASE 
> > instructions */
> >   XEN_CPUFEATURE(TSC_ADJUST,5*32+ 1) /*S  TSC_ADJUST MSR available */
> > -XEN_CPUFEATURE(SGX,   5*32+ 2) /*   Software Guard extensions */
> > +XEN_CPUFEATURE(SGX,   5*32+ 2) /*H  Intel Software Guard 
> > extensions */
> >   XEN_CPUFEATURE(BMI1,  5*32+ 3) /*A  1st bit manipulation 
> > extensions */
> >   XEN_CPUFEATURE(HLE,   5*32+ 4) /*A  Hardware Lock Elision */
> >   XEN_CPUFEATURE(AVX2,  5*32+ 5) /*A  AVX2 instructions */
> > @@ -230,6 +230,7 @@ XEN_CPUFEATURE(PKU,   6*32+ 3) /*H  Protection 
> > Keys for Userspace */
> >   XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*!  OS Protection Keys Enable */
> >   XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A  POPCNT for vectors of 
> > DW/QW */
> >   XEN_CPUFEATURE(RDPID, 6*32+22) /*A  RDPID instruction */
> > +XEN_CPUFEATURE(SGX_LC,6*32+30) /*H Intel SGX Launch Control */
> >   /* AMD-defined CPU features, CPUID level 0x8007.edx, word 7 */
> >   XEN_CPUFEATURE(ITSC,  7*32+ 8) /*   Invariant TSC */
> > diff --git a/xen/tools/gen-cpuid.py b/xen/tools/gen-cpuid.py
> > index 9ec4486f2b4b..4fef21203086 100755
> > --- a/xen/tools/gen-cpuid.py
> > +++ b/xen/tools/gen-cpuid.py
> > @@ -256,6 +256,9 @@ def crunch_numbers(state):
> >   AVX512F: [AVX512DQ, AVX512IFMA, AVX512PF, AVX512ER, AVX512CD,
> > AVX512BW, AVX512VL, AVX512VBMI, AVX512_4VNNIW,
> > AVX512_4FMAPS, AVX512_VPOPCNTDQ],
> > +
> > +# SGX Launch Control depends on SGX
> > +SGX: [SGX_LC],
> >   }
> >   deep_features = tuple(sorted(deps.keys()))
> > 
> 
> -- 
> Julien Grall

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 16/17] xen: tools: add SGX to applying CPUID policy

2017-12-03 Thread Boqun Feng
From: Kai Huang 

In libxc, a new structure 'xc_cpuid_policy_build_info_t' is added to carry
domain's EPC base and size info from libxl. libxl_cpuid_apply_policy is also
changed to take 'libxl_domain_build_info_t' as parameter, where domain's EPC
base and size can be got and passed to xc_cpuid_apply_policy.
xc_cpuid_apply_policy is extended to support SGX CPUID. If hypervisor doesn't
report SGX feature in host type cpufeatureset, then using 'epc' parameter
results in domain creation failure as SGX cannot be supported.

Signed-off-by: Kai Huang 
---
 tools/libxc/include/xenctrl.h   | 14 
 tools/libxc/xc_cpuid_x86.c  | 68 ++---
 tools/libxl/libxl.h |  3 +-
 tools/libxl/libxl_cpuid.c   | 15 ++--
 tools/libxl/libxl_dom.c |  6 +++-
 tools/libxl/libxl_nocpuid.c |  4 ++-
 tools/ocaml/libs/xc/xenctrl_stubs.c | 11 +-
 tools/python/xen/lowlevel/xc/xc.c   | 11 +-
 8 files changed, 121 insertions(+), 11 deletions(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 666db0b9193e..ad4429ca5ffd 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -1827,6 +1827,19 @@ int xc_domain_debug_control(xc_interface *xch,
 uint32_t vcpu);
 
 #if defined(__i386__) || defined(__x86_64__)
+typedef struct xc_cpuid_policy_build_info_sgx {
+uint64_t epc_base;
+uint64_t epc_size;
+} xc_cpuid_policy_build_info_sgx_t;
+
+typedef struct xc_cpuid_policy_build_info {
+xc_cpuid_policy_build_info_sgx_t sgx;
+} xc_cpuid_policy_build_info_t;
+
+int xc_cpuid_check(xc_interface *xch,
+   const unsigned int *input,
+   const char **config,
+   char **config_transformed);
 int xc_cpuid_set(xc_interface *xch,
  uint32_t domid,
  const unsigned int *input,
@@ -1834,6 +1847,7 @@ int xc_cpuid_set(xc_interface *xch,
  char **config_transformed);
 int xc_cpuid_apply_policy(xc_interface *xch,
   uint32_t domid,
+  xc_cpuid_policy_build_info_t *b_info,
   uint32_t *featureset,
   unsigned int nr_features);
 void xc_cpuid_to_str(const unsigned int *regs,
diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index 25b922ea2184..a778acf79a64 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -38,7 +38,7 @@ enum {
 #define clear_feature(idx, dst) ((dst) &= ~bitmaskof(idx))
 #define set_feature(idx, dst)   ((dst) |=  bitmaskof(idx))
 
-#define DEF_MAX_BASE 0x000du
+#define DEF_MAX_BASE 0x0012u
 #define DEF_MAX_INTELEXT  0x8008u
 #define DEF_MAX_AMDEXT0x801cu
 
@@ -178,6 +178,8 @@ struct cpuid_domain_info
 /* HVM-only information. */
 bool pae;
 bool nestedhvm;
+
+xc_cpuid_policy_build_info_t *b_info;
 };
 
 static void cpuid(const unsigned int *input, unsigned int *regs)
@@ -369,6 +371,12 @@ static void intel_xc_cpuid_policy(xc_interface *xch,
   const struct cpuid_domain_info *info,
   const unsigned int *input, unsigned int 
*regs)
 {
+xc_cpuid_policy_build_info_t *b_info = info->b_info;
+xc_cpuid_policy_build_info_sgx_t *sgx = NULL;
+
+if ( b_info )
+sgx = &b_info->sgx;
+
 switch ( input[0] )
 {
 case 0x0004:
@@ -381,6 +389,30 @@ static void intel_xc_cpuid_policy(xc_interface *xch,
 regs[3] &= 0x3ffu;
 break;
 
+case 0x0012:
+if ( !sgx ) {
+regs[0] = regs[1] = regs[2] = regs[3] = 0;
+break;
+}
+
+if ( !sgx->epc_base || !sgx->epc_size ) {
+regs[0] = regs[1] = regs[2] = regs[3] = 0;
+break;
+}
+
+if ( input[1] == 2 ) {
+/*
+ * FIX EPC base and size for SGX CPUID leaf 2. Xen hypervisor is
+ * depending on XEN_DOMCTL_set_cpuid to know domain's EPC base
+ * and size.
+ */
+regs[0] = (uint32_t)(sgx->epc_base & 0xf000) | 0x1;
+regs[1] = (uint32_t)(sgx->epc_base >> 32);
+regs[2] = (uint32_t)(sgx->epc_size & 0xf000) | 0x1;
+regs[3] = (uint32_t)(sgx->epc_size >> 32);
+}
+break;
+
 case 0x8000:
 if ( regs[0] > DEF_MAX_INTELEXT )
 regs[0] = DEF_MAX_INTELEXT;
@@ -444,6 +476,10 @@ static void xc_cpuid_hvm_policy(xc_interface *xch,
 regs[1] = regs[2] = regs[3] = 0;
 break;
 
+case 0x0012:
+/* Intel SGX. Passthrough to Intel function */
+break;
+
 case 0x8000:
 /* Passthrough to cpu vendor specific functions */
 break;
@@ -649,12 +685,13 @@ void xc_cpuid_to_str(const unsigned int *regs, char 
**strs)
 }
 }
 
-static void sanitise_featureset(struct cpuid_domain_info *info)
+sta

[Xen-devel] [PATCH v2 17/17] xen: tools: add SGX to applying MSR policy

2017-12-03 Thread Boqun Feng
In libxc, a new function 'xc_msr_sgx_set' is added, this function will
apply SGX related MSR policy to the target domain. This function takes
the value of 'lewr' and 'lehash*' in 'libxl_sgx_buildinfo', and set
the proper MSRs in all vcpus via 'XEN_DOMCTL_set_vcpu_msrs' hypercall.

If the physical IA32_SGXLEPUBKEYHASHn MSRs are writable:

* Domain's IA32_FEATURE_CONTROL_SGX_LE_WR bit depends on 'lwer'(default
  false)

* If 'lehash' is unset, do nothing, as we already set the proper value
  in sgx_domain_msr_init().

* If 'lehash' is set, set the domain's virtual IA32_SGXLEPUBKEYHASHn
  with its value, and later on the vcpu's virtual IA32_SGXLEPUBKEYHASHn
  will be set with the same value.

If the physical IA32_SGXLEPUBKEYHASHn MSRs are not writable, using
'lehash' or 'lewr' parameter results in domain creation failure.

Signed-off-by: Boqun Feng 
---
 tools/libxc/Makefile  |  1 +
 tools/libxc/include/xenctrl.h |  2 ++
 tools/libxc/xc_msr_x86.h  | 10 ++
 tools/libxc/xc_sgx.c  | 82 +++
 tools/libxl/libxl_dom.c   | 29 +++
 tools/xl/xl_parse.c   | 10 ++
 6 files changed, 134 insertions(+)
 create mode 100644 tools/libxc/xc_sgx.c

diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile
index 9a019e8dfed5..428430a15c40 100644
--- a/tools/libxc/Makefile
+++ b/tools/libxc/Makefile
@@ -41,6 +41,7 @@ CTRL_SRCS-y   += xc_foreign_memory.c
 CTRL_SRCS-y   += xc_kexec.c
 CTRL_SRCS-y   += xc_resource.c
 CTRL_SRCS-$(CONFIG_X86) += xc_psr.c
+CTRL_SRCS-$(CONFIG_X86) += xc_sgx.c
 CTRL_SRCS-$(CONFIG_X86) += xc_pagetab.c
 CTRL_SRCS-$(CONFIG_Linux) += xc_linux.c
 CTRL_SRCS-$(CONFIG_FreeBSD) += xc_freebsd.c
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index ad4429ca5ffd..abc9f711141a 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -1855,6 +1855,8 @@ void xc_cpuid_to_str(const unsigned int *regs,
 int xc_mca_op(xc_interface *xch, struct xen_mc *mc);
 int xc_mca_op_inject_v2(xc_interface *xch, unsigned int flags,
 xc_cpumap_t cpumap, unsigned int nr_cpus);
+int xc_msr_sgx_set(xc_interface *xch, uint32_t domid, bool lewr,
+   uint64_t *lehash, int max_vcpu);
 #endif
 
 struct xc_px_val {
diff --git a/tools/libxc/xc_msr_x86.h b/tools/libxc/xc_msr_x86.h
index 7f100e71a7a1..54eaa4de8945 100644
--- a/tools/libxc/xc_msr_x86.h
+++ b/tools/libxc/xc_msr_x86.h
@@ -24,6 +24,16 @@
 #define MSR_IA32_CMT_EVTSEL 0x0c8d
 #define MSR_IA32_CMT_CTR0x0c8e
 
+#define MSR_IA32_FEATURE_CONTROL   0x003a
+#define IA32_FEATURE_CONTROL_LOCK 0x0001
+#define IA32_FEATURE_CONTROL_SGX_ENABLE   0x4
+#define IA32_FEATURE_CONTROL_SGX_LE_WR0x2
+
+#define MSR_IA32_SGXLEPUBKEYHASH0   0x008c
+#define MSR_IA32_SGXLEPUBKEYHASH1   0x008d
+#define MSR_IA32_SGXLEPUBKEYHASH2   0x008e
+#define MSR_IA32_SGXLEPUBKEYHASH3   0x008f
+
 #endif
 
 /*
diff --git a/tools/libxc/xc_sgx.c b/tools/libxc/xc_sgx.c
new file mode 100644
index ..8f97ca0042e0
--- /dev/null
+++ b/tools/libxc/xc_sgx.c
@@ -0,0 +1,82 @@
+/*
+ * xc_sgx.c
+ *
+ * SGX related MSR setup
+ *
+ * Copyright (C) 2017  Intel Corporation
+ * Author Boqun Feng 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include 
+#include "xc_private.h"
+#include "xc_msr_x86.h"
+
+int xc_msr_sgx_set(xc_interface *xch, uint32_t domid, bool lewr,
+   uint64_t *lehash, int max_vcpu)
+{
+int rc, i, nr_msrs;
+DECLARE_DOMCTL;
+xen_domctl_vcpu_msr_t sgx_msrs[5];
+DECLARE_HYPERCALL_BUFFER(void, buffer);
+
+if ( !lehash && !lewr )
+return 0;
+
+sgx_msrs[0].index = MSR_IA32_FEATURE_CONTROL;
+sgx_msrs[0].reserved = 0;
+sgx_msrs[0].value = IA32_FEATURE_CONTROL_LOCK |
+IA32_FEATURE_CONTROL_SGX_ENABLE |
+(lewr ? IA32_FEATURE_CONTROL_SGX_LE_WR : 0);
+
+if ( !lehash )
+nr_msrs = 1;
+else
+{
+nr_msrs = 5;
+
+for ( i = 0; i < 4; i++ )
+{
+sgx_msrs[i+1].index = MSR_IA32_SGXLEPUBKEYHASH0 + i;
+sgx_msrs[i+1].reserved = 0;
+sgx_msrs[i+1].value = lehash[i];
+}
+}
+
+buffer = xc_hypercall_buffer_alloc(xc

[Xen-devel] [PATCH v2 03/17] xen: vmx: detect ENCLS VMEXIT

2017-12-03 Thread Boqun Feng
From: Kai Huang 

If ENCLS VMEXIT is not present then we cannot support SGX virtualization.
This patch detects presence of ENCLS VMEXIT, and disable SGX if ENCLS
VMEXIT not present

Signed-off-by: Kai Huang 
Signed-off-by: Boqun Feng 
---
 xen/arch/x86/hvm/vmx/vmcs.c| 16 +++-
 xen/include/asm-x86/hvm/vmx/vmcs.h |  3 +++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index b5100b50215a..dfcecc4fd1b0 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static bool_t __read_mostly opt_vpid_enabled = 1;
 boolean_param("vpid", opt_vpid_enabled);
@@ -143,6 +144,7 @@ static void __init vmx_display_features(void)
 P(cpu_has_vmx_virt_exceptions, "Virtualisation Exceptions");
 P(cpu_has_vmx_pml, "Page Modification Logging");
 P(cpu_has_vmx_tsc_scaling, "TSC Scaling");
+P(cpu_has_vmx_encls, "SGX ENCLS Exiting");
 #undef P
 
 if ( !printed )
@@ -238,7 +240,8 @@ static int vmx_init_vmcs_config(void)
SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |
SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS |
SECONDARY_EXEC_XSAVES |
-   SECONDARY_EXEC_TSC_SCALING);
+   SECONDARY_EXEC_TSC_SCALING |
+   SECONDARY_EXEC_ENABLE_ENCLS);
 rdmsrl(MSR_IA32_VMX_MISC, _vmx_misc_cap);
 if ( _vmx_misc_cap & VMX_MISC_VMWRITE_ALL )
 opt |= SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
@@ -341,6 +344,14 @@ static int vmx_init_vmcs_config(void)
 _vmx_secondary_exec_control &= ~ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 }
 
+/*
+ * Turn off SGX if ENCLS VMEXIT is not present. Actually on real machine,
+ * if SGX CPUID is present (CPUID.0x7.0x0:EBX.SGX = 1), then ENCLS VMEXIT
+ * will always be present. We do the check anyway here.
+ */
+if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_ENCLS) )
+disable_sgx();
+
 min = VM_EXIT_ACK_INTR_ON_EXIT;
 opt = VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT |
   VM_EXIT_CLEAR_BNDCFGS;
@@ -1136,6 +1147,9 @@ static int construct_vmcs(struct vcpu *v)
 /* Disable PML anyway here as it will only be enabled in log dirty mode */
 v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 
+/* Disable ENCLS VMEXIT. It will only be turned on when needed. */
+v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_ENCLS;
+
 /* Host data selectors. */
 __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
 __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 8fb9e3ceee4e..d0293b1a3620 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -245,6 +245,7 @@ extern u32 vmx_vmentry_control;
 #define SECONDARY_EXEC_ENABLE_INVPCID   0x1000
 #define SECONDARY_EXEC_ENABLE_VM_FUNCTIONS  0x2000
 #define SECONDARY_EXEC_ENABLE_VMCS_SHADOWING0x4000
+#define SECONDARY_EXEC_ENABLE_ENCLS 0x8000
 #define SECONDARY_EXEC_ENABLE_PML   0x0002
 #define SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS   0x0004
 #define SECONDARY_EXEC_XSAVES   0x0010
@@ -325,6 +326,8 @@ extern u64 vmx_ept_vpid_cap;
 (vmx_secondary_exec_control & SECONDARY_EXEC_XSAVES)
 #define cpu_has_vmx_tsc_scaling \
 (vmx_secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+#define cpu_has_vmx_encls \
+(vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_ENCLS)
 
 #define VMCS_RID_TYPE_MASK  0x8000
 
-- 
2.15.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 11/17] xen: vmx: handle SGX related MSRs

2017-12-03 Thread Boqun Feng
From: Kai Huang 

This patch handles IA32_FEATURE_CONTROL and IA32_SGXLEPUBKEYHASHn MSRs.

For IA32_FEATURE_CONTROL, if SGX is exposed to domain, then SGX_ENABLE
bit is always set. The SGX_LE_WR bit is default to be 0, unless 1) the
SGX launch control is exposed to domain and 2) the XL parameter 'lewr'
is true(the handling of this parameter is in a later patch, so for this
patch, SGX_LE_WR bit is always 0).  Write to IA32_FEATURE_CONTROL will
fault.

For IA32_SGXLEPUBKEYHASHn, vcpu's virtual ia32_sgxlepubkeyhash[0-3] are
added in 'sgx' field of 'struct msr_vcpu_policy'.

During vcpu is initialized, virtual ia32_sgxlepubkeyhash are also
initialized. The default values would be the physical values of the
physical machines. Later on, we may reset those values with the content
of the XL parameter 'lehash'. Besides if 'lewr' is true and no 'lehash'
is provided, we will reset those values with Intel's default value, as
for physical machines, those MSRs will have Intel's default value.

For IA32_SGXLEPUBKEYHASHn MSR read from guest, if SGX launch control is
not exposed to domain, guest is not allowed to read either, otherwise
vcpu's virtual MSR value is returned.

For IA32_SGXLEPUBKEYHASHn MSR write from guest, we allow guest to write
if only 'lewr' is set(so for this patch, writes will fault).

To make EINIT run successfully in guest, vcpu's virtual
IA32_SGXLEPUBKEYHASHn will be update to physical MSRs when vcpu is
scheduled in. Moreover, we cache the recent IA32_SGXLEPUBKEYHASHn in a
percpu variable, so that we won't need to update with wrmsr if the value
not changed.

Signed-off-by: Kai Huang 
Signed-off-by: Boqun Feng 
---
 xen/arch/x86/domctl.c|  28 -
 xen/arch/x86/hvm/vmx/vmx.c   |  19 ++
 xen/arch/x86/msr.c   |   6 +-
 xen/arch/x86/sgx.c   | 123 +++
 xen/include/asm-x86/cpufeature.h |   3 +
 xen/include/asm-x86/msr-index.h  |   5 ++
 xen/include/asm-x86/msr.h|   5 ++
 xen/include/asm-x86/sgx.h|   9 +++
 8 files changed, 196 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 0ee9fb6458ec..eb5d4b346313 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1352,13 +1352,16 @@ long arch_do_domctl(
 
 ret = -EINVAL;
 if ( (v == curr) || /* no vcpu_pause() */
- !is_pv_domain(d) )
+ (!is_pv_domain(d) && !d->arch.cpuid->feat.sgx_lc) )
 break;
 
 /* Count maximum number of optional msrs. */
 if ( boot_cpu_has(X86_FEATURE_DBEXT) )
 nr_msrs += 4;
 
+if ( d->arch.cpuid->feat.sgx_lc )
+nr_msrs += 5;
+
 if ( domctl->cmd == XEN_DOMCTL_get_vcpu_msrs )
 {
 ret = 0; copyback = true;
@@ -1447,6 +1450,29 @@ long arch_do_domctl(
 msr.index -= MSR_AMD64_DR1_ADDRESS_MASK - 1;
 v->arch.pv_vcpu.dr_mask[msr.index] = msr.value;
 continue;
+case MSR_IA32_FEATURE_CONTROL:
+if ( msr.value & IA32_FEATURE_CONTROL_SGX_LE_WR )
+{
+if ( d->arch.cpuid->feat.sgx_lc && sgx_lewr())
+{
+v->arch.msr->sgx.lewr = true;
+continue;
+}
+else /* Try to set LE_WR while not supported */
+break;
+}
+   continue;
+case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+if ( d->arch.cpuid->feat.sgx_lc && sgx_lewr() )
+{
+sgx_set_vcpu_sgxlepubkeyhash(v,
+msr.index - MSR_IA32_SGXLEPUBKEYHASH0,
+msr.value);
+continue;
+}
+else
+break;
+   continue;
 }
 break;
 }
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 92fb85b13a0c..ce1c95f69062 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1049,6 +1049,9 @@ static void vmx_ctxt_switch_to(struct vcpu *v)
 
 if ( v->domain->arch.hvm_domain.pi_ops.switch_to )
 v->domain->arch.hvm_domain.pi_ops.switch_to(v);
+
+if ( v->domain->arch.cpuid->feat.sgx_lc && sgx_lewr() )
+sgx_ctxt_switch_to(v);
 }
 
 
@@ -2892,6 +2895,8 @@ static int is_last_branch_msr(u32 ecx)
 static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
 {
 const struct vcpu *curr = current;
+const struct msr_vcpu_policy *vp = curr->arch.msr;
+ 

[Xen-devel] [PATCH v2 01/17] xen: x86: expose SGX to HVM domain in CPU featureset

2017-12-03 Thread Boqun Feng
From: Kai Huang 

Expose SGX in CPU featureset for HVM domain. SGX will not be supported for
PV domain, as ENCLS (which SGX driver in guest essentially runs) must run
in ring 0, while PV kernel runs in ring 3. Theoretically we can support SGX
in PV domain via either emulating #GP caused by ENCLS running in ring 3, or
by PV ENCLS but it is really not necessary at this stage.

SGX Launch Control is also exposed in CPU featureset for HVM domain. SGX
Launch Control depends on SGX.

Signed-off-by: Kai Huang 
Signed-off-by: Boqun Feng 
---
 xen/include/public/arch-x86/cpufeatureset.h | 3 ++-
 xen/tools/gen-cpuid.py  | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/xen/include/public/arch-x86/cpufeatureset.h 
b/xen/include/public/arch-x86/cpufeatureset.h
index be6da8eaf17c..1f8510eebb1d 100644
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -193,7 +193,7 @@ XEN_CPUFEATURE(XSAVES,4*32+ 3) /*S  XSAVES/XRSTORS 
instructions */
 /* Intel-defined CPU features, CPUID level 0x0007:0.ebx, word 5 */
 XEN_CPUFEATURE(FSGSBASE,  5*32+ 0) /*A  {RD,WR}{FS,GS}BASE instructions */
 XEN_CPUFEATURE(TSC_ADJUST,5*32+ 1) /*S  TSC_ADJUST MSR available */
-XEN_CPUFEATURE(SGX,   5*32+ 2) /*   Software Guard extensions */
+XEN_CPUFEATURE(SGX,   5*32+ 2) /*H  Intel Software Guard extensions */
 XEN_CPUFEATURE(BMI1,  5*32+ 3) /*A  1st bit manipulation extensions */
 XEN_CPUFEATURE(HLE,   5*32+ 4) /*A  Hardware Lock Elision */
 XEN_CPUFEATURE(AVX2,  5*32+ 5) /*A  AVX2 instructions */
@@ -230,6 +230,7 @@ XEN_CPUFEATURE(PKU,   6*32+ 3) /*H  Protection Keys 
for Userspace */
 XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*!  OS Protection Keys Enable */
 XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A  POPCNT for vectors of DW/QW */
 XEN_CPUFEATURE(RDPID, 6*32+22) /*A  RDPID instruction */
+XEN_CPUFEATURE(SGX_LC,6*32+30) /*H Intel SGX Launch Control */
 
 /* AMD-defined CPU features, CPUID level 0x8007.edx, word 7 */
 XEN_CPUFEATURE(ITSC,  7*32+ 8) /*   Invariant TSC */
diff --git a/xen/tools/gen-cpuid.py b/xen/tools/gen-cpuid.py
index 9ec4486f2b4b..4fef21203086 100755
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -256,6 +256,9 @@ def crunch_numbers(state):
 AVX512F: [AVX512DQ, AVX512IFMA, AVX512PF, AVX512ER, AVX512CD,
   AVX512BW, AVX512VL, AVX512VBMI, AVX512_4VNNIW,
   AVX512_4FMAPS, AVX512_VPOPCNTDQ],
+
+# SGX Launch Control depends on SGX
+SGX: [SGX_LC],
 }
 
 deep_features = tuple(sorted(deps.keys()))
-- 
2.15.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 04/17] xen: x86/mm: introduce ioremap_wb()

2017-12-03 Thread Boqun Feng
From: Kai Huang 

Currently Xen only has non-cacheable version of ioremap for x86.
Although EPC is reported as reserved memory in e820 but it can be mapped
as cacheable.  This patch introduces ioremap_wb() (ioremap for cacheable
and write back memory).

Signed-off-by: Kai Huang 
Signed-off-by: Boqun Feng 
---
 xen/arch/x86/mm.c| 9 +++--
 xen/include/asm-x86/mm.h | 7 +++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 886a5ee327df..db1d1f40 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5207,7 +5207,7 @@ void *__init arch_vmap_virt_end(void)
 return (void *)fix_to_virt(__end_of_fixed_addresses);
 }
 
-void __iomem *ioremap(paddr_t pa, size_t len)
+void __iomem *__ioremap(paddr_t pa, size_t len, unsigned int flags)
 {
 mfn_t mfn = _mfn(PFN_DOWN(pa));
 void *va;
@@ -5222,12 +5222,17 @@ void __iomem *ioremap(paddr_t pa, size_t len)
 unsigned int offs = pa & (PAGE_SIZE - 1);
 unsigned int nr = PFN_UP(offs + len);
 
-va = __vmap(&mfn, nr, 1, 1, PAGE_HYPERVISOR_UCMINUS, VMAP_DEFAULT) + 
offs;
+va = __vmap(&mfn, nr, 1, 1, flags, VMAP_DEFAULT) + offs;
 }
 
 return (void __force __iomem *)va;
 }
 
+void __iomem *ioremap(paddr_t pa, size_t len)
+{
+return __ioremap(pa, len, PAGE_HYPERVISOR_UCMINUS);
+}
+
 int create_perdomain_mapping(struct domain *d, unsigned long va,
  unsigned int nr, l1_pgentry_t **pl1tab,
  struct page_info **ppg)
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 83626085e0a6..77e3c3ba68d1 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -629,4 +629,11 @@ static inline bool arch_mfn_in_directmap(unsigned long mfn)
 return mfn <= (virt_to_mfn(eva - 1) + 1);
 }
 
+extern void __iomem *__ioremap(paddr_t, size_t, unsigned int);
+
+static inline void __iomem *ioremap_wb(paddr_t pa, size_t len)
+{
+return __ioremap(pa, len, PAGE_HYPERVISOR);
+}
+
 #endif /* __ASM_X86_MM_H__ */
-- 
2.15.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 06/17] xen: mm: introduce non-scrubbable pages

2017-12-03 Thread Boqun Feng
We are about to use the existing heap allocator for EPC page management,
and we need to prevent EPC pages from being scrubbed or merged with
normal memory pages, because EPC pages can not be accessed outside
Enclaves.

To do so, we use one bit in 'page_info::u::free' to record whether a
page could be scrubbed or not. 'page_scrubbable' is also introduced to
test this bit, however, it will always return 'true' for architectures
without unscrubbable pages like EPC pages for now(i.e. ARM).

Besides, during the page merging stage, we can not allow scrubbable
pages and unscrubbable pages to get merged, therefore 'page_mergeable'
is introduced, and it simply test whether two pages have the same
scrubbable attributes.

In 'scrub_one_page', scrubbing is aborted once the page is found
unscrubbable.

Signed-off-by: Boqun Feng 
---
 xen/common/page_alloc.c  | 10 +++---
 xen/include/asm-arm/mm.h |  7 +++
 xen/include/asm-x86/mm.h |  7 +++
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 5616a8226376..220d7d91c62b 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1364,6 +1364,8 @@ static void free_heap_pages(
 if ( pg[i].u.free.need_tlbflush )
 page_set_tlbflush_timestamp(&pg[i]);
 
+pg[i].u.free.scrubbable = true;
+
 /* This page is not a guest frame any more. */
 page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
 set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY);
@@ -1402,7 +1404,8 @@ static void free_heap_pages(
 if ( !mfn_valid(_mfn(page_to_mfn(predecessor))) ||
  !page_state_is(predecessor, free) ||
  (PFN_ORDER(predecessor) != order) ||
- (phys_to_nid(page_to_maddr(predecessor)) != node) )
+ (phys_to_nid(page_to_maddr(predecessor)) != node) ||
+ !page_mergeable(predecessor, pg) )
 break;
 
 check_and_stop_scrub(predecessor);
@@ -1425,7 +1428,8 @@ static void free_heap_pages(
 if ( !mfn_valid(_mfn(page_to_mfn(successor))) ||
  !page_state_is(successor, free) ||
  (PFN_ORDER(successor) != order) ||
- (phys_to_nid(page_to_maddr(successor)) != node) )
+ (phys_to_nid(page_to_maddr(successor)) != node) ||
+ !page_mergeable(successor, pg) )
 break;
 
 check_and_stop_scrub(successor);
@@ -2379,7 +2383,7 @@ __initcall(pagealloc_keyhandler_init);
 
 void scrub_one_page(struct page_info *pg)
 {
-if ( unlikely(pg->count_info & PGC_broken) )
+if ( !page_scrubbable(pg) || unlikely(pg->count_info & PGC_broken) )
 return;
 
 #ifndef NDEBUG
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index ad2f2a43dcbc..c715e2290510 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -55,6 +55,9 @@ struct page_info
 /* Do TLBs need flushing for safety before next page use? */
 bool need_tlbflush:1;
 
+/* Could this page be scrubbed when it's free? */
+bool scrubbable:1;
+
 #define BUDDY_NOT_SCRUBBING0
 #define BUDDY_SCRUBBING1
 #define BUDDY_SCRUB_ABORT  2
@@ -150,6 +153,10 @@ extern vaddr_t xenheap_virt_start;
 (mfn_valid(_mfn(mfn)) && is_xen_heap_page(__mfn_to_page(mfn)))
 #endif
 
+#define page_scrubbable(_p) true
+
+#define page_mergeable(_p1, _p2)true
+
 #define is_xen_fixed_mfn(mfn)   \
 ((pfn_to_paddr(mfn) >= virt_to_maddr(&_start)) &&   \
  (pfn_to_paddr(mfn) <= virt_to_maddr(&_end)))
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 77e3c3ba68d1..b0f0ea0a8b5d 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -98,6 +98,8 @@ struct page_info
 
 /* Do TLBs need flushing for safety before next page use? */
 bool need_tlbflush;
+/* Could this page be scrubbed when it's free? */
+bool scrubbable;
 
 #define BUDDY_NOT_SCRUBBING0
 #define BUDDY_SCRUBBING1
@@ -283,6 +285,11 @@ struct page_info
 /* OOS fixup entries */
 #define SHADOW_OOS_FIXUPS 2
 
+#define page_scrubbable(_p) ((_p)->u.free.scrubbable)
+
+#define page_mergeable(_p1, _p2)\
+(page_scrubbable(_p1) == page_scrubbable(_p2))
+
 #define page_get_owner(_p)  \
 ((struct domain *)((_p)->v.inuse._domain ?  \
pdx_to_virt((_p)->v.inuse._domain) : NULL))
-- 
2.15.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 14/17] xen: x86: reset EPC when guest got suspended.

2017-12-03 Thread Boqun Feng
From: Kai Huang 

EPC is destroyed when power state goes to S3-S5. Emulate this behavior.

A new function s3_suspend is added to hvm_function_table for this purpose.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/hvm.c| 3 +++
 xen/arch/x86/hvm/vmx/vmx.c| 7 +++
 xen/include/asm-x86/hvm/hvm.h | 3 +++
 3 files changed, 13 insertions(+)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index c5e8467f3219..053c15afc46a 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3952,6 +3952,9 @@ static void hvm_s3_suspend(struct domain *d)
 
 hvm_vcpu_reset_state(d->vcpu[0], 0xf000, 0xfff0);
 
+if ( hvm_funcs.s3_suspend )
+hvm_funcs.s3_suspend(d);
+
 domain_unlock(d);
 }
 
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 280fc82ca1ff..17190b06a421 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2307,6 +2307,12 @@ static bool vmx_get_pending_event(struct vcpu *v, struct 
x86_event *info)
 return true;
 }
 
+static void vmx_s3_suspend(struct domain *d)
+{
+if ( d->arch.cpuid->feat.sgx )
+domain_reset_epc(d, false);
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
 .name = "VMX",
 .cpu_up_prepare   = vmx_cpu_up_prepare,
@@ -2378,6 +2384,7 @@ static struct hvm_function_table __initdata 
vmx_function_table = {
 .max_ratio = VMX_TSC_MULTIPLIER_MAX,
 .setup = vmx_setup_tsc_scaling,
 },
+.s3_suspend = vmx_s3_suspend,
 };
 
 /* Handle VT-d posted-interrupt when VCPU is blocked. */
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 6ecad3331695..d9ff98a1b0ed 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -227,6 +227,9 @@ struct hvm_function_table {
 /* Architecture function to setup TSC scaling ratio */
 void (*setup)(struct vcpu *v);
 } tsc_scaling;
+
+/* Domain S3 suspend */
+void (*s3_suspend)(struct domain *d);
 };
 
 extern struct hvm_function_table hvm_funcs;
-- 
2.15.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 15/17] xen: tools: add new 'sgx' parameter support

2017-12-03 Thread Boqun Feng
From: Kai Huang 

In order to be able to configure domain's SGX related attributes(EPC
size, Launch Enclave hash key, etc.), a new parameter 'sgx' is added to
XL configuration file, the parameter should be in the following format:

sgx = 'epc=,lehash=<..>,lewr=<0|1>'

, in which 'lehash=<..>' and 'lewr=<0|1>' are optional.

A new 'libxl_sgx_buildinfo', which contains EPC base and size, and
Launch Enclave hash key and its writable permission, is also
added to libxl_domain_buind_info. EPC base and size are also added to
'xc_dom_image' in order to add EPC to e820 table. EPC base is calculated
internally.

Signed-off-by: Kai Huang 
Signed-off-by: Boqun Feng 
---
 tools/libxc/include/xc_dom.h |  4 +++
 tools/libxl/libxl_create.c   | 10 ++
 tools/libxl/libxl_dom.c  | 30 +
 tools/libxl/libxl_internal.h |  2 ++
 tools/libxl/libxl_types.idl  | 11 +++
 tools/libxl/libxl_x86.c  | 12 +++
 tools/xl/xl_parse.c  | 76 
 tools/xl/xl_parse.h  |  1 +
 8 files changed, 146 insertions(+)

diff --git a/tools/libxc/include/xc_dom.h b/tools/libxc/include/xc_dom.h
index cdcdd07d2bc2..8440532d0e9d 100644
--- a/tools/libxc/include/xc_dom.h
+++ b/tools/libxc/include/xc_dom.h
@@ -203,6 +203,10 @@ struct xc_dom_image {
 xen_paddr_t lowmem_end;
 xen_paddr_t highmem_end;
 xen_pfn_t vga_hole_size;
+#if defined(__i386__) || defined(__x86_64__)
+xen_paddr_t epc_base;
+xen_paddr_t epc_size;
+#endif
 
 /* If unset disables the setup of the IOREQ pages. */
 bool device_model;
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index f15fb215c24b..6a5863cd9637 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -59,6 +59,14 @@ void libxl__rdm_setdefault(libxl__gc *gc, 
libxl_domain_build_info *b_info)
 LIBXL_RDM_MEM_BOUNDARY_MEMKB_DEFAULT;
 }
 
+void libxl__sgx_setdefault(libxl__gc *gc, libxl_domain_build_info *b_info)
+{
+if (b_info->u.hvm.sgx.epckb == LIBXL_MEMKB_DEFAULT)
+b_info->u.hvm.sgx.epckb = 0;
+b_info->u.hvm.sgx.epcbase = 0;
+libxl_defbool_setdefault(&b_info->u.hvm.sgx.lewr, false);
+}
+
 int libxl__domain_build_info_setdefault(libxl__gc *gc,
 libxl_domain_build_info *b_info)
 {
@@ -359,6 +367,8 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
 libxl_defbool_setdefault(&b_info->u.hvm.gfx_passthru, false);
 
 libxl__rdm_setdefault(gc, b_info);
+
+libxl__sgx_setdefault(gc, b_info);
 break;
 case LIBXL_DOMAIN_TYPE_PV:
 libxl_defbool_setdefault(&b_info->u.pv.e820_host, false);
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index ef834e652d65..bbdba7e6e292 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -1213,6 +1213,36 @@ int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
 highmem_end = (1ull << 32) + (lowmem_end - mmio_start);
 lowmem_end = mmio_start;
 }
+#if defined(__i386__) || defined(__x86_64__)
+if (info->u.hvm.sgx.epckb) {
+/*
+ * FIXME:
+ *
+ * Currently EPC base is put at highmem_end + 8G, which should be
+ * safe in most cases.
+ *
+ * I am not quite sure which is the best way to calcualte EPC base.
+ * IMO we can either:
+ * 1) put EPC between lowmem_end to mmio_start, but this brings
+ * additional logic to handle, ex, lowmem_end may become too small
+ * if EPC is large (shall we limit domain's EPC size?), and hvmloader
+ * will try to enlarge MMIO space until lowmem_end, or even relocate
+ * lowmem -- all those make things complicated, so probably put EPC
+ * in hole between lowmem_end to mmio_start is not good.
+ * 2) put EPC after highmem_end, but hvmloader may also relocate MMIO
+ * resource to the place after highmem_end. Maybe the ideal way is to
+ * put EPC right after highmem_end, and change hvmloader to detect
+ * EPC, and put high MMIO resource after EPC. I've done this but I
+ * found a strange bug that EPT mapping of EPC will be (at least part
+ * of the mappings) will be removed by whom I still cannot find.
+ * Currently EPC base is put at highmem_end + 8G, and hvmloader code
+ * is not changed to handle EPC, but this should be safe for most 
cases.
+ */
+info->u.hvm.sgx.epcbase = highmem_end + (2ULL << 32);
+}
+dom->epc_size = (info->u.hvm.sgx.epckb << 10);
+dom->epc_base = info->u.hvm.sgx.epcbase;
+#endif
 dom->lowmem_end = lowmem_end;
 dom->highmem_end = highmem_end;
 dom->mmio_start = mmio_start;
diff --git a/tools/libxl/libxl_internal.h 

[Xen-devel] [PATCH v2 12/17] xen: vmx: handle ENCLS VMEXIT

2017-12-03 Thread Boqun Feng
From: Kai Huang 

Currently EPC are statically allocated and mapped to guest, we don't have
to trap ENCLS as it runs perfectly in VMX non-root mode. But exposing SGX
to guest means we also expose ENABLE_ENCLS bit to L1 hypervisor, therefore
we cannot stop L1 from enabling ENCLS VMEXIT. For ENCLS VMEXIT from L2 guest,
we simply inject it to L1, otherwise the ENCLS VMEXIT is unexpected in L0
and we simply crash the domain.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmx.c | 10 ++
 xen/arch/x86/hvm/vmx/vvmx.c| 11 +++
 xen/include/asm-x86/hvm/vmx/vmcs.h |  1 +
 xen/include/asm-x86/hvm/vmx/vmx.h  |  1 +
 4 files changed, 23 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index ce1c95f69062..c48c44565fc5 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4118,6 +4118,16 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 vmx_handle_apic_write();
 break;
 
+case EXIT_REASON_ENCLS:
+/*
+ * Currently L0 doesn't turn on ENCLS VMEXIT, but L0 cannot stop L1
+ * from enabling ENCLS VMEXIT. ENCLS VMEXIT from L2 guest has already
+ * been handled so by reaching here it is a BUG. We simply crash the
+ * domain.
+ */
+domain_crash(v->domain);
+break;
+
 case EXIT_REASON_PML_FULL:
 vmx_vcpu_flush_pml_buffer(v);
 break;
diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index dde02c076b9f..9c6123dc35ee 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -2094,6 +2094,12 @@ int nvmx_msr_read_intercept(unsigned int msr, u64 
*msr_content)
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_ENABLE_EPT;
+/*
+ * If SGX is exposed to guest, then ENABLE_ENCLS bit must also be
+ * exposed to guest.
+ */
+if ( d->arch.cpuid->feat.sgx )
+data |= SECONDARY_EXEC_ENABLE_ENCLS;
 data = gen_vmx_msr(data, 0, host_data);
 break;
 case MSR_IA32_VMX_EXIT_CTLS:
@@ -2316,6 +2322,11 @@ int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs,
 case EXIT_REASON_VMXON:
 case EXIT_REASON_INVEPT:
 case EXIT_REASON_XSETBV:
+/*
+ * L0 doesn't turn on ENCLS VMEXIT now, so ENCLS VMEXIT must come from
+ * L2 guest, and is because of ENCLS VMEXIT is turned on by L1.
+ */
+case EXIT_REASON_ENCLS:
 /* inject to L1 */
 nvcpu->nv_vmexit_pending = 1;
 break;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 44ff4f0a113f..f68f3d0f6801 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -407,6 +407,7 @@ enum vmcs_field {
 VIRT_EXCEPTION_INFO = 0x202a,
 XSS_EXIT_BITMAP = 0x202c,
 TSC_MULTIPLIER  = 0x2032,
+ENCLS_EXITING_BITMAP= 0x202E,
 GUEST_PHYSICAL_ADDRESS  = 0x2400,
 VMCS_LINK_POINTER   = 0x2800,
 GUEST_IA32_DEBUGCTL = 0x2802,
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h 
b/xen/include/asm-x86/hvm/vmx/vmx.h
index 7341cb191ef2..8547de9168eb 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -215,6 +215,7 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
 #define EXIT_REASON_APIC_WRITE  56
 #define EXIT_REASON_INVPCID 58
 #define EXIT_REASON_VMFUNC  59
+#define EXIT_REASON_ENCLS   60
 #define EXIT_REASON_PML_FULL62
 #define EXIT_REASON_XSAVES  63
 #define EXIT_REASON_XRSTORS 64
-- 
2.15.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 08/17] xen: x86/mm: add SGX EPC management

2017-12-03 Thread Boqun Feng
As now the heap allocator supports EPC pages, the management of EPC
pages is simply putting EPC pages into the heap at booting up if SGX is
supported and the EPC section is reported consistently. Allocation and
reclamation are just heap allocation and reclamation with MEMF_epc.

One more thing we need to do is to populate the portion of EPC pages in
the 'frame_table' and set up the mapping properly.

SGX would be disabled, if EPC initialization found any problem.

Signed-off-by: Boqun Feng 
---
 xen/arch/x86/sgx.c| 161 ++
 xen/include/asm-x86/sgx.h |   3 +
 2 files changed, 164 insertions(+)

diff --git a/xen/arch/x86/sgx.c b/xen/arch/x86/sgx.c
index ead917543f3e..9409b041e4f7 100644
--- a/xen/arch/x86/sgx.c
+++ b/xen/arch/x86/sgx.c
@@ -22,6 +22,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 struct sgx_cpuinfo __read_mostly boot_sgx_cpudata;
@@ -29,6 +31,13 @@ struct sgx_cpuinfo __read_mostly boot_sgx_cpudata;
 static bool __read_mostly opt_sgx_enabled = false;
 boolean_param("sgx", opt_sgx_enabled);
 
+#define total_epc_npages (boot_sgx_cpudata.epc_size >> PAGE_SHIFT)
+#define epc_base_mfn (boot_sgx_cpudata.epc_base >> PAGE_SHIFT)
+#define epc_base_maddr (boot_sgx_cpudata.epc_base)
+#define epc_end_maddr (epc_base_maddr + boot_sgx_cpudata.epc_size)
+
+static void *epc_base_vaddr = NULL;
+
 static void __detect_sgx(struct sgx_cpuinfo *sgxinfo)
 {
 u32 eax, ebx, ecx, edx;
@@ -166,11 +175,163 @@ static void __init print_sgx_cpuinfo(struct sgx_cpuinfo 
*sgxinfo)
boot_sgx_cpudata.epc_base + boot_sgx_cpudata.epc_size);
 }
 
+struct ft_page {
+struct page_info *pg;
+unsigned int order;
+unsigned long idx;
+struct list_head list;
+};
+
+static int extend_epc_frametable(unsigned long smfn, unsigned long emfn)
+{
+unsigned long idx;
+LIST_HEAD(ft_pages);
+struct ft_page *ftp, *nftp;
+int rc = 0;
+
+for ( ; smfn < emfn; smfn += PDX_GROUP_COUNT )
+{
+idx = pfn_to_pdx(smfn) / PDX_GROUP_COUNT;
+
+if (!test_bit(idx, pdx_group_valid))
+{
+unsigned long s = (unsigned long)pdx_to_page(idx * 
PDX_GROUP_COUNT);
+struct page_info *pg;
+
+ftp = xzalloc(struct ft_page);
+
+if ( !ftp )
+{
+rc = -ENOMEM;
+goto out;
+}
+
+pg = alloc_domheap_pages(NULL, PDX_GROUP_SHIFT - PAGE_SHIFT, 0);
+
+if ( !pg )
+{
+xfree(ftp);
+rc = -ENOMEM;
+goto out;
+}
+
+ftp->order = PDX_GROUP_SHIFT - PAGE_SHIFT;
+ftp->pg = pg;
+ftp->idx = idx;
+
+list_add_tail(&ftp->list, &ft_pages);
+
+map_pages_to_xen(s, page_to_mfn(pg),
+ 1UL << (PDX_GROUP_SHIFT - PAGE_SHIFT),
+ PAGE_HYPERVISOR);
+memset((void *)s, 0, sizeof(struct page_info) * PDX_GROUP_COUNT);
+}
+}
+
+out:
+list_for_each_entry_safe(ftp, nftp, &ft_pages, list)
+{
+if ( rc )
+{
+unsigned long s = (unsigned long)pdx_to_page(ftp->idx * 
PDX_GROUP_COUNT);
+
+destroy_xen_mappings(s, s + (1UL << PDX_GROUP_SHIFT));
+free_domheap_pages(ftp->pg, ftp->order);
+}
+list_del(&ftp->list);
+xfree(ftp);
+}
+
+if ( !rc )
+set_pdx_range(smfn, emfn);
+
+return rc;
+}
+
+static int __init init_epc_frametable(unsigned long mfn, unsigned long npages)
+{
+return extend_epc_frametable(mfn, mfn + npages);
+}
+
+static int __init init_epc_heap(void)
+{
+struct page_info *pg;
+unsigned long nrpages = total_epc_npages;
+unsigned long i;
+int rc = 0;
+
+rc = init_epc_frametable(epc_base_mfn, nrpages);
+
+if ( rc )
+return rc;
+
+for ( i = 0; i < nrpages; i++ )
+{
+pg = mfn_to_page(epc_base_mfn + i);
+pg->count_info |= PGC_epc;
+}
+
+init_domheap_pages(epc_base_maddr, epc_end_maddr);
+
+return rc;
+}
+
+struct page_info *alloc_epc_page(void)
+{
+struct page_info *pg = alloc_domheap_page(NULL, MEMF_epc);
+
+if ( !pg )
+return NULL;
+
+/*
+ * PGC_epc will be cleared in free_heap_pages(), so we add it back at
+ * allocation time, so that is_epc_page() will return true, when this page
+ * gets freed.
+ */
+pg->count_info |= PGC_epc;
+
+return pg;
+}
+
+void free_epc_page(struct page_info *epg)
+{
+free_domheap_page(epg);
+}
+
+
+static int __init sgx_init_epc(void)
+{
+int rc = 0;
+
+epc_base_vaddr = ioremap_wb(epc_base_maddr,
+total_epc_npages << PAGE_SHIFT);
+
+if ( !epc_base_maddr )
+{
+printk("Failed to ioremap_wb EPC range. Disable SGX.\n");
+
+

[Xen-devel] [PATCH v2 10/17] xen: x86: add SGX cpuid handling support.

2017-12-03 Thread Boqun Feng
From: Kai Huang 

This patch adds SGX to cpuid handling support. For SGX feature bit, it's
reported into raw_policy and passed along to a guest, but in
recalculate_cpu_policy(), we clear it if some one disabled SGX for some
reason. For EPC info, physical one is reported into raw_policy and
recalculated for *_policy. For a particular domain, it's EPC base and
size info will be filled by toolstack. Before domain's EPC base and size
are properly configured, guest's SGX cpuid should report invalid EPC,
which is also consistent with HW behavior.

Currently all EPC pages are fully populated for domain when it is
created.  Xen gets domain's EPC base and size from toolstack via
XEN_DOMCTL_set_cpuid, so domain's EPC pages are also populated in
XEN_DOMCTL_set_cpuid, after receiving valid EPC base and size. Failure
to populate EPC (such as there's no enough free EPC pages) results in
domain creation failure by making XEN_DOMCTL_set_cpuid return error.

Signed-off-by: Kai Huang 
Signed-off-by: Boqun Feng 
---
 xen/arch/x86/cpuid.c| 62 -
 xen/arch/x86/domctl.c   | 59 +-
 xen/include/asm-x86/cpuid.h | 29 -
 3 files changed, 147 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
index 5ee82d39d7cd..fcffbdec6bbe 100644
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 const uint32_t known_features[] = INIT_KNOWN_FEATURES;
 const uint32_t special_features[] = INIT_SPECIAL_FEATURES;
@@ -152,6 +153,33 @@ static void recalculate_xstate(struct cpuid_policy *p)
 }
 }
 
+static void recalculate_sgx(struct cpuid_policy *p)
+{
+if ( !p->feat.sgx || !p->sgx.sgx1 )
+{
+memset(&p->sgx, 0, sizeof (p->sgx));
+return;
+}
+
+/*
+ * SDM 42.7.2.1 SECS.ATTRIBUTE.XFRM:
+ *
+ * Legal value for SECS.ATTRIBUTE.XFRM conform to these requirements:
+ *  - XFRM[1:0] must be set to 0x3;
+ *  - If processor does not support XSAVE, or if the system software has 
not
+ *enabled XSAVE, then XFRM[63:2] must be 0.
+ *  - If the processor does support XSAVE, XFRM must contain a value that
+ *would be legal if loaded into XCR0.
+ */
+p->sgx.xfrm_low = 0x3;
+p->sgx.xfrm_high = 0;
+if ( p->basic.xsave )
+{
+p->sgx.xfrm_low |= p->xstate.xcr0_low;
+p->sgx.xfrm_high |= p->xstate.xcr0_high;
+}
+}
+
 /*
  * Misc adjustments to the policy.  Mostly clobbering reserved fields and
  * duplicating shared fields.  Intentionally hidden fields are annotated.
@@ -233,7 +261,7 @@ static void __init calculate_raw_policy(void)
 {
 switch ( i )
 {
-case 0x4: case 0x7: case 0xd:
+case 0x4: case 0x7: case 0xd: case 0x12:
 /* Multi-invocation leaves.  Deferred. */
 continue;
 }
@@ -293,6 +321,19 @@ static void __init calculate_raw_policy(void)
 }
 }
 
+if ( p->basic.max_leaf >= SGX_CPUID )
+{
+/*
+ * For raw policy we just report native CPUID. For EPC on native it's
+ * possible that we will have multiple EPC sections (meaning subleaf 3,
+ * 4, ... may also be valid), but as the policy is for guest so we only
+ * need one EPC section (subleaf 2).
+ */
+cpuid_count_leaf(SGX_CPUID, 0, &p->sgx.raw[0]);
+cpuid_count_leaf(SGX_CPUID, 1, &p->sgx.raw[1]);
+cpuid_count_leaf(SGX_CPUID, 2, &p->sgx.raw[2]);
+}
+
 /* Extended leaves. */
 cpuid_leaf(0x8000, &p->extd.raw[0]);
 for ( i = 1; i < min(ARRAY_SIZE(p->extd.raw),
@@ -318,6 +359,7 @@ static void __init calculate_host_policy(void)
 cpuid_featureset_to_policy(boot_cpu_data.x86_capability, p);
 recalculate_xstate(p);
 recalculate_misc(p);
+recalculate_sgx(p);
 
 if ( p->extd.svm )
 {
@@ -351,6 +393,7 @@ static void __init calculate_pv_max_policy(void)
 sanitise_featureset(pv_featureset);
 cpuid_featureset_to_policy(pv_featureset, p);
 recalculate_xstate(p);
+recalculate_sgx(p);
 
 p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
 }
@@ -408,6 +451,7 @@ static void __init calculate_hvm_max_policy(void)
 sanitise_featureset(hvm_featureset);
 cpuid_featureset_to_policy(hvm_featureset, p);
 recalculate_xstate(p);
+recalculate_sgx(p);
 }
 
 void __init init_guest_cpuid(void)
@@ -523,6 +567,14 @@ void recalculate_cpuid_policy(struct domain *d)
 if ( p->basic.max_leaf < XSTATE_CPUID )
 __clear_bit(X86_FEATURE_XSAVE, fs);
 
+/*
+ * We check cpu_has_sgx here because during boot up SGX may be disabled
+ * via disable_sgx(), e.g. BIOS disables SGX by setting
+ * IA32_FEATURE_CONTROL_SGX_ENABLE=0
+ */
+

[Xen-devel] [PATCH v2 13/17] xen: vmx: handle VMEXIT from SGX enclave

2017-12-03 Thread Boqun Feng
From: Kai Huang 

VMX adds new bit to both exit_reason and GUEST_INTERRUPT_STATE to indicate
whether VMEXIT happens in Enclave. Several instructions are also invalid or
behave differently in enclave according to SDM. This patch handles those
cases.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmx.c | 29 +
 xen/include/asm-x86/hvm/vmx/vmcs.h |  2 ++
 xen/include/asm-x86/hvm/vmx/vmx.h  |  2 ++
 3 files changed, 33 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index c48c44565fc5..280fc82ca1ff 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -58,6 +58,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static bool_t __initdata opt_force_ept;
 boolean_param("force-ept", opt_force_ept);
@@ -3536,6 +3537,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0;
 unsigned int vector = 0, mode;
 struct vcpu *v = current;
+bool_t exit_from_sgx_enclave;
 
 __vmread(GUEST_RIP,®s->rip);
 __vmread(GUEST_RSP,®s->rsp);
@@ -3561,6 +3563,11 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 
 perfc_incra(vmexits, exit_reason);
 
+/* We need to handle several VMEXITs if VMEXIT is from enclave. Also clear
+ * bit 27 as it is further useless. */
+exit_from_sgx_enclave = !!(exit_reason & VMX_EXIT_REASONS_FROM_ENCLAVE);
+exit_reason &= ~VMX_EXIT_REASONS_FROM_ENCLAVE;
+
 /* Handle the interrupt we missed before allowing any more in. */
 switch ( (uint16_t)exit_reason )
 {
@@ -4062,6 +4069,18 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 break;
 
 case EXIT_REASON_INVD:
+   /*
+* SDM 39.6.5 INVD Handling when Enclave Are Enabled
+*
+* INVD cause #GP if EPC is enabled.
+* FIXME: WBINVD??
+*/
+if ( exit_from_sgx_enclave )
+{
+hvm_inject_hw_exception(TRAP_gp_fault, 0);
+break;
+}
+/* Otherwise passthrough */
 case EXIT_REASON_WBINVD:
 {
 update_guest_eip(); /* Safe: INVD, WBINVD */
@@ -4073,6 +4092,16 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 {
 paddr_t gpa;
 
+/*
+ * Currently EPT violation from enclave is not possible as all EPC 
pages
+ * are statically allocated to guest when guest is created. We simply
+ * crash guest in this case.
+ */
+if ( exit_from_sgx_enclave )
+{
+domain_crash(v->domain);
+break;
+}
 __vmread(GUEST_PHYSICAL_ADDRESS, &gpa);
 __vmread(EXIT_QUALIFICATION, &exit_qualification);
 ept_handle_violation(exit_qualification, gpa);
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index f68f3d0f6801..52f137437b97 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -338,6 +338,8 @@ extern u64 vmx_ept_vpid_cap;
 #define VMX_INTR_SHADOW_MOV_SS  0x0002
 #define VMX_INTR_SHADOW_SMI 0x0004
 #define VMX_INTR_SHADOW_NMI 0x0008
+#define VMX_INTR_ENCLAVE_INTR   0x0010  /* VMEXIT was incident to
+   enclave mode */
 
 #define VMX_BASIC_REVISION_MASK 0x7fff
 #define VMX_BASIC_VMCS_SIZE_MASK(0x1fffULL << 32)
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h 
b/xen/include/asm-x86/hvm/vmx/vmx.h
index 8547de9168eb..88d0dd600500 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -158,6 +158,8 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
  * Exit Reasons
  */
 #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x8000
+/* Bit 27 is also set if VMEXIT is from SGX enclave mode */
+#define VMX_EXIT_REASONS_FROM_ENCLAVE   0x0800
 
 #define EXIT_REASON_EXCEPTION_NMI   0
 #define EXIT_REASON_EXTERNAL_INTERRUPT  1
-- 
2.15.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 05/17] xen: p2m: new 'p2m_epc' type for EPC mapping

2017-12-03 Thread Boqun Feng
From: Kai Huang 

A new 'p2m_epc' type is added for EPC mapping type. Two wrapper functions
set_epc_p2m_entry and clear_epc_p2m_entry are also added for further use.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/mm/p2m-ept.c |  3 +++
 xen/arch/x86/mm/p2m.c | 41 +
 xen/include/asm-x86/p2m.h | 12 ++--
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index b4996ce658ac..34c2e2f8ac1c 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -182,6 +182,9 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, 
ept_entry_t *entry,
 entry->a = !!cpu_has_vmx_ept_ad;
 entry->d = 0;
 break;
+case p2m_epc:
+entry->r = entry->w = entry->x = 1;
+break;
 }
 
 
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index c72a3cdebb81..8eeafe4b250c 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1192,6 +1192,12 @@ int set_identity_p2m_entry(struct domain *d, unsigned 
long gfn_l,
 return ret;
 }
 
+int set_epc_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_epc,
+p2m_get_hostp2m(d)->default_access);
+}
+
 /*
  * Returns:
  *0for success
@@ -1278,6 +1284,41 @@ int clear_identity_p2m_entry(struct domain *d, unsigned 
long gfn_l)
 return ret;
 }
 
+int clear_epc_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+mfn_t omfn;
+p2m_type_t ot;
+p2m_access_t oa;
+int ret = 0;
+
+gfn_lock(p2m, gfn, 0);
+
+omfn = p2m->get_entry(p2m, _gfn(gfn), &ot, &oa, 0, NULL, NULL);
+if ( mfn_eq(omfn, INVALID_MFN) || !p2m_is_epc(ot) )
+{
+printk(XENLOG_G_WARNING
+"d%d: invalid EPC map to clear: gfn 0x%lx, type %d.\n",
+d->domain_id, gfn, ot);
+goto out;
+}
+if ( !mfn_eq(mfn, omfn) )
+{
+printk(XENLOG_G_WARNING
+"d%d: mistaken EPC mfn to clear: gfn 0x%lx, "
+"omfn 0x%lx, mfn 0x%lx.\n",
+d->domain_id, gfn, mfn_x(omfn), mfn_x(mfn));
+}
+
+ret = p2m_set_entry(p2m, _gfn(gfn), INVALID_MFN, PAGE_ORDER_4K, 
p2m_invalid,
+p2m->default_access);
+
+out:
+gfn_unlock(p2m, gfn, 0);
+
+return ret;
+}
+
 /* Returns: 0 for success, -errno for failure */
 int set_shared_p2m_entry(struct domain *d, unsigned long gfn_l, mfn_t mfn)
 {
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index 17b1d0c8d326..40a40dd54380 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -72,6 +72,7 @@ typedef enum {
 p2m_ram_broken = 13,  /* Broken page, access cause domain crash */
 p2m_map_foreign  = 14,/* ram pages from foreign domain */
 p2m_ioreq_server = 15,
+p2m_epc = 16, /* EPC */
 } p2m_type_t;
 
 /* Modifiers to the query */
@@ -142,10 +143,13 @@ typedef unsigned int p2m_query_t;
 | p2m_to_mask(p2m_ram_logdirty) )
 #define P2M_SHARED_TYPES   (p2m_to_mask(p2m_ram_shared))
 
+#define P2M_EPC_TYPES   (p2m_to_mask(p2m_epc))
+
 /* Valid types not necessarily associated with a (valid) MFN. */
 #define P2M_INVALID_MFN_TYPES (P2M_POD_TYPES  \
| p2m_to_mask(p2m_mmio_direct) \
-   | P2M_PAGING_TYPES)
+   | P2M_PAGING_TYPES \
+   | P2M_EPC_TYPES)
 
 /* Broken type: the frame backing this pfn has failed in hardware
  * and must not be touched. */
@@ -153,6 +157,7 @@ typedef unsigned int p2m_query_t;
 
 /* Useful predicates */
 #define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES)
+#define p2m_is_epc(_t) (p2m_to_mask(_t) & P2M_EPC_TYPES)
 #define p2m_is_hole(_t) (p2m_to_mask(_t) & P2M_HOLE_TYPES)
 #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
@@ -163,7 +168,7 @@ typedef unsigned int p2m_query_t;
 /* Grant types are *not* considered valid, because they can be
unmapped at any time and, unless you happen to be the shadow or p2m
implementations, there's no way of synchronising against that. */
-#define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES))
+#define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES | 
P2M_EPC_TYPES))
 #define p2m_has_emt(_t)  (p2m_to_mask(_t) & (P2M_RAM_TYPES | 
p2m_to_mask(p2m_mmio_direct)))
 #define p2m_is_pageable(_t) (p2m_to_mask(_t) & P2M_PAGEABLE_TYPES)
 #define p2m_is_paging(_t)   (p2m_to_mask(_t) & P2M_PAGING_TYPES)
@@ -635,6 +640,9 @@ int clear_identity_p2m_entry(struct domain *d, unsigned 
long gfn);
 int p2m_add_foreign(struct domain *tdom, unsigned long fgfn,
 unsigned long gpfn, domid

[Xen-devel] [PATCH v2 07/17] xen: mm: manage EPC pages in Xen heaps

2017-12-03 Thread Boqun Feng
EPC is limited resouce reserved by BIOS, and is reported as reserved
memory in e820 but not normal memory. EPC must be managed in 4K pages,
and could not be accessed outside the Enclaves.

Using the existing memory allocation API(i.e. the heaps) allows us to
manage EPC pages in an efficient way, and may benefit EPC ballooning
implementation in the feature.

In order to use the existing heap mechanism to manage EPC pages, a
dedicated MEMZONE is required, because we need to avoid the mixture of
EPC pages and normal pages in one zone. And for the page_to_zone() to
return the proper zone number, similar to 'PGC_xen_heap' and
'is_xen_heap_page', 'PGC_epc' and 'is_epc_page' are introduced.

In 'free_heap_pages', 'need_scrub' is reset if the page is found to be
an EPC page, because EPC pages can not be scrubbed. And there is no
entry of EPC pages in m2p table, as it's not used, so related setting is
skipped.

Besides, a 'MEMF_epc' memflag is introduced to tell the allocator to get
EPC pages rather than normal memory.

Signed-off-by: Boqun Feng 
---
 xen/common/page_alloc.c  | 31 +--
 xen/include/asm-arm/mm.h |  2 ++
 xen/include/asm-x86/mm.h |  5 -
 xen/include/xen/mm.h |  2 ++
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 220d7d91c62b..3b9d2c1a534f 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -377,12 +377,14 @@ mfn_t __init alloc_boot_pages(unsigned long nr_pfns, 
unsigned long pfn_align)
  * BINARY BUDDY ALLOCATOR
  */
 
-#define MEMZONE_XEN 0
+#define MEMZONE_EPC 0
+#define MEMZONE_XEN 1
 #define NR_ZONES(PADDR_BITS - PAGE_SHIFT + 1)
 
 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT))
-#define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN :  \
-  (flsl(page_to_mfn(pg)) ? : 1))
+#define page_to_zone(pg) (is_epc_page(pg) ? MEMZONE_EPC :  \
+  is_xen_heap_page(pg) ? MEMZONE_XEN :  \
+  (flsl(page_to_mfn(pg)) ? : MEMZONE_XEN + 1))
 
 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
@@ -921,7 +923,12 @@ static struct page_info *alloc_heap_pages(
 }
 
 node = phys_to_nid(page_to_maddr(pg));
-zone = page_to_zone(pg);
+
+if ( memflags & MEMF_epc )
+zone = MEMZONE_EPC;
+else
+zone = page_to_zone(pg);
+
 buddy_order = PFN_ORDER(pg);
 
 first_dirty = pg->u.free.first_dirty;
@@ -1332,10 +1339,14 @@ static void free_heap_pages(
 unsigned long mask, mfn = page_to_mfn(pg);
 unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
 unsigned int zone = page_to_zone(pg);
+bool is_epc = false;
 
 ASSERT(order <= MAX_ORDER);
 ASSERT(node >= 0);
 
+is_epc = is_epc_page(pg);
+need_scrub = need_scrub && !is_epc;
+
 spin_lock(&heap_lock);
 
 for ( i = 0; i < (1 << order); i++ )
@@ -1364,11 +1375,13 @@ static void free_heap_pages(
 if ( pg[i].u.free.need_tlbflush )
 page_set_tlbflush_timestamp(&pg[i]);
 
-pg[i].u.free.scrubbable = true;
+pg[i].u.free.scrubbable = !is_epc;
 
 /* This page is not a guest frame any more. */
 page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
-set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY);
+
+if ( !is_epc )
+set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY);
 
 if ( need_scrub )
 {
@@ -2232,6 +2245,12 @@ struct page_info *alloc_domheap_pages(
 if ( memflags & MEMF_no_owner )
 memflags |= MEMF_no_refcount;
 
+/* MEMF_epc implies MEMF_no_scrub */
+if ((memflags & MEMF_epc) &&
+!(pg = alloc_heap_pages(MEMZONE_EPC, MEMZONE_EPC, order,
+memflags | MEMF_no_scrub, d)))
+return NULL;
+
 if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
 pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
 
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index c715e2290510..bca26f027402 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -153,6 +153,8 @@ extern vaddr_t xenheap_virt_start;
 (mfn_valid(_mfn(mfn)) && is_xen_heap_page(__mfn_to_page(mfn)))
 #endif
 
+#define is_epc_page(page)   false
+
 #define page_scrubbable(_p) true
 
 #define page_mergeable(_p1, _p2)true
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index b0f0ea0a8b5d..1dedb8099801 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -259,8 +259,10 @@ struct page_info
 #define PGC_state_freePG_mask(3, 9)
 #define page_state_is(pg, st) (((pg)->count_info&PGC

[Xen-devel] [RFC PATCH v2 00/17] RFC: SGX Virtualization design and draft patches

2017-12-03 Thread Boqun Feng
only
Linux and Windows);

For snapshot, we can support snapshot SGX guest by either:

- Suspend guest before snapshot (s3-s5). This works for all guests but
  requires user to manually susppend guest.
- Issue an hypercall to destroy guest's EPC in save_vm. This only works for
  Linux and Windows but doesn't require user intervention.

What's your comments?

3. Reference

- Intel SGX Homepage
https://software.intel.com/en-us/sgx

- Linux SGX SDK
https://01.org/intel-software-guard-extensions

- Linux SGX driver for upstreaming
https://github.com/01org/linux-sgx

- Intel SGX Specification (SDM Vol 3D)

https://software.intel.com/sites/default/files/managed/7c/f1/332831-sdm-vol-3d.pdf

- Paper: Intel SGX Explained
https://eprint.iacr.org/2016/086.pdf

- ISCA 2015 tutorial slides for Intel® SGX - Intel® Software
https://software.intel.com/sites/default/files/332680-002.pdf

Boqun Feng (5):
  xen: mm: introduce non-scrubbable pages
  xen: mm: manage EPC pages in Xen heaps
  xen: x86/mm: add SGX EPC management
  xen: x86: add functions to populate and destroy EPC for domain
  xen: tools: add SGX to applying MSR policy

Kai Huang (12):
  xen: x86: expose SGX to HVM domain in CPU featureset
  xen: x86: add early stage SGX feature detection
  xen: vmx: detect ENCLS VMEXIT
  xen: x86/mm: introduce ioremap_wb()
  xen: p2m: new 'p2m_epc' type for EPC mapping
  xen: x86: add SGX cpuid handling support.
  xen: vmx: handle SGX related MSRs
  xen: vmx: handle ENCLS VMEXIT
  xen: vmx: handle VMEXIT from SGX enclave
  xen: x86: reset EPC when guest got suspended.
  xen: tools: add new 'sgx' parameter support
  xen: tools: add SGX to applying CPUID policy

 docs/misc/xen-command-line.markdown |   8 +
 tools/libxc/Makefile|   1 +
 tools/libxc/include/xc_dom.h|   4 +
 tools/libxc/include/xenctrl.h   |  16 +
 tools/libxc/xc_cpuid_x86.c  |  68 ++-
 tools/libxc/xc_msr_x86.h|  10 +
 tools/libxc/xc_sgx.c|  82 +++
 tools/libxl/libxl.h |   3 +-
 tools/libxl/libxl_cpuid.c   |  15 +-
 tools/libxl/libxl_create.c  |  10 +
 tools/libxl/libxl_dom.c |  65 ++-
 tools/libxl/libxl_internal.h|   2 +
 tools/libxl/libxl_nocpuid.c |   4 +-
 tools/libxl/libxl_types.idl |  11 +
 tools/libxl/libxl_x86.c |  12 +
 tools/ocaml/libs/xc/xenctrl_stubs.c |  11 +-
 tools/python/xen/lowlevel/xc/xc.c   |  11 +-
 tools/xl/xl_parse.c |  86 +++
 tools/xl/xl_parse.h |   1 +
 xen/arch/x86/Makefile   |   1 +
 xen/arch/x86/cpu/common.c   |  15 +
 xen/arch/x86/cpuid.c|  62 ++-
 xen/arch/x86/domctl.c   |  87 ++-
 xen/arch/x86/hvm/hvm.c  |   3 +
 xen/arch/x86/hvm/vmx/vmcs.c |  16 +-
 xen/arch/x86/hvm/vmx/vmx.c  |  68 +++
 xen/arch/x86/hvm/vmx/vvmx.c |  11 +
 xen/arch/x86/mm.c   |   9 +-
 xen/arch/x86/mm/p2m-ept.c   |   3 +
 xen/arch/x86/mm/p2m.c   |  41 ++
 xen/arch/x86/msr.c  |   6 +-
 xen/arch/x86/sgx.c  | 815 
 xen/common/page_alloc.c |  39 +-
 xen/include/asm-arm/mm.h|   9 +
 xen/include/asm-x86/cpufeature.h|   4 +
 xen/include/asm-x86/cpuid.h |  29 +-
 xen/include/asm-x86/hvm/hvm.h   |   3 +
 xen/include/asm-x86/hvm/vmx/vmcs.h  |   8 +
 xen/include/asm-x86/hvm/vmx/vmx.h   |   3 +
 xen/include/asm-x86/mm.h|  19 +-
 xen/include/asm-x86/msr-index.h |   6 +
 xen/include/asm-x86/msr.h   |   5 +
 xen/include/asm-x86/p2m.h   |  12 +-
 xen/include/asm-x86/sgx.h   |  86 +++
 xen/include/public/arch-x86/cpufeatureset.h |   3 +-
 xen/include/xen/mm.h|   2 +
 xen/tools/gen-cpuid.py  |   3 +
 47 files changed, 1757 insertions(+), 31 deletions(-)
 create mode 100644 tools/libxc/xc_sgx.c
 create mode 100644 xen/arch/x86/sgx.c
 create mode 100644 xen/include/asm-x86/sgx.h

-- 
2.15.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 09/17] xen: x86: add functions to populate and destroy EPC for domain

2017-12-03 Thread Boqun Feng
Add per-domain structure to store SGX per-domain info. Currently only domain's
EPC base and size are stored. Also add new functions for further use:
- domain_populate_epc  # populate EPC when EPC base & size are notified.
- domain_reset_epc # Reset domain's EPC to be invalid. Used when domain
  goes to S3-S5, or being destroyed.
- domain_destroy_epc   # destroy and free domain's EPC.

For now, those functions only work for HVM domain, and will return
-EFAULT if calling these for non-HVM domain.

Signed-off-by: Kai Huang 
Signed-off-by: Boqun Feng 
---
 xen/arch/x86/hvm/vmx/vmx.c |   3 +
 xen/arch/x86/sgx.c | 340 +
 xen/include/asm-x86/hvm/vmx/vmcs.h |   2 +
 xen/include/asm-x86/sgx.h  |  13 ++
 4 files changed, 358 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index b18cceab55b2..92fb85b13a0c 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -417,6 +417,9 @@ static int vmx_domain_initialise(struct domain *d)
 
 static void vmx_domain_destroy(struct domain *d)
 {
+if ( domain_epc_populated(d) )
+domain_destroy_epc(d);
+
 if ( !has_vlapic(d) )
 return;
 
diff --git a/xen/arch/x86/sgx.c b/xen/arch/x86/sgx.c
index 9409b041e4f7..0c898c3086cb 100644
--- a/xen/arch/x86/sgx.c
+++ b/xen/arch/x86/sgx.c
@@ -25,6 +25,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 struct sgx_cpuinfo __read_mostly boot_sgx_cpudata;
 
@@ -38,6 +40,344 @@ boolean_param("sgx", opt_sgx_enabled);
 
 static void *epc_base_vaddr = NULL;
 
+static void *map_epc_page_to_xen(struct page_info *pg)
+{
+BUG_ON(!epc_base_vaddr);
+
+return (void *)((unsigned long)epc_base_vaddr +
+((page_to_mfn(pg) - epc_base_mfn) << PAGE_SHIFT));
+}
+
+/* ENCLS opcode */
+#define ENCLS   .byte 0x0f, 0x01, 0xcf
+
+/*
+ * ENCLS leaf functions
+ *
+ * However currently we only needs EREMOVE..
+ */
+enum {
+ECREATE = 0x0,
+EADD= 0x1,
+EINIT   = 0x2,
+EREMOVE = 0x3,
+EDGBRD  = 0x4,
+EDGBWR  = 0x5,
+EEXTEND = 0x6,
+ELDU= 0x8,
+EBLOCK  = 0x9,
+EPA = 0xA,
+EWB = 0xB,
+ETRACK  = 0xC,
+EAUG= 0xD,
+EMODPR  = 0xE,
+EMODT   = 0xF,
+};
+
+/*
+ * ENCLS error code
+ *
+ * Currently we only need SGX_CHILD_PRESENT
+ */
+#define SGX_CHILD_PRESENT   13
+
+static inline int __encls(unsigned long rax, unsigned long rbx,
+  unsigned long rcx, unsigned long rdx)
+{
+int ret;
+
+asm volatile ( "ENCLS;\n\t"
+: "=a" (ret)
+: "a" (rax), "b" (rbx), "c" (rcx), "d" (rdx)
+: "memory", "cc");
+
+return ret;
+}
+
+static inline int __eremove(void *epc)
+{
+unsigned long rbx = 0, rdx = 0;
+
+return __encls(EREMOVE, rbx, (unsigned long)epc, rdx);
+}
+
+static int sgx_eremove(struct page_info *epg)
+{
+void *addr = map_epc_page_to_xen(epg);
+int ret;
+
+BUG_ON(!addr);
+
+ret =  __eremove(addr);
+
+return ret;
+}
+
+struct sgx_domain *to_sgx(struct domain *d)
+{
+if (!is_hvm_domain(d))
+return NULL;
+else
+return &d->arch.hvm_domain.vmx.sgx;
+}
+
+bool domain_epc_populated(struct domain *d)
+{
+BUG_ON(!to_sgx(d));
+
+return !!to_sgx(d)->epc_base_pfn;
+}
+
+/*
+ * Reset domain's EPC with EREMOVE. free_epc indicates whether to free EPC
+ * pages during reset. This will be called when domain goes into S3-S5 state
+ * (with free_epc being false), and when domain is destroyed (with free_epc
+ * being true).
+ *
+ * It is possible that EREMOVE will be called for SECS when it still has
+ * children present, in which case SGX_CHILD_PRESENT will be returned. In this
+ * case, SECS page is kept to a tmp list and after all EPC pages have been
+ * called with EREMOVE, we call EREMOVE for all the SECS pages again, and this
+ * time SGX_CHILD_PRESENT should never occur as all children should have been
+ * removed.
+ *
+ * If unexpected error returned by EREMOVE, it means the EPC page becomes
+ * abnormal, so it will not be freed even free_epc is true, as further use of
+ * this EPC can cause unexpected error, potentially damaging other domains.
+ */
+static int __domain_reset_epc(struct domain *d, unsigned long epc_base_pfn,
+unsigned long epc_npages, bool free_epc)
+{
+struct page_list_head secs_list;
+struct page_info *epg, *tmp;
+unsigned long i;
+int ret = 0;
+
+INIT_PAGE_LIST_HEAD(&secs_list);
+
+for ( i = 0; i < epc_npages; i++ )
+{
+unsigned long gfn;
+mfn_t mfn;
+p2m_type_t t;
+int r;
+
+gfn = i + epc_base_pfn;
+mfn = get_gfn_query(d, gfn, &t);
+if ( unlikely(mfn_eq(mfn, INVALID_MFN)) )
+{
+printk("Domain %d

[Xen-devel] [PATCH v2 02/17] xen: x86: add early stage SGX feature detection

2017-12-03 Thread Boqun Feng
From: Kai Huang 

This patch adds early stage SGX feature detection via SGX CPUID 0x12.
Function detect_sgx is added to detect SGX info on each CPU (called from
identify_cpu). SDM says SGX info returned by CPUID is per-thread, and
we cannot assume all threads will return the same SGX info, so we have
to detect SGX for each CPU.  For simplicity, currently SGX is only
supported when all CPUs reports the same SGX info.

Besides a boot parameter 'sgx' is added to allow the sysadmin control
whether SGX is supported to guests.

SDM also says it's possible to have multiple EPC sections but this is
only for multiple-socket server, which we don't support now (there are
other things need to be done, ex, NUMA EPC, scheduling, etc, as well),
so currently only one EPC is supported.

The detection result is in the X86_FEATURE_SGX bit of 'boot_cpu_data',
and 'cpu_has_sgx' should be the only way to query for the SGX support
enabled or not in the whole system.

Dedicated files sgx.c and sgx.h are added for bulk of above SGX
detection code detection code, and for further SGX code as well.

Signed-off-by: Kai Huang 
Signed-off-by: Boqun Feng 
---
 docs/misc/xen-command-line.markdown |   8 ++
 xen/arch/x86/Makefile   |   1 +
 xen/arch/x86/cpu/common.c   |  15 +++
 xen/arch/x86/sgx.c  | 191 
 xen/include/asm-x86/cpufeature.h|   1 +
 xen/include/asm-x86/msr-index.h |   1 +
 xen/include/asm-x86/sgx.h   |  61 
 7 files changed, 278 insertions(+)
 create mode 100644 xen/arch/x86/sgx.c
 create mode 100644 xen/include/asm-x86/sgx.h

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index 781110d4b2a5..81f9936face2 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1601,6 +1601,14 @@ hypervisors handle SErrors:
   All SErrors will crash the whole system. This option will avoid all overhead
   of the dsb/isb pairs.
 
+### sgx (Intel)
+> = 
+
+> Default: false
+
+Flag to enable Software Guard Extensions support
+for guest.
+
 ### smap
 > `=  | hvm`
 
diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index d5d58a205ec8..c8a843fef540 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -54,6 +54,7 @@ obj-y += platform_hypercall.o x86_64/platform_hypercall.o
 obj-y += psr.o
 obj-y += setup.o
 obj-y += shutdown.o
+obj-y += sgx.o
 obj-y += smp.o
 obj-y += smpboot.o
 obj-y += srat.o
diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 6cf362849e85..0a93d5759a76 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include  /* for XEN_INVALID_{SOCKET,CORE}_ID */
 
 #include "cpu.h"
@@ -430,14 +431,28 @@ void identify_cpu(struct cpuinfo_x86 *c)
 * executed, c == &boot_cpu_data.
 */
if ( c != &boot_cpu_data ) {
+   struct sgx_cpuinfo tmp;
/* AND the already accumulated flags with these */
for ( i = 0 ; i < NCAPINTS ; i++ )
boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 
mcheck_init(c, false);
+   /*
+* Check SGX CPUID info all for all CPUs, and only support SGX 
when all
+* CPUs report the same SGX info. SDM (37.7.2 Intel SGX Resource
+* Enumeration Leaves) says "software should not assume that if 
Intel
+* SGX instructions are supported on one hardware thread, they 
are also
+* supported elsewhere.".  For simplicity, we only support SGX 
when all
+* CPUs reports consistent SGX info.
+*/
+   detect_sgx(&tmp);
+   if ( memcmp(&tmp, &boot_sgx_cpudata, sizeof(tmp)) )
+   disable_sgx();
} else {
mcheck_init(c, true);
 
+   detect_sgx(&boot_sgx_cpudata);
+
mtrr_bp_init();
}
 }
diff --git a/xen/arch/x86/sgx.c b/xen/arch/x86/sgx.c
new file mode 100644
index ..ead917543f3e
--- /dev/null
+++ b/xen/arch/x86/sgx.c
@@ -0,0 +1,191 @@
+/*
+ * Intel Software Guard Extensions support
+ *
+ * Copyright (c) 2017,  Intel Corporation
+ *
+ * Author: Kai Huang 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see