[Xen-devel] [PATCH 15/15] xen: tools: expose EPC in ACPI table

2017-07-09 Thread Kai Huang
On physical machine EPC is exposed in ACPI table via "INT0E0C". Although EPC
can be discovered by CPUID but Windows driver requires EPC to be exposed in
ACPI table as well. This patch exposes EPC in ACPI table.

Signed-off-by: Kai Huang 
---
 tools/firmware/hvmloader/util.c  | 23 +++
 tools/firmware/hvmloader/util.h  |  3 +++
 tools/libacpi/build.c|  3 +++
 tools/libacpi/dsdt.asl   | 49 
 tools/libacpi/dsdt_acpi_info.asl |  6 +++--
 tools/libacpi/libacpi.h  |  1 +
 tools/libxl/libxl_x86_acpi.c |  3 +++
 7 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/tools/firmware/hvmloader/util.c b/tools/firmware/hvmloader/util.c
index db5f240bb9..4a1da2d63a 100644
--- a/tools/firmware/hvmloader/util.c
+++ b/tools/firmware/hvmloader/util.c
@@ -330,6 +330,15 @@ cpuid(uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t 
*ecx, uint32_t *edx)
 : "0" (idx) );
 }
 
+void cpuid_count(uint32_t idx, uint32_t count, uint32_t *eax,
+ uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+asm volatile (
+"cpuid"
+: "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
+: "0" (idx), "c" (count) );
+}
+
 static const char hex_digits[] = "0123456789abcdef";
 
 /* Write a two-character hex representation of 'byte' to digits[].
@@ -888,6 +897,18 @@ static uint8_t acpi_lapic_id(unsigned cpu)
 return LAPIC_ID(cpu);
 }
 
+static void get_epc_info(struct acpi_config *config)
+{
+uint32_t eax, ebx, ecx, edx;
+
+cpuid_count(0x12, 0x2, &eax, &ebx, &ecx, &edx);
+
+config->epc_base = (((uint64_t)(ebx & 0xf)) << 32) |
+   (uint64_t)(eax & 0xf000);
+config->epc_size = (((uint64_t)(edx & 0xf)) << 32) |
+   (uint64_t)(ecx & 0xf000);
+}
+
 void hvmloader_acpi_build_tables(struct acpi_config *config,
  unsigned int physical)
 {
@@ -920,6 +941,8 @@ void hvmloader_acpi_build_tables(struct acpi_config *config,
 config->pci_hi_len = pci_hi_mem_end - pci_hi_mem_start;
 }
 
+get_epc_info(config);
+
 s = xenstore_read("platform/generation-id", "0:0");
 if ( s )
 {
diff --git a/tools/firmware/hvmloader/util.h b/tools/firmware/hvmloader/util.h
index 6062f0b8cf..deac0abb86 100644
--- a/tools/firmware/hvmloader/util.h
+++ b/tools/firmware/hvmloader/util.h
@@ -112,6 +112,9 @@ int hpet_exists(unsigned long hpet_base);
 void cpuid(uint32_t idx, uint32_t *eax, uint32_t *ebx,
uint32_t *ecx, uint32_t *edx);
 
+void cpuid_count(uint32_t idx, uint32_t count, uint32_t *eax,
+ uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+
 /* Read the TSC register. */
 static inline uint64_t rdtsc(void)
 {
diff --git a/tools/libacpi/build.c b/tools/libacpi/build.c
index f9881c9604..9d64856e26 100644
--- a/tools/libacpi/build.c
+++ b/tools/libacpi/build.c
@@ -54,6 +54,7 @@ struct acpi_info {
 uint32_t madt_lapic0_addr;  /* 16   - Address of first MADT LAPIC struct */
 uint32_t vm_gid_addr;   /* 20   - Address of VM generation id buffer */
 uint64_t pci_hi_min, pci_hi_len; /* 24, 32 - PCI I/O hole boundaries */
+uint64_t epc_min, epc_len;  /* 40, 48 - EPC region */
 };
 
 static void set_checksum(
@@ -535,6 +536,8 @@ int acpi_build_tables(struct acpi_ctxt *ctxt, struct 
acpi_config *config)
 acpi_info->pci_hi_min = config->pci_hi_start;
 acpi_info->pci_hi_len = config->pci_hi_len;
 }
+acpi_info->epc_min = config->epc_base;
+acpi_info->epc_len = config->epc_size;
 
 /*
  * Fill in high-memory data structures, starting at @buf.
diff --git a/tools/libacpi/dsdt.asl b/tools/libacpi/dsdt.asl
index fa8ff317b2..25ce196028 100644
--- a/tools/libacpi/dsdt.asl
+++ b/tools/libacpi/dsdt.asl
@@ -441,6 +441,55 @@ DefinitionBlock ("DSDT.aml", "DSDT", 2, "Xen", "HVM", 0)
 }
 }
 }
+
+Device (EPC)
+{
+Name (_HID, EisaId ("INT0E0C"))
+Name (_STR, Unicode ("Enclave Page Cache 1.5"))
+Name (_MLS, Package (0x01)
+{
+Package (0x02)
+{
+"en",
+Unicode ("Enclave Page Cache 1.5")
+}
+})
+Name (RBUF, ResourceTemplate ()
+{
+QWordMemory (ResourceConsumer, PosDecode, MinFixed, MaxFixed,
+Cacheable, ReadWrite,
+0x, // Granularity
+0x, // Range Minimum
+0x, // Range Maximum
+ 

[Xen-devel] [PATCH 14/15] xen: tools: add SGX to applying CPUID policy

2017-07-09 Thread Kai Huang
In libxc, a new structure 'xc_cpuid_policy_build_info_t' is added to carry
domain's EPC base and size info from libxl. libxl_cpuid_apply_policy is also
changed to take 'libxl_domain_build_info_t' as parameter, where domain's EPC
base and size can be got and passed to xc_cpuid_apply_policy.
xc_cpuid_apply_policy is extended to support SGX CPUID. If hypervisor doesn't
report SGX feature in host type cpufeatureset, then using 'epc' parameter
results in domain creation failure as SGX cannot be supported.

Signed-off-by: Kai Huang 
---
 tools/libxc/include/xenctrl.h   | 10 ++
 tools/libxc/xc_cpuid_x86.c  | 68 ++---
 tools/libxl/libxl.h |  3 +-
 tools/libxl/libxl_cpuid.c   | 15 ++--
 tools/libxl/libxl_dom.c |  6 +++-
 tools/libxl/libxl_nocpuid.c |  4 ++-
 tools/ocaml/libs/xc/xenctrl_stubs.c | 11 +-
 tools/python/xen/lowlevel/xc/xc.c   | 11 +-
 8 files changed, 117 insertions(+), 11 deletions(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 1629f412dd..b621b35dea 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -1783,6 +1783,15 @@ int xc_domain_debug_control(xc_interface *xch,
 uint32_t vcpu);
 
 #if defined(__i386__) || defined(__x86_64__)
+typedef struct xc_cpuid_policy_build_info_sgx {
+uint64_t epc_base;
+uint64_t epc_size;
+} xc_cpuid_policy_build_info_sgx_t;
+
+typedef struct xc_cpuid_policy_build_info {
+xc_cpuid_policy_build_info_sgx_t sgx;
+} xc_cpuid_policy_build_info_t;
+
 int xc_cpuid_check(xc_interface *xch,
const unsigned int *input,
const char **config,
@@ -1794,6 +1803,7 @@ int xc_cpuid_set(xc_interface *xch,
  char **config_transformed);
 int xc_cpuid_apply_policy(xc_interface *xch,
   domid_t domid,
+  xc_cpuid_policy_build_info_t *b_info,
   uint32_t *featureset,
   unsigned int nr_features);
 void xc_cpuid_to_str(const unsigned int *regs,
diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index 1bedf050b8..b7eb652db9 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -38,7 +38,7 @@ enum {
 #define clear_feature(idx, dst) ((dst) &= ~bitmaskof(idx))
 #define set_feature(idx, dst)   ((dst) |=  bitmaskof(idx))
 
-#define DEF_MAX_BASE 0x000du
+#define DEF_MAX_BASE 0x0012u
 #define DEF_MAX_INTELEXT  0x8008u
 #define DEF_MAX_AMDEXT0x801cu
 
@@ -178,6 +178,8 @@ struct cpuid_domain_info
 /* HVM-only information. */
 bool pae;
 bool nestedhvm;
+
+xc_cpuid_policy_build_info_t *b_info;
 };
 
 static void cpuid(const unsigned int *input, unsigned int *regs)
@@ -369,6 +371,12 @@ static void intel_xc_cpuid_policy(xc_interface *xch,
   const struct cpuid_domain_info *info,
   const unsigned int *input, unsigned int 
*regs)
 {
+xc_cpuid_policy_build_info_t *b_info = info->b_info;
+xc_cpuid_policy_build_info_sgx_t *sgx = NULL;
+
+if ( b_info )
+sgx = &b_info->sgx;
+
 switch ( input[0] )
 {
 case 0x0004:
@@ -381,6 +389,30 @@ static void intel_xc_cpuid_policy(xc_interface *xch,
 regs[3] &= 0x3ffu;
 break;
 
+case 0x0012:
+if ( !sgx ) {
+regs[0] = regs[1] = regs[2] = regs[3] = 0;
+break;
+}
+
+if ( !sgx->epc_base || !sgx->epc_size ) {
+regs[0] = regs[1] = regs[2] = regs[3] = 0;
+break;
+}
+
+if ( input[1] == 2 ) {
+/*
+ * FIX EPC base and size for SGX CPUID leaf 2. Xen hypervisor is
+ * depending on XEN_DOMCTL_set_cpuid to know domain's EPC base
+ * and size.
+ */
+regs[0] = (uint32_t)(sgx->epc_base & 0xf000) | 0x1;
+regs[1] = (uint32_t)(sgx->epc_base >> 32);
+regs[2] = (uint32_t)(sgx->epc_size & 0xf000) | 0x1;
+regs[3] = (uint32_t)(sgx->epc_size >> 32);
+}
+break;
+
 case 0x8000:
 if ( regs[0] > DEF_MAX_INTELEXT )
 regs[0] = DEF_MAX_INTELEXT;
@@ -444,6 +476,10 @@ static void xc_cpuid_hvm_policy(xc_interface *xch,
 regs[1] = regs[2] = regs[3] = 0;
 break;
 
+case 0x0012:
+/* Intel SGX. Passthrough to Intel function */
+break;
+
 case 0x8000:
 /* Passthrough to cpu vendor specific functions */
 break;
@@ -649,12 +685,13 @@ void xc_cpuid_to_str(const unsigned int *regs, char 
**strs)
 }
 }
 
-static void sanitise_featureset(struct cpuid_domain_info *info)
+static int sanitise_featureset(struct cpuid_domain_info *info)
 {
 const uint32_t

[Xen-devel] [PATCH 13/15] xen: tools: add new 'epc' parameter support

2017-07-09 Thread Kai Huang
In order to be able to configure domain's EPC size when it is created, a new
'epc' parameter is added to XL configuration file. Like memory it indicates
EPC size in MB. A new 'libxl_sgx_buildinfo', which contains EPC base and size,
is also added to libxl_domain_buind_info. EPC base and size are also added to
'xc_dom_image' in order to add EPC to e820 table. EPC base is calculated
internally.

Signed-off-by: Kai Huang 
---
 tools/libxc/include/xc_dom.h |  4 
 tools/libxl/libxl_create.c   |  9 +
 tools/libxl/libxl_dom.c  | 30 ++
 tools/libxl/libxl_internal.h |  2 ++
 tools/libxl/libxl_types.idl  |  6 ++
 tools/libxl/libxl_x86.c  | 12 
 tools/xl/xl_parse.c  |  5 +
 7 files changed, 68 insertions(+)

diff --git a/tools/libxc/include/xc_dom.h b/tools/libxc/include/xc_dom.h
index ce47058c41..be10af7002 100644
--- a/tools/libxc/include/xc_dom.h
+++ b/tools/libxc/include/xc_dom.h
@@ -203,6 +203,10 @@ struct xc_dom_image {
 xen_paddr_t lowmem_end;
 xen_paddr_t highmem_end;
 xen_pfn_t vga_hole_size;
+#if defined(__i386__) || defined(__x86_64__)
+xen_paddr_t epc_base;
+xen_paddr_t epc_size;
+#endif
 
 /* If unset disables the setup of the IOREQ pages. */
 bool device_model;
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index bffbc456c1..8710e53ffd 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -59,6 +59,13 @@ void libxl__rdm_setdefault(libxl__gc *gc, 
libxl_domain_build_info *b_info)
 LIBXL_RDM_MEM_BOUNDARY_MEMKB_DEFAULT;
 }
 
+void libxl__sgx_setdefault(libxl__gc *gc, libxl_domain_build_info *b_info)
+{
+if (b_info->u.hvm.sgx.epckb == LIBXL_MEMKB_DEFAULT)
+b_info->u.hvm.sgx.epckb = 0;
+b_info->u.hvm.sgx.epcbase = 0;
+}
+
 int libxl__domain_build_info_setdefault(libxl__gc *gc,
 libxl_domain_build_info *b_info)
 {
@@ -372,6 +379,8 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
 libxl_defbool_setdefault(&b_info->u.hvm.gfx_passthru, false);
 
 libxl__rdm_setdefault(gc, b_info);
+
+libxl__sgx_setdefault(gc, b_info);
 break;
 case LIBXL_DOMAIN_TYPE_PV:
 libxl_defbool_setdefault(&b_info->u.pv.e820_host, false);
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 5d914a59ee..6d1d51d35d 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -1124,6 +1124,36 @@ int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
 highmem_end = (1ull << 32) + (lowmem_end - mmio_start);
 lowmem_end = mmio_start;
 }
+#if defined(__i386__) || defined(__x86_64__)
+if (info->u.hvm.sgx.epckb) {
+/*
+ * FIXME:
+ *
+ * Currently EPC base is put at highmem_end + 8G, which should be
+ * safe in most cases.
+ *
+ * I am not quite sure which is the best way to calcualte EPC base.
+ * IMO we can either:
+ * 1) put EPC between lowmem_end to mmio_start, but this brings
+ * additional logic to handle, ex, lowmem_end may become too small
+ * if EPC is large (shall we limit domain's EPC size?), and hvmloader
+ * will try to enlarge MMIO space until lowmem_end, or even relocate
+ * lowmem -- all those make things complicated, so probably put EPC
+ * in hole between lowmem_end to mmio_start is not good.
+ * 2) put EPC after highmem_end, but hvmloader may also relocate MMIO
+ * resource to the place after highmem_end. Maybe the ideal way is to
+ * put EPC right after highmem_end, and change hvmloader to detect
+ * EPC, and put high MMIO resource after EPC. I've done this but I
+ * found a strange bug that EPT mapping of EPC will be (at least part
+ * of the mappings) will be removed by whom I still cannot find.
+ * Currently EPC base is put at highmem_end + 8G, and hvmloader code
+ * is not changed to handle EPC, but this should be safe for most 
cases.
+ */
+info->u.hvm.sgx.epcbase = highmem_end + (2ULL << 32);
+}
+dom->epc_size = (info->u.hvm.sgx.epckb << 10);
+dom->epc_base = info->u.hvm.sgx.epcbase;
+#endif
 dom->lowmem_end = lowmem_end;
 dom->highmem_end = highmem_end;
 dom->mmio_start = mmio_start;
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index afe6652847..9a1d309dac 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -1252,6 +1252,8 @@ _hidden int libxl__device_vkb_setdefault(libxl__gc *gc, 
libxl_device_vkb *vkb);
 _hidden int libxl__device_pci_setdefault(libxl__gc *gc, libxl_device_pci *pci);
 _hidden void libxl__rdm_setdefault(libxl__gc *gc,
libxl_domain_build_info *b_info);
+_hidden void li

[Xen-devel] [PATCH 05/15] xen: p2m: new 'p2m_epc' type for EPC mapping

2017-07-09 Thread Kai Huang
A new 'p2m_epc' type is added for EPC mapping type. Two wrapper functions
set_epc_p2m_entry and clear_epc_p2m_entry are also added for further use.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/mm/p2m-ept.c |  3 +++
 xen/arch/x86/mm/p2m.c | 41 +
 xen/include/asm-x86/p2m.h | 12 ++--
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index ecab56fbec..95929868dc 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -182,6 +182,9 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, 
ept_entry_t *entry,
 entry->a = !!cpu_has_vmx_ept_ad;
 entry->d = 0;
 break;
+case p2m_epc:
+entry->r = entry->w = entry->x = 1;
+break;
 }
 
 
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index bee733dc46..29f42cb96d 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1176,6 +1176,12 @@ int set_identity_p2m_entry(struct domain *d, unsigned 
long gfn,
 return ret;
 }
 
+int set_epc_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_epc,
+p2m_get_hostp2m(d)->default_access);
+}
+
 /*
  * Returns:
  *0for success
@@ -1260,6 +1266,41 @@ int clear_identity_p2m_entry(struct domain *d, unsigned 
long gfn)
 return ret;
 }
 
+int clear_epc_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+mfn_t omfn;
+p2m_type_t ot;
+p2m_access_t oa;
+int ret = 0;
+
+gfn_lock(p2m, gfn, 0);
+
+omfn = p2m->get_entry(p2m, gfn, &ot, &oa, 0, NULL, NULL);
+if ( mfn_eq(omfn, INVALID_MFN) || !p2m_is_epc(ot) )
+{
+printk(XENLOG_G_WARNING
+"d%d: invalid EPC map to clear: gfn 0x%lx, type %d.\n",
+d->domain_id, gfn, ot);
+goto out;
+}
+if ( !mfn_eq(mfn, omfn) )
+{
+printk(XENLOG_G_WARNING
+"d%d: mistaken EPC mfn to clear: gfn 0x%lx, "
+"omfn 0x%lx, mfn 0x%lx.\n",
+d->domain_id, gfn, mfn_x(omfn), mfn_x(mfn));
+}
+
+ret = p2m_set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_4K, p2m_invalid,
+p2m->default_access);
+
+out:
+gfn_unlock(p2m, gfn, 0);
+
+return ret;
+}
+
 /* Returns: 0 for success, -errno for failure */
 int set_shared_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
 {
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index e736609241..a9e330dd3c 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -72,6 +72,7 @@ typedef enum {
 p2m_ram_broken = 13,  /* Broken page, access cause domain crash */
 p2m_map_foreign  = 14,/* ram pages from foreign domain */
 p2m_ioreq_server = 15,
+p2m_epc = 16, /* EPC */
 } p2m_type_t;
 
 /* Modifiers to the query */
@@ -142,10 +143,13 @@ typedef unsigned int p2m_query_t;
 | p2m_to_mask(p2m_ram_logdirty) )
 #define P2M_SHARED_TYPES   (p2m_to_mask(p2m_ram_shared))
 
+#define P2M_EPC_TYPES   (p2m_to_mask(p2m_epc))
+
 /* Valid types not necessarily associated with a (valid) MFN. */
 #define P2M_INVALID_MFN_TYPES (P2M_POD_TYPES  \
| p2m_to_mask(p2m_mmio_direct) \
-   | P2M_PAGING_TYPES)
+   | P2M_PAGING_TYPES \
+   | P2M_EPC_TYPES)
 
 /* Broken type: the frame backing this pfn has failed in hardware
  * and must not be touched. */
@@ -153,6 +157,7 @@ typedef unsigned int p2m_query_t;
 
 /* Useful predicates */
 #define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES)
+#define p2m_is_epc(_t) (p2m_to_mask(_t) & P2M_EPC_TYPES)
 #define p2m_is_hole(_t) (p2m_to_mask(_t) & P2M_HOLE_TYPES)
 #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
@@ -163,7 +168,7 @@ typedef unsigned int p2m_query_t;
 /* Grant types are *not* considered valid, because they can be
unmapped at any time and, unless you happen to be the shadow or p2m
implementations, there's no way of synchronising against that. */
-#define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES))
+#define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES | 
P2M_EPC_TYPES))
 #define p2m_has_emt(_t)  (p2m_to_mask(_t) & (P2M_RAM_TYPES | 
p2m_to_mask(p2m_mmio_direct)))
 #define p2m_is_pageable(_t) (p2m_to_mask(_t) & P2M_PAGEABLE_TYPES)
 #define p2m_is_paging(_t)   (p2m_to_mask(_t) & P2M_PAGING_TYPES)
@@ -634,6 +639,9 @@ int clear_identity_p2m_entry(struct domain *d, unsigned 
long gfn);
 int p2m_add_foreign(struct 

[Xen-devel] [PATCH 04/15] xen: mm: add ioremap_cache

2017-07-09 Thread Kai Huang
Currently Xen only has non-cacheable version of ioremap. Although EPC is
reported as reserved memory in e820 but it can be mapped as cacheable. This
patch adds ioremap_cache (cacheable version of ioremap).

Signed-off-by: Kai Huang 
---
 xen/arch/x86/mm.c  | 15 +--
 xen/include/xen/vmap.h |  1 +
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 101ab33193..d0b6b3a247 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -6284,9 +6284,10 @@ void *__init arch_vmap_virt_end(void)
 return (void *)fix_to_virt(__end_of_fixed_addresses);
 }
 
-void __iomem *ioremap(paddr_t pa, size_t len)
+static void __iomem *__ioremap(paddr_t pa, size_t len, bool_t cache)
 {
 mfn_t mfn = _mfn(PFN_DOWN(pa));
+unsigned int flags = cache ? PAGE_HYPERVISOR : PAGE_HYPERVISOR_NOCACHE;
 void *va;
 
 WARN_ON(page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL));
@@ -6299,12 +6300,22 @@ void __iomem *ioremap(paddr_t pa, size_t len)
 unsigned int offs = pa & (PAGE_SIZE - 1);
 unsigned int nr = PFN_UP(offs + len);
 
-va = __vmap(&mfn, nr, 1, 1, PAGE_HYPERVISOR_NOCACHE, VMAP_DEFAULT) + 
offs;
+va = __vmap(&mfn, nr, 1, 1, flags, VMAP_DEFAULT) + offs;
 }
 
 return (void __force __iomem *)va;
 }
 
+void __iomem *ioremap(paddr_t pa, size_t len)
+{
+return __ioremap(pa, len, false);
+}
+
+void __iomem *ioremap_cache(paddr_t pa, size_t len)
+{
+return __ioremap(pa, len, true);
+}
+
 int create_perdomain_mapping(struct domain *d, unsigned long va,
  unsigned int nr, l1_pgentry_t **pl1tab,
  struct page_info **ppg)
diff --git a/xen/include/xen/vmap.h b/xen/include/xen/vmap.h
index 369560e620..f6037e368c 100644
--- a/xen/include/xen/vmap.h
+++ b/xen/include/xen/vmap.h
@@ -24,6 +24,7 @@ void *vzalloc(size_t size);
 void vfree(void *va);
 
 void __iomem *ioremap(paddr_t, size_t);
+void __iomem *ioremap_cache(paddr_t, size_t);
 
 static inline void iounmap(void __iomem *va)
 {
-- 
2.11.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH 08/15] xen: x86: add SGX cpuid handling support.

2017-07-09 Thread Kai Huang
This patch adds SGX to cpuid handling support. In init_guest_cpuid, for
raw_policy and host_policy, physical EPC info is reported, but for pv_max_policy
and hvm_max_policy EPC is hidden, as for particular domain, it's EPC base and
size are from tookstack, and it is meaningless to contain physical EPC info in
them. Before domain's EPC base and size are properly configured, guest's SGX
cpuid should report invalid EPC, which is also consistent with HW behavior.

Currently all EPC pages are fully populated for domain when it is created.
Xen gets domain's EPC base and size from toolstack via XEN_DOMCTL_set_cpuid,
so domain's EPC pages are also populated in XEN_DOMCTL_set_cpuid, after
receiving valid EPC base and size. Failure to populate EPC (such as there's
no enough free EPC pages) results in domain creation failure by making
XEN_DOMCTL_set_cpuid return error.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/cpuid.c| 87 -
 xen/arch/x86/domctl.c   | 47 +++-
 xen/include/asm-x86/cpuid.h | 26 +-
 3 files changed, 157 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
index d359e090f3..db896be2e8 100644
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 const uint32_t known_features[] = INIT_KNOWN_FEATURES;
 const uint32_t special_features[] = INIT_SPECIAL_FEATURES;
@@ -158,6 +159,44 @@ static void recalculate_xstate(struct cpuid_policy *p)
 }
 }
 
+static void recalculate_sgx(struct cpuid_policy *p, bool_t hide_epc)
+{
+if ( !p->feat.sgx )
+{
+memset(&p->sgx, 0, sizeof (p->sgx));
+return;
+}
+
+if ( !p->sgx.sgx1 )
+{
+memset(&p->sgx, 0, sizeof (p->sgx));
+return;
+}
+
+/*
+ * SDM 42.7.2.1 SECS.ATTRIBUTE.XFRM:
+ *
+ * Legal value for SECS.ATTRIBUTE.XFRM conform to these requirements:
+ *  - XFRM[1:0] must be set to 0x3;
+ *  - If processor does not support XSAVE, or if the system software has 
not
+ *enabled XSAVE, then XFRM[63:2] must be 0.
+ *  - If the processor does support XSAVE, XFRM must contain a value that
+ *would be legal if loaded into XCR0.
+ */
+p->sgx.xfrm_low = 0x3;
+p->sgx.xfrm_high = 0;
+if ( p->basic.xsave )
+{
+p->sgx.xfrm_low |= p->xstate.xcr0_low;
+p->sgx.xfrm_high |= p->xstate.xcr0_high;
+}
+
+if ( hide_epc )
+{
+memset(&p->sgx.raw[0x2], 0, sizeof (struct cpuid_leaf));
+}
+}
+
 /*
  * Misc adjustments to the policy.  Mostly clobbering reserved fields and
  * duplicating shared fields.  Intentionally hidden fields are annotated.
@@ -239,7 +278,7 @@ static void __init calculate_raw_policy(void)
 {
 switch ( i )
 {
-case 0x4: case 0x7: case 0xd:
+case 0x4: case 0x7: case 0xd: case 0x12:
 /* Multi-invocation leaves.  Deferred. */
 continue;
 }
@@ -299,6 +338,19 @@ static void __init calculate_raw_policy(void)
 }
 }
 
+if ( p->basic.max_leaf >= SGX_CPUID )
+{
+/*
+ * For raw policy we just report native CPUID. For EPC on native it's
+ * possible that we will have multiple EPC sections (meaning subleaf 3,
+ * 4, ... may also be valid), but as the policy is for guest so we only
+ * need one EPC section (subleaf 2).
+ */
+cpuid_count_leaf(SGX_CPUID, 0, &p->sgx.raw[0]);
+cpuid_count_leaf(SGX_CPUID, 0, &p->sgx.raw[0]);
+cpuid_count_leaf(SGX_CPUID, 0, &p->sgx.raw[0]);
+}
+
 /* Extended leaves. */
 cpuid_leaf(0x8000, &p->extd.raw[0]);
 for ( i = 1; i < min(ARRAY_SIZE(p->extd.raw),
@@ -324,6 +376,8 @@ static void __init calculate_host_policy(void)
 cpuid_featureset_to_policy(boot_cpu_data.x86_capability, p);
 recalculate_xstate(p);
 recalculate_misc(p);
+/* For host policy we report physical EPC */
+recalculate_sgx(p, 0);
 
 if ( p->extd.svm )
 {
@@ -357,6 +411,11 @@ static void __init calculate_pv_max_policy(void)
 sanitise_featureset(pv_featureset);
 cpuid_featureset_to_policy(pv_featureset, p);
 recalculate_xstate(p);
+/*
+ * For PV policy we don't report physical EPC. Actually for PV policy
+ * currently SGX will be disabled.
+ */
+recalculate_sgx(p, 1);
 
 p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
 }
@@ -413,6 +472,13 @@ static void __init calculate_hvm_max_policy(void)
 sanitise_featureset(hvm_featureset);
 cpuid_featureset_to_policy(hvm_featureset, p);
 recalculate_xstate(p);
+/*
+ * For HVM policy we don't report physical EPC. Actually cpuid policy
+ * should report VM's virtual EPC base and si

[Xen-devel] [PATCH 07/15] xen: x86: add functions to populate and destroy EPC for domain

2017-07-09 Thread Kai Huang
Add per-domain structure to store SGX per-domain info. Currently only domain's
EPC base and size are stored. Also add new functions for further use:
- hvm_populate_epc  # populate EPC when EPC base & size are notified.
- hvm_reset_epc # Reset domain's EPC to be invalid. Used when domain
  goes to S3-S5, or being destroyed.
- hvm_destroy_epc   # destroy and free domain's EPC.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/sgx.c | 315 +
 xen/arch/x86/hvm/vmx/vmx.c |   3 +
 xen/include/asm-x86/hvm/vmx/sgx.h  |  14 ++
 xen/include/asm-x86/hvm/vmx/vmcs.h |   2 +
 4 files changed, 334 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/sgx.c b/xen/arch/x86/hvm/vmx/sgx.c
index f4c9b2f933..14379151e8 100644
--- a/xen/arch/x86/hvm/vmx/sgx.c
+++ b/xen/arch/x86/hvm/vmx/sgx.c
@@ -9,6 +9,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 
@@ -90,6 +92,319 @@ void unmap_epc_page(void *addr)
 /* Nothing */
 }
 
+/* ENCLS opcode */
+#define ENCLS   .byte 0x0f, 0x01, 0xcf
+
+/*
+ * ENCLS leaf functions
+ *
+ * However currently we only needs EREMOVE..
+ */
+enum {
+ECREATE = 0x0,
+EADD= 0x1,
+EINIT   = 0x2,
+EREMOVE = 0x3,
+EDGBRD  = 0x4,
+EDGBWR  = 0x5,
+EEXTEND = 0x6,
+ELDU= 0x8,
+EBLOCK  = 0x9,
+EPA = 0xA,
+EWB = 0xB,
+ETRACK  = 0xC,
+EAUG= 0xD,
+EMODPR  = 0xE,
+EMODT   = 0xF,
+};
+
+/*
+ * ENCLS error code
+ *
+ * Currently we only need SGX_CHILD_PRESENT
+ */
+#define SGX_CHILD_PRESENT   13
+
+static inline int __encls(unsigned long rax, unsigned long rbx,
+  unsigned long rcx, unsigned long rdx)
+{
+int ret;
+
+asm volatile ( "ENCLS;\n\t"
+: "=a" (ret)
+: "a" (rax), "b" (rbx), "c" (rcx), "d" (rdx)
+: "memory", "cc");
+
+return ret;
+}
+
+static inline int __eremove(void *epc)
+{
+unsigned long rbx = 0, rdx = 0;
+
+return __encls(EREMOVE, rbx, (unsigned long)epc, rdx);
+}
+
+static int sgx_eremove(struct epc_page *epg)
+{
+void *addr = map_epc_page_to_xen(epg);
+int ret;
+
+BUG_ON(!addr);
+
+ret =  __eremove(addr);
+
+unmap_epc_page(addr);
+
+return ret;
+}
+
+/*
+ * Reset domain's EPC with EREMOVE. free_epc indicates whether to free EPC
+ * pages during reset. This will be called when domain goes into S3-S5 state
+ * (with free_epc being false), and when domain is destroyed (with free_epc
+ * being true).
+ *
+ * It is possible that EREMOVE will be called for SECS when it still has
+ * children present, in which case SGX_CHILD_PRESENT will be returned. In this
+ * case, SECS page is kept to a tmp list and after all EPC pages have been
+ * called with EREMOVE, we call EREMOVE for all the SECS pages again, and this
+ * time SGX_CHILD_PRESENT should never occur as all children should have been
+ * removed.
+ *
+ * If unexpected error returned by EREMOVE, it means the EPC page becomes
+ * abnormal, so it will not be freed even free_epc is true, as further use of
+ * this EPC can cause unexpected error, potentially damaging other domains.
+ */
+static int __hvm_reset_epc(struct domain *d, unsigned long epc_base_pfn,
+unsigned long epc_npages, bool_t free_epc)
+{
+struct list_head secs_list;
+struct list_head *p, *tmp;
+unsigned long i;
+int ret = 0;
+
+INIT_LIST_HEAD(&secs_list);
+
+for ( i = 0; i < epc_npages; i++ )
+{
+struct epc_page *epg;
+unsigned long gfn;
+mfn_t mfn;
+p2m_type_t t;
+int r;
+
+gfn = i + epc_base_pfn;
+mfn = get_gfn_query(d, gfn, &t);
+if ( unlikely(mfn_eq(mfn, INVALID_MFN)) )
+{
+printk("Domain %d: Reset EPC error: invalid MFN for gfn 0x%lx\n",
+d->domain_id, gfn);
+put_gfn(d, gfn);
+ret = -EFAULT;
+continue;
+}
+
+if ( unlikely(!p2m_is_epc(t)) )
+{
+printk("Domain %d: Reset EPC error: (gfn 0x%lx, mfn 0x%lx): " 
+"is not p2m_epc.\n", d->domain_id, gfn, mfn_x(mfn));
+put_gfn(d, gfn);
+ret = -EFAULT;
+continue;
+}
+
+put_gfn(d, gfn);
+
+epg = epc_mfn_to_page(mfn_x(mfn));
+
+/* EREMOVE the EPC page to make it invalid */
+r = sgx_eremove(epg);
+if ( r == SGX_CHILD_PRESENT )
+{
+list_add_tail(&epg->list, &secs_list);
+continue;
+}
+
+if ( r )
+{
+printk("Domain %d: Reset EPC error: (gfn 0x%lx, mfn 0x%lx): "
+"EREMOVE returns %d\n", d->domain_id, gfn, mfn_x(mfn), r);
+ret = r;
+if ( free_epc )
+

[Xen-devel] [PATCH 10/15] xen: vmx: handle ENCLS VMEXIT

2017-07-09 Thread Kai Huang
Currently EPC are statically allocated and mapped to guest, we don't have
to trap ENCLS as it runs perfectly in VMX non-root mode. But exposing SGX
to guest means we also expose ENABLE_ENCLS bit to L1 hypervisor, therefore
we cannot stop L1 from enabling ENCLS VMEXIT. For ENCLS VMEXIT from L2 guest,
we simply inject it to L1, otherwise the ENCLS VMEXIT is unexpected in L0
and we simply crash the domain.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmx.c | 10 ++
 xen/arch/x86/hvm/vmx/vvmx.c| 11 +++
 xen/include/asm-x86/hvm/vmx/vmcs.h |  1 +
 xen/include/asm-x86/hvm/vmx/vmx.h  |  1 +
 4 files changed, 23 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 7ee5515bdc..ea3d468bb0 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4126,6 +4126,16 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 vmx_handle_apic_write();
 break;
 
+case EXIT_REASON_ENCLS:
+/*
+ * Currently L0 doesn't turn on ENCLS VMEXIT, but L0 cannot stop L1
+ * from enabling ENCLS VMEXIT. ENCLS VMEXIT from L2 guest has already
+ * been handled so by reaching here it is a BUG. We simply crash the
+ * domain.
+ */
+domain_crash(v->domain);
+break;
+
 case EXIT_REASON_PML_FULL:
 vmx_vcpu_flush_pml_buffer(v);
 break;
diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index 3560faec6d..7eb10738d9 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -2059,6 +2059,12 @@ int nvmx_msr_read_intercept(unsigned int msr, u64 
*msr_content)
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_ENABLE_EPT;
+/*
+ * If SGX is exposed to guest, then ENABLE_ENCLS bit must also be
+ * exposed to guest.
+ */
+if ( domain_has_sgx(d) )
+data |= SECONDARY_EXEC_ENABLE_ENCLS;
 data = gen_vmx_msr(data, 0, host_data);
 break;
 case MSR_IA32_VMX_EXIT_CTLS:
@@ -2291,6 +2297,11 @@ int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs,
 case EXIT_REASON_VMXON:
 case EXIT_REASON_INVEPT:
 case EXIT_REASON_XSETBV:
+/*
+ * L0 doesn't turn on ENCLS VMEXIT now, so ENCLS VMEXIT must come from
+ * L2 guest, and is because of ENCLS VMEXIT is turned on by L1.
+ */
+case EXIT_REASON_ENCLS:
 /* inject to L1 */
 nvcpu->nv_vmexit_pending = 1;
 break;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index fc0b9d85fd..1350b7bc81 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -396,6 +396,7 @@ enum vmcs_field {
 VIRT_EXCEPTION_INFO = 0x202a,
 XSS_EXIT_BITMAP = 0x202c,
 TSC_MULTIPLIER  = 0x2032,
+ENCLS_EXITING_BITMAP= 0x202E,
 GUEST_PHYSICAL_ADDRESS  = 0x2400,
 VMCS_LINK_POINTER   = 0x2800,
 GUEST_IA32_DEBUGCTL = 0x2802,
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h 
b/xen/include/asm-x86/hvm/vmx/vmx.h
index 4889a64255..211f5c8058 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -210,6 +210,7 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
 #define EXIT_REASON_APIC_WRITE  56
 #define EXIT_REASON_INVPCID 58
 #define EXIT_REASON_VMFUNC  59
+#define EXIT_REASON_ENCLS   60
 #define EXIT_REASON_PML_FULL62
 #define EXIT_REASON_XSAVES  63
 #define EXIT_REASON_XRSTORS 64
-- 
2.11.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH 11/15] xen: vmx: handle VMEXIT from SGX enclave

2017-07-09 Thread Kai Huang
VMX adds new bit to both exit_reason and GUEST_INTERRUPT_STATE to indicate
whether VMEXIT happens in Enclave. Several instructions are also invalid or
behave differently in enclave according to SDM. This patch handles those
cases.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmx.c | 29 +
 xen/include/asm-x86/hvm/vmx/vmcs.h |  2 ++
 xen/include/asm-x86/hvm/vmx/vmx.h  |  2 ++
 3 files changed, 33 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index ea3d468bb0..d0c43ea0c8 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -57,6 +57,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static bool_t __initdata opt_force_ept;
 boolean_param("force-ept", opt_force_ept);
@@ -3544,6 +3545,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0;
 unsigned int vector = 0, mode;
 struct vcpu *v = current;
+bool_t exit_from_sgx_enclave;
 
 __vmread(GUEST_RIP,®s->rip);
 __vmread(GUEST_RSP,®s->rsp);
@@ -3569,6 +3571,11 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 
 perfc_incra(vmexits, exit_reason);
 
+/* We need to handle several VMEXITs if VMEXIT is from enclave. Also clear
+ * bit 27 as it is further useless. */
+exit_from_sgx_enclave = !!(exit_reason & VMX_EXIT_REASONS_FROM_ENCLAVE);
+exit_reason &= ~VMX_EXIT_REASONS_FROM_ENCLAVE;
+
 /* Handle the interrupt we missed before allowing any more in. */
 switch ( (uint16_t)exit_reason )
 {
@@ -4070,6 +4077,18 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 break;
 
 case EXIT_REASON_INVD:
+   /*
+* SDM 39.6.5 INVD Handling when Enclave Are Enabled
+*
+* INVD cause #GP if EPC is enabled.
+* FIXME: WBINVD??
+*/
+if ( exit_from_sgx_enclave )
+{
+hvm_inject_hw_exception(TRAP_gp_fault, 0);
+break;
+}
+/* Otherwise passthrough */
 case EXIT_REASON_WBINVD:
 {
 update_guest_eip(); /* Safe: INVD, WBINVD */
@@ -4081,6 +4100,16 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 {
 paddr_t gpa;
 
+/*
+ * Currently EPT violation from enclave is not possible as all EPC 
pages
+ * are statically allocated to guest when guest is created. We simply
+ * crash guest in this case.
+ */
+if ( exit_from_sgx_enclave )
+{
+domain_crash(v->domain);
+break;
+}
 __vmread(GUEST_PHYSICAL_ADDRESS, &gpa);
 __vmread(EXIT_QUALIFICATION, &exit_qualification);
 ept_handle_violation(exit_qualification, gpa);
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 1350b7bc81..bbbc3d0d78 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -327,6 +327,8 @@ extern u64 vmx_ept_vpid_cap;
 #define VMX_INTR_SHADOW_MOV_SS  0x0002
 #define VMX_INTR_SHADOW_SMI 0x0004
 #define VMX_INTR_SHADOW_NMI 0x0008
+#define VMX_INTR_ENCLAVE_INTR   0x0010  /* VMEXIT was incident to
+   enclave mode */
 
 #define VMX_BASIC_REVISION_MASK 0x7fff
 #define VMX_BASIC_VMCS_SIZE_MASK(0x1fffULL << 32)
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h 
b/xen/include/asm-x86/hvm/vmx/vmx.h
index 211f5c8058..2184d35246 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -153,6 +153,8 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
  * Exit Reasons
  */
 #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x8000
+/* Bit 27 is also set if VMEXIT is from SGX enclave mode */
+#define VMX_EXIT_REASONS_FROM_ENCLAVE   0x0800
 
 #define EXIT_REASON_EXCEPTION_NMI   0
 #define EXIT_REASON_EXTERNAL_INTERRUPT  1
-- 
2.11.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH 02/15] xen: vmx: detect ENCLS VMEXIT

2017-07-09 Thread Kai Huang
If ENCLS VMEXIT is not present then we cannot support SGX virtualization.
This patch detects presence of ENCLS VMEXIT. A Xen boot boolean parameter
'sgx' is also added to manually enable/disable SGX.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmcs.c| 17 +
 xen/include/asm-x86/hvm/vmx/vmcs.h |  3 +++
 2 files changed, 20 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 8103b20d29..ae7e6f9321 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -98,6 +98,9 @@ static void __init parse_ept_param(char *s)
 }
 custom_param("ept", parse_ept_param);
 
+static bool_t __read_mostly opt_sgx_enabled = 1;
+boolean_param("sgx", opt_sgx_enabled);
+
 /* Dynamic (run-time adjusted) execution control flags. */
 u32 vmx_pin_based_exec_control __read_mostly;
 u32 vmx_cpu_based_exec_control __read_mostly;
@@ -138,6 +141,7 @@ static void __init vmx_display_features(void)
 P(cpu_has_vmx_virt_exceptions, "Virtualisation Exceptions");
 P(cpu_has_vmx_pml, "Page Modification Logging");
 P(cpu_has_vmx_tsc_scaling, "TSC Scaling");
+P(cpu_has_vmx_encls, "SGX ENCLS Exiting");
 #undef P
 
 if ( !printed )
@@ -243,6 +247,8 @@ static int vmx_init_vmcs_config(void)
 opt |= SECONDARY_EXEC_UNRESTRICTED_GUEST;
 if ( opt_pml_enabled )
 opt |= SECONDARY_EXEC_ENABLE_PML;
+if ( opt_sgx_enabled )
+opt |= SECONDARY_EXEC_ENABLE_ENCLS;
 
 /*
  * "APIC Register Virtualization" and "Virtual Interrupt Delivery"
@@ -336,6 +342,14 @@ static int vmx_init_vmcs_config(void)
 _vmx_secondary_exec_control &= ~ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 }
 
+/*
+ * Turn off SGX if ENCLS VMEXIT is not present. Actually on real machine,
+ * if SGX CPUID is present (CPUID.0x7.0x0:EBX.SGX = 1), then ENCLS VMEXIT
+ * will always be present. We do the check anyway here.
+ */
+if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_ENCLS) )
+opt_sgx_enabled = 0;
+
 min = VM_EXIT_ACK_INTR_ON_EXIT;
 opt = VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT |
   VM_EXIT_CLEAR_BNDCFGS;
@@ -1146,6 +1160,9 @@ static int construct_vmcs(struct vcpu *v)
 /* Disable PML anyway here as it will only be enabled in log dirty mode */
 v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 
+/* Disable ENCLS VMEXIT. It will only be turned on when needed. */
+v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_ENCLS;
+
 /* Host data selectors. */
 __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
 __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index e3cdfdf576..889091da42 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -232,6 +232,7 @@ extern u32 vmx_vmentry_control;
 #define SECONDARY_EXEC_ENABLE_INVPCID   0x1000
 #define SECONDARY_EXEC_ENABLE_VM_FUNCTIONS  0x2000
 #define SECONDARY_EXEC_ENABLE_VMCS_SHADOWING0x4000
+#define SECONDARY_EXEC_ENABLE_ENCLS 0x8000
 #define SECONDARY_EXEC_ENABLE_PML   0x0002
 #define SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS   0x0004
 #define SECONDARY_EXEC_XSAVES   0x0010
@@ -312,6 +313,8 @@ extern u64 vmx_ept_vpid_cap;
 (vmx_secondary_exec_control & SECONDARY_EXEC_XSAVES)
 #define cpu_has_vmx_tsc_scaling \
 (vmx_secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+#define cpu_has_vmx_encls \
+(vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_ENCLS)
 
 #define VMCS_RID_TYPE_MASK  0x8000
 
-- 
2.11.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH 12/15] xen: x86: reset EPC when guest got suspended.

2017-07-09 Thread Kai Huang
EPC is destroyed when power state goes to S3-S5. Emulate this behavior.

A new function s3_suspend is added to hvm_function_table for this purpose.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/hvm.c| 3 +++
 xen/arch/x86/hvm/vmx/vmx.c| 7 +++
 xen/include/asm-x86/hvm/hvm.h | 3 +++
 3 files changed, 13 insertions(+)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 70ddc81d44..1021cd7307 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3858,6 +3858,9 @@ static void hvm_s3_suspend(struct domain *d)
 
 hvm_vcpu_reset_state(d->vcpu[0], 0xf000, 0xfff0);
 
+if ( hvm_funcs.s3_suspend )
+hvm_funcs.s3_suspend(d);
+
 domain_unlock(d);
 }
 
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index d0c43ea0c8..98c346178e 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2290,6 +2290,12 @@ static bool vmx_get_pending_event(struct vcpu *v, struct 
x86_event *info)
 return true;
 }
 
+static void vmx_s3_suspend(struct domain *d)
+{
+if ( domain_has_sgx(d) )
+hvm_reset_epc(d, false);
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
 .name = "VMX",
 .cpu_up_prepare   = vmx_cpu_up_prepare,
@@ -2360,6 +2366,7 @@ static struct hvm_function_table __initdata 
vmx_function_table = {
 .max_ratio = VMX_TSC_MULTIPLIER_MAX,
 .setup = vmx_setup_tsc_scaling,
 },
+.s3_suspend = vmx_s3_suspend,
 };
 
 /* Handle VT-d posted-interrupt when VCPU is blocked. */
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index b687e03dce..244b6566f2 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -226,6 +226,9 @@ struct hvm_function_table {
 /* Architecture function to setup TSC scaling ratio */
 void (*setup)(struct vcpu *v);
 } tsc_scaling;
+
+/* Domain S3 suspend */
+void (*s3_suspend)(struct domain *d);
 };
 
 extern struct hvm_function_table hvm_funcs;
-- 
2.11.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH 03/15] xen: x86: add early stage SGX feature detection

2017-07-09 Thread Kai Huang
This patch adds early stage SGX feature detection via SGX CPUID 0x12. Function
detect_sgx is added to detect SGX info on each CPU (called from vmx_cpu_up).
SDM says SGX info returned by CPUID is per-thread, and we cannot assume all
threads will return the same SGX info, so we have to detect SGX for each CPU.
For simplicity, currently SGX is only supported when all CPUs reports the same
SGX info.

SDM also says it's possible to have multiple EPC sections but this is only for
multiple-socket server, which we don't support now (there are other things
need to be done, ex, NUMA EPC, scheduling, etc, as well), so currently only
one EPC is supported.

Dedicated files sgx.c and sgx.h are added (under vmx directory as SGX is Intel
specific) for bulk of above SGX detection code detection code, and for further
SGX code as well.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/Makefile |   1 +
 xen/arch/x86/hvm/vmx/sgx.c| 208 ++
 xen/arch/x86/hvm/vmx/vmcs.c   |   4 +
 xen/include/asm-x86/cpufeature.h  |   1 +
 xen/include/asm-x86/hvm/vmx/sgx.h |  45 +
 5 files changed, 259 insertions(+)
 create mode 100644 xen/arch/x86/hvm/vmx/sgx.c
 create mode 100644 xen/include/asm-x86/hvm/vmx/sgx.h

diff --git a/xen/arch/x86/hvm/vmx/Makefile b/xen/arch/x86/hvm/vmx/Makefile
index 04a29ce59d..f6bcf0d143 100644
--- a/xen/arch/x86/hvm/vmx/Makefile
+++ b/xen/arch/x86/hvm/vmx/Makefile
@@ -4,3 +4,4 @@ obj-y += realmode.o
 obj-y += vmcs.o
 obj-y += vmx.o
 obj-y += vvmx.o
+obj-y += sgx.o
diff --git a/xen/arch/x86/hvm/vmx/sgx.c b/xen/arch/x86/hvm/vmx/sgx.c
new file mode 100644
index 00..6b41469371
--- /dev/null
+++ b/xen/arch/x86/hvm/vmx/sgx.c
@@ -0,0 +1,208 @@
+/*
+ * Intel Software Guard Extensions support
+ *
+ * Author: Kai Huang 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static struct sgx_cpuinfo __read_mostly sgx_cpudata[NR_CPUS];
+static struct sgx_cpuinfo __read_mostly boot_sgx_cpudata;
+
+static bool_t sgx_enabled_in_bios(void)
+{
+uint64_t val, sgx_enabled = IA32_FEATURE_CONTROL_SGX_ENABLE |
+IA32_FEATURE_CONTROL_LOCK;
+
+rdmsrl(MSR_IA32_FEATURE_CONTROL, val);
+
+return (val & sgx_enabled) == sgx_enabled;
+}
+
+static void __detect_sgx(int cpu)
+{
+struct sgx_cpuinfo *sgxinfo = &sgx_cpudata[cpu];
+u32 eax, ebx, ecx, edx;
+
+memset(sgxinfo, 0, sizeof(*sgxinfo));
+
+/*
+ * In reality if SGX is not enabled in BIOS, SGX CPUID should report
+ * invalid SGX info, but we do the check anyway to make sure.
+ */
+if ( !sgx_enabled_in_bios() )
+{
+printk("CPU%d: SGX disabled in BIOS.\n", cpu);
+goto not_supported;
+}
+
+/*
+ * CPUID.0x12.0x0:
+ *
+ *  EAX [0]:whether SGX1 is supported.
+ *  [1]:whether SGX2 is supported.
+ *  EBX [31:0]: miscselect
+ *  ECX [31:0]: reserved
+ *  EDX [7:0]:  MaxEnclaveSize_Not64
+ *  [15:8]: MaxEnclaveSize_64
+ */
+cpuid_count(SGX_CPUID, 0x0, &eax, &ebx, &ecx, &edx);
+sgxinfo->cap = eax & (SGX_CAP_SGX1 | SGX_CAP_SGX2);
+sgxinfo->miscselect = ebx;
+sgxinfo->max_enclave_size32 = edx & 0xff;
+sgxinfo->max_enclave_size64 = (edx & 0xff00) >> 8;
+
+if ( !(eax & SGX_CAP_SGX1) )
+{
+/* We may reach here if BIOS doesn't enable SGX */
+printk("CPU%d: CPUID.0x12.0x0 reports not SGX support.\n", cpu);
+goto not_supported;
+}
+
+/*
+ * CPUID.0x12.0x1:
+ *
+ *  EAX [31:0]: bitmask of 1-setting of SECS.ATTRIBUTES[31:0]
+ *  EBX [31:0]: bitmask of 1-setting of SECS.ATTRIBUTES[63:32]
+ *  ECX [31:0]: bitmask of 1-setting of SECS.ATTRIBUTES[95:64]
+ *  EDX [31:0]: bitmask of 1-setting of SECS.ATTRIBUTES[127:96]
+ */
+cpuid_count(SGX_CPUID, 0x1, &eax, &ebx, &ecx, &edx);
+sgxinfo->secs_attr_bitmask[0] = eax;
+sgxinfo->secs_attr_bitmask[1] = ebx;
+sgxinfo->secs_attr_bitmask[2] = ecx;
+sgxinfo->secs_attr_bitmask[3] = edx;
+
+/*
+ * CPUID.0x12.0x2:
+ *
+ *  EAX [3:0]:  : this sub-leaf is invalid
+ *  0001: this sub-leaf enumerates EPC resource
+ *  [11:4]: reserved
+ *  [31:12]:bits 31:12 of physical address of EPC base (when
+ *  EAX[3:0] is 0001, which applies to following)
+ *  EBX [19:0]: bits 51:32 of physical address of EPC base
+ *  [31:20]:reserved
+ *  ECX [3:0]:  : EDX:ECX are 0
+ *  0001: this is EPC section.
+ *  [11:4]: reserved
+ *  [31:12]:bits 31:12 of EPC size
+ *  EDX [19:0]: bits 51:32 of EPC size
+ *  [31:20]:reserved
+ *
+ *  TODO: So far assume there's only one EPC resource.
+ */
+cpuid_count(SGX_CPUID, 0x2, &eax, &ebx, &ecx,

[Xen-devel] [PATCH 06/15] xen: x86: add SGX basic EPC management

2017-07-09 Thread Kai Huang
EPC is limited resource reserved by BIOS. Typically EPC size is from dozens of
MB to more than a hundred MB. EPC is reported as reserved memory in e820 but
not normal memory. EPC must be managed in 4K pages.

From implementation's view, we can choose either to manage EPC separately, or
to extend existing memory management code to support EPC. The latter has
advantage of being able to use existing memory management algorithm but is
more complicated to implement (thus more risky), while the former is more
simple but has to write own EPC management algorithm. Currently we choose the
former. Given the fact that EPC size is small, currently we simply put all EPC
pages into single list, so allocation and free are very straightforward.

Like there is one 'struct page_info' for each memory page, a 'struct epc_page'
is added to represent status of each EPC page, and all 'struct epc_page' will
be in an array which is allocated during SGX initialization. Entire EPC is also
mapped to Xen's virtual address so that each EPC page's virtual address can be
calculated by base virtual address + offset.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/sgx.c| 154 ++
 xen/include/asm-x86/hvm/vmx/sgx.h |  19 +
 2 files changed, 173 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/sgx.c b/xen/arch/x86/hvm/vmx/sgx.c
index 6b41469371..f4c9b2f933 100644
--- a/xen/arch/x86/hvm/vmx/sgx.c
+++ b/xen/arch/x86/hvm/vmx/sgx.c
@@ -7,12 +7,89 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 
 static struct sgx_cpuinfo __read_mostly sgx_cpudata[NR_CPUS];
 static struct sgx_cpuinfo __read_mostly boot_sgx_cpudata;
 
+/*
+ * epc_frametable keeps an array of struct epc_page for every EPC pages, so 
that
+ * epc_page_to_mfn, epc_mfn_to_page works straightforwardly. The array will be
+ * allocated dynamically according to machine's EPC size.
+ */
+static struct epc_page *epc_frametable = NULL;
+/*
+ * EPC is mapped to Xen's virtual address at once, so that each EPC page's
+ * virtual address is epc_base_vaddr + offset.
+ */
+static void *epc_base_vaddr = NULL;
+
+/* Global free EPC pages list. */
+static struct list_head free_epc_list;
+static spinlock_t epc_lock;
+
+#define total_epc_npages (boot_sgx_cpudata.epc_size >> PAGE_SHIFT)
+#define epc_base_mfn (boot_sgx_cpudata.epc_base >> PAGE_SHIFT)
+
+/* Current number of free EPC pages in free_epc_list */
+static unsigned long free_epc_npages = 0;
+
+unsigned long epc_page_to_mfn(struct epc_page *epg)
+{
+BUG_ON(!epc_frametable);
+BUG_ON(!epc_base_mfn);
+
+return epc_base_mfn + (epg - epc_frametable);
+}
+
+struct epc_page *epc_mfn_to_page(unsigned long mfn)
+{
+BUG_ON(!epc_frametable);
+BUG_ON(!epc_base_mfn);
+
+return epc_frametable + (mfn - epc_base_mfn);
+}
+
+struct epc_page *alloc_epc_page(void)
+{
+struct epc_page *epg;
+
+spin_lock(&epc_lock);
+epg = list_first_entry_or_null(&free_epc_list, struct epc_page, list);
+if ( epg ) {
+list_del(&epg->list);
+free_epc_npages--;
+}
+spin_unlock(&epc_lock);
+
+return epg;
+}
+
+void free_epc_page(struct epc_page *epg)
+{
+spin_lock(&epc_lock);
+list_add_tail(&epg->list, &free_epc_list);
+free_epc_npages++;
+spin_unlock(&epc_lock);
+}
+
+void *map_epc_page_to_xen(struct epc_page *epg)
+{
+BUG_ON(!epc_base_vaddr);
+BUG_ON(!epc_frametable);
+
+return (void *)(((unsigned long)(epc_base_vaddr)) +
+((epg - epc_frametable) << PAGE_SHIFT));
+}
+
+void unmap_epc_page(void *addr)
+{
+/* Nothing */
+}
+
 static bool_t sgx_enabled_in_bios(void)
 {
 uint64_t val, sgx_enabled = IA32_FEATURE_CONTROL_SGX_ENABLE |
@@ -177,6 +254,80 @@ static bool_t __init check_sgx_consistency(void)
 return true;
 }
 
+static int inline npages_to_order(unsigned long npages)
+{
+int order = 0;
+
+while ( (1 << order) < npages )
+order++;
+
+return order;
+}
+
+static int __init init_epc_frametable(unsigned long npages)
+{
+unsigned long i, order;
+
+order = npages * sizeof(struct epc_page);
+order >>= 12;
+order = npages_to_order(order);
+
+epc_frametable = alloc_xenheap_pages(order, 0);
+if ( !epc_frametable )
+return -ENOMEM;
+
+for ( i = 0; i < npages; i++ )
+{
+struct epc_page *epg = epc_frametable + i;
+
+list_add_tail(&epg->list, &free_epc_list);
+}
+
+return 0;
+}
+
+static void destroy_epc_frametable(unsigned long npages)
+{
+unsigned long order;
+
+if ( !epc_frametable )
+return;
+
+order = npages * sizeof(struct epc_page);
+order >>= 12;
+order = npages_to_order(order);
+
+free_xenheap_pages(epc_frametable, order);
+}
+
+static int __init sgx_init_epc(void)
+{
+int r;
+
+INIT_LIST_HEAD(&free_epc_list)

[Xen-devel] [PATCH 09/15] xen: vmx: handle SGX related MSRs

2017-07-09 Thread Kai Huang
This patch handles IA32_FEATURE_CONTROL and IA32_SGXLEPUBKEYHASHn MSRs.

For IA32_FEATURE_CONTROL, if SGX is exposed to domain, then SGX_ENABLE bit
is always set. If SGX launch control is also exposed to domain, and physical
IA32_SGXLEPUBKEYHASHn are writable, then SGX_LAUNCH_CONTROL_ENABLE bit is
also always set. Write to IA32_FEATURE_CONTROL is ignored.

For IA32_SGXLEPUBKEYHASHn, a new 'struct sgx_vcpu' is added for per-vcpu SGX
staff, and currently it has vcpu's virtual ia32_sgxlepubkeyhash[0-3]. Two
boolean 'readable' and 'writable' are also added to indicate whether virtual
IA32_SGXLEPUBKEYHASHn are readable and writable.

During vcpu is initialized, virtual ia32_sgxlepubkeyhash are also initialized.
If physical IA32_SGXLEPUBKEYHASHn are writable, then ia32_sgxlepubkeyhash are
set to Intel's default value, as for physical machine, those MSRs will have
Intel's default value. If physical MSRs are not writable (it is *locked* by
BIOS before handling to Xen), then we try to read those MSRs and use physical
values as defult value for virtual MSRs. One thing is rdmsr_safe is used, as
although SDM says if SGX is present, IA32_SGXLEPUBKEYHASHn are available for
read, but in reality, skylake client (at least some, depending on BIOS) doesn't
have those MSRs available, so we use rdmsr_safe and set readable to false if it
returns error code.

For IA32_SGXLEPUBKEYHASHn MSR read from guest, if physical MSRs are not
readable, guest is not allowed to read either, otherwise vcpu's virtual MSR
value is returned.

For IA32_SGXLEPUBKEYHASHn MSR write from guest, we allow guest to write if both
physical MSRs are writable and SGX launch control is exposed to domain,
otherwise error is injected.

To make EINIT run successfully in guest, vcpu's virtual IA32_SGXLEPUBKEYHASHn
will be update to physical MSRs when vcpu is scheduled in.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/sgx.c | 194 +
 xen/arch/x86/hvm/vmx/vmx.c |  24 +
 xen/include/asm-x86/cpufeature.h   |   3 +
 xen/include/asm-x86/hvm/vmx/sgx.h  |  22 +
 xen/include/asm-x86/hvm/vmx/vmcs.h |   2 +
 xen/include/asm-x86/msr-index.h|   6 ++
 6 files changed, 251 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/sgx.c b/xen/arch/x86/hvm/vmx/sgx.c
index 14379151e8..4944e57aef 100644
--- a/xen/arch/x86/hvm/vmx/sgx.c
+++ b/xen/arch/x86/hvm/vmx/sgx.c
@@ -405,6 +405,200 @@ void hvm_destroy_epc(struct domain *d)
 hvm_reset_epc(d, true);
 }
 
+/* Whether IA32_SGXLEPUBKEYHASHn are physically *unlocked* by BIOS */
+bool_t sgx_ia32_sgxlepubkeyhash_writable(void)
+{
+uint64_t sgx_lc_enabled = IA32_FEATURE_CONTROL_SGX_ENABLE |
+  IA32_FEATURE_CONTROL_SGX_LAUNCH_CONTROL_ENABLE |
+  IA32_FEATURE_CONTROL_LOCK;
+uint64_t val;
+
+rdmsrl(MSR_IA32_FEATURE_CONTROL, val);
+
+return (val & sgx_lc_enabled) == sgx_lc_enabled;
+}
+
+bool_t domain_has_sgx(struct domain *d)
+{
+/* hvm_epc_populated(d) implies CPUID has SGX */
+return hvm_epc_populated(d);
+}
+
+bool_t domain_has_sgx_launch_control(struct domain *d)
+{
+struct cpuid_policy *p = d->arch.cpuid;
+
+if ( !domain_has_sgx(d) )
+return false;
+
+/* Unnecessary but check anyway */
+if ( !cpu_has_sgx_launch_control )
+return false;
+
+return !!p->feat.sgx_launch_control;
+}
+
+/* Digest of Intel signing key. MSR's default value after reset. */
+#define SGX_INTEL_DEFAULT_LEPUBKEYHASH0 0xa6053e051270b7ac
+#define SGX_INTEL_DEFAULT_LEPUBKEYHASH1 0x6cfbe8ba8b3b413d
+#define SGX_INTEL_DEFAULT_LEPUBKEYHASH2 0xc4916d99f2b3735d
+#define SGX_INTEL_DEFAULT_LEPUBKEYHASH3 0xd4f8c05909f9bb3b
+
+void sgx_vcpu_init(struct vcpu *v)
+{
+struct sgx_vcpu *sgxv = to_sgx_vcpu(v);
+
+memset(sgxv, 0, sizeof (*sgxv));
+
+if ( sgx_ia32_sgxlepubkeyhash_writable() )
+{
+/*
+ * If physical MSRs are writable, set vcpu's default value to Intel's
+ * default value. For real machine, after reset, MSRs contain Intel's
+ * default value.
+ */
+sgxv->ia32_sgxlepubkeyhash[0] = SGX_INTEL_DEFAULT_LEPUBKEYHASH0;
+sgxv->ia32_sgxlepubkeyhash[1] = SGX_INTEL_DEFAULT_LEPUBKEYHASH1;
+sgxv->ia32_sgxlepubkeyhash[2] = SGX_INTEL_DEFAULT_LEPUBKEYHASH2;
+sgxv->ia32_sgxlepubkeyhash[3] = SGX_INTEL_DEFAULT_LEPUBKEYHASH3;
+
+sgxv->readable = 1;
+sgxv->writable = domain_has_sgx_launch_control(v->domain);
+}
+else
+{
+uint64_t v;
+/*
+ * Although SDM says if SGX is present, then IA32_SGXLEPUBKEYHASHn are
+ * available for read, but in reality for SKYLAKE client machines,
+ * those MSRs are not available if SGX is present, so we cannot rely on
+ * cpu_has_sgx to determine whether to we are able to read MSRs,
+ * instead, we

[Xen-devel] [PATCH 01/15] xen: x86: expose SGX to HVM domain in CPU featureset

2017-07-09 Thread Kai Huang
Expose SGX in CPU featureset for HVM domain. SGX will not be supported for
PV domain, as ENCLS (which SGX driver in guest essentially runs) must run
in ring 0, while PV kernel runs in ring 3. Theoretically we can support SGX
in PV domain via either emulating #GP caused by ENCLS running in ring 3, or
by PV ENCLS but it is really not necessary at this stage. And currently SGX
is only exposed to HAP HVM domain (we can add for shadow in the future).

SGX Launch Control is also exposed in CPU featureset for HVM domain. SGX
Launch Control depends on SGX.

Signed-off-by: Kai Huang 
---
 xen/include/public/arch-x86/cpufeatureset.h | 3 ++-
 xen/tools/gen-cpuid.py  | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/xen/include/public/arch-x86/cpufeatureset.h 
b/xen/include/public/arch-x86/cpufeatureset.h
index 97dd3534c5..b6c54e654e 100644
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -193,7 +193,7 @@ XEN_CPUFEATURE(XSAVES,4*32+ 3) /*S  XSAVES/XRSTORS 
instructions */
 /* Intel-defined CPU features, CPUID level 0x0007:0.ebx, word 5 */
 XEN_CPUFEATURE(FSGSBASE,  5*32+ 0) /*A  {RD,WR}{FS,GS}BASE instructions */
 XEN_CPUFEATURE(TSC_ADJUST,5*32+ 1) /*S  TSC_ADJUST MSR available */
-XEN_CPUFEATURE(SGX,   5*32+ 2) /*   Software Guard extensions */
+XEN_CPUFEATURE(SGX,   5*32+ 2) /*H  Intel Software Guard extensions */
 XEN_CPUFEATURE(BMI1,  5*32+ 3) /*A  1st bit manipulation extensions */
 XEN_CPUFEATURE(HLE,   5*32+ 4) /*A  Hardware Lock Elision */
 XEN_CPUFEATURE(AVX2,  5*32+ 5) /*A  AVX2 instructions */
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(PKU,   6*32+ 3) /*H  Protection Keys 
for Userspace */
 XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*!  OS Protection Keys Enable */
 XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A  POPCNT for vectors of DW/QW */
 XEN_CPUFEATURE(RDPID, 6*32+22) /*A  RDPID instruction */
+XEN_CPUFEATURE(SGX_LAUNCH_CONTROL, 6*32+30) /*H Intel SGX Launch Control */
 
 /* AMD-defined CPU features, CPUID level 0x8007.edx, word 7 */
 XEN_CPUFEATURE(ITSC,  7*32+ 8) /*   Invariant TSC */
diff --git a/xen/tools/gen-cpuid.py b/xen/tools/gen-cpuid.py
index 9ec4486f2b..1301eee310 100755
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -256,6 +256,9 @@ def crunch_numbers(state):
 AVX512F: [AVX512DQ, AVX512IFMA, AVX512PF, AVX512ER, AVX512CD,
   AVX512BW, AVX512VL, AVX512VBMI, AVX512_4VNNIW,
   AVX512_4FMAPS, AVX512_VPOPCNTDQ],
+
+# SGX Launch Control depends on SGX
+SGX: [SGX_LAUNCH_CONTROL],
 }
 
 deep_features = tuple(sorted(deps.keys()))
-- 
2.11.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [RFC PATCH 00/15] RFC: SGX virtualization design and draft patches

2017-07-09 Thread Kai Huang

2.3 Additional Point: Live Migration, Snapshot Support (?)

Actually from hardware's point of view, SGX is not migratable. There are two
reasons:

- SGX key architecture cannot be virtualized.

For example, some keys are bound to CPU. For example, Sealing key, EREPORT
key, etc. If VM is migrated to another machine, the same enclave will derive
the different keys. Taking Sealing key as an example, Sealing key is
typically used by enclave (enclave can get sealing key by EGETKEY) to *seal*
its secrets to outside (ex, persistent storage) for further use. If Sealing
key changes after VM migration, then the enclave can never get the sealed
secrets back by using sealing key, as it has changed, and old sealing key
cannot be got back.

- There's no ENCLS to evict EPC page to normal memory, but at the meaning
time, still keep content in EPC. Currently once EPC page is evicted, the EPC
page becomes invalid. So technically, we are unable to implement live
migration (or check pointing, or snapshot) for enclave.

But, with some workaround, and some facts of existing SGX driver, technically
we are able to support Live migration (or even check pointing, snapshot). This
is because:

- Changing key (which is bound to CPU) is not a problem in reality

Take Sealing key as an example. Losing sealed data is not a problem, because
sealing key is only supposed to encrypt secrets that can be provisioned
again. The typical work model is, enclave gets secrets provisioned from
remote (service provider), and use sealing key to store it for further use.
When enclave tries to *unseal* use sealing key, if the sealing key is
changed, enclave will find the data is some kind of corrupted (integrity
check failure), so it will ask secrets to be provisioned again from remote.
Another reason is, in data center, VM's typically share lots of data, and as
sealing key is bound to CPU, it means the data encrypted by one enclave on
one machine cannot be shared by another enclave on another mahcine. So from
SGX app writer's point of view, developer should treat Sealing key as a
changeable key, and should handle lose of sealing data anyway. Sealing key
should only be used to seal secrets that can be easily provisioned again.

For other keys such as EREPORT key and provisioning key, which are used for
local attestation and remote attestation, due to the second reason below,
losing them is not a problem either.

- Sudden lose of EPC is not a problem.

On hardware, EPC will be lost if system goes to S3-S5, or reset, or
shutdown, and SGX driver need to handle lose of EPC due to power transition.
This is done by cooperation between SGX driver and userspace SGX SDK/apps.
However during live migration, there may not be power transition in guest,
so there may not be EPC lose during live migration. And technically we
cannot *really* live migrate enclave (explained above), so looks it's not
feasible. But the fact is that both Linux SGX driver and Windows SGX driver
have already supported *sudden* lose of EPC (not EPC lose during power
transition), which means both driver are able to recover in case EPC is lost
at any runtime. With this, technically we are able to support live migration
by simply ignoring EPC. After VM is migrated, the destination VM will only
suffer *sudden* lose of EPC, which both Windows SGX driver and Linux SGX
driver are already able to handle.

But we must point out such *sudden* lose of EPC is not hardware behavior,
and other SGX driver for other OSes (such as FreeBSD) may not implement
this, so for those guests, destination VM will behavior in unexpected
manner. But I am not sure we need to care about other OSes.

For the same reason, we are able to support check pointing for SGX guest (only
Linux and Windows);

For snapshot, we can support snapshot SGX guest by either:

- Suspend guest before snapshot (s3-s5). This works for all guests but
  requires user to manually susppend guest.
- Issue an hypercall to destroy guest's EPC in save_vm. This only works for
  Linux and Windows but doesn't require user intervention.

What's your comments?

3. Reference

- Intel SGX Homepage
https://software.intel.com/en-us/sgx

- Linux SGX SDK
https://01.org/intel-software-guard-extensions

- Linux SGX driver for upstreaming
https://github.com/01org/linux-sgx

- Intel SGX Specification (SDM Vol 3D)

https://software.intel.com/sites/default/files/managed/7c/f1/332831-sdm-vol-3d.pdf

- Paper: Intel SGX Explained
https://eprint.iacr.org/2016/086.pdf

- ISCA 2015 tutorial slides for Intel® SGX - Intel® Software
https://software.intel.com/sites/default/files/332680-002.pdf

Kai Huang (15):
  xen: x86: expose SGX to HVM domain in CPU featureset
  xen: vmx: detect ENCLS 

[Xen-devel] [PATCH v2] x86/vmx: enable PML by default

2015-11-27 Thread Kai Huang
Since PML series were merged (but disabled by default) we have conducted lots of
PML tests (live migration, GUI display) and PML has been working fine, therefore
turn it on by default.

Document of PML command line is adjusted accordingly as well.

Signed-off-by: Kai Huang 
Tested-by: Robert Hu 
Tested-by: Xudong Hao 
---

v1->v2: Document of PML command line is adjusted accordingly as well.

In case you might want some specific performance data to better get convinced to
turn PML on by default, I pasted the specjbb performance data (running in guest
which was in log-dirty mode) I gathered when I was posting the PML patch series
to xen-devel mailing list for review for your reference.

 specjbb performance ===

I measured specjbb performance in guest when guest is in video ram tracking mode
(the most usual case I think), and when guest is in global log-dirty mode (I
made some change in XL tool to put guest global log-dirty mode infinitely, see
below), from which we can see that PML does improved the specjbb performance in
guest while guest is in log-dirty mode, and the more frequently dirty pages are
queried, the more performance gain we will have. So while PML probably can't
speed up live migration process directly, it will be benefical for use cases
such as guest memory dirty speed monitoring.

- video ram tracking:

WP  PML 
122805  123887
120792  123249
118577  123348
121856  125195
121286  122056
120139  123037

avg 120909  123462  

100%102.11%

performance gain:   2.11% 

- global log-dirty:

WP  PML
72862   79511
73466   81173
72989   81177
73138   81777
72811   80257
72486   80413

avg 72959   80718
100%110.63%

performance gain: 10.63%

Test machine: Boardwell server with 16 CPUs (1.6G) + 4G memory.
Xen hypervisor: lastest upstream Xen
dom0 kernel: 3.16.0
guest: 4 vcpus + 1G memory.
guest os: ubuntu 14.04 with 3.13.0-24-generic kernel.

---
 docs/misc/xen-command-line.markdown | 2 +-
 xen/arch/x86/hvm/vmx/vmcs.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index c103894..47d148a 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -715,7 +715,7 @@ Controls EPT related features.
 
 > `pml`
 
-> Default: `false`
+> Default: `true`
 
 >> PML is a new hardware feature in Intel's Broadwell Server and further
 >> platforms which reduces hypervisor overhead of log-dirty mechanism by
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 7a7896e..dbf284d 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -64,7 +64,7 @@ integer_param("ple_gap", ple_gap);
 static unsigned int __read_mostly ple_window = 4096;
 integer_param("ple_window", ple_window);
 
-static bool_t __read_mostly opt_pml_enabled = 0;
+static bool_t __read_mostly opt_pml_enabled = 1;
 static s8 __read_mostly opt_ept_ad = -1;
 
 /*
-- 
2.5.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] x86/vmx: enable PML by default

2015-11-27 Thread Kai Huang



On 11/27/2015 04:35 PM, Jan Beulich wrote:

On 27.11.15 at 08:57,  wrote:

Since PML series were merged (but disabled by default) we have conducted lots of
PML tests (live migration, GUI display) and PML has been working fine, therefore
turn it on by default.

Well, I'm not really opposed, but if you do this ...


---
  xen/arch/x86/hvm/vmx/vmcs.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

... you need to also adjust docs/misc/xen-command-line.markdown.

Oh. Thanks for pointing out. My mistake. I'll send out v2.

Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel




___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH] x86/vmx: enable PML by default

2015-11-27 Thread Kai Huang
Since PML series were merged (but disabled by default) we have conducted lots of
PML tests (live migration, GUI display) and PML has been working fine, therefore
turn it on by default.

Signed-off-by: Kai Huang 
Tested-by: Robert Hu 
Tested-by: Xudong Hao 
---

In case you might want some specific performance data to better get convinced to
turn PML on by default, I pasted the specjbb performance data (running in guest
which was in log-dirty mode) I gathered when I was posting the PML patch series
to xen-devel mailing list for review for your reference.

 specjbb performance ===

I measured specjbb performance in guest when guest is in video ram tracking mode
(the most usual case I think), and when guest is in global log-dirty mode (I
made some change in XL tool to put guest global log-dirty mode infinitely, see
below), from which we can see that PML does improved the specjbb performance in
guest while guest is in log-dirty mode, and the more frequently dirty pages are
queried, the more performance gain we will have. So while PML probably can't
speed up live migration process directly, it will be benefical for use cases
such as guest memory dirty speed monitoring.

- video ram tracking:

WP  PML 
122805  123887
120792  123249
118577  123348
121856  125195
121286  122056
120139  123037

avg 120909  123462  

100%102.11%

performance gain:   2.11% 

- global log-dirty:

WP  PML
72862   79511
73466   81173
72989   81177
73138   81777
72811   80257
72486   80413

avg 72959   80718
100%110.63%

performance gain: 10.63%

Test machine: Boardwell server with 16 CPUs (1.6G) + 4G memory.
Xen hypervisor: lastest upstream Xen
dom0 kernel: 3.16.0
guest: 4 vcpus + 1G memory.
guest os: ubuntu 14.04 with 3.13.0-24-generic kernel.

---
 xen/arch/x86/hvm/vmx/vmcs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 7a7896e..dbf284d 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -64,7 +64,7 @@ integer_param("ple_gap", ple_gap);
 static unsigned int __read_mostly ple_window = 4096;
 integer_param("ple_window", ple_window);
 
-static bool_t __read_mostly opt_pml_enabled = 0;
+static bool_t __read_mostly opt_pml_enabled = 1;
 static s8 __read_mostly opt_ept_ad = -1;
 
 /*
-- 
2.5.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 0/2] Deter enabling of EPT A/D bit plus coding style fix

2015-10-20 Thread Kai Huang

Hi Kevin,

Would you comment on the two patches?

Thanks,
-Kai

On 10/20/2015 10:34 AM, Kai Huang wrote:

Patch 1 is the v2 of defering enabling of EPT A/D bit until PML get enabled,
with comments from Jan in v1 addressed. Patch 2 is coding style fix of
for_each_vcpu to existing PML functions according to Jan.

Kai Huang (2):
   x86/ept: defer enabling of EPT A/D bit until PML get enabled.
   x86/vmx: fix coding style of PML functions

  xen/arch/x86/hvm/vmx/vmcs.c| 32 
  xen/arch/x86/mm/p2m-ept.c  | 24 
  xen/include/asm-x86/hvm/vmx/vmcs.h |  2 ++
  3 files changed, 50 insertions(+), 8 deletions(-)




___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 2/2] x86/vmx: fix coding style of PML functions

2015-10-19 Thread Kai Huang
According to Jan's comments, also fix the coding style of for_each_vcpu in
existing PML functions.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmcs.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index c11f3ec..4ea1ad1 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1500,7 +1500,7 @@ int vmx_domain_enable_pml(struct domain *d)
 if ( vmx_domain_pml_enabled(d) )
 return 0;
 
-for_each_vcpu( d, v )
+for_each_vcpu ( d, v )
 if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
 goto error;
 
@@ -1509,7 +1509,7 @@ int vmx_domain_enable_pml(struct domain *d)
 return 0;
 
  error:
-for_each_vcpu( d, v )
+for_each_vcpu ( d, v )
 if ( vmx_vcpu_pml_enabled(v) )
 vmx_vcpu_disable_pml(v);
 return rc;
@@ -1530,7 +1530,7 @@ void vmx_domain_disable_pml(struct domain *d)
 if ( !vmx_domain_pml_enabled(d) )
 return;
 
-for_each_vcpu( d, v )
+for_each_vcpu ( d, v )
 vmx_vcpu_disable_pml(v);
 
 d->arch.hvm_domain.vmx.status &= ~VMX_DOMAIN_PML_ENABLED;
@@ -1549,7 +1549,7 @@ void vmx_domain_flush_pml_buffers(struct domain *d)
 if ( !vmx_domain_pml_enabled(d) )
 return;
 
-for_each_vcpu( d, v )
+for_each_vcpu ( d, v )
 vmx_vcpu_flush_pml_buffer(v);
 }
 
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 1/2] x86/ept: defer enabling of EPT A/D bit until PML get enabled.

2015-10-19 Thread Kai Huang
Existing PML implementation turns on EPT A/D bit unconditionally if PML is
supported by hardware. This works but enabling of EPT A/D bit can be deferred
until PML get enabled. There's no point in enabling the extra feature for every
domain when we're not meaning to use it (yet).

Also added ASSERT of domain having been paused to ept_flush_pml_buffers to make
it consistent with ept_enable{disable}_pml.

Sanity live migration and GUI display were tested on Broadwell Machine.

Signed-off-by: Kai Huang 
Suggested-by: Jan Beulich 
---
 xen/arch/x86/hvm/vmx/vmcs.c| 24 
 xen/arch/x86/mm/p2m-ept.c  | 24 
 xen/include/asm-x86/hvm/vmx/vmcs.h |  2 ++
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 3592a88..c11f3ec 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1553,6 +1553,30 @@ void vmx_domain_flush_pml_buffers(struct domain *d)
 vmx_vcpu_flush_pml_buffer(v);
 }
 
+static void vmx_vcpu_update_eptp(struct vcpu *v, u64 eptp)
+{
+vmx_vmcs_enter(v);
+__vmwrite(EPT_POINTER, eptp);
+vmx_vmcs_exit(v);
+}
+
+/*
+ * Update EPTP data to VMCS of all vcpus of the domain. Must be called when
+ * domain is paused.
+ */
+void vmx_domain_update_eptp(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+struct vcpu *v;
+
+ASSERT(atomic_read(&d->pause_count));
+
+for_each_vcpu ( d, v )
+vmx_vcpu_update_eptp(v, ept_get_eptp(&p2m->ept));
+
+ept_sync_domain(p2m);
+}
+
 int vmx_create_vmcs(struct vcpu *v)
 {
 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 74ce9e0..86440fc 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -1129,21 +1129,39 @@ void ept_sync_domain(struct p2m_domain *p2m)
 
 static void ept_enable_pml(struct p2m_domain *p2m)
 {
+/* Domain must have been paused */
+ASSERT(atomic_read(&p2m->domain->pause_count));
+
 /*
- * No need to check if vmx_domain_enable_pml has succeeded or not, as
+ * No need to return whether vmx_domain_enable_pml has succeeded, as
  * ept_p2m_type_to_flags will do the check, and write protection will be
  * used if PML is not enabled.
  */
-vmx_domain_enable_pml(p2m->domain);
+if ( vmx_domain_enable_pml(p2m->domain) )
+return;
+
+/* Enable EPT A/D bit for PML */
+p2m->ept.ept_ad = 1;
+vmx_domain_update_eptp(p2m->domain);
 }
 
 static void ept_disable_pml(struct p2m_domain *p2m)
 {
+/* Domain must have been paused */
+ASSERT(atomic_read(&p2m->domain->pause_count));
+
 vmx_domain_disable_pml(p2m->domain);
+
+/* Disable EPT A/D bit */
+p2m->ept.ept_ad = 0;
+vmx_domain_update_eptp(p2m->domain);
 }
 
 static void ept_flush_pml_buffers(struct p2m_domain *p2m)
 {
+/* Domain must have been paused */
+ASSERT(atomic_read(&p2m->domain->pause_count));
+
 vmx_domain_flush_pml_buffers(p2m->domain);
 }
 
@@ -1166,8 +1184,6 @@ int ept_p2m_init(struct p2m_domain *p2m)
 
 if ( cpu_has_vmx_pml )
 {
-/* Enable EPT A/D bits if we are going to use PML. */
-ept->ept_ad = cpu_has_vmx_pml ? 1 : 0;
 p2m->enable_hardware_log_dirty = ept_enable_pml;
 p2m->disable_hardware_log_dirty = ept_disable_pml;
 p2m->flush_hardware_cached_dirty = ept_flush_pml_buffers;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index f1126d4..ec526db 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -518,6 +518,8 @@ int vmx_domain_enable_pml(struct domain *d);
 void vmx_domain_disable_pml(struct domain *d);
 void vmx_domain_flush_pml_buffers(struct domain *d);
 
+void vmx_domain_update_eptp(struct domain *d);
+
 #endif /* ASM_X86_HVM_VMX_VMCS_H__ */
 
 /*
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 0/2] Deter enabling of EPT A/D bit plus coding style fix

2015-10-19 Thread Kai Huang
Patch 1 is the v2 of defering enabling of EPT A/D bit until PML get enabled,
with comments from Jan in v1 addressed. Patch 2 is coding style fix of
for_each_vcpu to existing PML functions according to Jan.

Kai Huang (2):
  x86/ept: defer enabling of EPT A/D bit until PML get enabled.
  x86/vmx: fix coding style of PML functions

 xen/arch/x86/hvm/vmx/vmcs.c| 32 
 xen/arch/x86/mm/p2m-ept.c  | 24 
 xen/include/asm-x86/hvm/vmx/vmcs.h |  2 ++
 3 files changed, 50 insertions(+), 8 deletions(-)

-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] x86/ept: defer enabling of EPT A/D bit until PML get enabled.

2015-10-16 Thread Kai Huang



On 10/16/2015 04:17 PM, Jan Beulich wrote:

On 16.10.15 at 04:21,  wrote:

Existing PML implementation turns on EPT A/D bit unconditionally if PML is
supported by hardware. This works but enabling of EPT A/D bit can be
deferred
until PML get enabled. There's no point in enabling the extra feature for
every
domain when we're not meaning to use it (yet).

Sanity live migration and GUI display were tested on Broadwell Machine.

Signed-off-by: Kai Huang 
Signed-off-by: Jan Beulich 

There's so little in this patch that came from me that I don't think this is
warranted; but if you want to keep it, the order needs to be switched.
Instead I'd suggest Suggested-by:.

I'll change it to Suggested-by.




+void vmx_domain_update_eptp(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+struct vcpu *v;
+
+ASSERT(atomic_read(&d->pause_count));

This should imo check controller_pause_count.
This function is called between domain_pause and domain_unpause, and 
domain_pause increases d->pause_count, not d->controller_pause_count, so 
we should check d->pause_count, right?





+for_each_vcpu( d, v )

Coding style: You need to settle on whether you want to treat
for_each_vcpu like a keyword (then there's a blank missing before
the opening paren) or like a normal identifier (then the blanks
immediately inside the parens need to go away).

Oh. I will add a blank before the opening paren.




  static void ept_flush_pml_buffers(struct p2m_domain *p2m)
  {
+/* Domain must have been paused */
+ASSERT(atomic_read(&p2m->domain->pause_count));

This seems unrelated - did you really mean it to go into this patch?
This function is also supposed to be called when domain is paused, so 
making it consistent with ept_enable{disable}_pml, I also added the 
ASSERT here. Is this reasonable?


Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel




___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH] x86/ept: defer enabling of EPT A/D bit until PML get enabled.

2015-10-15 Thread Kai Huang
Existing PML implementation turns on EPT A/D bit unconditionally if PML is
supported by hardware. This works but enabling of EPT A/D bit can be deferred
until PML get enabled. There's no point in enabling the extra feature for every
domain when we're not meaning to use it (yet).

Sanity live migration and GUI display were tested on Broadwell Machine.

Signed-off-by: Kai Huang 
Signed-off-by: Jan Beulich 
---
 xen/arch/x86/hvm/vmx/vmcs.c| 24 
 xen/arch/x86/mm/p2m-ept.c  | 24 
 xen/include/asm-x86/hvm/vmx/vmcs.h |  2 ++
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 3592a88..cddab15 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1553,6 +1553,30 @@ void vmx_domain_flush_pml_buffers(struct domain *d)
 vmx_vcpu_flush_pml_buffer(v);
 }
 
+static void vmx_vcpu_update_eptp(struct vcpu *v, u64 eptp)
+{
+vmx_vmcs_enter(v);
+__vmwrite(EPT_POINTER, eptp);
+vmx_vmcs_exit(v);
+}
+
+/*
+ * Update EPTP data to VMCS of all vcpus of the domain. Must be called when
+ * domain is paused.
+ */
+void vmx_domain_update_eptp(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+struct vcpu *v;
+
+ASSERT(atomic_read(&d->pause_count));
+
+for_each_vcpu( d, v )
+vmx_vcpu_update_eptp(v, ept_get_eptp(&p2m->ept));
+
+ept_sync_domain(p2m);
+}
+
 int vmx_create_vmcs(struct vcpu *v)
 {
 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 74ce9e0..86440fc 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -1129,21 +1129,39 @@ void ept_sync_domain(struct p2m_domain *p2m)
 
 static void ept_enable_pml(struct p2m_domain *p2m)
 {
+/* Domain must have been paused */
+ASSERT(atomic_read(&p2m->domain->pause_count));
+
 /*
- * No need to check if vmx_domain_enable_pml has succeeded or not, as
+ * No need to return whether vmx_domain_enable_pml has succeeded, as
  * ept_p2m_type_to_flags will do the check, and write protection will be
  * used if PML is not enabled.
  */
-vmx_domain_enable_pml(p2m->domain);
+if ( vmx_domain_enable_pml(p2m->domain) )
+return;
+
+/* Enable EPT A/D bit for PML */
+p2m->ept.ept_ad = 1;
+vmx_domain_update_eptp(p2m->domain);
 }
 
 static void ept_disable_pml(struct p2m_domain *p2m)
 {
+/* Domain must have been paused */
+ASSERT(atomic_read(&p2m->domain->pause_count));
+
 vmx_domain_disable_pml(p2m->domain);
+
+/* Disable EPT A/D bit */
+p2m->ept.ept_ad = 0;
+vmx_domain_update_eptp(p2m->domain);
 }
 
 static void ept_flush_pml_buffers(struct p2m_domain *p2m)
 {
+/* Domain must have been paused */
+ASSERT(atomic_read(&p2m->domain->pause_count));
+
 vmx_domain_flush_pml_buffers(p2m->domain);
 }
 
@@ -1166,8 +1184,6 @@ int ept_p2m_init(struct p2m_domain *p2m)
 
 if ( cpu_has_vmx_pml )
 {
-/* Enable EPT A/D bits if we are going to use PML. */
-ept->ept_ad = cpu_has_vmx_pml ? 1 : 0;
 p2m->enable_hardware_log_dirty = ept_enable_pml;
 p2m->disable_hardware_log_dirty = ept_disable_pml;
 p2m->flush_hardware_cached_dirty = ept_flush_pml_buffers;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index f1126d4..ec526db 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -518,6 +518,8 @@ int vmx_domain_enable_pml(struct domain *d);
 void vmx_domain_disable_pml(struct domain *d);
 void vmx_domain_flush_pml_buffers(struct domain *d);
 
+void vmx_domain_update_eptp(struct domain *d);
+
 #endif /* ASM_X86_HVM_VMX_VMCS_H__ */
 
 /*
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] x86/EPT: defer enabling of A/D maintenance until PML get enabled

2015-10-15 Thread Kai Huang



On 10/15/2015 03:35 PM, Kai Huang wrote:



On 10/15/2015 03:11 PM, Jan Beulich wrote:

On 15.10.15 at 08:42,  wrote:

Thanks for your comments Jan. Actually I am not happy with combining
with EPT A/D bit update with PML enabling to single function. After
thinking again, how about adding a separate vmx function (ex,
vmx_domain_update_eptp) to update EPTP of VMCS of all vcpus of domain
after p2m->ept.ept_ad is updated. Another good is this function can 
also

be used in the future for other runtime updates to p2m->ept.

What's your idea?
I don't mind, but that's really more of a question to the VMX 
maintainers.

Then I would prefer this way.

Kevin,

Do you have any comments on this thread?



--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -1129,17 +1129,26 @@ void ept_sync_domain(struct p2m_domain *p2m)

   static void ept_enable_pml(struct p2m_domain *p2m)
   {
   /*
- * No need to check if vmx_domain_enable_pml has succeeded or 
not, as
+ * No need to return if vmx_domain_enable_pml has succeeded or 
not, as

It seems to me that you'd better use "whether" instead of "if" now
(and then perhaps also drop the "or not").

OK. Thanks.


* ept_p2m_type_to_flags will do the check, and write 
protection will be

* used if PML is not enabled.
*/
-vmx_domain_enable_pml(p2m->domain);
+if ( vmx_domain_enable_pml(p2m->domain) )
+return;
+
+p2m->ept.ept_ad = 1;
+vmx_domain_update_eptp(p2m->domain);

Shouldn't you enable A/D _before_ enabling PML, at least without
having a domain-is-paused check here?
Looks we don't have such function. How about just add 
ASSERT(atomic_read(&d->pause_count)), just the same as in 
vmx_domain_enable_pml ?
I mean we can enable A/D before enabling PML, but if so we need 
additional code to clear A/D bit if vmx_domain_enable_pml failed. My 
thinking is considering  the function is called when domain is paused, 
so there's no difference to enable A/D before or after enabling PML.


Thanks,
-Kai


Thanks,
-Kai


Jan





___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel




___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] x86/EPT: defer enabling of A/D maintenance until PML get enabled

2015-10-15 Thread Kai Huang



On 10/15/2015 03:11 PM, Jan Beulich wrote:

On 15.10.15 at 08:42,  wrote:

Thanks for your comments Jan. Actually I am not happy with combining
with EPT A/D bit update with PML enabling to single function. After
thinking again, how about adding a separate vmx function (ex,
vmx_domain_update_eptp) to update EPTP of VMCS of all vcpus of domain
after p2m->ept.ept_ad is updated. Another good is this function can also
be used in the future for other runtime updates to p2m->ept.

What's your idea?

I don't mind, but that's really more of a question to the VMX maintainers.

Then I would prefer this way.

Kevin,

Do you have any comments on this thread?



--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -1129,17 +1129,26 @@ void ept_sync_domain(struct p2m_domain *p2m)

   static void ept_enable_pml(struct p2m_domain *p2m)
   {
   /*
- * No need to check if vmx_domain_enable_pml has succeeded or not, as
+ * No need to return if vmx_domain_enable_pml has succeeded or not, as

It seems to me that you'd better use "whether" instead of "if" now
(and then perhaps also drop the "or not").

OK. Thanks.



* ept_p2m_type_to_flags will do the check, and write protection will be
* used if PML is not enabled.
*/
-vmx_domain_enable_pml(p2m->domain);
+if ( vmx_domain_enable_pml(p2m->domain) )
+return;
+
+p2m->ept.ept_ad = 1;
+vmx_domain_update_eptp(p2m->domain);

Shouldn't you enable A/D _before_ enabling PML, at least without
having a domain-is-paused check here?
Looks we don't have such function. How about just add 
ASSERT(atomic_read(&d->pause_count)), just the same as in 
vmx_domain_enable_pml ?


Thanks,
-Kai


Jan





___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] x86/EPT: defer enabling of A/D maintenance until PML get enabled

2015-10-14 Thread Kai Huang



On 10/14/2015 05:26 PM, Jan Beulich wrote:

On 14.10.15 at 11:08,  wrote:

After some thinking, just set/clear p2m->ept.ept_ad is not enough -- we
also need to __vmwrite it to VMCS's EPTP, and then call ept_sync_domain.

Ah, yes, this makes sense of course.


I have verified attached patch can work.

Thanks!


Which implementation would you prefer, existing code or with attached
patch? If you prefer the latter, please provide comments.

I think it's marginal whether to flip the bit in ept_{en,dis}able_pml()
or vmx_domain_{en,dis}able_pml(); the former would seem slightly
more logical.

There's one possible problem with the patch though: Deferring the
sync from the vcpu to the domain function is fine when the domain
function is the caller, but what about the calls out of vmx.c? The
calls look safe as the domain isn't running (yet or anymore) at that
point, but the respective comments may need adjustment (and
the disable one should also refer to vmx_domain_disable_pml()),
in order to avoid confusing future readers. Also you'd need to fix
coding style of these new comments.
Thanks for your comments Jan. Actually I am not happy with combining 
with EPT A/D bit update with PML enabling to single function. After 
thinking again, how about adding a separate vmx function (ex, 
vmx_domain_update_eptp) to update EPTP of VMCS of all vcpus of domain 
after p2m->ept.ept_ad is updated. Another good is this function can also 
be used in the future for other runtime updates to p2m->ept.


What's your idea?

Below is the temporary code verified to be able to work. If you are OK 
with this approach (and comments are welcome), I will send out the 
formal patch.


diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 3592a88..cddab15 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1553,6 +1553,30 @@ void vmx_domain_flush_pml_buffers(struct domain *d)
 vmx_vcpu_flush_pml_buffer(v);
 }

+static void vmx_vcpu_update_eptp(struct vcpu *v, u64 eptp)
+{
+vmx_vmcs_enter(v);
+__vmwrite(EPT_POINTER, eptp);
+vmx_vmcs_exit(v);
+}
+
+/*
+ * Update EPTP data to VMCS of all vcpus of the domain. Must be called when
+ * domain is paused.
+ */
+void vmx_domain_update_eptp(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+struct vcpu *v;
+
+ASSERT(atomic_read(&d->pause_count));
+
+for_each_vcpu( d, v )
+vmx_vcpu_update_eptp(v, ept_get_eptp(&p2m->ept));
+
+ept_sync_domain(p2m);
+}
+
 int vmx_create_vmcs(struct vcpu *v)
 {
 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 74ce9e0..cbba06a 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -1129,17 +1129,26 @@ void ept_sync_domain(struct p2m_domain *p2m)

 static void ept_enable_pml(struct p2m_domain *p2m)
 {
 /*
- * No need to check if vmx_domain_enable_pml has succeeded or not, as
+ * No need to return if vmx_domain_enable_pml has succeeded or not, as
  * ept_p2m_type_to_flags will do the check, and write protection 
will be

  * used if PML is not enabled.
  */
-vmx_domain_enable_pml(p2m->domain);
+if ( vmx_domain_enable_pml(p2m->domain) )
+return;
+
+p2m->ept.ept_ad = 1;
+vmx_domain_update_eptp(p2m->domain);
 }

 static void ept_disable_pml(struct p2m_domain *p2m)
 {
 vmx_domain_disable_pml(p2m->domain);
+
+p2m->ept.ept_ad = 0;
+vmx_domain_update_eptp(p2m->domain);
 }

 static void ept_flush_pml_buffers(struct p2m_domain *p2m)
@@ -1166,8 +1177,6 @@ int ept_p2m_init(struct p2m_domain *p2m)

 if ( cpu_has_vmx_pml )
 {
-/* Enable EPT A/D bits if we are going to use PML. */
-ept->ept_ad = cpu_has_vmx_pml ? 1 : 0;
 p2m->enable_hardware_log_dirty = ept_enable_pml;
 p2m->disable_hardware_log_dirty = ept_disable_pml;
 p2m->flush_hardware_cached_dirty = ept_flush_pml_buffers;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h

index f1126d4..ec526db 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -518,6 +518,8 @@ int vmx_domain_enable_pml(struct domain *d);
 void vmx_domain_disable_pml(struct domain *d);
 void vmx_domain_flush_pml_buffers(struct domain *d);

+void vmx_domain_update_eptp(struct domain *d);
+
 #endif /* ASM_X86_HVM_VMX_VMCS_H__ */


Thanks,
-Kai



Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel




___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] x86/EPT: defer enabling of A/D maintenance until PML get enabled

2015-10-14 Thread Kai Huang

Hi Jan,

After some thinking, just set/clear p2m->ept.ept_ad is not enough -- we 
also need to __vmwrite it to VMCS's EPTP, and then call ept_sync_domain. 
I have verified attached patch can work.


Which implementation would you prefer, existing code or with attached 
patch? If you prefer the latter, please provide comments.


Thanks,
-Kai

On 10/14/2015 09:19 AM, Kai Huang wrote:

Hi Jan,

Our QA tested this patch but this patch broke PML. Neither GUI display 
(video ram tracking also uses PML) nor live migration works. I'll 
investigate what's wrong and get back to you.


Thanks,
-Kai

On 09/30/2015 08:45 PM, Kai Huang wrote:

On Wed, Sep 30, 2015 at 5:54 PM, Jan Beulich  wrote:

On 30.09.15 at 10:58,  wrote:

Good to me, if you have tested it. Sorry I cannot test it as I am
taking vacation  until Oct.8.

Note how I asked for help with testing ...

On Mon, Sep 28, 2015 at 10:42 PM, Jan Beulich  
wrote:

There's no point in enabling the extra feature for every domain when
we're not meaning to use it (yet). Just setting the flag should be
sufficient - the domain is required to be paused for PML enabling
anyway, i.e. hardware will pick up the new setting the next time
each vCPU of the guest gets scheduled.

Signed-off-by: Jan Beulich 
Cc: Kai Huang 
---
VT-x maintainers, Kai: Me lacking the hardware to test this, may I 
ask

for your help here?

... here. This patch can certainly wait until you get back from
vacation.
Thanks. I'll test it or ask someone has machine to test it after I 
get back.


Thanks,
-Kai

Jan







___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



>From cd01ef0908ee6d0931ea15ff25606f76fe859757 Mon Sep 17 00:00:00 2001
From: Kai Huang 
Date: Wed, 14 Oct 2015 17:01:24 +0800
Subject: [PATCH] x86/ept: defer enabling EPT A/D bit until PML is enabled.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmcs.c | 20 
 xen/arch/x86/mm/p2m-ept.c   |  2 --
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 3592a88..9bb278b 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1382,6 +1382,8 @@ bool_t vmx_vcpu_pml_enabled(const struct vcpu *v)
 
 int vmx_vcpu_enable_pml(struct vcpu *v)
 {
+struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
+
 if ( vmx_vcpu_pml_enabled(v) )
 return 0;
 
@@ -1399,6 +1401,9 @@ int vmx_vcpu_enable_pml(struct vcpu *v)
 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
   v->arch.hvm_vmx.secondary_exec_control);
 
+/* we leave ept_sync_domain to vmx_domain_enable_pml */
+__vmwrite(EPT_POINTER, ept_get_eptp(&p2m->ept));
+
 vmx_vmcs_exit(v);
 
 return 0;
@@ -1406,6 +1411,8 @@ int vmx_vcpu_enable_pml(struct vcpu *v)
 
 void vmx_vcpu_disable_pml(struct vcpu *v)
 {
+struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
+
 if ( !vmx_vcpu_pml_enabled(v) )
 return;
 
@@ -1418,6 +1425,9 @@ void vmx_vcpu_disable_pml(struct vcpu *v)
 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
   v->arch.hvm_vmx.secondary_exec_control);
 
+/* we leave ept_sync_domain to vmx_domain_enable_pml */
+__vmwrite(EPT_POINTER, ept_get_eptp(&p2m->ept));
+
 vmx_vmcs_exit(v);
 
 v->domain->arch.paging.free_page(v->domain, v->arch.hvm_vmx.pml_pg);
@@ -1492,6 +1502,7 @@ bool_t vmx_domain_pml_enabled(const struct domain *d)
  */
 int vmx_domain_enable_pml(struct domain *d)
 {
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
 struct vcpu *v;
 int rc;
 
@@ -1500,10 +1511,14 @@ int vmx_domain_enable_pml(struct domain *d)
 if ( vmx_domain_pml_enabled(d) )
 return 0;
 
+p2m->ept.ept_ad = 1;
+
 for_each_vcpu( d, v )
 if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
 goto error;
 
+ept_sync_domain(p2m);
+
 d->arch.hvm_domain.vmx.status |= VMX_DOMAIN_PML_ENABLED;
 
 return 0;
@@ -1523,6 +1538,7 @@ int vmx_domain_enable_pml(struct domain *d)
  */
 void vmx_domain_disable_pml(struct domain *d)
 {
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
 struct vcpu *v;
 
 ASSERT(atomic_read(&d->pause_count));
@@ -1530,10 +1546,14 @@ void vmx_domain_disable_pml(struct domain *d)
 if ( !vmx_domain_pml_enabled(d) )
 return;
 
+p2m->ept.ept_ad = 0;
+
 for_each_vcpu( d, v )
 vmx_vcpu_disable_pml(v);
 
 d->arch.hvm_domain.vmx.status &= ~VMX_DOMAIN_PML_ENABLED;
+
+ept_sync_domain(p2m);
 }
 
 /*
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 74ce9e0..0d689b0 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -1166,8 +1166,6 @@ int ept_p2m_init(struct p2m_domain *p2m)
 
 if ( cpu_has_vmx_pml )
 {
-/* Enable EPT A/D bits if we are going to use PML. */
-ept->ept_ad = cpu_has_vmx_pml ? 1 : 0

Re: [Xen-devel] [PATCH] x86/EPT: defer enabling of A/D maintenance until PML get enabled

2015-10-13 Thread Kai Huang

Hi Jan,

Our QA tested this patch but this patch broke PML. Neither GUI display 
(video ram tracking also uses PML) nor live migration works. I'll 
investigate what's wrong and get back to you.


Thanks,
-Kai

On 09/30/2015 08:45 PM, Kai Huang wrote:

On Wed, Sep 30, 2015 at 5:54 PM, Jan Beulich  wrote:

On 30.09.15 at 10:58,  wrote:

Good to me, if you have tested it. Sorry I cannot test it as I am
taking vacation  until Oct.8.

Note how I asked for help with testing ...


On Mon, Sep 28, 2015 at 10:42 PM, Jan Beulich  wrote:

There's no point in enabling the extra feature for every domain when
we're not meaning to use it (yet). Just setting the flag should be
sufficient - the domain is required to be paused for PML enabling
anyway, i.e. hardware will pick up the new setting the next time
each vCPU of the guest gets scheduled.

Signed-off-by: Jan Beulich 
Cc: Kai Huang 
---
VT-x maintainers, Kai: Me lacking the hardware to test this, may I ask
for your help here?

... here. This patch can certainly wait until you get back from
vacation.

Thanks. I'll test it or ask someone has machine to test it after I get back.

Thanks,
-Kai

Jan







___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3] x86/EPT: work around hardware erratum setting A bit

2015-10-13 Thread Kai Huang



On 10/02/2015 05:36 PM, Wei Liu wrote:

On Wed, Sep 30, 2015 at 01:25:49PM +0100, Wei Liu wrote:

On Wed, Sep 30, 2015 at 05:36:22AM -0600, Jan Beulich wrote:

Since commit 191b3f3344ee ("p2m/ept: enable PML in p2m-ept for
log-dirty"), the A and D bits of EPT paging entries are set
unconditionally, regardless of whether PML is enabled or not. This
causes a regression in Xen 4.6 on some processors due to Intel Errata
AVR41 -- HVM guests get severe memory corruption when the A bit is set
due to incorrect TLB flushing on mov to cr3. The errata affects the Atom
C2000 family (Avoton).

To fix, do not set the A bit on this processor family.

Signed-off-by: Ross Lagerwall 

Move feature suppression to feature detection code. Add command line
override.

Signed-off-by: Jan Beulich 


Release-acked-by: Wei Liu 

Thanks for handling this issue!

In light of both the author and vmx maintainer are on vacation until
October 8, I think we might as well commit this today.

Kevin and Kai, when you're back, please have a look at this patch. And,
if you disagree with the approach, please provide a patch to be
backported to 4.6.1.

Hi Wei,

Sorry for late response. Our QA (Xudong and Robert, also copied here) 
has conducted PML testing on Xen 4.6 release (which contains this patch) 
and looks everything works fine.


Thanks,
-Kai


Wei.




___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] x86/EPT: defer enabling of A/D maintenance until PML get enabled

2015-09-30 Thread Kai Huang
On Wed, Sep 30, 2015 at 5:54 PM, Jan Beulich  wrote:
>>>> On 30.09.15 at 10:58,  wrote:
>> Good to me, if you have tested it. Sorry I cannot test it as I am
>> taking vacation  until Oct.8.
>
> Note how I asked for help with testing ...
>
>> On Mon, Sep 28, 2015 at 10:42 PM, Jan Beulich  wrote:
>>> There's no point in enabling the extra feature for every domain when
>>> we're not meaning to use it (yet). Just setting the flag should be
>>> sufficient - the domain is required to be paused for PML enabling
>>> anyway, i.e. hardware will pick up the new setting the next time
>>> each vCPU of the guest gets scheduled.
>>>
>>> Signed-off-by: Jan Beulich 
>>> Cc: Kai Huang 
>>> ---
>>> VT-x maintainers, Kai: Me lacking the hardware to test this, may I ask
>>> for your help here?
>
> ... here. This patch can certainly wait until you get back from
> vacation.

Thanks. I'll test it or ask someone has machine to test it after I get back.

Thanks,
-Kai
>
> Jan
>



-- 
Thanks,
-Kai

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCHv2 for-4.6] p2m/ept: Work around hardware errata setting A bit

2015-09-30 Thread Kai Huang
On Mon, Sep 28, 2015 at 10:09 PM, Jan Beulich  wrote:
 On 28.09.15 at 14:39,  wrote:
>> --- a/xen/arch/x86/mm/p2m-ept.c
>> +++ b/xen/arch/x86/mm/p2m-ept.c
>> @@ -34,6 +34,8 @@
>>
>>  #include "mm-locks.h"
>>
>> +static bool_t __read_mostly cpu_has_ept_ad;
>
> This should be
> #define cpu_has_ept_ad (vmx_ept_vpid_cap & VMX_EPT_AD_BIT)
> put next to the respective other ones in vmx.h.

+1.

>
>> @@ -1150,6 +1152,9 @@ int ept_p2m_init(struct p2m_domain *p2m)
>>  p2m->memory_type_changed = ept_memory_type_changed;
>>  p2m->audit_p2m = NULL;
>>
>> +/* Work around Errata AVR41 on Avaton processors. */
>> +cpu_has_ept_ad = boot_cpu_data.x86_model != 0x4d;
>
> And this one then should turn off said flag (i.e. needs to be moved
> elsewhere).

However if cpu_has_ept_ad is going to be a macro definition as
suggested above, looks we need another variable.

>
> Plus PML initialization should get a respective check added.

My first thinking is checking whether cpu_has_ept_ad is set in ept_enable_pml.

Btw sorry I am taking vacation until Oct.8 so please expect slow
response if you need my comments.

Thanks,
-Kai

>
> Jan
>
>
> ___
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel



-- 
Thanks,
-Kai

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] x86/EPT: defer enabling of A/D maintenance until PML get enabled

2015-09-30 Thread Kai Huang
Good to me, if you have tested it. Sorry I cannot test it as I am
taking vacation  until Oct.8.

Thanks,
-Kai

On Mon, Sep 28, 2015 at 10:42 PM, Jan Beulich  wrote:
> There's no point in enabling the extra feature for every domain when
> we're not meaning to use it (yet). Just setting the flag should be
> sufficient - the domain is required to be paused for PML enabling
> anyway, i.e. hardware will pick up the new setting the next time
> each vCPU of the guest gets scheduled.
>
> Signed-off-by: Jan Beulich 
> Cc: Kai Huang 
> ---
> VT-x maintainers, Kai: Me lacking the hardware to test this, may I ask
> for your help here?
>
> --- a/xen/arch/x86/mm/p2m-ept.c
> +++ b/xen/arch/x86/mm/p2m-ept.c
> @@ -1127,6 +1127,7 @@ void ept_sync_domain(struct p2m_domain *
>
>  static void ept_enable_pml(struct p2m_domain *p2m)
>  {
> +p2m->ept.ept_ad = 1;
>  /*
>   * No need to check if vmx_domain_enable_pml has succeeded or not, as
>   * ept_p2m_type_to_flags will do the check, and write protection will be
> @@ -1137,6 +1138,7 @@ static void ept_enable_pml(struct p2m_do
>
>  static void ept_disable_pml(struct p2m_domain *p2m)
>  {
> +p2m->ept.ept_ad = 0;
>  vmx_domain_disable_pml(p2m->domain);
>  }
>
> @@ -1164,8 +1166,6 @@ int ept_p2m_init(struct p2m_domain *p2m)
>
>  if ( cpu_has_vmx_pml )
>  {
> -/* Enable EPT A/D bits if we are going to use PML. */
> -ept->ept_ad = cpu_has_vmx_pml ? 1 : 0;
>  p2m->enable_hardware_log_dirty = ept_enable_pml;
>  p2m->disable_hardware_log_dirty = ept_disable_pml;
>  p2m->flush_hardware_cached_dirty = ept_flush_pml_buffers;
>
>
>
>
> ___
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>



-- 
Thanks,
-Kai

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH for-4.6] p2m/ept: Set the A bit only if PML is enabled

2015-09-28 Thread Kai Huang



On 09/24/2015 05:10 PM, Tim Deegan wrote:

At 01:02 -0600 on 24 Sep (1443056566), Jan Beulich wrote:

On 23.09.15 at 17:46,  wrote:

At 16:18 +0100 on 23 Sep (1443025126), Wei Liu wrote:

With the discussion still not finalised I'm a bit worried that this
issue will block the release.

I think we have a few options here. I will list them in order of my
preference. Please correct me if I'm talking non-sense, and feel free to
add more options if I miss anything.

1. Disable PML on broken chips, gate access to A bit (or AD) with PML.

I don't much like tying this to PML: this is not a PML-related bug and
there may be CPUs that have A/D but not PML.

Better to have a global read-mostly bool cpu_has_vmx_ept_broken_access_bit,
or whatever name the maintainers prefer. :)

Actually I'd suggest a positive identification (e.g. cpu_has_ept_ad),
which would get forced off on known broken chips. And then, in a
slight variation of the previously proposed patch, at least for the
immediate 4.6 needs do

--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -130,14 +130,18 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, 
ept_entry_t *entry,
  break;
  case p2m_ram_rw:
  entry->r = entry->w = entry->x = 1;
-entry->a = entry->d = 1;
+entry->a = entry->d = cpu_has_ept_ad;
  break;
  case p2m_mmio_direct:
  entry->r = entry->x = 1;
  entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
  entry->mfn);
-entry->a = 1;
-entry->d = entry->w;
+entry->a = cpu_has_ept_ad;
+entry->d = entry->w && cpu_has_ept_ad;
  break;
  case p2m_ram_logdirty:
  entry->r = entry->x = 1;


Sure, that works.  I still prefer putting the workaround on the CR3
operation, so all the cost happens on the broken chip, but I'll shut
up about that now. :)
Sorry for late response on this issue. This is good to me too as it 
avoids the "if" gate.





etc along with adjusting the existing gating of PML on AD being
available (perhaps by simply stripping the respective bit from what
we read from MSR_IA32_VMX_EPT_VPID_CAP). Of course this
then ignores the fact that the erratum only affects the A bit, but
I think we can live with that.

I also think the currently slightly strange setting of the ept_ad bit
should be changed: There's no point setting the bit for domains
not getting PML enabled (and incurring the overhead of the
hardware updating the bits); imo this should instead be done in
ept_enable_pml() / vmx_domain_enable_pml() (and undone in
the respective disable function).

Yep.
Yes this is slight better. But I don't think keeping current code of 
setting ept_ad in ept_p2m_init would cause any performance regression, 
as we'll unconditionally set A/D bits to 1 in ept_p2m_type_to_flags to 
avoid having CPU to set them later. Right?


For the erratum we are talking about here, the ept_ad bit simply won't 
be set as PML is simply not supported.


Thanks,
-Kai



2. Implement general framework to detect broken chips and apply quirks.

I take that there is no general framework at the moment, otherwise the
patch would have used that.

We already have code that detects specific chips and adjusts things,
e.g. init_intel() in arch/x86/cpu/intel.c.  That seems like a good
place to set the global flag above, or...

Together with the above I'm not sure that would be the best place
to deal with this (EPT specific) erratum; I think this would better be
contained to VMX/EPT code.

Agreed.

Tim.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel




___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3 01/10] vmx: add new boot parameter to control PML enabling

2015-05-04 Thread Kai Huang



On 05/04/2015 03:52 PM, Jan Beulich wrote:

On 04.05.15 at 09:46,  wrote:

On 04/27/2015 02:56 PM, Jan Beulich wrote:

Kai Huang  04/25/15 5:00 PM >>>

On Fri, Apr 24, 2015 at 10:33 PM, Jan Beulich  wrote:

On 24.04.15 at 10:19,  wrote:

+}
+
+custom_param("ept", parse_ept_param);

And a superfluous blank line would want to be dropped here.

Sure. Will do both of your above comments if a further v4 is needed. Thanks.

And I suppose you are talking about the blank line before
custom_param("ept", parse_ept_param) ?

Yes.

Tim, Kevin, Andrew have provided their acks, so I think the v3 patch
series are OK to be merged?

I just got back to the office and would still like to make at least a brief
pass over the rest of the series before applying it.

Sure.

Thanks,
Kai



For your comments above, if you think is necessary, I can sent another
incremental patch to address, or you can just simply do it for me. Is
this OK to you?

Yes, I've taken notes to do these adjustments while committing.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3 00/10] PML (Page Modification Logging) support

2015-05-04 Thread Kai Huang



On 05/04/2015 03:40 PM, Tian, Kevin wrote:

From: Tim Deegan [mailto:t...@xen.org]
Sent: Thursday, April 30, 2015 7:04 PM

At 16:19 +0800 on 24 Apr (1429892368), Kai Huang wrote:

v2->v3:

- Merged v2 patch 02 (document change) to patch 01 as a single patch, and
   changed new parameter description as suggested by Andrew.
- changed vmx_vcpu_flush_pml_buffer to call mark_dirty for all logged GFNs,

and

   call p2m_change_type_one regardless of return value.
- Added ASSERT for vcpu (being current, or being non-running and

unrunnable) to

   vmx_vcpu_flush_pml_buffer
- Other refinement in coding style, comments description, etc.

Sanity test of live migration has been tested both with and without PML.

Acked-by: Tim Deegan 

Cheers,

Tim.

Acked-by: Kevin Tian  for all VMX related changes.

Thanks Kevin.

Thanks,
-Kai


Thanks
Kevin

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3 01/10] vmx: add new boot parameter to control PML enabling

2015-05-04 Thread Kai Huang



On 04/27/2015 02:56 PM, Jan Beulich wrote:

Kai Huang  04/25/15 5:00 PM >>>

On Fri, Apr 24, 2015 at 10:33 PM, Jan Beulich  wrote:

On 24.04.15 at 10:19,  wrote:

+}
+
+custom_param("ept", parse_ept_param);

And a superfluous blank line would want to be dropped here.

Sure. Will do both of your above comments if a further v4 is needed. Thanks.

And I suppose you are talking about the blank line before
custom_param("ept", parse_ept_param) ?

Yes.

Hi Jan,

Tim, Kevin, Andrew have provided their acks, so I think the v3 patch 
series are OK to be merged?


For your comments above, if you think is necessary, I can sent another 
incremental patch to address, or you can just simply do it for me. Is 
this OK to you?


Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3 00/10] PML (Page Modification Logging) support

2015-05-01 Thread Kai Huang
Thanks Tim!

On Thu, Apr 30, 2015 at 7:04 PM, Tim Deegan  wrote:
> At 16:19 +0800 on 24 Apr (1429892368), Kai Huang wrote:
>> v2->v3:
>>
>> - Merged v2 patch 02 (document change) to patch 01 as a single patch, and
>>   changed new parameter description as suggested by Andrew.
>> - changed vmx_vcpu_flush_pml_buffer to call mark_dirty for all logged GFNs, 
>> and
>>   call p2m_change_type_one regardless of return value.
>> - Added ASSERT for vcpu (being current, or being non-running and unrunnable) 
>> to
>>   vmx_vcpu_flush_pml_buffer
>> - Other refinement in coding style, comments description, etc.
>>
>> Sanity test of live migration has been tested both with and without PML.
>
> Acked-by: Tim Deegan 
>
> Cheers,
>
> Tim.
>
> ___
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel



-- 
Thanks,
-Kai

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3 01/10] vmx: add new boot parameter to control PML enabling

2015-04-25 Thread Kai Huang
On Fri, Apr 24, 2015 at 10:33 PM, Jan Beulich  wrote:
 On 24.04.15 at 10:19,  wrote:
>> --- a/xen/arch/x86/hvm/vmx/vmcs.c
>> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
>> @@ -64,6 +64,36 @@ integer_param("ple_gap", ple_gap);
>>  static unsigned int __read_mostly ple_window = 4096;
>>  integer_param("ple_window", ple_window);
>>
>> +static bool_t __read_mostly opt_pml_enabled = 0;
>> +
>> +/*
>> + * The 'ept' parameter controls functionalities that depend on, or impact 
>> the
>> + * EPT mechanism. Optional comma separated value may contain:
>> + *
>> + *  pml Enable PML
>> + */
>> +static void __init parse_ept_param(char *s)
>> +{
>> +char *ss;
>> +
>> +do {
>> +bool_t val = !!strncmp(s, "no-", 3);
>> +if ( !val )
>
> In case another round is needed, a blank line is missing above.
>
>> +s += 3;
>> +
>> +ss = strchr(s, ',');
>> +if ( ss )
>> +*ss = '\0';
>> +
>> +if ( !strcmp(s, "pml") )
>> +opt_pml_enabled = val;
>> +
>> +s = ss + 1;
>> +} while ( ss );
>> +}
>> +
>> +custom_param("ept", parse_ept_param);
>
> And a superfluous blank line would want to be dropped here.

Sure. Will do both of your above comments if a further v4 is needed. Thanks.

And I suppose you are talking about the blank line before
custom_param("ept", parse_ept_param) ?

Thanks,
-Kai

>
> Jan
>
>
> ___
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel



-- 
Thanks,
-Kai

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v3 05/10] vmx: add help functions to support PML

2015-04-24 Thread Kai Huang
This patch adds help functions to enable/disable PML, and flush PML buffer for
single vcpu and particular domain for further use.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmcs.c| 179 +
 xen/include/asm-x86/hvm/vmx/vmcs.h |   9 ++
 2 files changed, 188 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 04fdca3..f797fde 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1323,6 +1323,185 @@ void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 
vector)
 &v->arch.hvm_vmx.eoi_exitmap_changed);
 }
 
+bool_t vmx_vcpu_pml_enabled(const struct vcpu *v)
+{
+return !!(v->arch.hvm_vmx.secondary_exec_control &
+  SECONDARY_EXEC_ENABLE_PML);
+}
+
+int vmx_vcpu_enable_pml(struct vcpu *v)
+{
+if ( vmx_vcpu_pml_enabled(v) )
+return 0;
+
+v->arch.hvm_vmx.pml_pg = v->domain->arch.paging.alloc_page(v->domain);
+if ( !v->arch.hvm_vmx.pml_pg )
+return -ENOMEM;
+
+vmx_vmcs_enter(v);
+
+__vmwrite(PML_ADDRESS, page_to_mfn(v->arch.hvm_vmx.pml_pg) << PAGE_SHIFT);
+__vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
+
+v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_PML;
+
+__vmwrite(SECONDARY_VM_EXEC_CONTROL,
+  v->arch.hvm_vmx.secondary_exec_control);
+
+vmx_vmcs_exit(v);
+
+return 0;
+}
+
+void vmx_vcpu_disable_pml(struct vcpu *v)
+{
+if ( !vmx_vcpu_pml_enabled(v) )
+return;
+
+/* Make sure we don't lose any logged GPAs */
+vmx_vcpu_flush_pml_buffer(v);
+
+vmx_vmcs_enter(v);
+
+v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+__vmwrite(SECONDARY_VM_EXEC_CONTROL,
+  v->arch.hvm_vmx.secondary_exec_control);
+
+vmx_vmcs_exit(v);
+
+v->domain->arch.paging.free_page(v->domain, v->arch.hvm_vmx.pml_pg);
+v->arch.hvm_vmx.pml_pg = NULL;
+}
+
+void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
+{
+uint64_t *pml_buf;
+unsigned long pml_idx;
+
+ASSERT((v == current) || (!vcpu_runnable(v) && !v->is_running));
+ASSERT(vmx_vcpu_pml_enabled(v));
+
+vmx_vmcs_enter(v);
+
+__vmread(GUEST_PML_INDEX, &pml_idx);
+
+/* Do nothing if PML buffer is empty */
+if ( pml_idx == (NR_PML_ENTRIES - 1) )
+goto out;
+
+pml_buf = __map_domain_page(v->arch.hvm_vmx.pml_pg);
+
+/*
+ * PML index can be either 2^16-1 (buffer is full), or 0 ~ NR_PML_ENTRIES-1
+ * (buffer is not full), and in latter case PML index always points to next
+ * available entity.
+ */
+if (pml_idx >= NR_PML_ENTRIES)
+pml_idx = 0;
+else
+pml_idx++;
+
+for ( ; pml_idx < NR_PML_ENTRIES; pml_idx++ )
+{
+unsigned long gfn = pml_buf[pml_idx] >> PAGE_SHIFT;
+
+/*
+ * Need to change type from log-dirty to normal memory for logged GFN.
+ * hap_track_dirty_vram depends on it to work. And we mark all logged
+ * GFNs to be dirty, as we cannot be sure whether it's safe to ignore
+ * GFNs on which p2m_change_type_one returns failure. The failure cases
+ * are very rare, and additional cost is negligible, but a missing mark
+ * is extremely difficult to debug.
+ */
+p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty, p2m_ram_rw);
+paging_mark_gfn_dirty(v->domain, gfn);
+}
+
+unmap_domain_page(pml_buf);
+
+/* Reset PML index */
+__vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
+
+ out:
+vmx_vmcs_exit(v);
+}
+
+bool_t vmx_domain_pml_enabled(const struct domain *d)
+{
+return !!(d->arch.hvm_domain.vmx.status & VMX_DOMAIN_PML_ENABLED);
+}
+
+/*
+ * This function enables PML for particular domain. It should be called when
+ * domain is paused.
+ *
+ * PML needs to be enabled globally for all vcpus of the domain, as PML buffer
+ * and PML index are pre-vcpu, but EPT table is shared by vcpus, therefore
+ * enabling PML on partial vcpus won't work.
+ */
+int vmx_domain_enable_pml(struct domain *d)
+{
+struct vcpu *v;
+int rc;
+
+ASSERT(atomic_read(&d->pause_count));
+
+if ( vmx_domain_pml_enabled(d) )
+return 0;
+
+for_each_vcpu( d, v )
+if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
+goto error;
+
+d->arch.hvm_domain.vmx.status |= VMX_DOMAIN_PML_ENABLED;
+
+return 0;
+
+ error:
+for_each_vcpu( d, v )
+if ( vmx_vcpu_pml_enabled(v) )
+vmx_vcpu_disable_pml(v);
+return rc;
+}
+
+/*
+ * Disable PML for particular domain. Called when domain is paused.
+ *
+ * The same as enabling PML for domain, disabling PML should be done for all
+ * vcpus at once.
+ */
+void vmx_domain_disable_pml(struct domain *d)
+{
+struct vcpu *v;
+
+ASSERT(atomic_read(&d->pause_count));
+
+if ( !

[Xen-devel] [v3 09/10] log-dirty: refine common code to support PML

2015-04-24 Thread Kai Huang
Using PML, it's possible there are dirty GPAs logged in vcpus' PML buffers
when userspace peek/clear dirty pages, therefore we need to flush them befor
reporting dirty pages to userspace. This applies to both video ram tracking and
paging_log_dirty_op.

This patch adds new p2m layer functions to enable/disable PML and flush PML
buffers. The new functions are named to be generic to cover potential futher
PML-like features for other platforms.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/mm/hap/hap.c | 29 +
 xen/arch/x86/mm/p2m.c | 36 
 xen/arch/x86/mm/paging.c  | 10 ++
 xen/include/asm-x86/p2m.h | 11 +++
 4 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 4ecb2e2..1099670 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -121,7 +121,10 @@ int hap_track_dirty_vram(struct domain *d,
 p2m_change_type_range(d, ostart, oend,
   p2m_ram_logdirty, p2m_ram_rw);
 
-/* set l1e entries of range within P2M table to be read-only. */
+/*
+ * switch vram to log dirty mode, either by setting l1e entries of
+ * P2M table to be read-only, or via hardware-assisted log-dirty.
+ */
 p2m_change_type_range(d, begin_pfn, begin_pfn + nr,
   p2m_ram_rw, p2m_ram_logdirty);
 
@@ -135,6 +138,9 @@ int hap_track_dirty_vram(struct domain *d,
 
 domain_pause(d);
 
+/* flush dirty GFNs potentially cached by hardware */
+p2m_flush_hardware_cached_dirty(d);
+
 /* get the bitmap */
 paging_log_dirty_range(d, begin_pfn, nr, dirty_bitmap);
 
@@ -190,9 +196,15 @@ static int hap_enable_log_dirty(struct domain *d, bool_t 
log_global)
 d->arch.paging.mode |= PG_log_dirty;
 paging_unlock(d);
 
+/* enable hardware-assisted log-dirty if it is supported */
+p2m_enable_hardware_log_dirty(d);
+
 if ( log_global )
 {
-/* set l1e entries of P2M table to be read-only. */
+/*
+ * switch to log dirty mode, either by setting l1e entries of P2M table
+ * to be read-only, or via hardware-assisted log-dirty.
+ */
 p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty);
 flush_tlb_mask(d->domain_dirty_cpumask);
 }
@@ -205,14 +217,23 @@ static int hap_disable_log_dirty(struct domain *d)
 d->arch.paging.mode &= ~PG_log_dirty;
 paging_unlock(d);
 
-/* set l1e entries of P2M table with normal mode */
+/* disable hardware-assisted log-dirty if it is supported */
+p2m_disable_hardware_log_dirty(d);
+
+/*
+ * switch to normal mode, either by setting l1e entries of P2M table to
+ * normal mode, or via hardware-assisted log-dirty.
+ */
 p2m_change_entry_type_global(d, p2m_ram_logdirty, p2m_ram_rw);
 return 0;
 }
 
 static void hap_clean_dirty_bitmap(struct domain *d)
 {
-/* set l1e entries of P2M table to be read-only. */
+/*
+ * switch to log-dirty mode, either by setting l1e entries of P2M table to
+ * be read-only, or via hardware-assisted log-dirty.
+ */
 p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty);
 flush_tlb_mask(d->domain_dirty_cpumask);
 }
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index df4a485..67edf89 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -239,6 +239,42 @@ void p2m_memory_type_changed(struct domain *d)
 }
 }
 
+void p2m_enable_hardware_log_dirty(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+if ( p2m->enable_hardware_log_dirty )
+{
+p2m_lock(p2m);
+p2m->enable_hardware_log_dirty(p2m);
+p2m_unlock(p2m);
+}
+}
+
+void p2m_disable_hardware_log_dirty(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+if ( p2m->disable_hardware_log_dirty )
+{
+p2m_lock(p2m);
+p2m->disable_hardware_log_dirty(p2m);
+p2m_unlock(p2m);
+}
+}
+
+void p2m_flush_hardware_cached_dirty(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+if ( p2m->flush_hardware_cached_dirty )
+{
+p2m_lock(p2m);
+p2m->flush_hardware_cached_dirty(p2m);
+p2m_unlock(p2m);
+}
+}
+
 mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
 p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
 unsigned int *page_order, bool_t locked)
diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index 77c929b..59d4720 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -422,7 +422,17 @@ static int paging_log_dirty_op(struct domain *d,
 int i4, i3, i2;
 
 if ( !resuming )
+{
 domain_pause(d)

[Xen-devel] [v3 01/10] vmx: add new boot parameter to control PML enabling

2015-04-24 Thread Kai Huang
A top level EPT parameter "ept=" and a sub boolean "opt_pml_enabled"
are added to control PML. Other booleans can be further added for any other EPT
related features.

The document description for the new parameter is also added.

Signed-off-by: Kai Huang 
---
 docs/misc/xen-command-line.markdown | 15 +++
 xen/arch/x86/hvm/vmx/vmcs.c | 30 ++
 2 files changed, 45 insertions(+)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index 1dda1f0..4889e27 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -685,6 +685,21 @@ requirement can be relaxed.  This option is particularly 
useful for nested
 virtualization, to allow the L1 hypervisor to use EPT even if the L0 hypervisor
 does not provide VM\_ENTRY\_LOAD\_GUEST\_PAT.
 
+### ept (Intel)
+> `= List of ( pml )`
+
+> Default: `false`
+
+Controls EPT related features. Currently only Page Modification Logging (PML) 
is
+the controllable feature as boolean type.
+
+PML is a new hardware feature in Intel's Broadwell Server and further platforms
+which reduces hypervisor overhead of log-dirty mechanism by automatically
+recording GPAs (guest physical addresses) when guest memory gets dirty, and
+therefore significantly reducing number of EPT violation caused by write
+protection of guest memory, which is a necessity to implement log-dirty
+mechanism before PML.
+
 ### gdb
 > `= [/][,DPS[,[,[,[, 
 > | pci | amt ] `
 
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 63007a9..79efa42 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -64,6 +64,36 @@ integer_param("ple_gap", ple_gap);
 static unsigned int __read_mostly ple_window = 4096;
 integer_param("ple_window", ple_window);
 
+static bool_t __read_mostly opt_pml_enabled = 0;
+
+/*
+ * The 'ept' parameter controls functionalities that depend on, or impact the
+ * EPT mechanism. Optional comma separated value may contain:
+ *
+ *  pml Enable PML
+ */
+static void __init parse_ept_param(char *s)
+{
+char *ss;
+
+do {
+bool_t val = !!strncmp(s, "no-", 3);
+if ( !val )
+s += 3;
+
+ss = strchr(s, ',');
+if ( ss )
+*ss = '\0';
+
+if ( !strcmp(s, "pml") )
+opt_pml_enabled = val;
+
+s = ss + 1;
+} while ( ss );
+}
+
+custom_param("ept", parse_ept_param);
+
 /* Dynamic (run-time adjusted) execution control flags. */
 u32 vmx_pin_based_exec_control __read_mostly;
 u32 vmx_cpu_based_exec_control __read_mostly;
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v3 08/10] vmx: disable PML in vmx_vcpu_destroy

2015-04-24 Thread Kai Huang
It's possible domain still remains in log-dirty mode when it is about to be
destroyed, in which case we should manually disable PML for it.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmx.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index e5471b8..e189424 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -153,6 +153,14 @@ static int vmx_vcpu_initialise(struct vcpu *v)
 
 static void vmx_vcpu_destroy(struct vcpu *v)
 {
+/*
+ * There are cases that domain still remains in log-dirty mode when it is
+ * about to be destroyed (ex, user types 'xl destroy '), in which case
+ * we should disable PML manually here. Note that vmx_vcpu_destroy is 
called
+ * prior to vmx_domain_destroy so we need to disable PML for each vcpu
+ * separately here.
+ */
+vmx_vcpu_disable_pml(v);
 vmx_destroy_vmcs(v);
 vpmu_destroy(v);
 passive_domain_destroy(v);
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v3 00/10] PML (Page Modification Logging) support

2015-04-24 Thread Kai Huang
v2->v3:

- Merged v2 patch 02 (document change) to patch 01 as a single patch, and
  changed new parameter description as suggested by Andrew.
- changed vmx_vcpu_flush_pml_buffer to call mark_dirty for all logged GFNs, and
  call p2m_change_type_one regardless of return value.
- Added ASSERT for vcpu (being current, or being non-running and unrunnable) to
  vmx_vcpu_flush_pml_buffer
- Other refinement in coding style, comments description, etc.

Sanity test of live migration has been tested both with and without PML.

Kai Huang (10):
  vmx: add new boot parameter to control PML enabling
  log-dirty: add new paging_mark_gfn_dirty
  vmx: add PML definition and feature detection
  vmx: add new data structure member to support PML
  vmx: add help functions to support PML
  vmx: handle PML buffer full VMEXIT
  vmx: handle PML enabling in vmx_vcpu_initialise
  vmx: disable PML in vmx_vcpu_destroy
  log-dirty: refine common code to support PML
  p2m/ept: enable PML in p2m-ept for log-dirty

 docs/misc/xen-command-line.markdown |  15 +++
 xen/arch/x86/hvm/vmx/vmcs.c | 227 
 xen/arch/x86/hvm/vmx/vmx.c  |  35 ++
 xen/arch/x86/mm/hap/hap.c   |  29 -
 xen/arch/x86/mm/p2m-ept.c   |  79 +++--
 xen/arch/x86/mm/p2m.c   |  36 ++
 xen/arch/x86/mm/paging.c|  41 +--
 xen/include/asm-x86/hvm/vmx/vmcs.h  |  26 -
 xen/include/asm-x86/hvm/vmx/vmx.h   |   4 +-
 xen/include/asm-x86/p2m.h   |  11 ++
 xen/include/asm-x86/paging.h|   2 +
 11 files changed, 482 insertions(+), 23 deletions(-)

-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v3 07/10] vmx: handle PML enabling in vmx_vcpu_initialise

2015-04-24 Thread Kai Huang
It's possible domain has already been in log-dirty mode when creating vcpu, in
which case we should enable PML for this vcpu if PML has been enabled for the
domain.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmx.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index f11ac46..e5471b8 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -117,6 +117,29 @@ static int vmx_vcpu_initialise(struct vcpu *v)
 return rc;
 }
 
+/*
+ * It's rare but still possible that domain has already been in log-dirty
+ * mode when vcpu is being created (commented by Tim), in which case we
+ * should enable PML for this vcpu if PML has been enabled for the domain,
+ * and failure to enable results in failure of creating this vcpu.
+ *
+ * Note even there's no vcpu created for the domain, vmx_domain_enable_pml
+ * will return successful in which case vmx_domain_pml_enabled will also
+ * return true. And even this is the first vcpu to be created with
+ * vmx_domain_pml_enabled being true, failure of enabling PML still results
+ * in failure of creating vcpu, to avoid complicated logic to revert PML
+ * style EPT table to non-PML style EPT table.
+ */
+if ( vmx_domain_pml_enabled(v->domain) )
+{
+if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
+{
+dprintk(XENLOG_ERR, "%pv: Failed to enable PML.\n", v);
+vmx_destroy_vmcs(v);
+return rc;
+}
+}
+
 vpmu_initialise(v);
 
 vmx_install_vlapic_mapping(v);
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v3 04/10] vmx: add new data structure member to support PML

2015-04-24 Thread Kai Huang
A new 4K page pointer is added to arch_vmx_struct as PML buffer for vcpu. And a
new 'status' field is added to vmx_domain to indicate whether PML is enabled for
the domain or not.

Signed-off-by: Kai Huang 
---
 xen/include/asm-x86/hvm/vmx/vmcs.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index f831a78..441e974 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -70,8 +70,12 @@ struct ept_data {
 cpumask_var_t synced_mask;
 };
 
+#define _VMX_DOMAIN_PML_ENABLED0
+#define VMX_DOMAIN_PML_ENABLED (1ul << _VMX_DOMAIN_PML_ENABLED)
 struct vmx_domain {
 unsigned long apic_access_mfn;
+/* VMX_DOMAIN_* */
+unsigned int status;
 };
 
 struct pi_desc {
@@ -85,6 +89,8 @@ struct pi_desc {
 #define ept_get_eptp(ept) ((ept)->eptp)
 #define ept_get_synced_mask(ept) ((ept)->synced_mask)
 
+#define NR_PML_ENTRIES   512
+
 struct arch_vmx_struct {
 /* Virtual address of VMCS. */
 struct vmcs_struct  *vmcs;
@@ -142,6 +148,8 @@ struct arch_vmx_struct {
 /* Bitmap to control vmexit policy for Non-root VMREAD/VMWRITE */
 struct page_info *vmread_bitmap;
 struct page_info *vmwrite_bitmap;
+
+struct page_info *pml_pg;
 };
 
 int vmx_create_vmcs(struct vcpu *v);
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v3 02/10] log-dirty: add new paging_mark_gfn_dirty

2015-04-24 Thread Kai Huang
PML logs GPA in PML buffer. Original paging_mark_dirty takes MFN as parameter
but it gets guest pfn internally and use guest pfn to as index for looking up
radix log-dirty tree. In flushing PML buffer, calling paging_mark_dirty directly
introduces redundant p2m lookups (gfn->mfn->gfn), therefore we introduce
paging_mark_gfn_dirty which is bulk of paging_mark_dirty but takes guest pfn as
parameter, and in flushing PML buffer we call paging_mark_gfn_dirty directly.
Original paging_mark_dirty then simply is a wrapper of paging_mark_gfn_dirty.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/mm/paging.c | 31 +--
 xen/include/asm-x86/paging.h |  2 ++
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index b54d76a..77c929b 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -266,24 +266,17 @@ static int paging_log_dirty_disable(struct domain *d, 
bool_t resuming)
 return ret;
 }
 
-/* Mark a page as dirty */
-void paging_mark_dirty(struct domain *d, unsigned long guest_mfn)
+/* Mark a page as dirty, with taking guest pfn as parameter */
+void paging_mark_gfn_dirty(struct domain *d, unsigned long pfn)
 {
-unsigned long pfn;
-mfn_t gmfn;
 int changed;
 mfn_t mfn, *l4, *l3, *l2;
 unsigned long *l1;
 int i1, i2, i3, i4;
 
-gmfn = _mfn(guest_mfn);
-
-if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) ||
- page_get_owner(mfn_to_page(gmfn)) != d )
+if ( !paging_mode_log_dirty(d) )
 return;
 
-/* We /really/ mean PFN here, even for non-translated guests. */
-pfn = get_gpfn_from_mfn(mfn_x(gmfn));
 /* Shared MFNs should NEVER be marked dirty */
 BUG_ON(SHARED_M2P(pfn));
 
@@ -351,6 +344,24 @@ out:
 return;
 }
 
+/* Mark a page as dirty */
+void paging_mark_dirty(struct domain *d, unsigned long guest_mfn)
+{
+unsigned long pfn;
+mfn_t gmfn;
+
+gmfn = _mfn(guest_mfn);
+
+if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) ||
+ page_get_owner(mfn_to_page(gmfn)) != d )
+return;
+
+/* We /really/ mean PFN here, even for non-translated guests. */
+pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+paging_mark_gfn_dirty(d, pfn);
+}
+
 
 /* Is this guest page dirty? */
 int paging_mfn_is_dirty(struct domain *d, mfn_t gmfn)
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index 53de715..c99324c 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -156,6 +156,8 @@ void paging_log_dirty_init(struct domain *d,
 
 /* mark a page as dirty */
 void paging_mark_dirty(struct domain *d, unsigned long guest_mfn);
+/* mark a page as dirty with taking guest pfn as parameter */
+void paging_mark_gfn_dirty(struct domain *d, unsigned long pfn);
 
 /* is this guest page dirty? 
  * This is called from inside paging code, with the paging lock held. */
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v3 03/10] vmx: add PML definition and feature detection

2015-04-24 Thread Kai Huang
The patch adds PML definition and feature detection. Note PML won't be detected
if PML is disabled from boot parameter. PML is also disabled in construct_vmcs,
as it will only be enabled when domain is switched to log dirty mode.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmcs.c| 18 ++
 xen/include/asm-x86/hvm/vmx/vmcs.h |  6 ++
 xen/include/asm-x86/hvm/vmx/vmx.h  |  1 +
 3 files changed, 25 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 79efa42..04fdca3 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -140,6 +140,7 @@ static void __init vmx_display_features(void)
 P(cpu_has_vmx_virtual_intr_delivery, "Virtual Interrupt Delivery");
 P(cpu_has_vmx_posted_intr_processing, "Posted Interrupt Processing");
 P(cpu_has_vmx_vmcs_shadowing, "VMCS shadowing");
+P(cpu_has_vmx_pml, "Page Modification Logging");
 #undef P
 
 if ( !printed )
@@ -237,6 +238,8 @@ static int vmx_init_vmcs_config(void)
 opt |= SECONDARY_EXEC_ENABLE_VPID;
 if ( opt_unrestricted_guest_enabled )
 opt |= SECONDARY_EXEC_UNRESTRICTED_GUEST;
+if ( opt_pml_enabled )
+opt |= SECONDARY_EXEC_ENABLE_PML;
 
 /*
  * "APIC Register Virtualization" and "Virtual Interrupt Delivery"
@@ -283,6 +286,10 @@ static int vmx_init_vmcs_config(void)
  */
 if ( !(_vmx_ept_vpid_cap & VMX_VPID_INVVPID_ALL_CONTEXT) )
 _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+
+/* EPT A/D bits is required for PML */
+if ( !(_vmx_ept_vpid_cap & VMX_EPT_AD_BIT) )
+_vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 }
 
 if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
@@ -303,6 +310,14 @@ static int vmx_init_vmcs_config(void)
   SECONDARY_EXEC_UNRESTRICTED_GUEST);
 }
 
+/* PML cannot be supported if EPT is not used */
+if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) )
+_vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+/* Turn off opt_pml_enabled if PML feature is not present */
+if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_PML) )
+opt_pml_enabled = 0;
+
 if ( (_vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) &&
   ple_gap == 0 )
 {
@@ -1038,6 +1053,9 @@ static int construct_vmcs(struct vcpu *v)
 __vmwrite(POSTED_INTR_NOTIFICATION_VECTOR, posted_intr_vector);
 }
 
+/* Disable PML anyway here as it will only be enabled in log dirty mode */
+v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
 /* Host data selectors. */
 __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
 __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 6fce6aa..f831a78 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -215,6 +215,7 @@ extern u32 vmx_vmentry_control;
 #define SECONDARY_EXEC_ENABLE_INVPCID   0x1000
 #define SECONDARY_EXEC_ENABLE_VMFUNC0x2000
 #define SECONDARY_EXEC_ENABLE_VMCS_SHADOWING0x4000
+#define SECONDARY_EXEC_ENABLE_PML   0x0002
 extern u32 vmx_secondary_exec_control;
 
 #define VMX_EPT_EXEC_ONLY_SUPPORTED 0x0001
@@ -226,6 +227,7 @@ extern u32 vmx_secondary_exec_control;
 #define VMX_EPT_INVEPT_INSTRUCTION  0x0010
 #define VMX_EPT_INVEPT_SINGLE_CONTEXT   0x0200
 #define VMX_EPT_INVEPT_ALL_CONTEXT  0x0400
+#define VMX_EPT_AD_BIT  0x0020
 
 #define VMX_MISC_VMWRITE_ALL0x2000
 
@@ -274,6 +276,8 @@ extern u32 vmx_secondary_exec_control;
 (vmx_pin_based_exec_control & PIN_BASED_POSTED_INTERRUPT)
 #define cpu_has_vmx_vmcs_shadowing \
 (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VMCS_SHADOWING)
+#define cpu_has_vmx_pml \
+(vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_PML)
 
 #define VMCS_RID_TYPE_MASK  0x8000
 
@@ -318,6 +322,7 @@ enum vmcs_field {
 GUEST_LDTR_SELECTOR = 0x080c,
 GUEST_TR_SELECTOR   = 0x080e,
 GUEST_INTR_STATUS   = 0x0810,
+GUEST_PML_INDEX = 0x0812,
 HOST_ES_SELECTOR= 0x0c00,
 HOST_CS_SELECTOR= 0x0c02,
 HOST_SS_SELECTOR= 0x0c04,
@@ -331,6 +336,7 @@ enum vmcs_field {
 VM_EXIT_MSR_STORE_ADDR  = 0x2006,
 VM_EXIT_MSR_LOAD_ADDR   = 0x2008,
 VM_ENTRY_MSR_LOAD_ADDR  = 0x200a,
+PML_ADDRESS = 0x200e,
 TSC_OFFSET  = 0x2010

[Xen-devel] [v3 10/10] p2m/ept: enable PML in p2m-ept for log-dirty

2015-04-24 Thread Kai Huang
This patch firstly enables EPT A/D bits if PML is used, as PML depends on EPT
A/D bits to work. A bit is set for all present p2m types in middle and leaf EPT
entries, and D bit is set for all writable types in EPT leaf entry, except for
log-dirty type with PML.

With PML, for 4K pages, instead of setting EPT entry to read-only, we just need
to clear D bit in order to log that GFN. For superpages, we still need to set it
to read-only as we need to split superpage to 4K pages in EPT violation.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/mm/p2m-ept.c  | 79 ++
 xen/include/asm-x86/hvm/vmx/vmcs.h |  3 +-
 xen/include/asm-x86/hvm/vmx/vmx.h  |  3 +-
 3 files changed, 76 insertions(+), 9 deletions(-)

diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 5e95a83..a1b9eaf 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -102,9 +102,20 @@ static int atomic_write_ept_entry(ept_entry_t *entryptr, 
ept_entry_t new,
 return rc;
 }
 
-static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, 
p2m_access_t access)
+static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
+  p2m_type_t type, p2m_access_t access)
 {
-/* First apply type permissions */
+/*
+ * First apply type permissions.
+ *
+ * A/D bits are also manually set to avoid overhead of MMU having to set
+ * them later. Both A/D bits are safe to be updated directly as they are
+ * ignored by processor if EPT A/D bits is not turned on.
+ *
+ * A bit is set for all present p2m types in middle and leaf EPT entries.
+ * D bit is set for all writable types in EPT leaf entry, except for
+ * log-dirty type with PML.
+ */
 switch(type)
 {
 case p2m_invalid:
@@ -118,27 +129,51 @@ static void ept_p2m_type_to_flags(ept_entry_t *entry, 
p2m_type_t type, p2m_acces
 break;
 case p2m_ram_rw:
 entry->r = entry->w = entry->x = 1;
+entry->a = entry->d = 1;
 break;
 case p2m_mmio_direct:
 entry->r = entry->x = 1;
 entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
 entry->mfn);
+entry->a = 1;
+entry->d = entry->w;
 break;
 case p2m_ram_logdirty:
+entry->r = entry->x = 1;
+/*
+ * In case of PML, we don't have to write protect 4K page, but
+ * only need to clear D-bit for it, but we still need to write
+ * protect super page in order to split it to 4K pages in EPT
+ * violation.
+ */
+if ( vmx_domain_pml_enabled(p2m->domain)
+ && !is_epte_superpage(entry) )
+entry->w = 1;
+else
+entry->w = 0;
+entry->a = 1;
+/* For both PML or non-PML cases we clear D bit anyway */
+entry->d = 0;
+break;
 case p2m_ram_ro:
 case p2m_ram_shared:
 entry->r = entry->x = 1;
 entry->w = 0;
+entry->a = 1;
+entry->d = 0;
 break;
 case p2m_grant_map_rw:
 case p2m_map_foreign:
 entry->r = entry->w = 1;
 entry->x = 0;
+entry->a = entry->d = 1;
 break;
 case p2m_grant_map_ro:
 case p2m_mmio_write_dm:
 entry->r = 1;
 entry->w = entry->x = 0;
+entry->a = 1;
+entry->d = 0;
 break;
 }
 
@@ -194,6 +229,8 @@ static int ept_set_middle_entry(struct p2m_domain *p2m, 
ept_entry_t *ept_entry)
 ept_entry->access = p2m->default_access;
 
 ept_entry->r = ept_entry->w = ept_entry->x = 1;
+/* Manually set A bit to avoid overhead of MMU having to write it later. */
+ept_entry->a = 1;
 
 return 1;
 }
@@ -244,10 +281,9 @@ static int ept_split_super_page(struct p2m_domain *p2m, 
ept_entry_t *ept_entry,
 epte->sp = (level > 1);
 epte->mfn += i * trunk;
 epte->snp = (iommu_enabled && iommu_snoop);
-ASSERT(!epte->rsvd1);
 ASSERT(!epte->avail3);
 
-ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);
+ept_p2m_type_to_flags(p2m, epte, epte->sa_p2mt, epte->access);
 
 if ( (level - 1) == target )
 continue;
@@ -489,7 +525,7 @@ static int resolve_misconfig(struct p2m_domain *p2m, 
unsigned long gfn)
 {
  e.sa_p2mt = p2m_is_logdirty_range(p2m, gfn + i, gfn + 
i)
  ? p2m_ram_logdirty : p2m_ram_rw;
- ept_p2m_type_to_flags(&e, e.sa_p2m

[Xen-devel] [v3 06/10] vmx: handle PML buffer full VMEXIT

2015-04-24 Thread Kai Huang
We need to flush PML buffer when it's full.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmx.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 6c4f78c..f11ac46 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3178,6 +3178,10 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 vmx_handle_apic_write();
 break;
 
+case EXIT_REASON_PML_FULL:
+vmx_vcpu_flush_pml_buffer(v);
+break;
+
 case EXIT_REASON_ACCESS_GDTR_OR_IDTR:
 case EXIT_REASON_ACCESS_LDTR_OR_TR:
 case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 06/11] vmx: add help functions to support PML

2015-04-24 Thread Kai Huang



On 04/24/2015 03:30 PM, Jan Beulich wrote:

On 24.04.15 at 08:32,  wrote:

On 04/17/2015 03:37 PM, Jan Beulich wrote:

On 17.04.15 at 09:23,  wrote:

I see. I will do as you suggested:

ASSERT((v == current) || (!vcpu_runnable(v) && !v->is_running));

And v != current should be the only case requires the vcpu to be paused.

But if you require (or at least expect) the vCPU to be paused, this
isn't what I suggested. Afaict

ASSERT((v == current) || (!v->is_running && atomic_read(v->pause_count)));

would then be the right check (and, while not be a full guarantee that
things wouldn't change behind your back, would at least increase
chances that the vCPU's runnable state won't change, as the vCPU
could have been non-runnable for reasons other than having been
paused).

Hi Jan,

I tried the ASSERT with atomic_read(&v->pause_count), and it turns out
the ASSERT would fail and panic Xen. The reason is domain_pause only
increases d->pause_count, but it doesn't internally increase
v->pause_count for all vcpus.

vmx_vcpu_flush_pml_buffer is only supposed to be called from PML buffer
full VMEXIT, and vmx_domain_flush_pml_buffer, before which domain_pause
should be called.

Sorry that obviously I had some misunderstanding regarding to "require
(or at least expect) vCPU to be paused", and looks !vcpu_runnable(v) is
the right choice.

What's your opinion?

Then either go with the slightly weaker original (still quoted at the
top) or (preferred by me) OR together both pause counts in the
condition.
Thanks. Then I'd like to use original weaker one (below) instead of 
explicitly doing OR 'd->pause_count' and 'v->pause_count', as you are 
not objecting the former :)


ASSERT((v == current) || (!vcpu_runnable(v) && !v->is_running));

Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 06/11] vmx: add help functions to support PML

2015-04-23 Thread Kai Huang



On 04/17/2015 03:37 PM, Jan Beulich wrote:

On 17.04.15 at 09:23,  wrote:

On 04/17/2015 02:58 PM, Jan Beulich wrote:

On 17.04.15 at 08:51,  wrote:

On 04/17/2015 02:23 PM, Jan Beulich wrote:

On 17.04.15 at 05:10,  wrote:

On 04/16/2015 11:42 PM, Jan Beulich wrote:

On 15.04.15 at 09:03,  wrote:

+void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
+{
+uint64_t *pml_buf;
+unsigned long pml_idx;
+
+ASSERT(vmx_vcpu_pml_enabled(v));
+
+vmx_vmcs_enter(v);
+
+__vmread(GUEST_PML_INDEX, &pml_idx);

Don't you require the vCPU to be non-running or current when you
get here? If so, perhaps add a respective ASSERT()?

Yes an ASSERT would be better.

v->pause_count will be increased if vcpu is kicked out by domain_pause
explicitly, but looks the same thing won't be done if vcpu is kicked out
by PML buffer full VMEXIT. So should the ASSERT be done like below?

ASSERT(atomic_read(&v->pause_count) || (v == current));

For one I'd reverse the two parts. And then I think pause count
being non-zero is not a sufficient condition - if a non-synchronous
pause was issued against the vCPU it may still be running. I'd
suggest !vcpu_runnable(v) && !v->is_running, possibly with the
pause count check instead of the runnable one if the only
permitted case where v != current requires the vCPU to be
paused.

The vmx_vcpu_flush_pml_buffer is only supposed to be called in below cases:

   - When PML full VMEXIT happens
   - In paging_log_dirty_op & hap_track_dirty_vram, before reporting
dirty pages to userspace.
   - In vmx_vcpu_disable_pml, called from vmx_vcpu_destroy, or when
log-dirty mode is disabled.

In the latter two cases, domain_pause is guaranteed to be called before
vmx_vcpu_flush_pml_buffer is called, therefore looks there's no
possibility of non-synchronous pause of the vcpu.

Or are you suggesting we should suppose this function can be called from
any caller, and meanwhile is able to act reasonably?

No. All I'm saying is in order to protect against eventual undue
future callers, it should assert that its preconditions are met. I.e.
if the vCPU is expected to be paused, check that the pause
count in non-zero _and_ that the pause actually took effect.

I see. I will do as you suggested:

ASSERT((v == current) || (!vcpu_runnable(v) && !v->is_running));

And v != current should be the only case requires the vcpu to be paused.

But if you require (or at least expect) the vCPU to be paused, this
isn't what I suggested. Afaict

ASSERT((v == current) || (!v->is_running && atomic_read(v->pause_count)));

would then be the right check (and, while not be a full guarantee that
things wouldn't change behind your back, would at least increase
chances that the vCPU's runnable state won't change, as the vCPU
could have been non-runnable for reasons other than having been
paused).

Hi Jan,

I tried the ASSERT with atomic_read(&v->pause_count), and it turns out 
the ASSERT would fail and panic Xen. The reason is domain_pause only 
increases d->pause_count, but it doesn't internally increase 
v->pause_count for all vcpus.


vmx_vcpu_flush_pml_buffer is only supposed to be called from PML buffer 
full VMEXIT, and vmx_domain_flush_pml_buffer, before which domain_pause 
should be called.


Sorry that obviously I had some misunderstanding regarding to "require 
(or at least expect) vCPU to be paused", and looks !vcpu_runnable(v) is 
the right choice.


What's your opinion?

Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 05/11] vmx: add new data structure member to support PML

2015-04-20 Thread Kai Huang



On 04/17/2015 10:31 AM, Kai Huang wrote:



On 04/17/2015 06:39 AM, Tian, Kevin wrote:

From: Kai Huang [mailto:kai.hu...@linux.intel.com]
Sent: Wednesday, April 15, 2015 3:04 PM

A new 4K page pointer is added to arch_vmx_struct as PML buffer for 
vcpu.

And a
new 'status' field is added to vmx_domain to indicate whether PML is 
enabled

for
the domain or not. The 'status' field also can be used for further 
similiar

purpose.
not sure about the last sentence. what's the similar purpose to 
"whether PML

is enabled"? :-)
I mean potentially there might be such feature in the future, and I 
can't give you an example right now. If you are just commenting the 
description here but fine with the current code, I can remove that 
last sentence if you like. Or do you suggest to just use a "bool_t 
pml_enabled"? I am fine with both, but looks there's no objection from 
others so I intend to keep it as 'unsigned int status', if you agree.

Hi Kevin,

What's your opinion here? Is 'unsigned int status' OK to you?





Note both new members don't have to be initialized to zero 
explicitly as both

vcpu and domain structure are zero-ed when they are created.

no initialization in this patch, so why explaining it here?
OK. Looks it's a common sense to all of you so I'll just remove this 
sentence.





Signed-off-by: Kai Huang 
---
  xen/include/asm-x86/hvm/vmx/vmcs.h | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index f831a78..2c679ac 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -70,8 +70,12 @@ struct ept_data {
  cpumask_var_t synced_mask;
  };

+#define _VMX_DOMAIN_PML_ENABLED0
+#define VMX_DOMAIN_PML_ENABLED (1ul <<
_VMX_DOMAIN_PML_ENABLED)
  struct vmx_domain {
  unsigned long apic_access_mfn;
+/* VMX_DOMAIN_* */
+unsigned long status;
  };

  struct pi_desc {
@@ -142,6 +146,9 @@ struct arch_vmx_struct {
  /* Bitmap to control vmexit policy for Non-root VMREAD/VMWRITE */
  struct page_info *vmread_bitmap;
  struct page_info *vmwrite_bitmap;
+
+#define NR_PML_ENTRIES   512
+struct page_info *pml_pg;

move the macro out of the structure.

OK. I will move it just above the declaration of struct arch_vmx_struct.


and is pml_buffer or pml_buf more clear?


To me pml_buffer or pml_buf is more likely a virtual address you can 
access the buffer directly, while pml_pg indicates it's a pointer of 
struct page_info. If you you look at patch 6, you can find statements 
like:


uint64_t *pml_buf;

pml_buf = __map_domain_page(v->arch.hvm_vmx.pml_pg);

So I intend to keep it.

And this one? Are you OK with 'pml_pg'?

Thanks,
-Kai


Thanks,
-Kai



  };

  int vmx_create_vmcs(struct vcpu *v);
--
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 06/11] vmx: add help functions to support PML

2015-04-20 Thread Kai Huang
On Mon, Apr 20, 2015 at 4:29 PM, Tim Deegan  wrote:
> At 17:29 +0800 on 17 Apr (1429291763), Kai Huang wrote:
>>
>>
>> On 04/17/2015 04:36 PM, Tim Deegan wrote:
>> > At 11:32 +0800 on 17 Apr (1429270332), Kai Huang wrote:
>> >>
>> >> On 04/17/2015 08:10 AM, Tim Deegan wrote:
>> >>> At 22:57 +0000 on 16 Apr (1429225024), Tian, Kevin wrote:
>> >>>
>> >>>>> From: Kai Huang [mailto:kai.hu...@linux.intel.com]
>> >>>>> +if ( !p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty,
>> >>>>> +p2m_ram_rw) )
>> >>>>> +paging_mark_gfn_dirty(v->domain, gfn);
>> >>>> Should we handle error from p2m_change_type_one and consequently
>> >>>> making this flush function non-void?
>> >>> I don't think we need to return an error, but we should probably
>> >>> call mark_dirty here for anything except -EBUSY.
>> >> Hi Kevin, Tim,
>> >>
>> >> My intention here is to rule out the GFN with original type that is not
>> >> p2m_ram_logdirty, though with patch 11 it's not likely have such GFN
>> >> logged.
>> >>
>> >> Looks -EBUSY returns exactly when original type is not p2m_ram_logdirty,
>> >> so I think it might be OK to do as Tim suggested.
>> >>
>> >> But given the same thing has already been done in hap_track_dirty_vram
>> >> (hap_track_dirty_vram->paging_log_dirty_range, in which gfn is set in
>> >> bitmap with !p2m_change_type_one is true), and in EPT violation
>> >> (p2m_change_type_one is called unconditionally without checking return
>> >> value), I think it should be safe to do the current code here.
>> > The paging_log_dirty_range case is doing something quite different:
>> > it is making pages read-only so they can be tracked, and it needs to
>> > mark any page that couldn't be made read-only (because the guest can
>> > write to them).
>> Thanks for comprehensive reply. However, looks I can't agree on some points.
>>
>> paging_log_dirty_range currently is only used by hap_track_dirty_vram
>> for video ram tracking, and it doesn't call paging_mark_dirty in any
>> case.
>
> Sure, it doesn't call paging_mark_dirty(), but instead it puts the
> marks into a bitmap directly.
>
>> Basically, paging_log_dirty_range only does below thing but
>> nothing else:
>>
>>  for ( i = 0, pfn = begin_pfn; pfn < begin_pfn + nr; i++, pfn++ )
>>  if ( !p2m_change_type_one(d, pfn, p2m_ram_rw, p2m_ram_logdirty) )
>>  dirty_bitmap[i >> 3] |= (1 << (i & 7));
>>
>>  From which we can see the purpose of this function (and let us put PML
>> away for now):
>>
>>  - change GFN's type from p2m_ram_rw back to p2m_ram_logdirty, in
>> order to be able to log the GFN again (more precisely, to track the GFN
>> again in EPT violation), with the fact that the dirty page's p2m type
>> has been changed from p2m_log_dirty to p2m_ram_rw, in EPT violation.
>>  - mark the dirty GFN to the bitmap only when above changing
>> p2m_ram_logdirty to p2m_ram_rw is done successfully. It is reasonable,
>> as only a successful changing from p2m_ram_rw to p2m_ram_dirty means the
>> dirty page has been changed from p2m_ram_logdirty to p2m_ram_rw in EPT
>> violation.
>
> Right; so this code should probably also be setting the mark if
> p2m_change_type_one() returns anything except EBUSY.

Reasonable. But as so far there's nothing wrong behavior observed with
current code, I intend to leave it unchanged. Is it OK to you?

>
> But in this vram code we can't just set the mark in all cases, because
> we need to detect the case where the type is still p2m_ram_logdirty --
> i.e. the page hasn't been written to.

Agreed.

>
>> Btw, from which we can also note that currently video ram tracking is
>> not via log-dirty radix tree but depends on p2m type change, this is the
>> reason we must call p2m_type_change_one in vmx_vcpu_flush_pml_buffer.
>
> I think we need to do that anyway, to make sure that next time we clear
> the bitmap, the change back from _rw to _logdirty clears the D bit.
>
> But it does suggest that we might want to flush the PML buffers in the
> vram function.

True.

>
>> > Now that I've written it out, and since we expect these races to be
>> > very rare, I've changed my mind: we should _always_ ca

Re: [Xen-devel] [v2 06/11] vmx: add help functions to support PML

2015-04-17 Thread Kai Huang



On 04/17/2015 04:36 PM, Tim Deegan wrote:

At 11:32 +0800 on 17 Apr (1429270332), Kai Huang wrote:


On 04/17/2015 08:10 AM, Tim Deegan wrote:

At 22:57 + on 16 Apr (1429225024), Tian, Kevin wrote:


From: Kai Huang [mailto:kai.hu...@linux.intel.com]
+if ( !p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty,
+p2m_ram_rw) )
+paging_mark_gfn_dirty(v->domain, gfn);

Should we handle error from p2m_change_type_one and consequently
making this flush function non-void?

I don't think we need to return an error, but we should probably
call mark_dirty here for anything except -EBUSY.

Hi Kevin, Tim,

My intention here is to rule out the GFN with original type that is not
p2m_ram_logdirty, though with patch 11 it's not likely have such GFN
logged.

Looks -EBUSY returns exactly when original type is not p2m_ram_logdirty,
so I think it might be OK to do as Tim suggested.

But given the same thing has already been done in hap_track_dirty_vram
(hap_track_dirty_vram->paging_log_dirty_range, in which gfn is set in
bitmap with !p2m_change_type_one is true), and in EPT violation
(p2m_change_type_one is called unconditionally without checking return
value), I think it should be safe to do the current code here.

The paging_log_dirty_range case is doing something quite different:
it is making pages read-only so they can be tracked, and it needs to
mark any page that couldn't be made read-only (because the guest can
write to them).

Thanks for comprehensive reply. However, looks I can't agree on some points.

paging_log_dirty_range currently is only used by hap_track_dirty_vram 
for video ram tracking, and it doesn't call paging_mark_dirty in any 
case. Basically, paging_log_dirty_range only does below thing but 
nothing else:


for ( i = 0, pfn = begin_pfn; pfn < begin_pfn + nr; i++, pfn++ )
if ( !p2m_change_type_one(d, pfn, p2m_ram_rw, p2m_ram_logdirty) )
dirty_bitmap[i >> 3] |= (1 << (i & 7));

From which we can see the purpose of this function (and let us put PML 
away for now):


- change GFN's type from p2m_ram_rw back to p2m_ram_logdirty, in 
order to be able to log the GFN again (more precisely, to track the GFN 
again in EPT violation), with the fact that the dirty page's p2m type 
has been changed from p2m_log_dirty to p2m_ram_rw, in EPT violation.
- mark the dirty GFN to the bitmap only when above changing 
p2m_ram_logdirty to p2m_ram_rw is done successfully. It is reasonable, 
as only a successful changing from p2m_ram_rw to p2m_ram_dirty means the 
dirty page has been changed from p2m_ram_logdirty to p2m_ram_rw in EPT 
violation.


Btw, from which we can also note that currently video ram tracking is 
not via log-dirty radix tree but depends on p2m type change, this is the 
reason we must call p2m_type_change_one in vmx_vcpu_flush_pml_buffer.



Its three cases are:
  - change succeeded: no mark, we will trap any new writes
  - EBUSY: mark, since we can't be sure we'll trap new writes
  - other error: mark, since we can't be sure we'll trap new writes
Regarding to the above three cases, I assume you are referring to 
changing p2m_ram_rw back to p2m_ram_logdirty in paging_log_dirty_range , 
in which case the paging_mark_dirty is not called at all, as I mentioned 
above.




In this case we _know_ the guest has written to the page (because it's
in the PML log), so our only reason for not calling mark_dirty() is
if we see that someone else has changed the p2m (EBUSY) and that
someone else ought to already have DTRT.
I agree on this, given you said we can't be sure for the unsuccessful 
p2m type change.




Now that I've written it out, and since we expect these races to be
very rare, I've changed my mind: we should _always_ call mark_dirty
here.  The extra cost should be negligible, and a missing mark is
extremely difficult to debug.
Which is also good to me, and in this case we should also call 
p2m_change_type_one(.., p2m_ram_logdirty, p2m_ram_rw) unconditionally, 
as this is required for video ram tracking.


Thanks,
-Kai


Cheers,

Tim.



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 06/11] vmx: add help functions to support PML

2015-04-17 Thread Kai Huang



On 04/17/2015 03:37 PM, Jan Beulich wrote:

On 17.04.15 at 09:23,  wrote:

On 04/17/2015 02:58 PM, Jan Beulich wrote:

On 17.04.15 at 08:51,  wrote:

On 04/17/2015 02:23 PM, Jan Beulich wrote:

On 17.04.15 at 05:10,  wrote:

On 04/16/2015 11:42 PM, Jan Beulich wrote:

On 15.04.15 at 09:03,  wrote:

+void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
+{
+uint64_t *pml_buf;
+unsigned long pml_idx;
+
+ASSERT(vmx_vcpu_pml_enabled(v));
+
+vmx_vmcs_enter(v);
+
+__vmread(GUEST_PML_INDEX, &pml_idx);

Don't you require the vCPU to be non-running or current when you
get here? If so, perhaps add a respective ASSERT()?

Yes an ASSERT would be better.

v->pause_count will be increased if vcpu is kicked out by domain_pause
explicitly, but looks the same thing won't be done if vcpu is kicked out
by PML buffer full VMEXIT. So should the ASSERT be done like below?

ASSERT(atomic_read(&v->pause_count) || (v == current));

For one I'd reverse the two parts. And then I think pause count
being non-zero is not a sufficient condition - if a non-synchronous
pause was issued against the vCPU it may still be running. I'd
suggest !vcpu_runnable(v) && !v->is_running, possibly with the
pause count check instead of the runnable one if the only
permitted case where v != current requires the vCPU to be
paused.

The vmx_vcpu_flush_pml_buffer is only supposed to be called in below cases:

   - When PML full VMEXIT happens
   - In paging_log_dirty_op & hap_track_dirty_vram, before reporting
dirty pages to userspace.
   - In vmx_vcpu_disable_pml, called from vmx_vcpu_destroy, or when
log-dirty mode is disabled.

In the latter two cases, domain_pause is guaranteed to be called before
vmx_vcpu_flush_pml_buffer is called, therefore looks there's no
possibility of non-synchronous pause of the vcpu.

Or are you suggesting we should suppose this function can be called from
any caller, and meanwhile is able to act reasonably?

No. All I'm saying is in order to protect against eventual undue
future callers, it should assert that its preconditions are met. I.e.
if the vCPU is expected to be paused, check that the pause
count in non-zero _and_ that the pause actually took effect.

I see. I will do as you suggested:

ASSERT((v == current) || (!vcpu_runnable(v) && !v->is_running));

And v != current should be the only case requires the vcpu to be paused.

But if you require (or at least expect) the vCPU to be paused, this
isn't what I suggested. Afaict

ASSERT((v == current) || (!v->is_running && atomic_read(v->pause_count)));

would then be the right check (and, while not be a full guarantee that
things wouldn't change behind your back, would at least increase
chances that the vCPU's runnable state won't change, as the vCPU
could have been non-runnable for reasons other than having been
paused).
You are right. Your last sentence convinced me. I didn't go that far. I 
will use atomic_read(&v->pause_count) instead of !vcpu_runnable(v).


Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 06/11] vmx: add help functions to support PML

2015-04-17 Thread Kai Huang



On 04/17/2015 02:58 PM, Jan Beulich wrote:

On 17.04.15 at 08:51,  wrote:

On 04/17/2015 02:23 PM, Jan Beulich wrote:

On 17.04.15 at 05:10,  wrote:

On 04/16/2015 11:42 PM, Jan Beulich wrote:

On 15.04.15 at 09:03,  wrote:

+void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
+{
+uint64_t *pml_buf;
+unsigned long pml_idx;
+
+ASSERT(vmx_vcpu_pml_enabled(v));
+
+vmx_vmcs_enter(v);
+
+__vmread(GUEST_PML_INDEX, &pml_idx);

Don't you require the vCPU to be non-running or current when you
get here? If so, perhaps add a respective ASSERT()?

Yes an ASSERT would be better.

v->pause_count will be increased if vcpu is kicked out by domain_pause
explicitly, but looks the same thing won't be done if vcpu is kicked out
by PML buffer full VMEXIT. So should the ASSERT be done like below?

ASSERT(atomic_read(&v->pause_count) || (v == current));

For one I'd reverse the two parts. And then I think pause count
being non-zero is not a sufficient condition - if a non-synchronous
pause was issued against the vCPU it may still be running. I'd
suggest !vcpu_runnable(v) && !v->is_running, possibly with the
pause count check instead of the runnable one if the only
permitted case where v != current requires the vCPU to be
paused.

The vmx_vcpu_flush_pml_buffer is only supposed to be called in below cases:

  - When PML full VMEXIT happens
  - In paging_log_dirty_op & hap_track_dirty_vram, before reporting
dirty pages to userspace.
  - In vmx_vcpu_disable_pml, called from vmx_vcpu_destroy, or when
log-dirty mode is disabled.

In the latter two cases, domain_pause is guaranteed to be called before
vmx_vcpu_flush_pml_buffer is called, therefore looks there's no
possibility of non-synchronous pause of the vcpu.

Or are you suggesting we should suppose this function can be called from
any caller, and meanwhile is able to act reasonably?

No. All I'm saying is in order to protect against eventual undue
future callers, it should assert that its preconditions are met. I.e.
if the vCPU is expected to be paused, check that the pause
count in non-zero _and_ that the pause actually took effect.

I see. I will do as you suggested:

ASSERT((v == current) || (!vcpu_runnable(v) && !v->is_running));

And v != current should be the only case requires the vcpu to be paused.

Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 11/11] p2m/ept: enable PML in p2m-ept for log-dirty

2015-04-17 Thread Kai Huang



On 04/17/2015 02:28 PM, Jan Beulich wrote:

On 17.04.15 at 04:40,  wrote:

On 04/16/2015 11:54 PM, Jan Beulich wrote:

On 15.04.15 at 09:03,  wrote:

This patch firstly enables EPT A/D bits if PML is used, as PML depends on

EPT

A/D bits to work. A bit is set for all present leaf p2m types, D bit is set

for

all writable types, except log-dirty type.

I think the tying of "leaf" to the A bit part of the description became
stale, as you're now also doing this for non-leaf ones.

You are right. How about just "A bit is set for all present p2m types, ..."?

Almost - adding "leaf" to the D bit part would still be desirable for
clarity.

That will be more accurate. Thanks for suggestion. Is below good to you?

"A bit is set for all present p2m types in middle and leaf EPT entries, 
and D bit is set for all writable types in the leaf EPT entry, except 
log-dirty type."


And I will update comments at the beginning of ept_p2m_type_to_flags 
accordingly.


Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 10/11] log-dirty: refine common code to support PML

2015-04-16 Thread Kai Huang



On 04/17/2015 02:28 PM, Jan Beulich wrote:

On 17.04.15 at 04:46,  wrote:

On 04/16/2015 11:51 PM, Jan Beulich wrote:

On 15.04.15 at 09:03,  wrote:

@@ -190,9 +196,15 @@ static int hap_enable_log_dirty(struct domain *d, bool_t 
log_global)
   d->arch.paging.mode |= PG_log_dirty;
   paging_unlock(d);
   
+/* enable hardware-assisted log-dirty if it is supported */

+p2m_enable_hardware_log_dirty(d);

I don't see that you would anywhere avoid setting up software
log-dirty handling - is that on purpose? If so, is there really a
win from adding PML?


   if ( log_global )
   {
-/* set l1e entries of P2M table to be read-only. */
+/*
+ * switch to log dirty mode, either by setting l1e entries of P2M table
+ * to be read-only, or via hardware-assisted log-dirty.
+ */
   p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty);

Or did I miss you changing the behavior of this anywhere (as the
changed comment suggests)?

Both of your comments are done in patch 11.

Partly - the new behavior indeed gets added there, but the misconfig
VM exits still seem to be a necessary part of the logic, so the question
stands: Is there really a win from adding PML? Or wait, I think now I
recall - the benefit comes from avoiding the protection violation exits,
not the misconfig ones. Sorry for the noise then.
Yes PML is targeted to significantly reduce number of EPT violation 
caused by write protection of guest memory, and thus reduce hypervisor 
overhead of log-dirty mechanism.


Thanks,
-Kai


Jan




___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 06/11] vmx: add help functions to support PML

2015-04-16 Thread Kai Huang



On 04/17/2015 02:23 PM, Jan Beulich wrote:

On 17.04.15 at 05:10,  wrote:

On 04/16/2015 11:42 PM, Jan Beulich wrote:

On 15.04.15 at 09:03,  wrote:

+void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
+{
+uint64_t *pml_buf;
+unsigned long pml_idx;
+
+ASSERT(vmx_vcpu_pml_enabled(v));
+
+vmx_vmcs_enter(v);
+
+__vmread(GUEST_PML_INDEX, &pml_idx);

Don't you require the vCPU to be non-running or current when you
get here? If so, perhaps add a respective ASSERT()?

Yes an ASSERT would be better.

v->pause_count will be increased if vcpu is kicked out by domain_pause
explicitly, but looks the same thing won't be done if vcpu is kicked out
by PML buffer full VMEXIT. So should the ASSERT be done like below?

ASSERT(atomic_read(&v->pause_count) || (v == current));

For one I'd reverse the two parts. And then I think pause count
being non-zero is not a sufficient condition - if a non-synchronous
pause was issued against the vCPU it may still be running. I'd
suggest !vcpu_runnable(v) && !v->is_running, possibly with the
pause count check instead of the runnable one if the only
permitted case where v != current requires the vCPU to be
paused.

The vmx_vcpu_flush_pml_buffer is only supposed to be called in below cases:

- When PML full VMEXIT happens
- In paging_log_dirty_op & hap_track_dirty_vram, before reporting 
dirty pages to userspace.
- In vmx_vcpu_disable_pml, called from vmx_vcpu_destroy, or when 
log-dirty mode is disabled.


In the latter two cases, domain_pause is guaranteed to be called before 
vmx_vcpu_flush_pml_buffer is called, therefore looks there's no 
possibility of non-synchronous pause of the vcpu.


Or are you suggesting we should suppose this function can be called from 
any caller, and meanwhile is able to act reasonably?





+/*
+ * Need to change type from log-dirty to normal memory for logged GFN.
+ * hap_track_dirty_vram depends on it to work. And we really only need
+ * to mark GFNs which hve been successfully changed from log-dirty to
+ * normal memory to be dirty.
+ */
+if ( !p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty,
+p2m_ram_rw) )

Indentation.

To be where exactly? Sorry I didn't find an example to refer in such case.

p2m_ram_rw should align with the v in v->domain.

Understood. Will do.

Thanks,
-Kai


Jan




___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 06/11] vmx: add help functions to support PML

2015-04-16 Thread Kai Huang



On 04/17/2015 08:10 AM, Tim Deegan wrote:

At 22:57 + on 16 Apr (1429225024), Tian, Kevin wrote:


From: Kai Huang [mailto:kai.hu...@linux.intel.com]
+if ( !p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty,
+p2m_ram_rw) )
+paging_mark_gfn_dirty(v->domain, gfn);

Should we handle error from p2m_change_type_one and consequently
making this flush function non-void?

I don't think we need to return an error, but we should probably
call mark_dirty here for anything except -EBUSY.

Hi Kevin, Tim,

My intention here is to rule out the GFN with original type that is not 
p2m_ram_logdirty, though with patch 11 it's not likely have such GFN 
logged.


Looks -EBUSY returns exactly when original type is not p2m_ram_logdirty, 
so I think it might be OK to do as Tim suggested.


But given the same thing has already been done in hap_track_dirty_vram 
(hap_track_dirty_vram->paging_log_dirty_range, in which gfn is set in 
bitmap with !p2m_change_type_one is true), and in EPT violation 
(p2m_change_type_one is called unconditionally without checking return 
value), I think it should be safe to do the current code here.


What's your comments?

Thanks,
-Kai




+d->arch.hvm_domain.vmx.status |= VMX_DOMAIN_PML_ENABLED;

I didn't see how this domain-wise flag is useful. Or if we really
want to go this way, you also need to clear this flag if vcpu pml
enable is failed in vcpu hotplug phase, since the flag itself means
all vcpus of the domain so we must keep this intention in all
places.

IIUC we need this flag so that we know whether to enable PML on any
new vcpus.  If we fail to enable PML on hotplug, we fail the hotplug
(like for other vcpu bringup errors) so the invariant still holds.

Cheers,

Tim.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 06/11] vmx: add help functions to support PML

2015-04-16 Thread Kai Huang



On 04/17/2015 06:57 AM, Tian, Kevin wrote:

From: Kai Huang [mailto:kai.hu...@linux.intel.com]
Sent: Wednesday, April 15, 2015 3:04 PM

This patch adds help functions to enable/disable PML, and flush PML buffer for
single vcpu and particular domain for further use.

Signed-off-by: Kai Huang 
---
  xen/arch/x86/hvm/vmx/vmcs.c| 178
+
  xen/include/asm-x86/hvm/vmx/vmcs.h |   9 ++
  2 files changed, 187 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index d120370..d3cb50f 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1328,6 +1328,184 @@ void vmx_clear_eoi_exit_bitmap(struct vcpu *v,
u8 vector)
  &v->arch.hvm_vmx.eoi_exitmap_changed);
  }

+bool_t vmx_vcpu_pml_enabled(const struct vcpu *v)
+{
+return !!(v->arch.hvm_vmx.secondary_exec_control &
+  SECONDARY_EXEC_ENABLE_PML);
+}
+
+int vmx_vcpu_enable_pml(struct vcpu *v)
+{
+struct domain *d = v->domain;
+
+if ( vmx_vcpu_pml_enabled(v) )
+return 0;
+
+v->arch.hvm_vmx.pml_pg = d->arch.paging.alloc_page(d);
+if ( !v->arch.hvm_vmx.pml_pg )
+return -ENOMEM;
+
+vmx_vmcs_enter(v);
+
+__vmwrite(PML_ADDRESS, page_to_mfn(v->arch.hvm_vmx.pml_pg) <<
PAGE_SHIFT);
+__vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
+
+v->arch.hvm_vmx.secondary_exec_control |=
SECONDARY_EXEC_ENABLE_PML;
+
+__vmwrite(SECONDARY_VM_EXEC_CONTROL,
+  v->arch.hvm_vmx.secondary_exec_control);
+
+vmx_vmcs_exit(v);
+
+return 0;
+}
+
+void vmx_vcpu_disable_pml(struct vcpu *v)
+{
+if ( !vmx_vcpu_pml_enabled(v) )
+return;
+
+/* Make sure we don't lose any logged GPAs */
+vmx_vcpu_flush_pml_buffer(v);
+
+vmx_vmcs_enter(v);
+
+v->arch.hvm_vmx.secondary_exec_control &=
~SECONDARY_EXEC_ENABLE_PML;
+__vmwrite(SECONDARY_VM_EXEC_CONTROL,
+  v->arch.hvm_vmx.secondary_exec_control);
+
+vmx_vmcs_exit(v);
+
+v->domain->arch.paging.free_page(v->domain,
v->arch.hvm_vmx.pml_pg);
+v->arch.hvm_vmx.pml_pg = NULL;
+}
+
+void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
+{
+uint64_t *pml_buf;
+unsigned long pml_idx;
+
+ASSERT(vmx_vcpu_pml_enabled(v));
+
+vmx_vmcs_enter(v);
+
+__vmread(GUEST_PML_INDEX, &pml_idx);
+
+/* Do nothing if PML buffer is empty */
+if ( pml_idx == (NR_PML_ENTRIES - 1) )
+goto out;
+
+pml_buf = __map_domain_page(v->arch.hvm_vmx.pml_pg);
+
+/*
+ * PML index can be either 2^16-1 (buffer is full), or 0~511 (buffer is
not

0~NR_PML_ENTRIES-1

Will do.




+ * full), and in latter case PML index always points to next available
+ * entity.
+ */
+if (pml_idx >= NR_PML_ENTRIES)
+pml_idx = 0;
+else
+pml_idx++;
+
+for ( ; pml_idx < NR_PML_ENTRIES; pml_idx++ )
+{
+unsigned long gfn = pml_buf[pml_idx] >> PAGE_SHIFT;
+/*
+ * Need to change type from log-dirty to normal memory for
logged GFN.
+ * hap_track_dirty_vram depends on it to work. And we really only
need
+ * to mark GFNs which hve been successfully changed from
log-dirty to
+ * normal memory to be dirty.
+ */
+if ( !p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty,
+p2m_ram_rw) )
+paging_mark_gfn_dirty(v->domain, gfn);

Should we handle error from p2m_change_type_one and consequently
making this flush function non-void?


+}
+
+unmap_domain_page(pml_buf);
+
+/* Reset PML index */
+__vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);

Like Jan pointed out an assertion of vcpu status is required here otherwise
blindly reset PML index may race with new logged entries from a running
vcpu.

Yeah will do.

Thanks,
-Kai



+
+out:
+vmx_vmcs_exit(v);
+}
+
+bool_t vmx_domain_pml_enabled(const struct domain *d)
+{
+return !!(d->arch.hvm_domain.vmx.status &
VMX_DOMAIN_PML_ENABLED);
+}
+
+/*
+ * This function enables PML for particular domain. It should be called when
+ * domain is paused.
+ *
+ * PML needs to be enabled globally for all vcpus of the domain, as PML
buffer
+ * and PML index are pre-vcpu, but EPT table is shared by vcpus, therefore
+ * enabling PML on partial vcpus won't work.
+ */
+int vmx_domain_enable_pml(struct domain *d)
+{
+struct vcpu *v;
+int rc;
+
+ASSERT(atomic_read(&d->pause_count));
+
+if ( vmx_domain_pml_enabled(d) )
+return 0;
+
+for_each_vcpu( d, v )
+if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
+goto error;
+
+d->arch.hvm_domain.vmx.status |= VMX_DOMAIN_PML_ENABLED;

I didn't see how this domain-wise flag is useful. Or if we really
want to go this way, you also need to clear this flag if vcpu pml
enable is failed in vcpu hotplug phase, since the flag itself means
all vcpus of t

Re: [Xen-devel] [v2 06/11] vmx: add help functions to support PML

2015-04-16 Thread Kai Huang



On 04/16/2015 11:42 PM, Jan Beulich wrote:

On 15.04.15 at 09:03,  wrote:

+int vmx_vcpu_enable_pml(struct vcpu *v)
+{
+struct domain *d = v->domain;
+
+if ( vmx_vcpu_pml_enabled(v) )
+return 0;
+
+v->arch.hvm_vmx.pml_pg = d->arch.paging.alloc_page(d);

So you latch v->domain into d for this invocation, ...


+void vmx_vcpu_disable_pml(struct vcpu *v)
+{
+if ( !vmx_vcpu_pml_enabled(v) )
+return;
+
+/* Make sure we don't lose any logged GPAs */
+vmx_vcpu_flush_pml_buffer(v);
+
+vmx_vmcs_enter(v);
+
+v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+__vmwrite(SECONDARY_VM_EXEC_CONTROL,
+  v->arch.hvm_vmx.secondary_exec_control);
+
+vmx_vmcs_exit(v);
+
+v->domain->arch.paging.free_page(v->domain, v->arch.hvm_vmx.pml_pg);

... but not for this one. Please be consistent.

Hmm. My bad. I'll use v->domain in both function.




+void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
+{
+uint64_t *pml_buf;
+unsigned long pml_idx;
+
+ASSERT(vmx_vcpu_pml_enabled(v));
+
+vmx_vmcs_enter(v);
+
+__vmread(GUEST_PML_INDEX, &pml_idx);

Don't you require the vCPU to be non-running or current when you
get here? If so, perhaps add a respective ASSERT()?

Yes an ASSERT would be better.

v->pause_count will be increased if vcpu is kicked out by domain_pause 
explicitly, but looks the same thing won't be done if vcpu is kicked out 
by PML buffer full VMEXIT. So should the ASSERT be done like below?


ASSERT(atomic_read(&v->pause_count) || (v == current));




+
+/* Do nothing if PML buffer is empty */
+if ( pml_idx == (NR_PML_ENTRIES - 1) )
+goto out;
+
+pml_buf = __map_domain_page(v->arch.hvm_vmx.pml_pg);
+
+/*
+ * PML index can be either 2^16-1 (buffer is full), or 0~511 (buffer is not
+ * full), and in latter case PML index always points to next available
+ * entity.
+ */
+if (pml_idx >= NR_PML_ENTRIES)
+pml_idx = 0;
+else
+pml_idx++;
+
+for ( ; pml_idx < NR_PML_ENTRIES; pml_idx++ )
+{
+unsigned long gfn = pml_buf[pml_idx] >> PAGE_SHIFT;

Blank line here please.

Will do.




+/*
+ * Need to change type from log-dirty to normal memory for logged GFN.
+ * hap_track_dirty_vram depends on it to work. And we really only need
+ * to mark GFNs which hve been successfully changed from log-dirty to
+ * normal memory to be dirty.
+ */
+if ( !p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty,
+p2m_ram_rw) )

Indentation.

To be where exactly? Sorry I didn't find an example to refer in such case.




+paging_mark_gfn_dirty(v->domain, gfn);
+}
+
+unmap_domain_page(pml_buf);
+
+/* Reset PML index */
+__vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
+
+out:

Labels indented by at least one space please.

OK. I'll put one space before the "out:" label.

Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 10/11] log-dirty: refine common code to support PML

2015-04-16 Thread Kai Huang



On 04/16/2015 11:51 PM, Jan Beulich wrote:

On 15.04.15 at 09:03,  wrote:

@@ -190,9 +196,15 @@ static int hap_enable_log_dirty(struct domain *d, bool_t 
log_global)
  d->arch.paging.mode |= PG_log_dirty;
  paging_unlock(d);
  
+/* enable hardware-assisted log-dirty if it is supported */

+p2m_enable_hardware_log_dirty(d);

I don't see that you would anywhere avoid setting up software
log-dirty handling - is that on purpose? If so, is there really a
win from adding PML?


  if ( log_global )
  {
-/* set l1e entries of P2M table to be read-only. */
+/*
+ * switch to log dirty mode, either by setting l1e entries of P2M table
+ * to be read-only, or via hardware-assisted log-dirty.
+ */
  p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty);

Or did I miss you changing the behavior of this anywhere (as the
changed comment suggests)?

Both of your comments are done in patch 11.

And I think even without patch 11, there's no harm with this patch as 
p2m_enable_hardware_log_dirty will essentially do nothing and write 
protection will be used for log-dirty :)


Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 10/11] log-dirty: refine common code to support PML

2015-04-16 Thread Kai Huang



On 04/17/2015 07:07 AM, Tian, Kevin wrote:

From: Jan Beulich [mailto:jbeul...@suse.com]
Sent: Thursday, April 16, 2015 11:52 PM


On 15.04.15 at 09:03,  wrote:

@@ -190,9 +196,15 @@ static int hap_enable_log_dirty(struct domain *d,

bool_t log_global)

  d->arch.paging.mode |= PG_log_dirty;
  paging_unlock(d);

+/* enable hardware-assisted log-dirty if it is supported */
+p2m_enable_hardware_log_dirty(d);

I don't see that you would anywhere avoid setting up software
log-dirty handling - is that on purpose? If so, is there really a
win from adding PML?


  if ( log_global )
  {
-/* set l1e entries of P2M table to be read-only. */
+/*
+ * switch to log dirty mode, either by setting l1e entries of P2M

table

+ * to be read-only, or via hardware-assisted log-dirty.
+ */
  p2m_change_entry_type_global(d, p2m_ram_rw,

p2m_ram_logdirty);

Or did I miss you changing the behavior of this anywhere (as the
changed comment suggests)?


just found behavior is changed in [11/11]. :-)

Yes.

Thanks,
-Kai


Thanks
Kevin

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 11/11] p2m/ept: enable PML in p2m-ept for log-dirty

2015-04-16 Thread Kai Huang



On 04/16/2015 11:54 PM, Jan Beulich wrote:

On 15.04.15 at 09:03,  wrote:

This patch firstly enables EPT A/D bits if PML is used, as PML depends on EPT
A/D bits to work. A bit is set for all present leaf p2m types, D bit is set for
all writable types, except log-dirty type.

I think the tying of "leaf" to the A bit part of the description became
stale, as you're now also doing this for non-leaf ones.

You are right. How about just "A bit is set for all present p2m types, ..."?

Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 05/11] vmx: add new data structure member to support PML

2015-04-16 Thread Kai Huang



On 04/17/2015 06:39 AM, Tian, Kevin wrote:

From: Kai Huang [mailto:kai.hu...@linux.intel.com]
Sent: Wednesday, April 15, 2015 3:04 PM

A new 4K page pointer is added to arch_vmx_struct as PML buffer for vcpu.
And a
new 'status' field is added to vmx_domain to indicate whether PML is enabled
for
the domain or not. The 'status' field also can be used for further similiar
purpose.

not sure about the last sentence. what's the similar purpose to "whether PML
is enabled"? :-)
I mean potentially there might be such feature in the future, and I 
can't give you an example right now. If you are just commenting the 
description here but fine with the current code, I can remove that last 
sentence if you like. Or do you suggest to just use a "bool_t 
pml_enabled"? I am fine with both, but looks there's no objection from 
others so I intend to keep it as 'unsigned int status', if you agree.





Note both new members don't have to be initialized to zero explicitly as both
vcpu and domain structure are zero-ed when they are created.

no initialization in this patch, so why explaining it here?
OK. Looks it's a common sense to all of you so I'll just remove this 
sentence.





Signed-off-by: Kai Huang 
---
  xen/include/asm-x86/hvm/vmx/vmcs.h | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index f831a78..2c679ac 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -70,8 +70,12 @@ struct ept_data {
  cpumask_var_t synced_mask;
  };

+#define _VMX_DOMAIN_PML_ENABLED0
+#define VMX_DOMAIN_PML_ENABLED (1ul <<
_VMX_DOMAIN_PML_ENABLED)
  struct vmx_domain {
  unsigned long apic_access_mfn;
+/* VMX_DOMAIN_* */
+unsigned long status;
  };

  struct pi_desc {
@@ -142,6 +146,9 @@ struct arch_vmx_struct {
  /* Bitmap to control vmexit policy for Non-root VMREAD/VMWRITE */
  struct page_info *vmread_bitmap;
  struct page_info *vmwrite_bitmap;
+
+#define NR_PML_ENTRIES   512
+struct page_info *pml_pg;

move the macro out of the structure.

OK. I will move it just above the declaration of struct arch_vmx_struct.


and is pml_buffer or pml_buf more clear?


To me pml_buffer or pml_buf is more likely a virtual address you can 
access the buffer directly, while pml_pg indicates it's a pointer of 
struct page_info. If you you look at patch 6, you can find statements like:


uint64_t *pml_buf;

pml_buf = __map_domain_page(v->arch.hvm_vmx.pml_pg);

So I intend to keep it.

Thanks,
-Kai



  };

  int vmx_create_vmcs(struct vcpu *v);
--
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 04/11] vmx: add PML definition and feature detection.

2015-04-16 Thread Kai Huang



On 04/17/2015 06:35 AM, Tian, Kevin wrote:

From: Kai Huang [mailto:kai.hu...@linux.intel.com]
Sent: Wednesday, April 15, 2015 3:04 PM

The patch adds PML definition and feature detection. Note PML won't be
detected
if PML is disabled from boot parameter. PML is also disabled in
construct_vmcs,
as it will only be enabled when domain is switched to log dirty mode.

Signed-off-by: Kai Huang 
---
  xen/arch/x86/hvm/vmx/vmcs.c| 22 ++
  xen/include/asm-x86/hvm/vmx/vmcs.h |  6 ++
  xen/include/asm-x86/hvm/vmx/vmx.h  |  1 +
  3 files changed, 29 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 4fff46d..d120370 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -141,6 +141,7 @@ static void __init vmx_display_features(void)
  P(cpu_has_vmx_virtual_intr_delivery, "Virtual Interrupt Delivery");
  P(cpu_has_vmx_posted_intr_processing, "Posted Interrupt Processing");
  P(cpu_has_vmx_vmcs_shadowing, "VMCS shadowing");
+P(cpu_has_vmx_pml, "Page Modification Logging");
  #undef P

  if ( !printed )
@@ -238,6 +239,8 @@ static int vmx_init_vmcs_config(void)
  opt |= SECONDARY_EXEC_ENABLE_VPID;
  if ( opt_unrestricted_guest_enabled )
  opt |= SECONDARY_EXEC_UNRESTRICTED_GUEST;
+if ( opt_pml_enabled )
+opt |= SECONDARY_EXEC_ENABLE_PML;

  /*
   * "APIC Register Virtualization" and "Virtual Interrupt Delivery"
@@ -284,6 +287,14 @@ static int vmx_init_vmcs_config(void)
   */
  if ( !(_vmx_ept_vpid_cap & VMX_VPID_INVVPID_ALL_CONTEXT) )
  _vmx_secondary_exec_control &=
~SECONDARY_EXEC_ENABLE_VPID;
+
+/*
+ * PML cannot be supported if EPT A/D bits is not supported.
Actually,
+ * PML should not be detected if EPT A/D bits is not supported, but
for
+ * sure we do the check anyway.
+ */
+if ( !(_vmx_ept_vpid_cap & VMX_EPT_AD_BIT) )
+_vmx_secondary_exec_control &=
~SECONDARY_EXEC_ENABLE_PML;
  }

the comment is not very clear. I think you just want to say "EPT A/D bit is
required for PML" or no comment at all.

Sure, I'll change it to "EPT A/D bit is required for PML".

Thanks,
-Kai



  if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
@@ -304,6 +315,14 @@ static int vmx_init_vmcs_config(void)
SECONDARY_EXEC_UNRESTRICTED_GUEST);
  }

+/* PML cannot be supported if EPT is not used */
+if ( !(_vmx_secondary_exec_control &
SECONDARY_EXEC_ENABLE_EPT) )
+_vmx_secondary_exec_control &=
~SECONDARY_EXEC_ENABLE_PML;
+
+/* Turn off opt_pml_enabled if PML feature is not present */
+if ( !(_vmx_secondary_exec_control &
SECONDARY_EXEC_ENABLE_PML) )
+opt_pml_enabled = 0;
+
  if ( (_vmx_secondary_exec_control &
SECONDARY_EXEC_PAUSE_LOOP_EXITING) &&
ple_gap == 0 )
  {
@@ -1039,6 +1058,9 @@ static int construct_vmcs(struct vcpu *v)
  __vmwrite(POSTED_INTR_NOTIFICATION_VECTOR,
posted_intr_vector);
  }

+/* Disable PML anyway here as it will only be enabled in log dirty mode
*/
+v->arch.hvm_vmx.secondary_exec_control &=
~SECONDARY_EXEC_ENABLE_PML;
+
  /* Host data selectors. */
  __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
  __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 6fce6aa..f831a78 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -215,6 +215,7 @@ extern u32 vmx_vmentry_control;
  #define SECONDARY_EXEC_ENABLE_INVPCID   0x1000
  #define SECONDARY_EXEC_ENABLE_VMFUNC0x2000
  #define SECONDARY_EXEC_ENABLE_VMCS_SHADOWING0x4000
+#define SECONDARY_EXEC_ENABLE_PML   0x0002
  extern u32 vmx_secondary_exec_control;

  #define VMX_EPT_EXEC_ONLY_SUPPORTED 0x0001
@@ -226,6 +227,7 @@ extern u32 vmx_secondary_exec_control;
  #define VMX_EPT_INVEPT_INSTRUCTION  0x0010
  #define VMX_EPT_INVEPT_SINGLE_CONTEXT   0x0200
  #define VMX_EPT_INVEPT_ALL_CONTEXT  0x0400
+#define VMX_EPT_AD_BIT  0x0020

  #define VMX_MISC_VMWRITE_ALL0x2000

@@ -274,6 +276,8 @@ extern u32 vmx_secondary_exec_control;
  (vmx_pin_based_exec_control & PIN_BASED_POSTED_INTERRUPT)
  #define cpu_has_vmx_vmcs_shadowing \
  (vmx_secondary_exec_control &
SECONDARY_EXEC_ENABLE_VMCS_SHADOWING)
+#define cpu_has_vmx_pml \
+(vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_PML)

  #define VMCS_RID_TYPE_MASK  0x8000

@@ -318,6 +322,7 @@ enum vmcs_field {
  GUEST_LDTR_SELECTOR = 0x080c,
  GUEST_TR_SELECTOR 

Re: [Xen-devel] [v2 05/11] vmx: add new data structure member to support PML

2015-04-16 Thread Kai Huang



On 04/16/2015 11:33 PM, Jan Beulich wrote:

On 15.04.15 at 09:03,  wrote:

--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -70,8 +70,12 @@ struct ept_data {
  cpumask_var_t synced_mask;
  };
  
+#define _VMX_DOMAIN_PML_ENABLED0

+#define VMX_DOMAIN_PML_ENABLED (1ul << _VMX_DOMAIN_PML_ENABLED)
  struct vmx_domain {
  unsigned long apic_access_mfn;
+/* VMX_DOMAIN_* */
+unsigned long status;

Please us "unsigned int" until 32 bits wouldn't suffice anymore. This
will (on the average) produce slightly smaller code.

Sure.

Thanks,
-Kai


Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 00/11] PML (Paging Modification Logging) support

2015-04-16 Thread Kai Huang
Thanks Tim!

I'll send out the v3 addressing minor comments from Andrew and Jan
regarding to patch 1 & 2.

Thanks,
-Kai

On Thu, Apr 16, 2015 at 10:41 PM, Tim Deegan  wrote:
> At 15:03 +0800 on 15 Apr (1429110222), Kai Huang wrote:
>> This v2 patch series was rebased on latest upstream code.
>
> Looks good to me.
>
> Acked-by: Tim Deegan 
>
> ___
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel



-- 
Thanks,
-Kai

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 02/11] doc: add description for new PML boot parameter

2015-04-15 Thread Kai Huang



On 04/15/2015 06:15 PM, Andrew Cooper wrote:

On 15/04/15 08:03, Kai Huang wrote:

This patch adds doc description for new boot parameter 'ept=pml'.

Signed-off-by: Kai Huang 

Personally, I would fold this patch into the previous so the
documentation is in the same patch as introduces the parameter.

Thanks for suggestion. Will do in the next version.




---
  docs/misc/xen-command-line.markdown | 14 ++
  1 file changed, 14 insertions(+)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index 1dda1f0..ae30d02 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -685,6 +685,20 @@ requirement can be relaxed.  This option is particularly 
useful for nested
  virtualization, to allow the L1 hypervisor to use EPT even if the L0 
hypervisor
  does not provide VM\_ENTRY\_LOAD\_GUEST\_PAT.
  
+### ept (Intel)

+> `= List of [ pml ]`
+
+> Sub-options:
+
+> `pml`
+
+> This sub-option is boolean type and can be prefixed with `no-` to effect the
+> inverse meaning.
+
+> Default: `false`
+
+>> Control the use of Page Modification Logging for log-dirty.

Can you follow the same style as the "psr" option?  Specifically, I
don't think you need to re-describe what a boolean parameter is.
Thanks for suggestion. I chose to follow the "iommu=" style parameter as 
to me the "ept=" parameter is more likely to the "iommu=" parameter -- 
both of them intend to have various sub-boolean types, within which some 
may or may not be independent to each other.


Referring to 'psr' parameter, does below change look good to you?

+### ept (Intel)
+> `= List of ( pml )`
+
+> Default: `false`
+
+Controls EPT related features. Currently only Page Modification Logging 
(PML) is

+the controllable feature as boolean type.
+
+PML is a new hardware feature in Intel's Broadwell Server and further 
platforms

+which reduces hypervisor overhead of log-dirty mechanism by automatically
+recording GPAs (guest physical addresses) when guest memory gets dirty, and
+therefore significantly reducing number of EPT violation caused by write
+protection of guest memory, which is a necessity to implement log-dirty
+mechanism before PML.

Thanks,
-Kai


~Andrew


+
  ### gdb
  > `= [/][,DPS[,[,[,[, 
| pci | amt ] `
  


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v2 01/11] vmx: add new boot parameter to control PML enabling

2015-04-15 Thread Kai Huang
On Wed, Apr 15, 2015 at 8:20 PM, Jan Beulich  wrote:
 On 15.04.15 at 09:03,  wrote:
>> +static void __init parse_ept_param(char *s)
>> +{
>> +char *ss;
>> +int val;
>
> bool_t, and would better move ...
>
>> +
>> +do {
>> +val = !!strncmp(s, "no-", 3);
>
> ... here (making the right side expression its intializer).

Hi Jan,

Thanks for review. Do you mean below?

do {
bool_t val = !!strncmp(s,  "no-", 3);
...

Thanks,
-Kai
>
> Jan
>
>
> ___
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel



-- 
Thanks,
-Kai

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v2 11/11] p2m/ept: enable PML in p2m-ept for log-dirty

2015-04-15 Thread Kai Huang
This patch firstly enables EPT A/D bits if PML is used, as PML depends on EPT
A/D bits to work. A bit is set for all present leaf p2m types, D bit is set for
all writable types, except log-dirty type.

With PML, for 4K pages, instead of setting EPT entry to read-only, we just need
to clear D bit in order to log that GFN. For superpages, we still need to set it
to read-only as we need to split superpage to 4K pages in EPT violation.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/mm/p2m-ept.c  | 79 ++
 xen/include/asm-x86/hvm/vmx/vmcs.h |  3 +-
 xen/include/asm-x86/hvm/vmx/vmx.h  |  3 +-
 3 files changed, 76 insertions(+), 9 deletions(-)

diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 5e95a83..ff84c16 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -102,9 +102,20 @@ static int atomic_write_ept_entry(ept_entry_t *entryptr, 
ept_entry_t new,
 return rc;
 }
 
-static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, 
p2m_access_t access)
+static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
+  p2m_type_t type, p2m_access_t access)
 {
-/* First apply type permissions */
+/*
+ * First apply type permissions.
+ *
+ * A/D bits are also manually set to avoid overhead of MMU having to set
+ * them later. Both A/D bits are safe to be updated directly as they are
+ * ignored by processor if EPT A/D bits is not turned on.
+ *
+ * A bit is set for all present leaf types. D bit is set for all writable
+ * types and cleared for read-only types, as read-only types are apparently
+ * impossible to be dirty.
+ */
 switch(type)
 {
 case p2m_invalid:
@@ -118,27 +129,51 @@ static void ept_p2m_type_to_flags(ept_entry_t *entry, 
p2m_type_t type, p2m_acces
 break;
 case p2m_ram_rw:
 entry->r = entry->w = entry->x = 1;
+entry->a = entry->d = 1;
 break;
 case p2m_mmio_direct:
 entry->r = entry->x = 1;
 entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
 entry->mfn);
+entry->a = 1;
+entry->d = entry->w;
 break;
 case p2m_ram_logdirty:
+entry->r = entry->x = 1;
+/*
+ * In case of PML, we don't have to write protect 4K page, but
+ * only need to clear D-bit for it, but we still need to write
+ * protect super page in order to split it to 4K pages in EPT
+ * violation.
+ */
+if ( vmx_domain_pml_enabled(p2m->domain)
+ && !is_epte_superpage(entry) )
+entry->w = 1;
+else
+entry->w = 0;
+entry->a = 1;
+/* For both PML or non-PML cases we clear D bit anyway */
+entry->d = 0;
+break;
 case p2m_ram_ro:
 case p2m_ram_shared:
 entry->r = entry->x = 1;
 entry->w = 0;
+entry->a = 1;
+entry->d = 0;
 break;
 case p2m_grant_map_rw:
 case p2m_map_foreign:
 entry->r = entry->w = 1;
 entry->x = 0;
+entry->a = entry->d = 1;
 break;
 case p2m_grant_map_ro:
 case p2m_mmio_write_dm:
 entry->r = 1;
 entry->w = entry->x = 0;
+entry->a = 1;
+entry->d = 0;
 break;
 }
 
@@ -194,6 +229,8 @@ static int ept_set_middle_entry(struct p2m_domain *p2m, 
ept_entry_t *ept_entry)
 ept_entry->access = p2m->default_access;
 
 ept_entry->r = ept_entry->w = ept_entry->x = 1;
+/* Manually set A bit to avoid overhead of MMU having to write it later. */
+ept_entry->a = 1;
 
 return 1;
 }
@@ -244,10 +281,9 @@ static int ept_split_super_page(struct p2m_domain *p2m, 
ept_entry_t *ept_entry,
 epte->sp = (level > 1);
 epte->mfn += i * trunk;
 epte->snp = (iommu_enabled && iommu_snoop);
-ASSERT(!epte->rsvd1);
 ASSERT(!epte->avail3);
 
-ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);
+ept_p2m_type_to_flags(p2m, epte, epte->sa_p2mt, epte->access);
 
 if ( (level - 1) == target )
 continue;
@@ -489,7 +525,7 @@ static int resolve_misconfig(struct p2m_domain *p2m, 
unsigned long gfn)
 {
  e.sa_p2mt = p2m_is_logdirty_range(p2m, gfn + i, gfn + 
i)
  ? p2m_ram_logdirty : p2m_ram_rw;
- ept_p2m_type_to_flags(&e, e.sa_p2mt, e.access);
+ ept_p2m_type_to_flags(p

[Xen-devel] [v2 01/11] vmx: add new boot parameter to control PML enabling

2015-04-15 Thread Kai Huang
A top level EPT parameter "ept=" and a sub boolean "opt_pml_enabled"
are added to control PML. Other booleans can be further added for any other EPT
related features.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmcs.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index d614638..4fff46d 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -64,6 +64,37 @@ integer_param("ple_gap", ple_gap);
 static unsigned int __read_mostly ple_window = 4096;
 integer_param("ple_window", ple_window);
 
+static bool_t __read_mostly opt_pml_enabled = 0;
+
+/*
+ * The 'ept' parameter controls functionalities that depend on, or impact the
+ * EPT mechanism. Optional comma separated value may contain:
+ *
+ *  pml Enable PML
+ */
+static void __init parse_ept_param(char *s)
+{
+char *ss;
+int val;
+
+do {
+val = !!strncmp(s, "no-", 3);
+if ( !val )
+s += 3;
+
+ss = strchr(s, ',');
+if ( ss )
+*ss = '\0';
+
+if ( !strcmp(s, "pml") )
+opt_pml_enabled = val;
+
+s = ss + 1;
+} while ( ss );
+}
+
+custom_param("ept", parse_ept_param);
+
 /* Dynamic (run-time adjusted) execution control flags. */
 u32 vmx_pin_based_exec_control __read_mostly;
 u32 vmx_cpu_based_exec_control __read_mostly;
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v2 09/11] vmx: disable PML in vmx_vcpu_destroy

2015-04-15 Thread Kai Huang
It's possible domain still remains in log-dirty mode when it is about to be
destroyed, in which case we should manually disable PML for it.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmx.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index ad9d7d4..821e90b 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -152,6 +152,14 @@ static int vmx_vcpu_initialise(struct vcpu *v)
 
 static void vmx_vcpu_destroy(struct vcpu *v)
 {
+/*
+ * There are cases that domain still remains in log-dirty mode when it is
+ * about to be destroyed (ex, user types 'xl destroy '), in which case
+ * we should disable PML manually here. Note that vmx_vcpu_destroy is 
called
+ * prior to vmx_domain_destroy so we need to disable PML for each vcpu
+ * separately here.
+ */
+vmx_vcpu_disable_pml(v);
 vmx_destroy_vmcs(v);
 vpmu_destroy(v);
 passive_domain_destroy(v);
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v2 08/11] vmx: handle PML enabling in vmx_vcpu_initialise

2015-04-15 Thread Kai Huang
It's possible domain has already been in log-dirty mode when creating vcpu, in
which case we should enable PML for this vcpu if PML has been enabled for the
domain.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmx.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 279e745..ad9d7d4 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -116,6 +116,29 @@ static int vmx_vcpu_initialise(struct vcpu *v)
 return rc;
 }
 
+/*
+ * It's rare but still possible that domain has already been in log-dirty
+ * mode when vcpu is being created (commented by Tim), in which case we
+ * should enable PML for this vcpu if PML has been enabled for the domain,
+ * and failure to enable results in failure of creating this vcpu.
+ *
+ * Note even there's no vcpu created for the domain, vmx_domain_enable_pml
+ * will return successful in which case vmx_domain_pml_enabled will also
+ * return true. And even this is the first vcpu to be created with
+ * vmx_domain_pml_enabled being true, failure of enabling PML still results
+ * in failure of creating vcpu, to avoid complicated logic to revert PML
+ * style EPT table to non-PML style EPT table.
+ */
+if ( vmx_domain_pml_enabled(v->domain) )
+{
+if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
+{
+dprintk(XENLOG_ERR, "%pv: Failed to enable PML.\n", v);
+vmx_destroy_vmcs(v);
+return rc;
+}
+}
+
 vpmu_initialise(v);
 
 vmx_install_vlapic_mapping(v);
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v2 10/11] log-dirty: refine common code to support PML

2015-04-15 Thread Kai Huang
Using PML, it's possible there are dirty GPAs logged in vcpus' PML buffers
when userspace peek/clear dirty pages, therefore we need to flush them befor
reporting dirty pages to userspace. This applies to both video ram tracking and
paging_log_dirty_op.

This patch adds new p2m layer functions to enable/disable PML and flush PML
buffers. The new functions are named to be generic to cover potential futher
PML-like features for other platforms.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/mm/hap/hap.c | 29 +
 xen/arch/x86/mm/p2m.c | 36 
 xen/arch/x86/mm/paging.c  | 10 ++
 xen/include/asm-x86/p2m.h | 11 +++
 4 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 4ecb2e2..1099670 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -121,7 +121,10 @@ int hap_track_dirty_vram(struct domain *d,
 p2m_change_type_range(d, ostart, oend,
   p2m_ram_logdirty, p2m_ram_rw);
 
-/* set l1e entries of range within P2M table to be read-only. */
+/*
+ * switch vram to log dirty mode, either by setting l1e entries of
+ * P2M table to be read-only, or via hardware-assisted log-dirty.
+ */
 p2m_change_type_range(d, begin_pfn, begin_pfn + nr,
   p2m_ram_rw, p2m_ram_logdirty);
 
@@ -135,6 +138,9 @@ int hap_track_dirty_vram(struct domain *d,
 
 domain_pause(d);
 
+/* flush dirty GFNs potentially cached by hardware */
+p2m_flush_hardware_cached_dirty(d);
+
 /* get the bitmap */
 paging_log_dirty_range(d, begin_pfn, nr, dirty_bitmap);
 
@@ -190,9 +196,15 @@ static int hap_enable_log_dirty(struct domain *d, bool_t 
log_global)
 d->arch.paging.mode |= PG_log_dirty;
 paging_unlock(d);
 
+/* enable hardware-assisted log-dirty if it is supported */
+p2m_enable_hardware_log_dirty(d);
+
 if ( log_global )
 {
-/* set l1e entries of P2M table to be read-only. */
+/*
+ * switch to log dirty mode, either by setting l1e entries of P2M table
+ * to be read-only, or via hardware-assisted log-dirty.
+ */
 p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty);
 flush_tlb_mask(d->domain_dirty_cpumask);
 }
@@ -205,14 +217,23 @@ static int hap_disable_log_dirty(struct domain *d)
 d->arch.paging.mode &= ~PG_log_dirty;
 paging_unlock(d);
 
-/* set l1e entries of P2M table with normal mode */
+/* disable hardware-assisted log-dirty if it is supported */
+p2m_disable_hardware_log_dirty(d);
+
+/*
+ * switch to normal mode, either by setting l1e entries of P2M table to
+ * normal mode, or via hardware-assisted log-dirty.
+ */
 p2m_change_entry_type_global(d, p2m_ram_logdirty, p2m_ram_rw);
 return 0;
 }
 
 static void hap_clean_dirty_bitmap(struct domain *d)
 {
-/* set l1e entries of P2M table to be read-only. */
+/*
+ * switch to log-dirty mode, either by setting l1e entries of P2M table to
+ * be read-only, or via hardware-assisted log-dirty.
+ */
 p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty);
 flush_tlb_mask(d->domain_dirty_cpumask);
 }
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 6a06e9f..291a275 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -239,6 +239,42 @@ void p2m_memory_type_changed(struct domain *d)
 }
 }
 
+void p2m_enable_hardware_log_dirty(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+if ( p2m->enable_hardware_log_dirty )
+{
+p2m_lock(p2m);
+p2m->enable_hardware_log_dirty(p2m);
+p2m_unlock(p2m);
+}
+}
+
+void p2m_disable_hardware_log_dirty(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+if ( p2m->disable_hardware_log_dirty )
+{
+p2m_lock(p2m);
+p2m->disable_hardware_log_dirty(p2m);
+p2m_unlock(p2m);
+}
+}
+
+void p2m_flush_hardware_cached_dirty(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+if ( p2m->flush_hardware_cached_dirty )
+{
+p2m_lock(p2m);
+p2m->flush_hardware_cached_dirty(p2m);
+p2m_unlock(p2m);
+}
+}
+
 mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
 p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
 unsigned int *page_order, bool_t locked)
diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index 77c929b..59d4720 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -422,7 +422,17 @@ static int paging_log_dirty_op(struct domain *d,
 int i4, i3, i2;
 
 if ( !resuming )
+{
 domain_pause(d)

[Xen-devel] [v2 06/11] vmx: add help functions to support PML

2015-04-15 Thread Kai Huang
This patch adds help functions to enable/disable PML, and flush PML buffer for
single vcpu and particular domain for further use.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmcs.c| 178 +
 xen/include/asm-x86/hvm/vmx/vmcs.h |   9 ++
 2 files changed, 187 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index d120370..d3cb50f 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1328,6 +1328,184 @@ void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 
vector)
 &v->arch.hvm_vmx.eoi_exitmap_changed);
 }
 
+bool_t vmx_vcpu_pml_enabled(const struct vcpu *v)
+{
+return !!(v->arch.hvm_vmx.secondary_exec_control &
+  SECONDARY_EXEC_ENABLE_PML);
+}
+
+int vmx_vcpu_enable_pml(struct vcpu *v)
+{
+struct domain *d = v->domain;
+
+if ( vmx_vcpu_pml_enabled(v) )
+return 0;
+
+v->arch.hvm_vmx.pml_pg = d->arch.paging.alloc_page(d);
+if ( !v->arch.hvm_vmx.pml_pg )
+return -ENOMEM;
+
+vmx_vmcs_enter(v);
+
+__vmwrite(PML_ADDRESS, page_to_mfn(v->arch.hvm_vmx.pml_pg) << PAGE_SHIFT);
+__vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
+
+v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_PML;
+
+__vmwrite(SECONDARY_VM_EXEC_CONTROL,
+  v->arch.hvm_vmx.secondary_exec_control);
+
+vmx_vmcs_exit(v);
+
+return 0;
+}
+
+void vmx_vcpu_disable_pml(struct vcpu *v)
+{
+if ( !vmx_vcpu_pml_enabled(v) )
+return;
+
+/* Make sure we don't lose any logged GPAs */
+vmx_vcpu_flush_pml_buffer(v);
+
+vmx_vmcs_enter(v);
+
+v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+__vmwrite(SECONDARY_VM_EXEC_CONTROL,
+  v->arch.hvm_vmx.secondary_exec_control);
+
+vmx_vmcs_exit(v);
+
+v->domain->arch.paging.free_page(v->domain, v->arch.hvm_vmx.pml_pg);
+v->arch.hvm_vmx.pml_pg = NULL;
+}
+
+void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
+{
+uint64_t *pml_buf;
+unsigned long pml_idx;
+
+ASSERT(vmx_vcpu_pml_enabled(v));
+
+vmx_vmcs_enter(v);
+
+__vmread(GUEST_PML_INDEX, &pml_idx);
+
+/* Do nothing if PML buffer is empty */
+if ( pml_idx == (NR_PML_ENTRIES - 1) )
+goto out;
+
+pml_buf = __map_domain_page(v->arch.hvm_vmx.pml_pg);
+
+/*
+ * PML index can be either 2^16-1 (buffer is full), or 0~511 (buffer is not
+ * full), and in latter case PML index always points to next available
+ * entity.
+ */
+if (pml_idx >= NR_PML_ENTRIES)
+pml_idx = 0;
+else
+pml_idx++;
+
+for ( ; pml_idx < NR_PML_ENTRIES; pml_idx++ )
+{
+unsigned long gfn = pml_buf[pml_idx] >> PAGE_SHIFT;
+/*
+ * Need to change type from log-dirty to normal memory for logged GFN.
+ * hap_track_dirty_vram depends on it to work. And we really only need
+ * to mark GFNs which hve been successfully changed from log-dirty to
+ * normal memory to be dirty.
+ */
+if ( !p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty,
+p2m_ram_rw) )
+paging_mark_gfn_dirty(v->domain, gfn);
+}
+
+unmap_domain_page(pml_buf);
+
+/* Reset PML index */
+__vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
+
+out:
+vmx_vmcs_exit(v);
+}
+
+bool_t vmx_domain_pml_enabled(const struct domain *d)
+{
+return !!(d->arch.hvm_domain.vmx.status & VMX_DOMAIN_PML_ENABLED);
+}
+
+/*
+ * This function enables PML for particular domain. It should be called when
+ * domain is paused.
+ *
+ * PML needs to be enabled globally for all vcpus of the domain, as PML buffer
+ * and PML index are pre-vcpu, but EPT table is shared by vcpus, therefore
+ * enabling PML on partial vcpus won't work.
+ */
+int vmx_domain_enable_pml(struct domain *d)
+{
+struct vcpu *v;
+int rc;
+
+ASSERT(atomic_read(&d->pause_count));
+
+if ( vmx_domain_pml_enabled(d) )
+return 0;
+
+for_each_vcpu( d, v )
+if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
+goto error;
+
+d->arch.hvm_domain.vmx.status |= VMX_DOMAIN_PML_ENABLED;
+
+return 0;
+
+error:
+for_each_vcpu( d, v )
+if ( vmx_vcpu_pml_enabled(v) )
+vmx_vcpu_disable_pml(v);
+return rc;
+}
+
+/*
+ * Disable PML for particular domain. Called when domain is paused.
+ *
+ * The same as enabling PML for domain, disabling PML should be done for all
+ * vcpus at once.
+ */
+void vmx_domain_disable_pml(struct domain *d)
+{
+struct vcpu *v;
+
+ASSERT(atomic_read(&d->pause_count));
+
+if ( !vmx_domain_pml_enabled(d) )
+return;
+
+for_each_vcpu( d, v )
+vmx_vcpu_disable_pml(v);
+
+d->arch.hvm_domain.vmx.status &= ~VMX_DOMAIN_PML_ENABLED;
+}
+
+/*
+ * Flush PML buffer of all vcpus, and upda

[Xen-devel] [v2 02/11] doc: add description for new PML boot parameter

2015-04-15 Thread Kai Huang
This patch adds doc description for new boot parameter 'ept=pml'.

Signed-off-by: Kai Huang 
---
 docs/misc/xen-command-line.markdown | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index 1dda1f0..ae30d02 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -685,6 +685,20 @@ requirement can be relaxed.  This option is particularly 
useful for nested
 virtualization, to allow the L1 hypervisor to use EPT even if the L0 hypervisor
 does not provide VM\_ENTRY\_LOAD\_GUEST\_PAT.
 
+### ept (Intel)
+> `= List of [ pml ]`
+
+> Sub-options:
+
+> `pml`
+
+> This sub-option is boolean type and can be prefixed with `no-` to effect the
+> inverse meaning.
+
+> Default: `false`
+
+>> Control the use of Page Modification Logging for log-dirty.
+
 ### gdb
 > `= [/][,DPS[,[,[,[, 
 > | pci | amt ] `
 
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v2 03/11] log-dirty: add new paging_mark_gfn_dirty

2015-04-15 Thread Kai Huang
PML logs GPA in PML buffer. Original paging_mark_dirty takes MFN as parameter
but it gets guest pfn internally and use guest pfn to as index for looking up
radix log-dirty tree. In flushing PML buffer, calling paging_mark_dirty directly
introduces redundant p2m lookups (gfn->mfn->gfn), therefore we introduce
paging_mark_gfn_dirty which is bulk of paging_mark_dirty but takes guest pfn as
parameter, and in flushing PML buffer we call paging_mark_gfn_dirty directly.
Original paging_mark_dirty then simply is a wrapper of paging_mark_gfn_dirty.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/mm/paging.c | 31 +--
 xen/include/asm-x86/paging.h |  2 ++
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index b54d76a..77c929b 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -266,24 +266,17 @@ static int paging_log_dirty_disable(struct domain *d, 
bool_t resuming)
 return ret;
 }
 
-/* Mark a page as dirty */
-void paging_mark_dirty(struct domain *d, unsigned long guest_mfn)
+/* Mark a page as dirty, with taking guest pfn as parameter */
+void paging_mark_gfn_dirty(struct domain *d, unsigned long pfn)
 {
-unsigned long pfn;
-mfn_t gmfn;
 int changed;
 mfn_t mfn, *l4, *l3, *l2;
 unsigned long *l1;
 int i1, i2, i3, i4;
 
-gmfn = _mfn(guest_mfn);
-
-if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) ||
- page_get_owner(mfn_to_page(gmfn)) != d )
+if ( !paging_mode_log_dirty(d) )
 return;
 
-/* We /really/ mean PFN here, even for non-translated guests. */
-pfn = get_gpfn_from_mfn(mfn_x(gmfn));
 /* Shared MFNs should NEVER be marked dirty */
 BUG_ON(SHARED_M2P(pfn));
 
@@ -351,6 +344,24 @@ out:
 return;
 }
 
+/* Mark a page as dirty */
+void paging_mark_dirty(struct domain *d, unsigned long guest_mfn)
+{
+unsigned long pfn;
+mfn_t gmfn;
+
+gmfn = _mfn(guest_mfn);
+
+if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) ||
+ page_get_owner(mfn_to_page(gmfn)) != d )
+return;
+
+/* We /really/ mean PFN here, even for non-translated guests. */
+pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+paging_mark_gfn_dirty(d, pfn);
+}
+
 
 /* Is this guest page dirty? */
 int paging_mfn_is_dirty(struct domain *d, mfn_t gmfn)
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index 53de715..c99324c 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -156,6 +156,8 @@ void paging_log_dirty_init(struct domain *d,
 
 /* mark a page as dirty */
 void paging_mark_dirty(struct domain *d, unsigned long guest_mfn);
+/* mark a page as dirty with taking guest pfn as parameter */
+void paging_mark_gfn_dirty(struct domain *d, unsigned long pfn);
 
 /* is this guest page dirty? 
  * This is called from inside paging code, with the paging lock held. */
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v2 04/11] vmx: add PML definition and feature detection.

2015-04-15 Thread Kai Huang
The patch adds PML definition and feature detection. Note PML won't be detected
if PML is disabled from boot parameter. PML is also disabled in construct_vmcs,
as it will only be enabled when domain is switched to log dirty mode.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmcs.c| 22 ++
 xen/include/asm-x86/hvm/vmx/vmcs.h |  6 ++
 xen/include/asm-x86/hvm/vmx/vmx.h  |  1 +
 3 files changed, 29 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 4fff46d..d120370 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -141,6 +141,7 @@ static void __init vmx_display_features(void)
 P(cpu_has_vmx_virtual_intr_delivery, "Virtual Interrupt Delivery");
 P(cpu_has_vmx_posted_intr_processing, "Posted Interrupt Processing");
 P(cpu_has_vmx_vmcs_shadowing, "VMCS shadowing");
+P(cpu_has_vmx_pml, "Page Modification Logging");
 #undef P
 
 if ( !printed )
@@ -238,6 +239,8 @@ static int vmx_init_vmcs_config(void)
 opt |= SECONDARY_EXEC_ENABLE_VPID;
 if ( opt_unrestricted_guest_enabled )
 opt |= SECONDARY_EXEC_UNRESTRICTED_GUEST;
+if ( opt_pml_enabled )
+opt |= SECONDARY_EXEC_ENABLE_PML;
 
 /*
  * "APIC Register Virtualization" and "Virtual Interrupt Delivery"
@@ -284,6 +287,14 @@ static int vmx_init_vmcs_config(void)
  */
 if ( !(_vmx_ept_vpid_cap & VMX_VPID_INVVPID_ALL_CONTEXT) )
 _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+
+/*
+ * PML cannot be supported if EPT A/D bits is not supported. Actually,
+ * PML should not be detected if EPT A/D bits is not supported, but for
+ * sure we do the check anyway.
+ */
+if ( !(_vmx_ept_vpid_cap & VMX_EPT_AD_BIT) )
+_vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 }
 
 if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
@@ -304,6 +315,14 @@ static int vmx_init_vmcs_config(void)
   SECONDARY_EXEC_UNRESTRICTED_GUEST);
 }
 
+/* PML cannot be supported if EPT is not used */
+if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) )
+_vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+/* Turn off opt_pml_enabled if PML feature is not present */
+if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_PML) )
+opt_pml_enabled = 0;
+
 if ( (_vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) &&
   ple_gap == 0 )
 {
@@ -1039,6 +1058,9 @@ static int construct_vmcs(struct vcpu *v)
 __vmwrite(POSTED_INTR_NOTIFICATION_VECTOR, posted_intr_vector);
 }
 
+/* Disable PML anyway here as it will only be enabled in log dirty mode */
+v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
 /* Host data selectors. */
 __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
 __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 6fce6aa..f831a78 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -215,6 +215,7 @@ extern u32 vmx_vmentry_control;
 #define SECONDARY_EXEC_ENABLE_INVPCID   0x1000
 #define SECONDARY_EXEC_ENABLE_VMFUNC0x2000
 #define SECONDARY_EXEC_ENABLE_VMCS_SHADOWING0x4000
+#define SECONDARY_EXEC_ENABLE_PML   0x0002
 extern u32 vmx_secondary_exec_control;
 
 #define VMX_EPT_EXEC_ONLY_SUPPORTED 0x0001
@@ -226,6 +227,7 @@ extern u32 vmx_secondary_exec_control;
 #define VMX_EPT_INVEPT_INSTRUCTION  0x0010
 #define VMX_EPT_INVEPT_SINGLE_CONTEXT   0x0200
 #define VMX_EPT_INVEPT_ALL_CONTEXT  0x0400
+#define VMX_EPT_AD_BIT  0x0020
 
 #define VMX_MISC_VMWRITE_ALL0x2000
 
@@ -274,6 +276,8 @@ extern u32 vmx_secondary_exec_control;
 (vmx_pin_based_exec_control & PIN_BASED_POSTED_INTERRUPT)
 #define cpu_has_vmx_vmcs_shadowing \
 (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VMCS_SHADOWING)
+#define cpu_has_vmx_pml \
+(vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_PML)
 
 #define VMCS_RID_TYPE_MASK  0x8000
 
@@ -318,6 +322,7 @@ enum vmcs_field {
 GUEST_LDTR_SELECTOR = 0x080c,
 GUEST_TR_SELECTOR   = 0x080e,
 GUEST_INTR_STATUS   = 0x0810,
+GUEST_PML_INDEX = 0x0812,
 HOST_ES_SELECTOR= 0x0c00,
 HOST_CS_SELECTOR= 0x0c02,
 HOST_SS_SELECTOR= 0x0c04,
@@ -331,6 +336,7 @@ enum vmcs_field {
 VM_EXIT_MSR_STORE_ADDR  = 0x

[Xen-devel] [v2 00/11] PML (Paging Modification Logging) support

2015-04-15 Thread Kai Huang
This v2 patch series was rebased on latest upstream code.

v1->v2:

Firstly the sequence of this patch series were adjusted addressing Andrew and
Tim's comments:

- Put v1 patch 2 (new boot parameter for PML) to be patch 1.
- Added a new patch to patch the doc change of new added boot parameter
  of controlling PML enabling, as patch 2.
- Added a new patch to introduce paging_mark_gfn_dirty, and made
  paging_mark_dirty as a wrapper of it, as patch 3.
- Merged v1 patch 1 (EPT A/D bits support) and v1 patch 10 (enabling PML in
  p2m-ept.c) to a single patch as they both operates on EPT A/D bits, with
  considering EPT A/D bits will be enabled only when PML is used as well.
  To me, looks a single patch is more straightforward.

The v2 patch series were orginized as following, with detail changes described
under each of them.

patch 1: Add new boot parameter to control PML enabling
- adjusted the sequence of parsing function and boot parameter.

patch 2: new patch for adding description for new PML boot parameter
- inspired by "iommu=" parameter.

patch 3: add new paging_mark_gfn_dirty
- Introduced new paging_mark_gfn_dirty, which takes guest pfn as parameter,
  and made paging_mark_dirty a wrapper of paging_mark_gfn_dirty, commented
  by Tim.

patch 4: PML feature detection
- disable opt_pml_enabled if PML is not present, commented by Andrew.

patch 5 ~ 9: Add PML support in VMX
- changed vmx_*_{enable|disable}_pml, vmx_domain_flush_pml_buffer to be
  idempotent, commented by Tim. vmx_vcpu_flush_pml_buffer remains the same
  as it is also called in PML buffer full VMEXIT.
- changed vmx_{vcpu|domain}_pml_enabled to return bool_t, with taking const
  pointer (of vcpu or domain) as parameter, commented by Andrew.
- changed vmx_vcpu_flush_pml_buffer calling paging_mark_gfn_dirty instead of
  paging_mark_dirty, commented by Tim.
- changed various coding style issues and did several code refinements
  commented by Andrew.

patch 10: refine log-dirty common code to support PML
- removed PML buffer flush callback in log_dirty_domain in paging layer.
- changed to call p2m_flush_hardware_cached_dirty directly in
  hap_track_dirty_vram, and paging_log_dirty_op.

patch 11: enable EPT A/D bits and add PML support in p2m-ept.c
- Merged EPT A/D bits support with enabling PML in p2m-ept.c as it's more
  straightforward, with considering EPT A/D bits will only be enabled if PML
  is used, and both of them operates on EPT A/D bits.
- Manually set or clear A/D bits in ept_p2m_type_to_flags, and
  ept_set_middle_entry, commented by Tim.

Several sanity tests of live migration were done, and all tests worked well.

I also tested specjbb performance under global log-dirty, by using the same hack
mentioned in v1. The result is consistent with v1 (~10% improvement in global
log-dirty), and PML is beneficial in reducing hypervisor overhead in log-dirty
mode.

- global log-dirty:

WP  PML (v1)PML (v2)
72862   79511   80007
73466   81173   81614
72989   81177   82047
73138   81777   81975
72811   80257   80139
72486   80413   81127

avg 72959   80718   81151
100%110.63%     111.22%



Kai Huang (11):
  vmx: add new boot parameter to control PML enabling
  doc: add description for new PML boot parameter
  log-dirty: add new paging_mark_gfn_dirty
  vmx: add PML definition and feature detection.
  vmx: add new data structure member to support PML
  vmx: add help functions to support PML
  vmx: handle PML buffer full VMEXIT
  vmx: handle PML enabling in vmx_vcpu_initialise
  vmx: disable PML in vmx_vcpu_destroy
  log-dirty: refine common code to support PML
  p2m/ept: enable PML in p2m-ept for log-dirty

 docs/misc/xen-command-line.markdown |  14 +++
 xen/arch/x86/hvm/vmx/vmcs.c | 231 
 xen/arch/x86/hvm/vmx/vmx.c  |  35 ++
 xen/arch/x86/mm/hap/hap.c   |  29 -
 xen/arch/x86/mm/p2m-ept.c   |  79 ++--
 xen/arch/x86/mm/p2m.c   |  36 ++
 xen/arch/x86/mm/paging.c|  41 +--
 xen/include/asm-x86/hvm/vmx/vmcs.h  |  25 +++-
 xen/include/asm-x86/hvm/vmx/vmx.h   |   4 +-
 xen/include/asm-x86/p2m.h   |  11 ++
 xen/include/asm-x86/paging.h|   2 +
 11 files changed, 484 insertions(+), 23 deletions(-)

-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v2 05/11] vmx: add new data structure member to support PML

2015-04-15 Thread Kai Huang
A new 4K page pointer is added to arch_vmx_struct as PML buffer for vcpu. And a
new 'status' field is added to vmx_domain to indicate whether PML is enabled for
the domain or not. The 'status' field also can be used for further similiar
purpose.

Note both new members don't have to be initialized to zero explicitly as both
vcpu and domain structure are zero-ed when they are created.

Signed-off-by: Kai Huang 
---
 xen/include/asm-x86/hvm/vmx/vmcs.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index f831a78..2c679ac 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -70,8 +70,12 @@ struct ept_data {
 cpumask_var_t synced_mask;
 };
 
+#define _VMX_DOMAIN_PML_ENABLED0
+#define VMX_DOMAIN_PML_ENABLED (1ul << _VMX_DOMAIN_PML_ENABLED)
 struct vmx_domain {
 unsigned long apic_access_mfn;
+/* VMX_DOMAIN_* */
+unsigned long status;
 };
 
 struct pi_desc {
@@ -142,6 +146,9 @@ struct arch_vmx_struct {
 /* Bitmap to control vmexit policy for Non-root VMREAD/VMWRITE */
 struct page_info *vmread_bitmap;
 struct page_info *vmwrite_bitmap;
+
+#define NR_PML_ENTRIES   512
+struct page_info *pml_pg;
 };
 
 int vmx_create_vmcs(struct vcpu *v);
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [v2 07/11] vmx: handle PML buffer full VMEXIT

2015-04-15 Thread Kai Huang
We need to flush PML buffer when it's full.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmx.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 2ac1492..279e745 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3177,6 +3177,10 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 vmx_handle_apic_write();
 break;
 
+case EXIT_REASON_PML_FULL:
+vmx_vcpu_flush_pml_buffer(v);
+break;
+
 case EXIT_REASON_ACCESS_GDTR_OR_IDTR:
 case EXIT_REASON_ACCESS_LDTR_OR_TR:
 case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 00/10] PML (Paging Modification Logging) support

2015-04-07 Thread Kai Huang



On 04/07/2015 05:24 PM, Tim Deegan wrote:

Hi,

At 16:30 +0800 on 07 Apr (1428424218), Kai Huang wrote:

Hi Jan, Tim, other maintainers,

Do you have comments? Or should I send out the v2 addressing Andrew's
comments, as it's been more than a week since this patch series were
sent out?

I'm sorry, I was away last week so I haven't had a chance to review
these patches.  I'll probably be able to look at them on Thursday.

Yeah sure. Thanks Tim!

Thanks,
-Kai


Cheers,

Tim.


On 03/30/2015 01:50 PM, Kai Huang wrote:


On 03/28/2015 05:26 AM, Andrew Cooper wrote:

On 27/03/15 02:35, Kai Huang wrote:

Hi all,

This patch series adds PML support to Xen. Please kindly help to
review it.

Overall this looks like a very good series, and it is particularly
helpful given the level of commenting.

Which platforms is/will PML be available for?

Hi Andrew,

Thanks for your quick review. PML will be available from Intel's
"Broadwell server" platform.

Thanks,
-Kai

~Andrew

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 00/10] PML (Paging Modification Logging) support

2015-04-07 Thread Kai Huang

Hi Jan, Tim, other maintainers,

Do you have comments? Or should I send out the v2 addressing Andrew's 
comments, as it's been more than a week since this patch series were 
sent out?


Thanks,
-Kai

On 03/30/2015 01:50 PM, Kai Huang wrote:



On 03/28/2015 05:26 AM, Andrew Cooper wrote:

On 27/03/15 02:35, Kai Huang wrote:

Hi all,

This patch series adds PML support to Xen. Please kindly help to 
review it.

Overall this looks like a very good series, and it is particularly
helpful given the level of commenting.

Which platforms is/will PML be available for?

Hi Andrew,

Thanks for your quick review. PML will be available from Intel's 
"Broadwell server" platform.


Thanks,
-Kai


~Andrew

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 02/10] VMX: New parameter to control PML enabling

2015-04-02 Thread Kai Huang
On Thu, Apr 2, 2015 at 5:58 PM, Andrew Cooper  wrote:
> On 02/04/15 06:46, Kai Huang wrote:
>>
>>
>> On 03/28/2015 04:42 AM, Andrew Cooper wrote:
>>> On 27/03/15 02:35, Kai Huang wrote:
>>>> A top level EPT parameter "ept=" and a sub boolean
>>>> "pml_enable" are
>>>> added to control PML. Other booleans can be further added for any
>>>> other EPT
>>>> related features.
>>>>
>>>> Signed-off-by: Kai Huang 
>>> Please patch docs/misc/xen-command-line.markdown as well.  See the
>>> existing "psr" option as a similar example.
>>>
>>> Also, as indicated in patch 1, I think patches 1 and 2 need swapping in
>>> the series.
>>>
>>>> ---
>>>>   xen/arch/x86/hvm/vmx/vmcs.c | 32 
>>>>   1 file changed, 32 insertions(+)
>>>>
>>>> diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
>>>> index 2f645fe..9b20a4b 100644
>>>> --- a/xen/arch/x86/hvm/vmx/vmcs.c
>>>> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
>>>> @@ -50,6 +50,16 @@ boolean_param("unrestricted_guest",
>>>> opt_unrestricted_guest_enabled);
>>>>   static bool_t __read_mostly opt_apicv_enabled = 1;
>>>>   boolean_param("apicv", opt_apicv_enabled);
>>>>   +static void parse_ept_param(char *s);
>>>> +/*
>>>> + * The 'ept' parameter controls functionalities that depend on, or
>>>> impact the
>>>> + * EPT mechanism. Optional comma separated value may contain:
>>>> + *
>>>> + *  pml Enable PML
>>>> + */
>>>> +custom_param("ept", parse_ept_param);
>>> It is common to put the custom_param() call below parse_ept_param() so
>>> you don't need to forward-declare the function.  The comment can happily
>>> live at the top of parse_ept_param().
>> Hi Andrew,
>>
>> Looks it's better to keep parse_ept_param() below custom_param(), as
>> simply moving parse_ept_param() above custom_param() results in below
>> error (I also changed pml_enable to opt_pml_enabled), as it references
>> opt_pml_enabled variable, which is defined below custom_param().
>> Actually for "iommu=" parameter, parse_iommu_param() was also
>> placed below custom_param().
>>
>> What do you think?
>>
>> vmcs.c: In function ‘parse_ept_param’:
>> vmcs.c:74:13: error: ‘opt_pml_enabled’ undeclared (first use in this
>> function)
>>  opt_pml_enabled = val;
>>  ^
>> vmcs.c:74:13: note: each undeclared identifier is reported only once
>> for each function it appears in
>> vmcs.c: At top level:
>> vmcs.c:81:29: error: ‘opt_pml_enabled’ defined but not used
>> [-Werror=unused-variable]
>>  static bool_t __read_mostly opt_pml_enabled = 0;
>
> The most concise way of doing this is:
>
> static bool_t __read_mostly opt_pml_enabled = 0;
>
> static void parse_ept_param(char *s)
> {
>   ...
> }
> custom_param("ept", parse_ept_param);
>
Sure. Thanks.


> ~Andrew
>
> ___
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel



-- 
Thanks,
-Kai

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 01/10] VMX: Enable EPT A/D bit support

2015-04-01 Thread Kai Huang



On 03/28/2015 04:38 AM, Andrew Cooper wrote:

On 27/03/15 02:35, Kai Huang wrote:

PML requires A/D bit support so enable it for further use.

Signed-off-by: Kai Huang 
---
  xen/arch/x86/hvm/vmx/vmcs.c| 1 +
  xen/arch/x86/mm/p2m-ept.c  | 8 +++-
  xen/include/asm-x86/hvm/vmx/vmcs.h | 4 +++-
  xen/include/asm-x86/hvm/vmx/vmx.h  | 5 -
  4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index d614638..2f645fe 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -103,6 +103,7 @@ static void __init vmx_display_features(void)
  P(cpu_has_vmx_tpr_shadow, "APIC TPR shadow");
  P(cpu_has_vmx_ept, "Extended Page Tables (EPT)");
  P(cpu_has_vmx_vpid, "Virtual-Processor Identifiers (VPID)");
+P(cpu_has_vmx_ept_ad_bit, "EPT A/D bit");
  P(cpu_has_vmx_vnmi, "Virtual NMI");
  P(cpu_has_vmx_msr_bitmap, "MSR direct-access bitmap");
  P(cpu_has_vmx_unrestricted_guest, "Unrestricted Guest");
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index c2d7720..8650092 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -233,6 +233,9 @@ static int ept_split_super_page(struct p2m_domain *p2m, 
ept_entry_t *ept_entry,
  if ( !ept_set_middle_entry(p2m, &new_ept) )
  return 0;
  
+/* It's better to copy A bit of Middle entry from original entry */

+new_ept.a = ept_entry->a;

Surely d needs to be propagated as well?  Would it make sense to extend
ept_set_middle_entry() to do all of new_ept setup in one location?


+
  table = map_domain_page(new_ept.mfn);
  trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER);
  
@@ -244,7 +247,7 @@ static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry,

  epte->sp = (level > 1);
  epte->mfn += i * trunk;
  epte->snp = (iommu_enabled && iommu_snoop);
-ASSERT(!epte->rsvd1);
+/* A/D bits are inherited from superpage */
  ASSERT(!epte->avail3);
  
  ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);

@@ -1071,6 +1074,9 @@ int ept_p2m_init(struct p2m_domain *p2m)
  /* set EPT page-walk length, now it's actual walk length - 1, i.e. 3 */
  ept->ept_wl = 3;
  
+/* Enable EPT A/D bit if it's supported by hardware */

+ept->ept_ad = cpu_has_vmx_ept_ad_bit ? 1 : 0;

This will incur overhead on all EPT operations.  It should only be
enabled if pml is going to be in use.  (I think you need reverse patches
1 and 2 in the series, and gate on pml_enable here)

Hi Andrew,

I'd like to also put patch 3 (PML feature detection) before this patch, 
as it's better to use cpu_has_vmx_pml to gate A/D bit enabling here. 
Theoretically simple "pml_enable = 1" here doesn't guarantee 
cpu_has_vmx_pml to be true, as PML may be turned off during 
vmx_init_vmcs_config.


And in this case I also want to delete below code, as if PML is not 
enabled, below code will print but actually EPT A/D bits is not enabled 
in hardware.


   P(cpu_has_vmx_ept_ad, "EPT A/D bit");

Thanks,
-Kai

+
  if ( !zalloc_cpumask_var(&ept->synced_mask) )
  return -ENOMEM;
  
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h

index 6fce6aa..4528346 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -62,7 +62,8 @@ struct ept_data {
  struct {
  u64 ept_mt :3,
  ept_wl :3,
-rsvd   :6,
+ept_ad :1,
+rsvd   :5,
  asr:52;

While you are making this change, can you add comments similar to
ept_entry_t describing the bits?


  };
  u64 eptp;
@@ -226,6 +227,7 @@ extern u32 vmx_secondary_exec_control;
  #define VMX_EPT_INVEPT_INSTRUCTION  0x0010
  #define VMX_EPT_INVEPT_SINGLE_CONTEXT   0x0200
  #define VMX_EPT_INVEPT_ALL_CONTEXT  0x0400
+#define VMX_EPT_AD_BIT_SUPPORT  0x0020
  
  #define VMX_MISC_VMWRITE_ALL0x2000
  
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h

index 91c5e18..9afd351 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -37,7 +37,8 @@ typedef union {
  emt :   3,  /* bits 5:3 - EPT Memory type */
  ipat:   1,  /* bit 6 - Ignore PAT memory type */
  sp  :   1,  /* bit 7 - Is this a superpage? */
-rsvd1   :   2,  /* bits 9:8 - Reserved for future use */
+a   :   1,  /* bit 8 - Access bit */
+d   :   1,  /* bit 9 - Dirty bit */
  recalc  :   1,  /* bit 10 - Software available 1 */
  snp 

Re: [Xen-devel] [PATCH 02/10] VMX: New parameter to control PML enabling

2015-04-01 Thread Kai Huang



On 03/28/2015 04:42 AM, Andrew Cooper wrote:

On 27/03/15 02:35, Kai Huang wrote:

A top level EPT parameter "ept=" and a sub boolean "pml_enable" are
added to control PML. Other booleans can be further added for any other EPT
related features.

Signed-off-by: Kai Huang 

Please patch docs/misc/xen-command-line.markdown as well.  See the
existing "psr" option as a similar example.

Also, as indicated in patch 1, I think patches 1 and 2 need swapping in
the series.


---
  xen/arch/x86/hvm/vmx/vmcs.c | 32 
  1 file changed, 32 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 2f645fe..9b20a4b 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -50,6 +50,16 @@ boolean_param("unrestricted_guest", 
opt_unrestricted_guest_enabled);
  static bool_t __read_mostly opt_apicv_enabled = 1;
  boolean_param("apicv", opt_apicv_enabled);
  
+static void parse_ept_param(char *s);

+/*
+ * The 'ept' parameter controls functionalities that depend on, or impact the
+ * EPT mechanism. Optional comma separated value may contain:
+ *
+ *  pml Enable PML
+ */
+custom_param("ept", parse_ept_param);

It is common to put the custom_param() call below parse_ept_param() so
you don't need to forward-declare the function.  The comment can happily
live at the top of parse_ept_param().

Hi Andrew,

Looks it's better to keep parse_ept_param() below custom_param(), as 
simply moving parse_ept_param() above custom_param() results in below 
error (I also changed pml_enable to opt_pml_enabled), as it references 
opt_pml_enabled variable, which is defined below custom_param(). 
Actually for "iommu=" parameter, parse_iommu_param() was also 
placed below custom_param().


What do you think?

vmcs.c: In function ‘parse_ept_param’:
vmcs.c:74:13: error: ‘opt_pml_enabled’ undeclared (first use in this 
function)

 opt_pml_enabled = val;
 ^
vmcs.c:74:13: note: each undeclared identifier is reported only once for 
each function it appears in

vmcs.c: At top level:
vmcs.c:81:29: error: ‘opt_pml_enabled’ defined but not used 
[-Werror=unused-variable]

 static bool_t __read_mostly opt_pml_enabled = 0;

Thanks,
-Kai




+static bool_t __read_mostly pml_enable = 0;
+
  /*
   * These two parameters are used to config the controls for Pause-Loop 
Exiting:
   * ple_gap:upper bound on the amount of time between two successive
@@ -92,6 +102,28 @@ DEFINE_PER_CPU(bool_t, vmxon);
  static u32 vmcs_revision_id __read_mostly;
  u64 __read_mostly vmx_basic_msr;
  
+/* Copied from parse_iommu_param */

Not a useful comment, as it is likely to diverge in the future.


+static void parse_ept_param(char *s)

__init

~Andrew


+{
+char *ss;
+int val;
+
+do {
+val = !!strncmp(s, "no-", 3);
+if ( !val )
+s += 3;
+
+ss = strchr(s, ',');
+if ( ss )
+*ss = '\0';
+
+if ( !strcmp(s, "pml") )
+pml_enable = val;
+
+s = ss + 1;
+} while ( ss );
+}
+
  static void __init vmx_display_features(void)
  {
  int printed = 0;


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 05/10] VMX: add help functions to support PML

2015-03-30 Thread Kai Huang
On Mon, Mar 30, 2015 at 5:54 PM, Andrew Cooper
 wrote:
> On 30/03/15 07:43, Kai Huang wrote:
>>
>>
>> On 03/28/2015 05:09 AM, Andrew Cooper wrote:
>>> On 27/03/15 02:35, Kai Huang wrote:
>>>
>>>> +}
>>>> +
>>>> +int vmx_vcpu_enable_pml(struct vcpu *v)
>>>> +{
>>>> +struct domain *d = v->domain;
>>>> +
>>>> +ASSERT(!vmx_vcpu_pml_enabled(v));
>>>> +
>>>> +v->arch.hvm_vmx.pml_pg = d->arch.paging.alloc_page(d);
>>>> +if ( !v->arch.hvm_vmx.pml_pg )
>>>> +return -ENOMEM;
>>>> +
>>>> +vmx_vmcs_enter(v);
>>>> +
>>>> +__vmwrite(PML_ADDRESS, page_to_mfn(v->arch.hvm_vmx.pml_pg) <<
>>>> PAGE_SHIFT);
>>>> +__vmwrite(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
>>>> +
>>>> +v->arch.hvm_vmx.secondary_exec_control |=
>>>> SECONDARY_EXEC_ENABLE_PML;
>>>> +
>>>> +__vmwrite(SECONDARY_VM_EXEC_CONTROL,
>>>> +v->arch.hvm_vmx.secondary_exec_control);
>>> Alignment.
>> Do you mean to put 'v->arch.hvm_vmx.secondary_exec_control' to the
>> same line with '__vmwrite(SECONDARY_VM_EXEC_CONTROL,'? In this case
>> the number of characters will be 81.
>
> Splitting the line is fine.  The v should be vertically in line with S
> from SECONDARY

Oh I got your point. Thanks. Will do.

>
>>
>>>
>>>> +unsigned long gfn;
>>>> +mfn_t mfn;
>>>> +p2m_type_t t;
>>>> +p2m_access_t a;
>>>> +
>>>> +gfn = pml_buf[pml_idx] >> PAGE_SHIFT;
>>>> +mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL);
>>>> +if ( mfn_x(mfn) == INVALID_MFN )
>>>> +{
>>>> +/*
>>>> + * Either EPT table entry for mapping the GFN has been
>>>> destroyed, or
>>>> + * there's something wrong with hardware behavior, in
>>>> both cases we
>>>> + * should report a warning.
>>>> + */
>>>> +dprintk(XENLOG_WARNING, "PML: vcpu %d: invalid GPA
>>>> 0x%lx logged\n",
>>>> +v->vcpu_id, pml_buf[pml_idx]);
>>> It would be shorter to log gfn rather than gpa.
>> Will do. And I'd also like to add the domain ID in the warning info.
>
> Ah of course - dprintk() doesn't identify current().  Use %pv with v.
>
>>
>>>
>>>> +{
>>>> +return (d->arch.hvm_domain.vmx.status & VMX_DOMAIN_PML_ENABLED)
>>>> ? 1 : 0;
>>>> +}
>>>> +
>>>> +/*
>>>> + * This function enables PML for particular domain. It should be
>>>> called when
>>>> + * domain is paused.
>>> In which case assert that the domain is paused, or call domain_pause()
>>> yourself to take an extra pause refcount.
>> Which function should I use to assert domain is paused? I didn't find
>> a function like "domain_paused". Is below good enough?
>>
>> ASSERT(atomic_read(&d->pause_count));
>
> Hmm - we indeed don't have an appropriate helper.  That ASSERT() will do
> for now.
>
> ~Andrew
>
> ___
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel



-- 
Thanks,
-Kai

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 01/10] VMX: Enable EPT A/D bit support

2015-03-30 Thread Kai Huang
On Mon, Mar 30, 2015 at 5:36 PM, Andrew Cooper
 wrote:
> On 30/03/15 07:11, Kai Huang wrote:
>>
>>
>> On 03/28/2015 04:38 AM, Andrew Cooper wrote:
>>> On 27/03/15 02:35, Kai Huang wrote:
>>>> PML requires A/D bit support so enable it for further use.
>>>>
>>>> Signed-off-by: Kai Huang 
>>>> ---
>>>>   xen/arch/x86/hvm/vmx/vmcs.c| 1 +
>>>>   xen/arch/x86/mm/p2m-ept.c  | 8 +++-
>>>>   xen/include/asm-x86/hvm/vmx/vmcs.h | 4 +++-
>>>>   xen/include/asm-x86/hvm/vmx/vmx.h  | 5 -
>>>>   4 files changed, 15 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
>>>> index d614638..2f645fe 100644
>>>> --- a/xen/arch/x86/hvm/vmx/vmcs.c
>>>> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
>>>> @@ -103,6 +103,7 @@ static void __init vmx_display_features(void)
>>>>   P(cpu_has_vmx_tpr_shadow, "APIC TPR shadow");
>>>>   P(cpu_has_vmx_ept, "Extended Page Tables (EPT)");
>>>>   P(cpu_has_vmx_vpid, "Virtual-Processor Identifiers (VPID)");
>>>> +P(cpu_has_vmx_ept_ad_bit, "EPT A/D bit");
>>>>   P(cpu_has_vmx_vnmi, "Virtual NMI");
>>>>   P(cpu_has_vmx_msr_bitmap, "MSR direct-access bitmap");
>>>>   P(cpu_has_vmx_unrestricted_guest, "Unrestricted Guest");
>>>> diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
>>>> index c2d7720..8650092 100644
>>>> --- a/xen/arch/x86/mm/p2m-ept.c
>>>> +++ b/xen/arch/x86/mm/p2m-ept.c
>>>> @@ -233,6 +233,9 @@ static int ept_split_super_page(struct
>>>> p2m_domain *p2m, ept_entry_t *ept_entry,
>>>>   if ( !ept_set_middle_entry(p2m, &neThanks,
-Kaiw_ept) )
>>>>   return 0;
>>>>   +/* It's better to copy A bit of Middle entry from original
>>>> entry */
>>>> +new_ept.a = ept_entry->a;
>>> Surely d needs to be propagated as well?
>> No it's not necessary. D-bit is not defined in middle level EPT table.
>> Only leaf table entry has D-bit definition.
>
> Ok - so the middle doesn't have a D.
>
> What about the superpage having D set? Surely that needs propagated down
> to the new shattered leaves?

Yes shattered leaves will inherit both A and D bits from the original
superpage entry in " *epte = *ept_entry; " statement in below code in
ept_split_super_page.

for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
{
ept_entry_t *epte = table + i;

*epte = *ept_entry;
..

>
>>> Would it make sense to extend
>>> ept_set_middle_entry() to do all of new_ept setup in one location?
>> Yes it certainly makes sense to move A-bit propagation into
>> ept_set_middle_entry, but this also requires adding additional
>> original EPT entry pointer to ept_set_middle_entry as parameter. And
>> ept_set_middle_entry is also called by ept_next_level, therefore
>> changing it requires more code change, something like below. While I
>> am fine with both, which solution do you prefer?
>>
>> +++ b/xen/arch/x86/mm/p2m-ept.c
>> @@ -208,7 +208,8 @@ static void ept_p2m_type_to_flags(struct
>> p2m_domain *p2m, ept_entry_t *entry,
>>  #define GUEST_TABLE_POD_PAGE3
>>
>>  /* Fill in middle levels of ept table */
>> -static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t
>> *ept_entry)
>> +static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t
>> *new_entry,
>> +ept_entry_t *ori_entry)
>
> const ept_entry_t *old_entry (for consistency with other similar
> functions, or even just 'new' and 'old' as you are already changing the
> names)
>
> This looks fine.  Being a static function with only two callsites, it is
> very likely to be inlined by the compiler.

Sure I will  do in this way.

>
> ~Andrew
>
> ___
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel



-- 
Thanks,
-Kai

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 07/10] VMX: handle PML enabling in vmx_vcpu_initialise

2015-03-30 Thread Kai Huang



On 03/28/2015 05:12 AM, Andrew Cooper wrote:

On 27/03/15 02:35, Kai Huang wrote:

It's possible domain has already been in log-dirty mode when creating vcpu, in
which case we should enable PML for this vcpu if PML has been enabled for the
domain.

Signed-off-by: Kai Huang 
---
  xen/arch/x86/hvm/vmx/vmx.c | 24 
  1 file changed, 24 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 453bcc5..fce3aa2 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -116,6 +116,30 @@ static int vmx_vcpu_initialise(struct vcpu *v)
  return rc;
  }
  
+/*

+ * It's rare but still possible that domain has already been in log-dirty
+ * mode when vcpu is being created (commented by Tim), in which case we
+ * should enable PML for this vcpu if PML has been enabled for the domain,
+ * and failure to enable results in failure of creating this vcpu.
+ *
+ * Note even there's no vcpu created for the domain, vmx_domain_enable_pml
+ * will return successful in which case vmx_domain_pml_enabled will also
+ * return true. And even this is the first vcpu to be created with
+ * vmx_domain_pml_enabled being true, failure of enabling PML still results
+ * in failure of creating vcpu, to avoid complicated logic to revert PML
+ * style EPT table to non-PML style EPT table.
+ */
+if ( vmx_domain_pml_enabled(v->domain) )
+{
+if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )

Given the comment here, is the assertion in the top of
vmx_vcpu_enable_pml() liable to trip?
Do you mean below assertion at beginning of vmx_vcpu_enable_pml might 
not work here?


ASSERT(!vmx_vcpu_pml_enabled(v));

To me it asserts for this particular vcpu, not the domain, so even in 
this case the assertion is reasonable and should work fine, shouldn't it?





+{
+dprintk(XENLOG_ERR, "Failed to enable PML for vcpu %d\n",
+v->vcpu_id);

Please use %pv to identify the domain as well as vcpu.

Sure.

Thanks,
-Kai


~Andrew


+vmx_destroy_vmcs(v);
+return rc;
+}
+}
+
  vpmu_initialise(v);
  
  vmx_install_vlapic_mapping(v);


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 05/10] VMX: add help functions to support PML

2015-03-29 Thread Kai Huang



On 03/28/2015 05:09 AM, Andrew Cooper wrote:

On 27/03/15 02:35, Kai Huang wrote:

This patch adds help functions to enable/disable PML, and flush PML buffer for
single vcpu and particular domain for further use.

Signed-off-by: Kai Huang 
---
  xen/arch/x86/hvm/vmx/vmcs.c| 190 +
  xen/include/asm-x86/hvm/vmx/vmcs.h |   9 ++
  2 files changed, 199 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 2798b0b..17cbef4 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1326,6 +1326,196 @@ void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 
vector)
  &v->arch.hvm_vmx.eoi_exitmap_changed);
  }
  
+int vmx_vcpu_pml_enabled(struct vcpu *v)

bool_t vmx_vcpu_pml_enabled(const struct vcpu *v)

Will do.




+{
+return (v->arch.hvm_vmx.secondary_exec_control &
+SECONDARY_EXEC_ENABLE_PML) ? 1 : 0;

This would be slightly shorter as
!!(v->arch.hvm_vmx.secondary_exec_control & SECONDARY_EXEC_ENABLE_PML)

Will do.




+}
+
+int vmx_vcpu_enable_pml(struct vcpu *v)
+{
+struct domain *d = v->domain;
+
+ASSERT(!vmx_vcpu_pml_enabled(v));
+
+v->arch.hvm_vmx.pml_pg = d->arch.paging.alloc_page(d);
+if ( !v->arch.hvm_vmx.pml_pg )
+return -ENOMEM;
+
+vmx_vmcs_enter(v);
+
+__vmwrite(PML_ADDRESS, page_to_mfn(v->arch.hvm_vmx.pml_pg) << PAGE_SHIFT);
+__vmwrite(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+
+v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_PML;
+
+__vmwrite(SECONDARY_VM_EXEC_CONTROL,
+v->arch.hvm_vmx.secondary_exec_control);

Alignment.
Do you mean to put 'v->arch.hvm_vmx.secondary_exec_control' to the same 
line with '__vmwrite(SECONDARY_VM_EXEC_CONTROL,'? In this case the 
number of characters will be 81.





+
+vmx_vmcs_exit(v);
+
+return 0;
+}
+
+void vmx_vcpu_disable_pml(struct vcpu *v)
+{
+ASSERT(vmx_vcpu_pml_enabled(v));
+
+vmx_vmcs_enter(v);
+
+v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+__vmwrite(SECONDARY_VM_EXEC_CONTROL,
+v->arch.hvm_vmx.secondary_exec_control);
+
+vmx_vmcs_exit(v);
+
+v->domain->arch.paging.free_page(v->domain, v->arch.hvm_vmx.pml_pg);
+v->arch.hvm_vmx.pml_pg = NULL;
+}
+
+void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
+{
+uint64_t *pml_buf;
+unsigned long pml_idx;
+
+ASSERT(vmx_vcpu_pml_enabled(v));
+
+vmx_vmcs_enter(v);
+
+__vmread(GUEST_PML_INDEX, &pml_idx);
+
+/* Do nothing if PML buffer is empty */
+if ( pml_idx == (PML_ENTITY_NUM - 1) )
+goto out;
+
+pml_buf = map_domain_page(page_to_mfn(v->arch.hvm_vmx.pml_pg));

__map_domain_page() is a wrapper which takes a struct page_info

Will do.




+
+/*
+ * PML index can be either 2^16-1 (buffer is full), or 0~511 (buffer is not
+ * full), and in latter case PML index always points to next available
+ * entity.
+ */
+if (pml_idx >= PML_ENTITY_NUM)
+pml_idx = 0;
+else
+pml_idx++;
+
+for ( ; pml_idx < PML_ENTITY_NUM; pml_idx++ )
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);

This p2m_get_host_p2m() call should be hoisted out of the loop.

Will do.




+unsigned long gfn;
+mfn_t mfn;
+p2m_type_t t;
+p2m_access_t a;
+
+gfn = pml_buf[pml_idx] >> PAGE_SHIFT;
+mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL);
+if ( mfn_x(mfn) == INVALID_MFN )
+{
+/*
+ * Either EPT table entry for mapping the GFN has been destroyed, 
or
+ * there's something wrong with hardware behavior, in both cases we
+ * should report a warning.
+ */
+dprintk(XENLOG_WARNING, "PML: vcpu %d: invalid GPA 0x%lx logged\n",
+v->vcpu_id, pml_buf[pml_idx]);

It would be shorter to log gfn rather than gpa.

Will do. And I'd also like to add the domain ID in the warning info.




+continue;
+}
+
+/*
+ * Need to change type from log-dirty to normal memory for logged GFN.
+ * hap_track_dirty_vram depends on it to work. And we really only need
+ * to mark GFNs which hve been successfully changed from log-dirty to
+ * normal memory to be dirty.
+ */
+if ( !p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty,
+p2m_ram_rw) )
+paging_mark_dirty(v->domain, mfn_x(mfn));
+}
+
+unmap_domain_page(pml_buf);
+
+/* Reset PML index */
+__vmwrite(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+
+out:
+vmx_vmcs_exit(v);
+}
+
+int vmx_domain_pml_enabled(struct domain *d)

bool_t and const as per vcpu variant.

Will do.




+{
+return (d->arch.hvm_domain.vmx.status & VMX_DOMAIN_PML_EN

Re: [Xen-devel] [PATCH 04/10] VMX: New data structure member to support PML

2015-03-29 Thread Kai Huang



On 03/28/2015 04:48 AM, Andrew Cooper wrote:

On 27/03/15 02:35, Kai Huang wrote:

A new 4K page pointer is added to arch_vmx_struct as PML buffer for vcpu. And a
new 'status' field is added to vmx_domain to indicate whether PML is enabled for
the domain or not. The 'status' field also can be used for further similiar
purpose.

Note both new members don't have to be initialized to zero explicitly as both
vcpu and domain structure are zero-ed when they are created.

Signed-off-by: Kai Huang 
---
  xen/include/asm-x86/hvm/vmx/vmcs.h | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 47b4df2..8cc1122 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -71,8 +71,12 @@ struct ept_data {
  cpumask_var_t synced_mask;
  };
  
+#define _VMX_DOMAIN_PML_ENABLED0

+#define VMX_DOMAIN_PML_ENABLED (1ul << _VMX_DOMAIN_PML_ENABLED)
  struct vmx_domain {
  unsigned long apic_access_mfn;
+/* VMX_DOMAIN_* */
+unsigned long status;
  };
  
  struct pi_desc {

@@ -143,6 +147,9 @@ struct arch_vmx_struct {
  /* Bitmap to control vmexit policy for Non-root VMREAD/VMWRITE */
  struct page_info *vmread_bitmap;
  struct page_info *vmwrite_bitmap;
+
+#define PML_ENTITY_NUM  512

This is the number of pml entries, not entities.  NR_PML_ENTRIES perhaps?

Yours is better indeed. Will do.




+struct page_info   *pml_pg;

Please align the fields vertically like vmwrite_bitmap.

Sure.

Thanks,
-Kai


~Andrew


  };
  
  int vmx_create_vmcs(struct vcpu *v);


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 03/10] VMX: Add PML definition and feature detection.

2015-03-29 Thread Kai Huang



On 03/28/2015 04:46 AM, Andrew Cooper wrote:

On 27/03/15 02:35, Kai Huang wrote:

The patch adds PML definition and feature detection. Note PML won't be detected
if PML is disabled from boot parameter. PML is also disabled in construct_vmcs,
as it will only be enabled when domain is switched to log dirty mode.

Signed-off-by: Kai Huang 
---
  xen/arch/x86/hvm/vmx/vmcs.c| 18 ++
  xen/include/asm-x86/hvm/vmx/vmcs.h |  5 +
  xen/include/asm-x86/hvm/vmx/vmx.h  |  1 +
  3 files changed, 24 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 9b20a4b..2798b0b 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -143,6 +143,7 @@ static void __init vmx_display_features(void)
  P(cpu_has_vmx_virtual_intr_delivery, "Virtual Interrupt Delivery");
  P(cpu_has_vmx_posted_intr_processing, "Posted Interrupt Processing");
  P(cpu_has_vmx_vmcs_shadowing, "VMCS shadowing");
+P(cpu_has_vmx_pml, "Page Modification Logging");
  #undef P
  
  if ( !printed )

@@ -240,6 +241,8 @@ static int vmx_init_vmcs_config(void)
  opt |= SECONDARY_EXEC_ENABLE_VPID;
  if ( opt_unrestricted_guest_enabled )
  opt |= SECONDARY_EXEC_UNRESTRICTED_GUEST;
+if ( pml_enable )

This should be named opt_pml_enable in patch 1 or 2 to identify that it
is a command line option.

Sure.




+opt |= SECONDARY_EXEC_ENABLE_PML;
  
  /*

   * "APIC Register Virtualization" and "Virtual Interrupt Delivery"
@@ -286,6 +289,14 @@ static int vmx_init_vmcs_config(void)
   */
  if ( !(_vmx_ept_vpid_cap & VMX_VPID_INVVPID_ALL_CONTEXT) )
  _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+
+   /*
+* PML cannot be supported if EPT A/D bits is not supported. Actually,
+* PML should not be detected if EPT A/D bits is not supported, but for
+* sure we do it anyway.
+*/
+   if ( !(_vmx_ept_vpid_cap & VMX_EPT_AD_BIT_SUPPORT) )
+   _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
  }
  
  if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )

@@ -306,6 +317,10 @@ static int vmx_init_vmcs_config(void)
SECONDARY_EXEC_UNRESTRICTED_GUEST);
  }
  
+/* PML cannot be supported if we don't use EPT */

+if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) )
+_vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+

Somewhere in here you should clear pml_enable if hardware doesn't
support it.

Will do. Thanks for catching.

Thanks,
-Kai


~Andrew


  if ( (_vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) &&
ple_gap == 0 )
  {
@@ -1041,6 +1056,9 @@ static int construct_vmcs(struct vcpu *v)
  __vmwrite(POSTED_INTR_NOTIFICATION_VECTOR, posted_intr_vector);
  }
  
+/* Disable PML anyway here as it will only be enabled in log dirty mode */

+v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
  /* Host data selectors. */
  __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
  __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 4528346..47b4df2 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -216,6 +216,7 @@ extern u32 vmx_vmentry_control;
  #define SECONDARY_EXEC_ENABLE_INVPCID   0x1000
  #define SECONDARY_EXEC_ENABLE_VMFUNC0x2000
  #define SECONDARY_EXEC_ENABLE_VMCS_SHADOWING0x4000
+#define SECONDARY_EXEC_ENABLE_PML   0x0002
  extern u32 vmx_secondary_exec_control;
  
  #define VMX_EPT_EXEC_ONLY_SUPPORTED 0x0001

@@ -276,6 +277,8 @@ extern u32 vmx_secondary_exec_control;
  (vmx_pin_based_exec_control & PIN_BASED_POSTED_INTERRUPT)
  #define cpu_has_vmx_vmcs_shadowing \
  (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VMCS_SHADOWING)
+#define cpu_has_vmx_pml \
+(vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_PML)
  
  #define VMCS_RID_TYPE_MASK  0x8000
  
@@ -320,6 +323,7 @@ enum vmcs_field {

  GUEST_LDTR_SELECTOR = 0x080c,
  GUEST_TR_SELECTOR   = 0x080e,
  GUEST_INTR_STATUS   = 0x0810,
+GUEST_PML_INDEX = 0x0812,
  HOST_ES_SELECTOR= 0x0c00,
  HOST_CS_SELECTOR= 0x0c02,
  HOST_SS_SELECTOR= 0x0c04,
@@ -333,6 +337,7 @@ enum vmcs_field {
  VM_EXIT_MSR_STORE_ADDR  = 0x2006,
  VM_EXIT_MSR_LOAD_ADDR   = 0x2008,
  VM_ENTRY_MSR_LOAD_ADDR  = 0x200a,
+PML_ADDRESS = 0x00

Re: [Xen-devel] [PATCH 02/10] VMX: New parameter to control PML enabling

2015-03-29 Thread Kai Huang



On 03/28/2015 04:42 AM, Andrew Cooper wrote:

On 27/03/15 02:35, Kai Huang wrote:

A top level EPT parameter "ept=" and a sub boolean "pml_enable" are
added to control PML. Other booleans can be further added for any other EPT
related features.

Signed-off-by: Kai Huang 

Please patch docs/misc/xen-command-line.markdown as well.  See the
existing "psr" option as a similar example.

Will do. Thanks for pointing out.



Also, as indicated in patch 1, I think patches 1 and 2 need swapping in
the series.

Sure.




---
  xen/arch/x86/hvm/vmx/vmcs.c | 32 
  1 file changed, 32 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 2f645fe..9b20a4b 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -50,6 +50,16 @@ boolean_param("unrestricted_guest", 
opt_unrestricted_guest_enabled);
  static bool_t __read_mostly opt_apicv_enabled = 1;
  boolean_param("apicv", opt_apicv_enabled);
  
+static void parse_ept_param(char *s);

+/*
+ * The 'ept' parameter controls functionalities that depend on, or impact the
+ * EPT mechanism. Optional comma separated value may contain:
+ *
+ *  pml Enable PML
+ */
+custom_param("ept", parse_ept_param);

It is common to put the custom_param() call below parse_ept_param() so
you don't need to forward-declare the function.  The comment can happily
live at the top of parse_ept_param().

Will do.




+static bool_t __read_mostly pml_enable = 0;
+
  /*
   * These two parameters are used to config the controls for Pause-Loop 
Exiting:
   * ple_gap:upper bound on the amount of time between two successive
@@ -92,6 +102,28 @@ DEFINE_PER_CPU(bool_t, vmxon);
  static u32 vmcs_revision_id __read_mostly;
  u64 __read_mostly vmx_basic_msr;
  
+/* Copied from parse_iommu_param */

Not a useful comment, as it is likely to diverge in the future.
I will move comments for 'custom_param("ept", parse_ept_param)' here, as 
you suggested above, and this useless comment can be eliminated.





+static void parse_ept_param(char *s)

__init

Will do.

Thanks,
-Kai


~Andrew


+{
+char *ss;
+int val;
+
+do {
+val = !!strncmp(s, "no-", 3);
+if ( !val )
+s += 3;
+
+ss = strchr(s, ',');
+if ( ss )
+*ss = '\0';
+
+if ( !strcmp(s, "pml") )
+pml_enable = val;
+
+s = ss + 1;
+} while ( ss );
+}
+
  static void __init vmx_display_features(void)
  {
  int printed = 0;


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 01/10] VMX: Enable EPT A/D bit support

2015-03-29 Thread Kai Huang



On 03/28/2015 04:38 AM, Andrew Cooper wrote:

On 27/03/15 02:35, Kai Huang wrote:

PML requires A/D bit support so enable it for further use.

Signed-off-by: Kai Huang 
---
  xen/arch/x86/hvm/vmx/vmcs.c| 1 +
  xen/arch/x86/mm/p2m-ept.c  | 8 +++-
  xen/include/asm-x86/hvm/vmx/vmcs.h | 4 +++-
  xen/include/asm-x86/hvm/vmx/vmx.h  | 5 -
  4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index d614638..2f645fe 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -103,6 +103,7 @@ static void __init vmx_display_features(void)
  P(cpu_has_vmx_tpr_shadow, "APIC TPR shadow");
  P(cpu_has_vmx_ept, "Extended Page Tables (EPT)");
  P(cpu_has_vmx_vpid, "Virtual-Processor Identifiers (VPID)");
+P(cpu_has_vmx_ept_ad_bit, "EPT A/D bit");
  P(cpu_has_vmx_vnmi, "Virtual NMI");
  P(cpu_has_vmx_msr_bitmap, "MSR direct-access bitmap");
  P(cpu_has_vmx_unrestricted_guest, "Unrestricted Guest");
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index c2d7720..8650092 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -233,6 +233,9 @@ static int ept_split_super_page(struct p2m_domain *p2m, 
ept_entry_t *ept_entry,
  if ( !ept_set_middle_entry(p2m, &new_ept) )
  return 0;
  
+/* It's better to copy A bit of Middle entry from original entry */

+new_ept.a = ept_entry->a;

Surely d needs to be propagated as well?
No it's not necessary. D-bit is not defined in middle level EPT table. 
Only leaf table entry has D-bit definition.

Would it make sense to extend
ept_set_middle_entry() to do all of new_ept setup in one location?
Yes it certainly makes sense to move A-bit propagation into 
ept_set_middle_entry, but this also requires adding additional original 
EPT entry pointer to ept_set_middle_entry as parameter. And 
ept_set_middle_entry is also called by ept_next_level, therefore 
changing it requires more code change, something like below. While I am 
fine with both, which solution do you prefer?


+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -208,7 +208,8 @@ static void ept_p2m_type_to_flags(struct p2m_domain 
*p2m, ept_entry_t *entry,

 #define GUEST_TABLE_POD_PAGE3

 /* Fill in middle levels of ept table */
-static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t 
*ept_entry)
+static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t 
*new_entry,

+ept_entry_t *ori_entry)
 {
 struct page_info *pg;

@@ -216,11 +217,13 @@ static int ept_set_middle_entry(struct p2m_domain 
*p2m, ept_entry_t *ept_entry)

 if ( pg == NULL )
 return 0;

-ept_entry->epte = 0;
-ept_entry->mfn = page_to_mfn(pg);
-ept_entry->access = p2m->default_access;
+new_entry->epte = 0;
+new_entry->mfn = page_to_mfn(pg);
+new_entry->access = p2m->default_access;

-ept_entry->r = ept_entry->w = ept_entry->x = 1;
+new_entry->r = new_entry->w = new_entry->x = 1;
+
+new_entry->a = ori_entry->a;

 return 1;
 }
@@ -257,7 +260,7 @@ static int ept_split_super_page(struct p2m_domain 
*p2m, ept_entry_t *ept_entry,


 ASSERT(is_epte_superpage(ept_entry));

-if ( !ept_set_middle_entry(p2m, &new_ept) )
+if ( !ept_set_middle_entry(p2m, &new_ept, ept_entry) )
 return 0;

 table = map_domain_page(new_ept.mfn);
@@ -337,7 +340,7 @@ static int ept_next_level(struct p2m_domain *p2m, 
bool_t read_only,

 if ( read_only )
 return GUEST_TABLE_MAP_FAILED;

-if ( !ept_set_middle_entry(p2m, ept_entry) )
+if ( !ept_set_middle_entry(p2m, ept_entry, &e) )
 return GUEST_TABLE_MAP_FAILED;
 else
 e = atomic_read_ept_entry(ept_entry); /* Refresh */



+
  table = map_domain_page(new_ept.mfn);
  trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER);
  
@@ -244,7 +247,7 @@ static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry,

  epte->sp = (level > 1);
  epte->mfn += i * trunk;
  epte->snp = (iommu_enabled && iommu_snoop);
-ASSERT(!epte->rsvd1);
+/* A/D bits are inherited from superpage */
  ASSERT(!epte->avail3);
  
  ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);

@@ -1071,6 +1074,9 @@ int ept_p2m_init(struct p2m_domain *p2m)
  /* set EPT page-walk length, now it's actual walk length - 1, i.e. 3 */
  ept->ept_wl = 3;
  
+/* Enable EPT A/D bit if it's supported by hardware */

+ept->ept_ad = cpu_has_vmx_ept_ad_bit ? 1 : 0;

This will incur overhead on all EPT operations.  It should only be
enabled if pml is going to be in use.  (I think you need reverse patches
1 and 2 in the series, and gate on pml_enable here)


Re: [Xen-devel] [PATCH 00/10] PML (Paging Modification Logging) support

2015-03-29 Thread Kai Huang



On 03/28/2015 05:26 AM, Andrew Cooper wrote:

On 27/03/15 02:35, Kai Huang wrote:

Hi all,

This patch series adds PML support to Xen. Please kindly help to review it.

Overall this looks like a very good series, and it is particularly
helpful given the level of commenting.

Which platforms is/will PML be available for?

Hi Andrew,

Thanks for your quick review. PML will be available from Intel's 
"Broadwell server" platform.


Thanks,
-Kai


~Andrew

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 09/10] log-dirty: Refine common code to support PML

2015-03-26 Thread Kai Huang
This patch adds several new callbacks in paging/hap/p2m layer to support PML.

At paging layer, a new callback is added to log_dirty_domain to flush hardware
cached dirty pages to log-dirty radix tree, as in case of PML, it's possible
there are dirty GPAs logged in vcpus' PML buffers when userspace peek/clear
dirty pages, therefore we need to flush them before reporting dirty pages to
userspace.

At p2m layer, three new callbacks are added to p2m_domain to enable/disable PML
and flush PML buffers. PML enabling/disabling callback will be called when
switching to log-dirty mode / switching back to normal mode respectively.
Flushing PML buffer callback will be called from paging layer when flushing PML
buffer manually.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/mm/hap/hap.c   | 16 +++-
 xen/arch/x86/mm/p2m.c   | 36 
 xen/arch/x86/mm/paging.c| 15 ++-
 xen/arch/x86/mm/shadow/common.c |  2 +-
 xen/include/asm-x86/domain.h|  1 +
 xen/include/asm-x86/p2m.h   | 11 +++
 xen/include/asm-x86/paging.h|  3 ++-
 7 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 4ecb2e2..25f2f58 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -135,6 +135,10 @@ int hap_track_dirty_vram(struct domain *d,
 
 domain_pause(d);
 
+/* flush dirty GFNs potentially cached by hardware */
+if ( d->arch.paging.log_dirty.flush_cached_dirty )
+d->arch.paging.log_dirty.flush_cached_dirty(d);
+
 /* get the bitmap */
 paging_log_dirty_range(d, begin_pfn, nr, dirty_bitmap);
 
@@ -190,6 +194,8 @@ static int hap_enable_log_dirty(struct domain *d, bool_t 
log_global)
 d->arch.paging.mode |= PG_log_dirty;
 paging_unlock(d);
 
+p2m_enable_hardware_log_dirty(d);
+
 if ( log_global )
 {
 /* set l1e entries of P2M table to be read-only. */
@@ -205,6 +211,8 @@ static int hap_disable_log_dirty(struct domain *d)
 d->arch.paging.mode &= ~PG_log_dirty;
 paging_unlock(d);
 
+p2m_disable_hardware_log_dirty(d);
+
 /* set l1e entries of P2M table with normal mode */
 p2m_change_entry_type_global(d, p2m_ram_logdirty, p2m_ram_rw);
 return 0;
@@ -217,6 +225,11 @@ static void hap_clean_dirty_bitmap(struct domain *d)
 flush_tlb_mask(d->domain_dirty_cpumask);
 }
 
+static void hap_flush_cached_dirty(struct domain *d)
+{
+p2m_flush_hardware_cached_dirty(d);
+}
+
 //
 /* HAP SUPPORT FUNCTIONS*/
 //
@@ -431,7 +444,8 @@ void hap_domain_init(struct domain *d)
 /* Use HAP logdirty mechanism. */
 paging_log_dirty_init(d, hap_enable_log_dirty,
   hap_disable_log_dirty,
-  hap_clean_dirty_bitmap);
+  hap_clean_dirty_bitmap,
+  hap_flush_cached_dirty);
 }
 
 /* return 0 for success, -errno for failure */
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 6a06e9f..291a275 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -239,6 +239,42 @@ void p2m_memory_type_changed(struct domain *d)
 }
 }
 
+void p2m_enable_hardware_log_dirty(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+if ( p2m->enable_hardware_log_dirty )
+{
+p2m_lock(p2m);
+p2m->enable_hardware_log_dirty(p2m);
+p2m_unlock(p2m);
+}
+}
+
+void p2m_disable_hardware_log_dirty(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+if ( p2m->disable_hardware_log_dirty )
+{
+p2m_lock(p2m);
+p2m->disable_hardware_log_dirty(p2m);
+p2m_unlock(p2m);
+}
+}
+
+void p2m_flush_hardware_cached_dirty(struct domain *d)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+if ( p2m->flush_hardware_cached_dirty )
+{
+p2m_lock(p2m);
+p2m->flush_hardware_cached_dirty(p2m);
+p2m_unlock(p2m);
+}
+}
+
 mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
 p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
 unsigned int *page_order, bool_t locked)
diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index b54d76a..c2d336a 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -411,7 +411,18 @@ static int paging_log_dirty_op(struct domain *d,
 int i4, i3, i2;
 
 if ( !resuming )
+{
 domain_pause(d);
+
+/*
+ * Only need to flush when not resuming, as domain was paused in
+ * resuming case therefore it's not possible to have any new dirty
+ * page.
+ */
+if ( d->arch.paging.log_dirty.flush_cached_dirty )
+  

  1   2   >