date:20150527

[PATCH V1 0/5] Enable ACPI support for KVM ARM

2015-05-27 Thread Wei Huang

Initial ACPI support for ARM64 has been accepted into Linux kernel recently.
Now it is a good time to re-visit ACPI support for KVM. This patchset
enables ACPI for both arch_timer and vGIC by probing related ACPI tables
and does necessary initialization.

Note that Alexander Spyridaki submitted similar patches before. Some of
his ideas were borrowed in this patchset, but with substancial changes.
In addition we extend support for both GICv2 and GICv3.

This patchset would work better on top of recent GIC/IRQCHIP patches by
Hanjun Guo, who added support for gic_version in ACPI struct of GIC
distributor (search "ACPICA: Introduce GIC version for arm based system").

This patchset can be applied cleanly on top of Linx 4.1-rc1.

Wei Huang (5):
  kvm: arm64: Enable ACPI support for virt arch timer
  kvm: arm64: Dispatch virt GIC probing to device tree and ACPI
  kvm: arm64: Detect GIC version for proper ACPI vGIC probing
  kvm: arm64: Implement ACPI probing code for GICv2
  kvm: arm64: Implement ACPI probing code for GICv3

 include/kvm/arm_vgic.h|  36 +---
 virt/kvm/arm/arch_timer.c |  64 -
 virt/kvm/arm/vgic-v2.c|  65 +++--
 virt/kvm/arm/vgic-v3.c|  56 +--
 virt/kvm/arm/vgic.c   | 140 ++
 5 files changed, 320 insertions(+), 41 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V1 2/5] kvm: arm64: Dispatch virt GIC probing to device tree and ACPI

2015-05-27 Thread Wei Huang

This patch creates a dispatch function to support virt GIC probing
in both device tree (DT) and ACPI environment. kvm_vgic_hyp_init()
will probe DT first. If failed, it will try ACPI.

Signed-off-by: Wei Huang 
---
 include/kvm/arm_vgic.h | 18 +-
 virt/kvm/arm/vgic-v2.c |  8 
 virt/kvm/arm/vgic-v3.c |  8 
 virt/kvm/arm/vgic.c| 42 +++---
 4 files changed, 48 insertions(+), 28 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 133ea00..3ee732a 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -332,17 +332,17 @@ int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
 #define vgic_initialized(k)(!!((k)->arch.vgic.nr_cpus))
 #define vgic_ready(k)  ((k)->arch.vgic.ready)
 
-int vgic_v2_probe(struct device_node *vgic_node,
- const struct vgic_ops **ops,
- const struct vgic_params **params);
+int vgic_v2_dt_probe(struct device_node *vgic_node,
+const struct vgic_ops **ops,
+const struct vgic_params **params);
 #ifdef CONFIG_ARM_GIC_V3
-int vgic_v3_probe(struct device_node *vgic_node,
- const struct vgic_ops **ops,
- const struct vgic_params **params);
+int vgic_v3_dt_probe(struct device_node *vgic_node,
+const struct vgic_ops **ops,
+const struct vgic_params **params);
 #else
-static inline int vgic_v3_probe(struct device_node *vgic_node,
-   const struct vgic_ops **ops,
-   const struct vgic_params **params)
+static inline int vgic_v3_dt_probe(struct device_node *vgic_node,
+  const struct vgic_ops **ops,
+  const struct vgic_params **params)
 {
return -ENODEV;
 }
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
index f9b9c7c..295996f 100644
--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@@ -167,7 +167,7 @@ static const struct vgic_ops vgic_v2_ops = {
 static struct vgic_params vgic_v2_params;
 
 /**
- * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT
+ * vgic_v2_dt_probe - probe for a GICv2 compatible interrupt controller in DT
  * @node:  pointer to the DT node
  * @ops:   address of a pointer to the GICv2 operations
  * @params:address of a pointer to HW-specific parameters
@@ -176,9 +176,9 @@ static struct vgic_params vgic_v2_params;
  * in *ops and the HW parameters in *params. Returns an error code
  * otherwise.
  */
-int vgic_v2_probe(struct device_node *vgic_node,
- const struct vgic_ops **ops,
- const struct vgic_params **params)
+int vgic_v2_dt_probe(struct device_node *vgic_node,
+const struct vgic_ops **ops,
+const struct vgic_params **params)
 {
int ret;
struct resource vctrl_res;
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index dff0602..91814e2 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -211,7 +211,7 @@ static const struct vgic_ops vgic_v3_ops = {
 static struct vgic_params vgic_v3_params;
 
 /**
- * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT
+ * vgic_v3_dt_probe - probe for a GICv3 compatible interrupt controller in DT
  * @node:  pointer to the DT node
  * @ops:   address of a pointer to the GICv3 operations
  * @params:address of a pointer to HW-specific parameters
@@ -220,9 +220,9 @@ static struct vgic_params vgic_v3_params;
  * in *ops and the HW parameters in *params. Returns an error code
  * otherwise.
  */
-int vgic_v3_probe(struct device_node *vgic_node,
- const struct vgic_ops **ops,
- const struct vgic_params **params)
+int vgic_v3_dt_probe(struct device_node *vgic_node,
+const struct vgic_ops **ops,
+const struct vgic_params **params)
 {
int ret = 0;
u32 gicv_idx;
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 78fb820..b4010f0 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -2088,32 +2089,51 @@ static struct notifier_block vgic_cpu_nb = {
 };
 
 static const struct of_device_id vgic_ids[] = {
-   { .compatible = "arm,cortex-a15-gic",   .data = vgic_v2_probe, },
-   { .compatible = "arm,cortex-a7-gic",.data = vgic_v2_probe, },
-   { .compatible = "arm,gic-400",  .data = vgic_v2_probe, },
-   { .compatible = "arm,gic-v3",   .data = vgic_v3_probe, },
+   { .compatible = "arm,cortex-a15-gic",   .data = vgic_v2_dt_probe, },
+   { .compatible = "arm,cortex-a7-gic",.data = vgic_v2_dt_probe, },
+   { .compatible = "arm,gic-400",  .data = vgic_v2_dt_probe, },
+   { .compatible = "arm,gic-v3",   .data = vgic_v3_dt_

[PATCH V1 3/5] kvm: arm64: Detect GIC version for proper ACPI vGIC probing

2015-05-27 Thread Wei Huang

There are two GICs (GICv2 and GICv3) supported by KVM. So it is necessary
to find out GIC version before calling ACPI probing functions defined
in vgic-v2.c and vgic-v3.c.

This patch detects GIC version by checking gic_version field of GIC
distributor, which was defined  since ACPI 6.0. In case of ACPI 5.1,
we use manual hardware discovery to find out GIC version.

NOTE: This patch is based on a recent patch by Hanjun Guo.

Signed-off-by: Hanjun Guo 
Signed-off-by: Wei Huang 
---
 include/kvm/arm_vgic.h |  18 +
 virt/kvm/arm/vgic-v2.c |  10 +
 virt/kvm/arm/vgic-v3.c |  10 +
 virt/kvm/arm/vgic.c| 100 -
 4 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 3ee732a..7a44b08 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define VGIC_NR_IRQS_LEGACY256
@@ -335,10 +336,18 @@ int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
 int vgic_v2_dt_probe(struct device_node *vgic_node,
 const struct vgic_ops **ops,
 const struct vgic_params **params);
+#ifdef CONFIG_ACPI
+int vgic_v2_acpi_probe(struct acpi_madt_generic_interrupt *,
+  const struct vgic_ops **ops,
+  const struct vgic_params **params);
+#endif /* CONFIG_ACPI */
 #ifdef CONFIG_ARM_GIC_V3
 int vgic_v3_dt_probe(struct device_node *vgic_node,
 const struct vgic_ops **ops,
 const struct vgic_params **params);
+int vgic_v3_acpi_probe(struct acpi_madt_generic_interrupt *,
+  const struct vgic_ops **ops,
+  const struct vgic_params **params);
 #else
 static inline int vgic_v3_dt_probe(struct device_node *vgic_node,
   const struct vgic_ops **ops,
@@ -346,6 +355,15 @@ static inline int vgic_v3_dt_probe(struct device_node 
*vgic_node,
 {
return -ENODEV;
 }
+
+#ifdef CONFIG_ACPI
+int vgic_v3_acpi_probe(struct acpi_madt_generic_interrupt *,
+  const struct vgic_ops **ops,
+  const struct vgic_params **params)
+{
+   return -ENODEV;
+}
+#endif /* CONFIG_ACPI */
 #endif
 
 #endif
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
index 295996f..711de82 100644
--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -257,3 +258,12 @@ out:
of_node_put(vgic_node);
return ret;
 }
+
+#ifdef CONFIG_ACPI
+int vgic_v2_acpi_probe(struct acpi_madt_generic_interrupt *vgic_acpi,
+  const struct vgic_ops **ops,
+  const struct vgic_params **params)
+{
+   return -EINVAL;
+}
+#endif /* CONFIG_ACPI */
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index 91814e2..99d0f9f 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -285,3 +286,12 @@ out:
of_node_put(vgic_node);
return ret;
 }
+
+#ifdef CONFIG_ACPI
+int vgic_v3_acpi_probe(struct acpi_madt_generic_interrupt *vgic_acpi,
+  const struct vgic_ops **ops,
+  const struct vgic_params **params)
+{
+   return -EINVAL;
+}
+#endif /* CONFIG_ACPI */
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index b4010f0..cd09877 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -28,6 +28,7 @@
 #include 
 
 #include 
+#include 
 
 #include 
 #include 
@@ -2114,9 +2115,106 @@ static int kvm_vgic_dt_probe(void)
 }
 
 #ifdef CONFIG_ACPI
+u8 gic_version = ACPI_MADT_GIC_VER_UNKNOWN;
+phys_addr_t dist_phy_base;
+static struct acpi_madt_generic_interrupt *vgic_acpi;
+
+static void gic_get_acpi_header(struct acpi_subtable_header *header)
+{
+   vgic_acpi = (struct acpi_madt_generic_interrupt *)header;
+}
+
+static int gic_parse_distributor(struct acpi_subtable_header *header,
+const unsigned long end)
+{
+   struct acpi_madt_generic_distributor *dist;
+
+   dist = (struct acpi_madt_generic_distributor *)header;
+
+   if (BAD_MADT_ENTRY(dist, end))
+   return -EINVAL;
+
+   gic_version = dist->gic_version;
+   dist_phy_base = dist->base_address;
+
+   return 0;
+}
+
+static int gic_match_redist(struct acpi_subtable_header *header,
+   const unsigned long end)
+{
+   return 0;
+}
+
+static bool gic_redist_is_present(void)
+{
+   int count;
+
+   /* scan MADT table to find if we have redistributor entries */
+   count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
+ gic_match_redist, 0);
+
+   return (count > 0) ? true : false;
+}
+
 static int kvm_vgic_acpi_probe(void)
 {
-   return -EINVAL;
+

[PATCH V1 4/5] kvm: arm64: Implement ACPI probing code for GICv2

2015-05-27 Thread Wei Huang

This patches enables ACPI support for KVM virtual GICv2. KVM parses
ACPI table for virt GIC related information and initializes resources.

Signed-off-by: Alexander Spyridaki 
Signed-off-by: Wei Huang 
---
 virt/kvm/arm/vgic-v2.c | 49 -
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
index 711de82..01ce8a3 100644
--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@@ -264,6 +264,53 @@ int vgic_v2_acpi_probe(struct acpi_madt_generic_interrupt 
*vgic_acpi,
   const struct vgic_ops **ops,
   const struct vgic_params **params)
 {
-   return -EINVAL;
+   struct vgic_params *vgic = &vgic_v2_params;
+   int irq_mode, ret;
+
+   /* IRQ trigger mode */
+   irq_mode = (vgic_acpi->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+   ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+   vgic->maint_irq = acpi_register_gsi(NULL, vgic_acpi->vgic_interrupt,
+   irq_mode, ACPI_ACTIVE_HIGH);
+   if (!vgic->maint_irq) {
+   kvm_err("Cannot register VGIC ACPI maintenance irq\n");
+   ret = -ENXIO;
+   goto out;
+   }
+
+   /* GICH resource */
+   vgic->vctrl_base = ioremap(vgic_acpi->gich_base_address, SZ_8K);
+   if (!vgic->vctrl_base) {
+   kvm_err("cannot ioremap GICH memory\n");
+   ret = -ENOMEM;
+   goto out;
+   }
+
+   vgic->nr_lr = readl_relaxed(vgic->vctrl_base + GICH_VTR);
+   vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1;
+
+   ret = create_hyp_io_mappings(vgic->vctrl_base,
+vgic->vctrl_base + SZ_8K,
+vgic_acpi->gich_base_address);
+   if (ret) {
+   kvm_err("Cannot map GICH into hyp\n");
+   goto out;
+   }
+
+   vgic->vcpu_base = vgic_acpi->gicv_base_address;
+   vgic->can_emulate_gicv2 = true;
+   kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
+
+   kvm_info("GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n",
+(unsigned long long)vgic_acpi->gich_base_address,
+(unsigned long long)vgic_acpi->gicv_base_address,
+vgic->maint_irq);
+
+   vgic->type = VGIC_V2;
+   *ops = &vgic_v2_ops;
+   *params = vgic;
+
+out:
+   return ret;
 }
 #endif /* CONFIG_ACPI */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V1 5/5] kvm: arm64: Implement ACPI probing code for GICv3

2015-05-27 Thread Wei Huang

This patches enables ACPI support for KVM virtual GICv3. KVM parses
ACPI table for virt GIC related information and initializes resources.

Signed-off-by: Wei Huang 
---
 virt/kvm/arm/vgic-v3.c | 40 +++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index 99d0f9f..2e4df78 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -292,6 +292,44 @@ int vgic_v3_acpi_probe(struct acpi_madt_generic_interrupt 
*vgic_acpi,
   const struct vgic_ops **ops,
   const struct vgic_params **params)
 {
-   return -EINVAL;
+   int ret = 0;
+   struct vgic_params *vgic = &vgic_v3_params;
+   int irq_mode;
+
+   /* IRQ trigger mode */
+   irq_mode = (vgic_acpi->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+   ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+   vgic->maint_irq = acpi_register_gsi(NULL, vgic_acpi->vgic_interrupt,
+   irq_mode, ACPI_ACTIVE_HIGH);
+   if (!vgic->maint_irq) {
+   kvm_err("Cannot register VGIC ACPI maintenance irq\n");
+   ret = -ENXIO;
+   goto out;
+   }
+
+   ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
+   vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1;
+   vgic->can_emulate_gicv2 = false;
+
+   vgic->vcpu_base = vgic_acpi->gicv_base_address;
+
+   if (vgic->vcpu_base == 0)
+   kvm_info("disabling GICv2 emulation\n");
+   else {
+   vgic->can_emulate_gicv2 = true;
+   kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+   KVM_DEV_TYPE_ARM_VGIC_V2);
+   }
+
+   kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3);
+
+   vgic->vctrl_base = NULL;
+   vgic->type = VGIC_V3;
+   vgic->max_gic_vcpus = KVM_MAX_VCPUS;
+
+   *ops = &vgic_v3_ops;
+   *params = vgic;
+out:
+   return ret;
 }
 #endif /* CONFIG_ACPI */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V1 1/5] kvm: arm64: Enable ACPI support for virt arch timer

2015-05-27 Thread Wei Huang

This patches enables ACPI support for KVM virtual arch timer. It allows
KVM to parse ACPI table for arch timer PPI when DT table is not present.

Signed-off-by: Alexander Spyridaki 
Signed-off-by: Wei Huang 
---
 virt/kvm/arm/arch_timer.c | 64 +--
 1 file changed, 51 insertions(+), 13 deletions(-)

diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 98c95f2..7da9eb3 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -274,9 +275,46 @@ static const struct of_device_id arch_timer_of_match[] = {
{},
 };
 
-int kvm_timer_hyp_init(void)
+static int kvm_timer_ppi_dt_parse(unsigned int *ppi)
 {
struct device_node *np;
+
+   np = of_find_matching_node(NULL, arch_timer_of_match);
+   if (!np)
+   return -ENODEV;
+
+   *ppi = irq_of_parse_and_map(np, 2);
+   if (*ppi == 0) {
+   of_node_put(np);
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
+#ifdef CONFIG_ACPI
+struct acpi_table_gtdt *gtdt_acpi;
+static void arch_timer_acpi_parse(struct acpi_table_header *table)
+{
+   gtdt_acpi = container_of(table, struct acpi_table_gtdt, header);
+}
+
+static int kvm_timer_ppi_acpi_parse(unsigned int *ppi)
+{
+   /* Get the interrupt number from the GTDT table */
+   acpi_table_parse(ACPI_SIG_GTDT,
+(acpi_tbl_table_handler)arch_timer_acpi_parse);
+
+   if (!gtdt_acpi->virtual_timer_interrupt)
+   return -EINVAL;
+
+   *ppi = gtdt_acpi->virtual_timer_interrupt;
+   return 0;
+}
+#endif
+
+int kvm_timer_hyp_init(void)
+{
unsigned int ppi;
int err;
 
@@ -284,19 +322,20 @@ int kvm_timer_hyp_init(void)
if (!timecounter)
return -ENODEV;
 
-   np = of_find_matching_node(NULL, arch_timer_of_match);
-   if (!np) {
-   kvm_err("kvm_arch_timer: can't find DT node\n");
-   return -ENODEV;
-   }
+   /* PPI parsing: try DT first, then ACPI */
+   err = kvm_timer_ppi_dt_parse(&ppi);
+#ifdef CONFIG_ACPI
+   if (err && !acpi_disabled)
+   err = kvm_timer_ppi_acpi_parse(&ppi);
+#endif
 
-   ppi = irq_of_parse_and_map(np, 2);
-   if (!ppi) {
-   kvm_err("kvm_arch_timer: no virtual timer interrupt\n");
-   err = -EINVAL;
-   goto out;
+   if (err) {
+   kvm_err("kvm_arch_timer: can't find virtual timer info or "
+   "config virtual timer interrupt\n");
+   return err;
}
 
+   /* configure IRQ handler */
err = request_percpu_irq(ppi, kvm_arch_timer_handler,
 "kvm guest timer", kvm_get_running_vcpus());
if (err) {
@@ -319,14 +358,13 @@ int kvm_timer_hyp_init(void)
goto out_free;
}
 
-   kvm_info("%s IRQ%d\n", np->name, ppi);
+   kvm_info("timer IRQ%d\n", ppi);
on_each_cpu(kvm_timer_init_interrupt, NULL, 1);
 
goto out;
 out_free:
free_percpu_irq(ppi, kvm_get_running_vcpus());
 out:
-   of_node_put(np);
return err;
 }
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v6 8/8] macvtap/tun: cross-endian support for little-endian hosts

2015-05-27 Thread David Gibson

On Fri, Apr 24, 2015 at 02:50:36PM +0200, Greg Kurz wrote:
> The VNET_LE flag was introduced to fix accesses to virtio 1.0 headers
> that are always little-endian. It can also be used to handle the special
> case of a legacy little-endian device implemented by a big-endian host.
> 
> Let's add a flag and ioctls for big-endian devices as well. If both flags
> are set, little-endian wins.
> 
> Since this is isn't a common usecase, the feature is controlled by a kernel
> config option (not set by default).
> 
> Both macvtap and tun are covered by this patch since they share the same
> API with userland.
> 
> Signed-off-by: Greg Kurz 

Reviewed-by: David Gibson 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgp9Pvo0akB9k.pgp
Description: PGP signature

Re: [PATCH v6 6/8] virtio: add explicit big-endian support to memory accessors

2015-05-27 Thread David Gibson

On Fri, Apr 24, 2015 at 02:26:24PM +0200, Greg Kurz wrote:
> The current memory accessors logic is:
> - little endian if little_endian
> - native endian (i.e. no byteswap) if !little_endian
> 
> If we want to fully support cross-endian vhost, we also need to be
> able to convert to big endian.
> 
> Instead of changing the little_endian argument to some 3-value enum, this
> patch changes the logic to:
> - little endian if little_endian
> - big endian if !little_endian
> 
> The native endian case is handled by all users with a trivial helper. This
> patch doesn't change any functionality, nor it does add overhead.
> 
> Signed-off-by: Greg Kurz 

Reviewed-by: David Gibson 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgpiWIvRcbzD2.pgp
Description: PGP signature

Re: [PATCH v6 1/8] virtio: introduce virtio_is_little_endian() helper

2015-05-27 Thread David Gibson

On Fri, Apr 24, 2015 at 02:24:27PM +0200, Greg Kurz wrote:
> Signed-off-by: Greg Kurz 

Reviewed-by: David Gibson 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgpb03l0z2tFO.pgp
Description: PGP signature

Re: [PATCH v6 2/8] tun: add tun_is_little_endian() helper

2015-05-27 Thread David Gibson

On Fri, Apr 24, 2015 at 02:24:38PM +0200, Greg Kurz wrote:
> Signed-off-by: Greg Kurz 

Reviewed-by: David Gibson 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgp6xoaowBbwy.pgp
Description: PGP signature

Re: [PATCH v6 7/8] vhost: cross-endian support for legacy devices

2015-05-27 Thread David Gibson

On Fri, Apr 24, 2015 at 02:27:24PM +0200, Greg Kurz wrote:
> This patch brings cross-endian support to vhost when used to implement
> legacy virtio devices. Since it is a relatively rare situation, the
> feature availability is controlled by a kernel config option (not set
> by default).
> 
> The vq->is_le boolean field is added to cache the endianness to be
> used for ring accesses. It defaults to native endian, as expected
> by legacy virtio devices. When the ring gets active, we force little
> endian if the device is modern. When the ring is deactivated, we
> revert to the native endian default.
> 
> If cross-endian was compiled in, a vq->user_be boolean field is added
> so that userspace may request a specific endianness. This field is
> used to override the default when activating the ring of a legacy
> device. It has no effect on modern devices.
> 
> Signed-off-by: Greg Kurz 

Reviewed-by: David Gibson 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgprkOq_Rbkxt.pgp
Description: PGP signature

Re: [PATCH v6 5/8] vhost: introduce vhost_is_little_endian() helper

2015-05-27 Thread David Gibson

On Fri, Apr 24, 2015 at 02:25:12PM +0200, Greg Kurz wrote:
> Signed-off-by: Greg Kurz 

Reviewed-by: David Gibson 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgpI3mX3BeZb0.pgp
Description: PGP signature

Re: [PATCH v6 4/8] vringh: introduce vringh_is_little_endian() helper

2015-05-27 Thread David Gibson

On Fri, Apr 24, 2015 at 02:24:58PM +0200, Greg Kurz wrote:
> Signed-off-by: Greg Kurz 

Reviewed-by: David Gibson 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgpjoRhuLeJ7I.pgp
Description: PGP signature

Re: [PATCH v6 3/8] macvtap: introduce macvtap_is_little_endian() helper

2015-05-27 Thread David Gibson

On Fri, Apr 24, 2015 at 02:24:48PM +0200, Greg Kurz wrote:
> Signed-off-by: Greg Kurz 

Reviewed-by: David Gibson 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgpQ75DSmzcZJ.pgp
Description: PGP signature

[PATCH 2/2] KVM: PPC: Book3S HV: Implement dynamic micro-threading on POWER8

2015-05-27 Thread Paul Mackerras

This builds on the ability to run more than one vcore on a physical
core by using the micro-threading (split-core) modes of the POWER8
chip.  Previously, only vcores from the same VM could be run together,
and (on POWER8) only if they had just one thread per core.  With the
ability to split the core on guest entry and unsplit it on guest exit,
we can run up to 8 vcpu threads from up to 4 different VMs, and we can
run multiple vcores with 2 or 4 vcpus per vcore.

Dynamic micro-threading is only available if the static configuration
of the cores is whole-core mode (unsplit), and only on POWER8.

To manage this, we introduce a new kvm_split_mode struct which is
shared across all of the subcores in the core, with a pointer in the
paca on each thread.  In addition we extend the core_info struct to
have information on each subcore.  When deciding whether to add a
vcore to the set already on the core, we now have two possibilities:
(a) piggyback the vcore onto an existing subcore, or (b) start a new
subcore.

Currently, when any vcpu needs to exit the guest and switch to host
virtual mode, we interrupt all the threads in all subcores and switch
the core back to whole-core mode.  It may be possible in future to
allow some of the subcores to keep executing in the guest while
subcore 0 switches to the host, but that is not implemented in this
patch.

This adds a module parameter called dynamic_mt_modes which controls
which micro-threading (split-core) modes the code will consider, as a
bitmap.  In other words, if it is 0, no micro-threading mode is
considered; if it is 2, only 2-way micro-threading is considered; if
it is 4, only 4-way, and if it is 6, both 2-way and 4-way
micro-threading mode will be considered.  The default is 6.

With this, we now have secondary threads which are the primary thread
for their subcore and therefore need to do the MMU switch.  These
threads will need to be started even if they have no vcpu to run, so
we use the vcore pointer in the PACA rather than the vcpu pointer to
trigger them.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_book3s_asm.h |  20 ++
 arch/powerpc/include/asm/kvm_host.h   |   3 +
 arch/powerpc/kernel/asm-offsets.c |   7 +
 arch/powerpc/kvm/book3s_hv.c  | 369 ++
 arch/powerpc/kvm/book3s_hv_builtin.c  |  25 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 106 +++--
 6 files changed, 469 insertions(+), 61 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 5bdfb5d..4024d24 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -25,6 +25,12 @@
 #define XICS_MFRR  0xc
 #define XICS_IPI   2   /* interrupt source # for IPIs */
 
+/* Maximum number of threads per physical core */
+#define MAX_THREADS8
+
+/* Maximum number of subcores per physical core */
+#define MAX_SUBCORES   4
+
 #ifdef __ASSEMBLY__
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
@@ -65,6 +71,19 @@ kvmppc_resume_\intno:
 
 #else  /*__ASSEMBLY__ */
 
+struct kvmppc_vcore;
+
+/* Struct used for coordinating micro-threading (split-core) mode changes */
+struct kvm_split_mode {
+   unsigned long   rpr;
+   unsigned long   pmmar;
+   unsigned long   ldbar;
+   u8  subcore_size;
+   u8  do_nap;
+   u8  napped[MAX_THREADS];
+   struct kvmppc_vcore *master_vcs[MAX_SUBCORES];
+};
+
 /*
  * This struct goes in the PACA on 64-bit processors.  It is used
  * to store host state that needs to be saved when we enter a guest
@@ -100,6 +119,7 @@ struct kvmppc_host_state {
u64 host_spurr;
u64 host_dscr;
u64 dec_expires;
+   struct kvm_split_mode *kvm_split_mode;
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
u64 cfar;
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 2b74490..80eb29a 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -302,6 +302,9 @@ struct kvmppc_vcore {
 #define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8)
 #define VCORE_IS_EXITING(vc)   (VCORE_EXIT_MAP(vc) != 0)
 
+/* This bit is used when a vcore exit is triggered from outside the vcore */
+#define VCORE_EXIT_REQ 0x1
+
 /*
  * Values for vcore_state.
  * Note that these are arranged such that lower values
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index d333664..c3e11e0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -676,7 +676,14 @@ int main(void)
HSTATE_FIELD(HSTATE_DSCR, host_dscr);
HSTATE_FIELD(HSTATE_DABR, dabr);
HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+   HSTATE_FIELD(HSTATE_SPLIT_MODE, kvm_split_mode);
DEFINE(IPI_PRIORITY, IPI_PRIORITY);
+   DEFINE(KVM_SPLIT_RPR, offsetof(struct kvm_split_mode, rpr));

[PATCH 0/2] KVM: PPC: Book3S HV: Dynamic micro-threading/split-core

2015-05-27 Thread Paul Mackerras

This patch series provides a way to use more of the capacity of each
processor core when running guests configured with threads=1, 2 or 4
on a POWER8 host with HV KVM, without having to change the static
micro-threading (the official name for split-core) mode for the whole
machine.  The problem with setting the machine to static 2-way or
4-way micro-threading mode is that (a) then you can't run guests with
threads=8 and (b) selecting the right mode can be tricky and requires
knowledge of what guests you will be running.

Instead, with these two patches, we can now run more than one virtual
core (vcore) on a given physical core if possible, and if that means
we need to switch the core to 2-way or 4-way micro-threading mode,
then we do that on entry to the guests and switch back to whole-core
mode on exit (and we only switch the one core, not the whole machine).
The core mode switching is only done if the machine is in static
whole-core mode.

All of this only comes into effect when a core is over-committed.
When the machine is lightly loaded everything operates the same with
these patches as without.  Only when some core has a vcore that is
able to run while there is also another vcore that was wanting to run
on that core but got preempted does the logic kick in to try to run
both vcores at once.

Paul.
---

 arch/powerpc/include/asm/kvm_book3s_asm.h |  20 +
 arch/powerpc/include/asm/kvm_host.h   |  22 +-
 arch/powerpc/kernel/asm-offsets.c |   9 +
 arch/powerpc/kvm/book3s_hv.c  | 648 ++
 arch/powerpc/kvm/book3s_hv_builtin.c  |  32 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c  |   4 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 111 -
 7 files changed, 740 insertions(+), 106 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] KVM: PPC: Book3S HV: Make use of unused threads when running guests

2015-05-27 Thread Paul Mackerras

When running a virtual core of a guest that is configured with fewer
threads per core than the physical cores have, the extra physical
threads are currently unused.  This makes it possible to use them to
run one or more other virtual cores from the same guest when certain
conditions are met.  This applies on POWER7, and on POWER8 to guests
with one thread per virtual core.  (It doesn't apply to POWER8 guests
with multiple threads per vcore because they require a 1-1 virtual to
physical thread mapping in order to be able to use msgsndp and the
TIR.)

The idea is that we maintain a list of preempted vcores for each
physical cpu (i.e. each core, since the host runs single-threaded).
Then, when a vcore is about to run, it checks to see if there are
any vcores on the list for its physical cpu that could be
piggybacked onto this vcore's execution.  If so, those additional
vcores are put into state VCORE_PIGGYBACK and their runnable VCPU
threads are started as well as the original vcore, which is called
the master vcore.

After the vcores have exited the guest, the extra ones are put back
onto the preempted list if any of their VCPUs are still runnable and
not idle.

This means that vcpu->arch.ptid is no longer necessarily the same as
the physical thread that the vcpu runs on.  In order to make it easier
for code that wants to send an IPI to know which CPU to target, we
now store that in a new field in struct vcpu_arch, called thread_cpu.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_host.h |  19 +-
 arch/powerpc/kernel/asm-offsets.c   |   2 +
 arch/powerpc/kvm/book3s_hv.c| 333 ++--
 arch/powerpc/kvm/book3s_hv_builtin.c|   7 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c|   4 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   5 +
 6 files changed, 298 insertions(+), 72 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d91f65b..2b74490 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -278,7 +278,9 @@ struct kvmppc_vcore {
u16 last_cpu;
u8 vcore_state;
u8 in_guest;
+   struct kvmppc_vcore *master_vcore;
struct list_head runnable_threads;
+   struct list_head preempt_list;
spinlock_t lock;
wait_queue_head_t wq;
spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
@@ -300,12 +302,18 @@ struct kvmppc_vcore {
 #define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8)
 #define VCORE_IS_EXITING(vc)   (VCORE_EXIT_MAP(vc) != 0)
 
-/* Values for vcore_state */
+/*
+ * Values for vcore_state.
+ * Note that these are arranged such that lower values
+ * (< VCORE_SLEEPING) don't require stolen time accounting
+ * on load/unload, and higher values do.
+ */
 #define VCORE_INACTIVE 0
-#define VCORE_SLEEPING 1
-#define VCORE_PREEMPT  2
-#define VCORE_RUNNING  3
-#define VCORE_EXITING  4
+#define VCORE_PREEMPT  1
+#define VCORE_PIGGYBACK2
+#define VCORE_SLEEPING 3
+#define VCORE_RUNNING  4
+#define VCORE_EXITING  5
 
 /*
  * Struct used to manage memory for a virtual processor area
@@ -619,6 +627,7 @@ struct kvm_vcpu_arch {
int trap;
int state;
int ptid;
+   int thread_cpu;
bool timer_running;
wait_queue_head_t cpu_run;
 
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 0034b6b..d333664 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -512,6 +512,8 @@ int main(void)
DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
DEFINE(VCPU_HEIR, offsetof(struct kvm_vcpu, arch.emul_inst));
+   DEFINE(VCPU_CPU, offsetof(struct kvm_vcpu, cpu));
+   DEFINE(VCPU_THREAD_CPU, offsetof(struct kvm_vcpu, arch.thread_cpu));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 68d067a..2048309 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -81,6 +81,9 @@ static DECLARE_BITMAP(default_enabled_hcalls, 
MAX_HCALL_OPCODE/4 + 1);
 #define MPP_BUFFER_ORDER   3
 #endif
 
+static int target_smt_mode;
+module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
 
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
@@ -114,7 +117,7 @@ static bool kvmppc_ipi_thread(int cpu)
 
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
-   int cpu = vcpu->cpu;
+   int cpu;
wait_queue_head_t *wqp;
 
wqp = kvm_arch_vcpu_wq(vcpu);
@@ -123,10 +126,11 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu 
*vcpu)
++vcpu->stat.halt_wakeup;
}
 
-

[patch 1/3] x86: kvmclock: add flag to indicate pvclock counts from zero

2015-05-27 Thread Marcelo Tosatti

Setting sched clock stable for kvmclock causes the printk timestamps
to not start from zero, which is different from baremetal and 
can possibly break userspace. Add a flag to indicate that 
hypervisor sets clock base at zero when kvmclock is initialized.

Signed-off-by: Marcelo Tosatti 

---
 arch/x86/include/asm/pvclock-abi.h |1 +
 1 file changed, 1 insertion(+)

Index: kvm/arch/x86/include/asm/pvclock-abi.h
===
--- kvm.orig/arch/x86/include/asm/pvclock-abi.h 2014-11-06 23:59:14.615913334 
-0200
+++ kvm/arch/x86/include/asm/pvclock-abi.h  2015-05-27 17:40:53.435192771 
-0300
@@ -41,5 +41,6 @@
 
 #define PVCLOCK_TSC_STABLE_BIT (1 << 0)
 #define PVCLOCK_GUEST_STOPPED  (1 << 1)
+#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 0/3] kvmclock: allow stable sched clock

2015-05-27 Thread Marcelo Tosatti

kvmclock provides the behaviour sched_clock users expect.
Mark it as stable allowing nohz_full in guests.
See individual patches for more details.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 3/3] KVM: x86: zero kvmclock_offset when vcpu0 initializes kvmclock system MSR

2015-05-27 Thread Marcelo Tosatti

Initialize kvmclock base, on kvmclock system MSR write time,
so that the guest sees kvmclock counting from zero.

This matches baremetal behaviour when kvmclock in guest
sets sched clock stable.

Signed-off-by: Marcelo Tosatti 

---
 arch/x86/kvm/x86.c |5 +
 1 file changed, 5 insertions(+)

Index: kvm/arch/x86/kvm/x86.c
===
--- kvm.orig/arch/x86/kvm/x86.c 2015-05-27 17:40:46.948189811 -0300
+++ kvm/arch/x86/kvm/x86.c  2015-05-27 22:43:47.340413347 -0300
@@ -1703,6 +1703,8 @@
/* If the host uses TSC clocksource, then it is stable */
if (use_master_clock)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+   if (ka->kvmclk_counts_from_zero)
+   pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO;
 
vcpu->hv_clock.flags = pvclock_flags;
 
@@ -2282,6 +2284,9 @@
&vcpu->requests);
 
ka->boot_vcpu_runs_old_kvmclock = tmp;
+
+   ka->kvmclock_offset = -get_kernel_ns();
+   ka->kvmclk_counts_from_zero = true;
}
 
vcpu->arch.time = data;


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 2/3] x86: kvmclock: set scheduler clock stable

2015-05-27 Thread Marcelo Tosatti

From: Luiz Capitulino 

If you try to enable NOHZ_FULL on a guest today, you'll get
the following error when the guest tries to deactivate the
scheduler tick:

 WARNING: CPU: 3 PID: 2182 at kernel/time/tick-sched.c:192 
can_stop_full_tick+0xb9/0x290()
 NO_HZ FULL will not work with unstable sched clock
 CPU: 3 PID: 2182 Comm: kworker/3:1 Not tainted 4.0.0-10545-gb9bb6fb #204
 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
 Workqueue: events flush_to_ldisc
  8162a0c7 88011f583e88 814e6ba0 0002
  88011f583ed8 88011f583ec8 8104d095 88011f583eb8
   0003 0001 0001
 Call Trace:
[] dump_stack+0x4f/0x7b
  [] warn_slowpath_common+0x85/0xc0
  [] warn_slowpath_fmt+0x46/0x50
  [] can_stop_full_tick+0xb9/0x290
  [] tick_nohz_irq_exit+0x8d/0xb0
  [] irq_exit+0xc5/0x130
  [] smp_apic_timer_interrupt+0x4a/0x60
  [] apic_timer_interrupt+0x6e/0x80
[] ? _raw_spin_unlock_irqrestore+0x31/0x60
  [] __wake_up+0x48/0x60
  [] n_tty_receive_buf_common+0x49c/0xba0
  [] ? tty_ldisc_ref+0x1f/0x70
  [] n_tty_receive_buf2+0x14/0x20
  [] flush_to_ldisc+0xe0/0x120
  [] process_one_work+0x1d5/0x540
  [] ? process_one_work+0x151/0x540
  [] worker_thread+0x121/0x470
  [] ? process_one_work+0x540/0x540
  [] kthread+0xef/0x110
  [] ? __kthread_parkme+0xa0/0xa0
  [] ret_from_fork+0x42/0x70
  [] ? __kthread_parkme+0xa0/0xa0
 ---[ end trace 06e3507544a38866 ]---

However, it turns out that kvmclock does provide a stable
sched_clock callback. So, let the scheduler know this which
in turn makes NOHZ_FULL work in the guest.

Signed-off-by: Marcelo Tosatti 
Signed-off-by: Luiz Capitulino 

---
 arch/x86/kernel/kvmclock.c |   17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

Index: kvm/arch/x86/kernel/kvmclock.c
===
--- kvm.orig/arch/x86/kernel/kvmclock.c 2015-05-27 18:00:53.616391551 -0300
+++ kvm/arch/x86/kernel/kvmclock.c  2015-05-27 22:43:14.474432962 -0300
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -217,8 +218,10 @@
 
 void __init kvmclock_init(void)
 {
+   struct pvclock_vcpu_time_info *vcpu_time;
unsigned long mem;
-   int size;
+   int size, cpu;
+   u8 flags;
 
size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
 
@@ -263,8 +266,18 @@
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
 
+   flags = PVCLOCK_COUNTS_FROM_ZERO;
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
-   pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+   flags |= PVCLOCK_TSC_STABLE_BIT;
+
+   pvclock_set_flags(flags);
+
+   cpu = get_cpu();
+   vcpu_time = &hv_clock[cpu].pvti;
+   flags = pvclock_read_flags(vcpu_time);
+   if (flags & PVCLOCK_COUNTS_FROM_ZERO)
+   set_sched_clock_stable();
+   put_cpu();
 }
 
 int __init kvm_setup_vsyscall_timeinfo(void)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Bug 98841] New: android emulator broken since d7a2a24 kernel commit

2015-05-27 Thread Bandan Das

bugzilla-dae...@bugzilla.kernel.org writes:

> https://bugzilla.kernel.org/show_bug.cgi?id=98841
>
> Bug ID: 98841
>Summary: android emulator broken since d7a2a24 kernel commit
>Product: Virtualization
>Version: unspecified
> Kernel Version: 3.17
>   Hardware: All
> OS: Linux
>   Tree: Mainline
> Status: NEW
>   Severity: normal
>   Priority: P1
>  Component: kvm
>   Assignee: virtualization_...@kernel-bugs.osdl.org
>   Reporter: adrian.sa...@asandu.eu
> Regression: No
>
> Hello,
>
> For a while now I've noticed the android emulator not wanting to work anymore
> ..
> With a lil' bit of git bisect I've found this kernel commit
> https://git.kernel.org/cgit/linux/kernel/git/stable/linux-stable.git/commit/?id=d7a2a246a1b5a0b0c803e800019600051e1e6f1a
> that seems to break it.
>
> I've mailed the author .. submited a report to the google guys ( not sure if I
> did it right ) https://code.google.com/p/android/issues/detail?id=174557 ...
>
> The problem was first reported by someone @ fedora .. 
> https://bugzilla.redhat.com/show_bug.cgi?id=1187982
>
> Alot of details ( run logs ) are there.
>
> Let me know if I can help test stuff.

Ok, a quick look and it seems the patch is doing the right thing
(as the spec) says. For PHYSBASE it's masking out bits 8-11 and for PHYSMASK
bits 0-10. Are you sure Android Emulator isn't attempting to
write to these reserved regions ?

What happens if you enable ignore_msrs ?
echo Y > /sys/module/kvm/parameters/ignore_msrs

Bandan

> Thanks in advance.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Bug 98841] android emulator broken since d7a2a24 kernel commit

2015-05-27 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=98841

Adrian Sandu  changed:

   What|Removed |Added

 Kernel Version|3.17|4.0.4

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC 3/6] VSOCK: Introduce virtio-vsock-common.ko

2015-05-27 Thread Stefan Hajnoczi

From: Asias He 

This module contains the common code and header files for the following
virtio-vsock and virtio-vhost kernel modules.

Signed-off-by: Asias He 
Signed-off-by: Stefan Hajnoczi 
---
 include/linux/virtio_vsock.h|  207 +
 include/uapi/linux/virtio_ids.h |1 +
 include/uapi/linux/virtio_vsock.h   |   80 ++
 net/vmw_vsock/virtio_transport_common.c | 1248 +++
 4 files changed, 1536 insertions(+)
 create mode 100644 include/linux/virtio_vsock.h
 create mode 100644 include/uapi/linux/virtio_vsock.h
 create mode 100644 net/vmw_vsock/virtio_transport_common.c

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
new file mode 100644
index 000..01d84a5
--- /dev/null
+++ b/include/linux/virtio_vsock.h
@@ -0,0 +1,207 @@
+/*
+ * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
+ * anyone can use the definitions to implement compatible drivers/servers:
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *notice, this list of conditions and the following disclaimer in the
+ *documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *may be used to endorse or promote products derived from this software
+ *without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS 
IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) Red Hat, Inc., 2013-2015
+ * Copyright (C) Asias He , 2013
+ * Copyright (C) Stefan Hajnoczi , 2015
+ */
+
+#ifndef _LINUX_VIRTIO_VSOCK_H
+#define _LINUX_VIRTIO_VSOCK_H
+
+#include 
+#include 
+#include 
+
+#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE  128
+#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE  (1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE  (1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE   (1024 * 4)
+#define VIRTIO_VSOCK_MAX_BUF_SIZE  0xUL
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+#define VIRTIO_VSOCK_MAX_TX_BUF_SIZE   (1024 * 1024 * 16)
+#define VIRTIO_VSOCK_MAX_DGRAM_SIZE(1024 * 64)
+
+struct vsock_transport_recv_notify_data;
+struct vsock_transport_send_notify_data;
+struct sockaddr_vm;
+struct vsock_sock;
+
+enum {
+   VSOCK_VQ_CTRL   = 0,
+   VSOCK_VQ_RX = 1, /* for host to guest data */
+   VSOCK_VQ_TX = 2, /* for guest to host data */
+   VSOCK_VQ_MAX= 3,
+};
+
+/* virtio transport socket state */
+struct virtio_transport {
+   struct virtio_transport_pkt_ops *ops;
+   struct vsock_sock *vsk;
+
+   u32 buf_size;
+   u32 buf_size_min;
+   u32 buf_size_max;
+
+   struct mutex tx_lock;
+   struct mutex rx_lock;
+
+   struct list_head rx_queue;
+   u32 rx_bytes;
+
+   /* Protected by trans->tx_lock */
+   u32 tx_cnt;
+   u32 buf_alloc;
+   u32 peer_fwd_cnt;
+   u32 peer_buf_alloc;
+   /* Protected by trans->rx_lock */
+   u32 fwd_cnt;
+
+   u16 dgram_id;
+};
+
+struct virtio_vsock_pkt {
+   struct virtio_vsock_hdr hdr;
+   struct virtio_transport *trans;
+   struct work_struct work;
+   struct list_head list;
+   void *buf;
+   u32 len;
+   u32 off;
+};
+
+struct virtio_vsock_pkt_info {
+   u32 remote_cid, remote_port;
+   struct msghdr *msg;
+   u32 pkt_len;
+   u16 type;
+   u16 op;
+   u32 flags;
+   u16 dgram_id;
+   u16 dgram_len;
+};
+
+struct virtio_transport_pkt_ops {
+   int (*send_pkt)(struct vsock_sock *vsk,
+   struct virtio_vsock_pkt_info *info);
+};
+
+void virtio_vsock_dumppkt(const char *func,
+ const struct virtio_vsock_pkt *pkt);
+
+struct sock *
+virtio_transport_get_pending(struct sock *listener,
+struct virtio_vsock_pkt *pkt);
+struct virtio_vsock_pkt *
+virtio_transport_alloc_pkt(struct vsock_sock *vsk,
+

[PATCH v2 03/13] KVM: x86: pass the whole hflags field to emulator and back

2015-05-27 Thread Paolo Bonzini

The hflags field will contain information about system management mode
and will be useful for the emulator.  Pass the entire field rather than
just the guest-mode information.

Signed-off-by: Paolo Bonzini 
---
RFC->v1: introduce kvm_set_hflags
---
 arch/x86/include/asm/kvm_emulate.h |  5 -
 arch/x86/kvm/emulate.c |  6 +++---
 arch/x86/kvm/x86.c | 10 +-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h 
b/arch/x86/include/asm/kvm_emulate.h
index 57a9d94fe160..7410879a41f7 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -262,6 +262,9 @@ enum x86emul_mode {
X86EMUL_MODE_PROT64,/* 64-bit (long) mode.*/
 };
 
+/* These match some of the HF_* flags defined in kvm_host.h  */
+#define X86EMUL_GUEST_MASK   (1 << 5) /* VCPU is in guest-mode */
+
 struct x86_emulate_ctxt {
const struct x86_emulate_ops *ops;
 
@@ -273,8 +276,8 @@ struct x86_emulate_ctxt {
 
/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;
+   int emul_flags;
 
-   bool guest_mode; /* guest running a nested guest */
bool perm_ok; /* do not check permissions if true */
bool ud;/* inject an #UD if host doesn't support insn */
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9b655d113fc6..a1c6c25552e9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4895,7 +4895,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
fetch_possible_mmx_operand(ctxt, &ctxt->dst);
}
 
-   if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+   if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && 
ctxt->intercept) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
  X86_ICPT_PRE_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -4924,7 +4924,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
 
-   if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+   if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d 
& Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
  X86_ICPT_POST_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -4978,7 +4978,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 special_insn:
 
-   if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+   if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & 
Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
  X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 747bf7da550b..70072f94318e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5236,7 +5236,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 (cs_l && is_long_mode(vcpu))   ? X86EMUL_MODE_PROT64 :
 cs_db  ? X86EMUL_MODE_PROT32 :
  X86EMUL_MODE_PROT16;
-   ctxt->guest_mode = is_guest_mode(vcpu);
+   BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
+   ctxt->emul_flags = vcpu->arch.hflags;
 
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5405,6 +5406,11 @@ static bool retry_instruction(struct x86_emulate_ctxt 
*ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
 
+void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
+{
+   vcpu->arch.hflags = emul_flags;
+}
+
 static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
unsigned long *db)
 {
@@ -5604,6 +5610,8 @@ restart:
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+   if (vcpu->arch.hflags != ctxt->emul_flags)
+   kvm_set_hflags(vcpu, ctxt->emul_flags);
kvm_rip_write(vcpu, ctxt->eip);
if (r == EMULATE_DONE)
kvm_vcpu_check_singlestep(vcpu, rflags, &r);
-- 
1.8.3.1


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2 06/13] KVM: x86: save/load state on SMM switch

2015-05-27 Thread Paolo Bonzini

The big ugly one.  This patch adds support for switching in and out of
system management mode, respectively upon receiving KVM_REQ_SMI and upon
executing a RSM instruction.  Both 32- and 64-bit formats are supported
for the SMM state save area.

Signed-off-by: Paolo Bonzini 
---
RFC->v1: shift access rights left by 8 for 32-bit format
 move tracepoint to kvm_set_hflags
 fix NMI handling
---
 arch/x86/kvm/cpuid.h   |   8 ++
 arch/x86/kvm/emulate.c | 248 -
 arch/x86/kvm/trace.h   |  22 +
 arch/x86/kvm/x86.c | 225 +++-
 4 files changed, 501 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 496b3695d3d3..dd05b9cef6ae 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu 
*vcpu)
return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
 }
 
+static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
+{
+   struct kvm_cpuid_entry2 *best;
+
+   best = kvm_find_cpuid_entry(vcpu, 0x8001, 0);
+   return best && (best->edx & bit(X86_FEATURE_LM));
+}
+
 static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
 {
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e763a9b8c26b..1da4dd88465d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2259,12 +2259,258 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
return rc;
 }
 
+static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
+{
+   u32 eax, ebx, ecx, edx;
+
+   eax = 0x8001;
+   ecx = 0;
+   ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+   return edx & bit(X86_FEATURE_LM);
+}
+
+#define get_smstate(type, smbase, offset)\
+   ({\
+type __val;  \
+int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val,   \
+sizeof(__val), NULL);\
+if (r != X86EMUL_CONTINUE)   \
+return X86EMUL_UNHANDLEABLE; \
+__val;   \
+   })
+
+static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
+{
+   desc->g= (flags >> 23) & 1;
+   desc->d= (flags >> 22) & 1;
+   desc->l= (flags >> 21) & 1;
+   desc->avl  = (flags >> 20) & 1;
+   desc->p= (flags >> 15) & 1;
+   desc->dpl  = (flags >> 13) & 3;
+   desc->s= (flags >> 12) & 1;
+   desc->type = (flags >>  8) & 15;
+}
+
+static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+   struct desc_struct desc;
+   int offset;
+   u16 selector;
+
+   selector = get_smstate(u32, smbase, 0x7fa8 + n * 4);
+
+   if (n < 3)
+   offset = 0x7f84 + n * 12;
+   else
+   offset = 0x7f2c + (n - 3) * 12;
+
+   set_desc_base(&desc,  get_smstate(u32, smbase, offset + 8));
+   set_desc_limit(&desc, get_smstate(u32, smbase, offset + 4));
+   rsm_set_desc_flags(&desc, get_smstate(u32, smbase, offset));
+   ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
+   return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+   struct desc_struct desc;
+   int offset;
+   u16 selector;
+   u32 base3;
+
+   offset = 0x7e00 + n * 16;
+
+   selector =get_smstate(u16, smbase, offset);
+   rsm_set_desc_flags(&desc, get_smstate(u16, smbase, offset + 2) << 8);
+   set_desc_limit(&desc, get_smstate(u32, smbase, offset + 4));
+   set_desc_base(&desc,  get_smstate(u32, smbase, offset + 8));
+   base3 =   get_smstate(u32, smbase, offset + 12);
+
+   ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
+   return X86EMUL_CONTINUE;
+}
+
+static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
+u64 cr0, u64 cr4)
+{
+   int bad;
+
+   /*
+* First enable PAE, long mode needs it before CR0.PG = 1 is set.
+* Then enable protected mode.  However, PCID cannot be enabled
+* if EFER.LMA=0, so set it separately.
+*/
+   bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+   if (bad)
+   return X86EMUL_UNHANDLEABLE;
+
+   bad = ctxt->ops->set_cr(ctxt, 0, cr0);
+   if (bad)
+   return X86EMUL_UNHANDLEABLE;
+
+   if (cr4 & X86_CR4_PCIDE) {
+   bad = ctxt->ops->set_cr(ctxt, 4, cr4);
+   if (bad)
+   return X86EMUL_UNHANDLEABLE;
+

[PATCH v2 00/13] SMM implementation for KVM

2015-05-27 Thread Paolo Bonzini

This brings together the remaining parts of SMM.  For now I've left the
"weird" interaction between SMM and NMI blocking, and I'm using the same
format for the state save area (which is also the one used by QEMU) as
the RFC.

It builds on the previous cleanup patches, which (with the exception
of "KVM: x86: pass kvm_mmu_page to gfn_to_rmap") are now in kvm/queue.
The first six patches are more or less the same as the previous version,
while the address spaces part hopefully touches all affected functions
now.

Patches 1-6 implement the SMM API and world switch; patches 7-12
implements the multiple address spaces; patch 13 ties the loose
ends and advertises the capability.

Tested with SeaBIOS and OVMF, where SMM provides the trusted base
for secure boot.

Thanks,

Paolo

Paolo Bonzini (13):
  KVM: x86: introduce num_emulated_msrs
  KVM: x86: pass host_initiated to functions that read MSRs
  KVM: x86: pass the whole hflags field to emulator and back
  KVM: x86: API changes for SMM support
  KVM: x86: stubs for SMM support
  KVM: x86: save/load state on SMM switch
  KVM: add vcpu-specific functions to read/write/translate GFNs
  KVM: implement multiple address spaces
  KVM: x86: pass kvm_mmu_page to gfn_to_rmap
  KVM: x86: use vcpu-specific functions to read/write/translate GFNs
  KVM: x86: work on all available address spaces
  KVM: x86: add SMM to the MMU role, support SMRAM address space
  KVM: x86: advertise KVM_CAP_X86_SMM

 Documentation/virtual/kvm/api.txt|  52 ++-
 arch/powerpc/include/asm/kvm_book3s_64.h |   2 +-
 arch/x86/include/asm/kvm_emulate.h   |   9 +-
 arch/x86/include/asm/kvm_host.h  |  44 ++-
 arch/x86/include/asm/vmx.h   |   1 +
 arch/x86/include/uapi/asm/kvm.h  |  11 +-
 arch/x86/kvm/cpuid.h |   8 +
 arch/x86/kvm/emulate.c   | 262 +-
 arch/x86/kvm/kvm_cache_regs.h|   5 +
 arch/x86/kvm/lapic.c |   4 +-
 arch/x86/kvm/mmu.c   | 171 +-
 arch/x86/kvm/mmu_audit.c |  16 +-
 arch/x86/kvm/paging_tmpl.h   |  18 +-
 arch/x86/kvm/svm.c   |  73 ++--
 arch/x86/kvm/trace.h |  22 ++
 arch/x86/kvm/vmx.c   | 106 +++---
 arch/x86/kvm/x86.c   | 562 ++-
 include/linux/kvm_host.h |  49 ++-
 include/uapi/linux/kvm.h |   6 +-
 virt/kvm/kvm_main.c  | 237 ++---
 20 files changed, 1337 insertions(+), 321 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2 02/13] KVM: x86: pass host_initiated to functions that read MSRs

2015-05-27 Thread Paolo Bonzini

SMBASE is only readable from SMM for the VCPU, but it must be always
accessible if userspace is accessing it.  Thus, all functions that
read MSRs are changed to accept a struct msr_data; the host_initiated
and index fields are pre-initialized, while the data field is filled
on return.

Signed-off-by: Paolo Bonzini 
--
RFC->v1: fix pasto in do_get_msr
---
 arch/x86/include/asm/kvm_host.h |   6 +--
 arch/x86/kvm/svm.c  |  54 ++--
 arch/x86/kvm/vmx.c  |  62 ---
 arch/x86/kvm/x86.c  | 106 
 4 files changed, 127 insertions(+), 101 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7276107b35df..4e299fcd0eb6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -721,7 +721,7 @@ struct kvm_x86_ops {
void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
-   int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+   int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
void (*get_segment)(struct kvm_vcpu *vcpu,
@@ -941,7 +941,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
 
 void kvm_enable_efer_bits(u64);
 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 struct x86_emulate_ctxt;
@@ -970,7 +970,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b9f9e1073e50..a08df4145173 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3069,42 +3069,42 @@ static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 
host_tsc)
svm_scale_tsc(vcpu, host_tsc);
 }
 
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
struct vcpu_svm *svm = to_svm(vcpu);
 
-   switch (ecx) {
+   switch (msr_info->index) {
case MSR_IA32_TSC: {
-   *data = svm->vmcb->control.tsc_offset +
+   msr_info->data = svm->vmcb->control.tsc_offset +
svm_scale_tsc(vcpu, native_read_tsc());
 
break;
}
case MSR_STAR:
-   *data = svm->vmcb->save.star;
+   msr_info->data = svm->vmcb->save.star;
break;
 #ifdef CONFIG_X86_64
case MSR_LSTAR:
-   *data = svm->vmcb->save.lstar;
+   msr_info->data = svm->vmcb->save.lstar;
break;
case MSR_CSTAR:
-   *data = svm->vmcb->save.cstar;
+   msr_info->data = svm->vmcb->save.cstar;
break;
case MSR_KERNEL_GS_BASE:
-   *data = svm->vmcb->save.kernel_gs_base;
+   msr_info->data = svm->vmcb->save.kernel_gs_base;
break;
case MSR_SYSCALL_MASK:
-   *data = svm->vmcb->save.sfmask;
+   msr_info->data = svm->vmcb->save.sfmask;
break;
 #endif
case MSR_IA32_SYSENTER_CS:
-   *data = svm->vmcb->save.sysenter_cs;
+   msr_info->data = svm->vmcb->save.sysenter_cs;
break;
case MSR_IA32_SYSENTER_EIP:
-   *data = svm->sysenter_eip;
+   msr_info->data = svm->sysenter_eip;
break;
case MSR_IA32_SYSENTER_ESP:
-   *data = svm->sysenter_esp;
+   msr_info->data = svm->sysenter_esp;
break;
/*
 * Nobody will change the following 5 values in the VMCB so we can
@@ -3112,31 +3112,31 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned 
ecx, u64 *data)
 * implemented.
 */
case MSR_IA32_DEBUGCTLMSR:
-   *data = svm->vmcb->save.dbgctl;
+   msr_info->data = svm->vmcb->save.dbgctl;
break;
case MSR_IA32_LASTBRANCHFROMIP:
-   *data = svm->vmcb->save.br_from;
+   msr_info->data = svm->vmcb->save.br_from;
break;
case MSR_IA32_LASTBRANCHTOIP:
-   *data = svm->vmcb->save.br_to;
+   msr_info->data = svm->vmcb->save.br_to;
b

[PATCH v2 01/13] KVM: x86: introduce num_emulated_msrs

2015-05-27 Thread Paolo Bonzini

We will want to filter away MSR_IA32_SMBASE from the emulated_msrs if
the host CPU does not support SMM virtualization.  Introduce the
logic to do that, and also move paravirt MSRs to emulated_msrs for
simplicity and to get rid of KVM_SAVE_MSRS_BEGIN.

Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/x86.c | 40 +++-
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2211213a84e7..9dd41de10eb5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -925,17 +925,11 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
  *
  * This list is modified at module load time to reflect the
  * capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in the beginning of the list.
+ * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * may depend on host virtualization features rather than host cpu features.
  */
 
-#define KVM_SAVE_MSRS_BEGIN12
 static u32 msrs_to_save[] = {
-   MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-   MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
-   HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-   HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
-   HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
-   MSR_KVM_PV_EOI_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -947,7 +941,14 @@ static u32 msrs_to_save[] = {
 
 static unsigned num_msrs_to_save;
 
-static const u32 emulated_msrs[] = {
+static u32 emulated_msrs[] = {
+   MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+   MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+   HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+   HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+   HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+   MSR_KVM_PV_EOI_EN,
+
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
MSR_IA32_MISC_ENABLE,
@@ -955,6 +956,8 @@ static const u32 emulated_msrs[] = {
MSR_IA32_MCG_CTL,
 };
 
+static unsigned num_emulated_msrs;
+
 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
if (efer & efer_reserved_bits)
@@ -2924,7 +2927,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
goto out;
n = msr_list.nmsrs;
-   msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+   msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
goto out;
r = -E2BIG;
@@ -2936,7 +2939,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
goto out;
if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
 &emulated_msrs,
-ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+num_emulated_msrs * sizeof(u32)))
goto out;
r = 0;
break;
@@ -4202,8 +4205,7 @@ static void kvm_init_msr_list(void)
u32 dummy[2];
unsigned i, j;
 
-   /* skip the first msrs in the list. KVM-specific */
-   for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
+   for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
continue;
 
@@ -4228,6 +4230,18 @@ static void kvm_init_msr_list(void)
j++;
}
num_msrs_to_save = j;
+
+   for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
+   switch (emulated_msrs[i]) {
+   default:
+   break;
+   }
+
+   if (j < i)
+   emulated_msrs[j] = emulated_msrs[i];
+   j++;
+   }
+   num_emulated_msrs = j;
 }
 
 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
-- 
1.8.3.1


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2 05/13] KVM: x86: stubs for SMM support

2015-05-27 Thread Paolo Bonzini

This patch adds the interface between x86.c and the emulator: the
SMBASE register, a new emulator flag, the RSM instruction.  It also
adds a new request bit that will be used by the KVM_SMI ioctl.

Signed-off-by: Paolo Bonzini 
--
RFC->v1: make SMBASE host-readable only
 add support for latching an SMI
 do not reset SMBASE on INIT
v1->v2: do not expose SMBASE in the VMX misc enable MSR
---
 arch/x86/include/asm/kvm_emulate.h |  4 +++
 arch/x86/include/asm/kvm_host.h|  1 +
 arch/x86/kvm/emulate.c | 10 +-
 arch/x86/kvm/lapic.c   |  4 ++-
 arch/x86/kvm/svm.c |  1 +
 arch/x86/kvm/x86.c | 64 --
 include/linux/kvm_host.h   |  1 +
 9 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h 
b/arch/x86/include/asm/kvm_emulate.h
index 7410879a41f7..e16466ec473c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -193,6 +193,8 @@ struct x86_emulate_ops {
int (*cpl)(struct x86_emulate_ctxt *ctxt);
int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+   u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
+   void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 
*pdata);
int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
@@ -264,6 +266,8 @@ enum x86emul_mode {
 
 /* These match some of the HF_* flags defined in kvm_host.h  */
 #define X86EMUL_GUEST_MASK   (1 << 5) /* VCPU is in guest-mode */
+#define X86EMUL_SMM_MASK (1 << 6)
+#define X86EMUL_SMM_INSIDE_NMI_MASK  (1 << 7)
 
 struct x86_emulate_ctxt {
const struct x86_emulate_ops *ops;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d52d7aea375f..12a7318887ad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -368,6 +368,7 @@ struct kvm_vcpu_arch {
int32_t apic_arb_prio;
int mp_state;
u64 ia32_misc_enable_msr;
+   u64 smbase;
bool tpr_access_reporting;
u64 ia32_xss;
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a1c6c25552e9..e763a9b8c26b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2259,6 +2259,14 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
return rc;
 }
 
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
+{
+   if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
+   return emulate_ud(ctxt);
+
+   return X86EMUL_UNHANDLEABLE;
+}
+
 static void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct desc_struct *cs, struct desc_struct *ss)
@@ -4197,7 +4205,7 @@ static const struct opcode twobyte_table[256] = {
F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
/* 0xA8 - 0xAF */
I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
-   DI(ImplicitOps, rsm),
+   II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c789e00dfa8b..b8e47e2b1d94 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -808,7 +808,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
break;
 
case APIC_DM_SMI:
-   apic_debug("Ignoring guest SMI\n");
+   result = 1;
+   kvm_make_request(KVM_REQ_SMI, vcpu);
+   kvm_vcpu_kick(vcpu);
break;
 
case APIC_DM_NMI:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a08df4145173..c48748cf638e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3394,6 +3394,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm 
*svm) = {
[SVM_EXIT_MWAIT]= mwait_interception,
[SVM_EXIT_XSETBV]   = xsetbv_interception,
[SVM_EXIT_NPF]  = pf_interception,
+   [SVM_EXIT_RSM]  = emulate_on_interception,
 };
 
 static void dump_vmcb(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0deb83f526c0..ee21e4eaf3a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -954,6 +954,7 @@ static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
+   MSR_IA32_SMBASE,
 };
 
 static unsigned num_emulated_msrs;
@@ -2280,6 +2281,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, st

[PATCH v2 08/13] KVM: implement multiple address spaces

2015-05-27 Thread Paolo Bonzini

Signed-off-by: Paolo Bonzini 
---
v1->v2: new

 Documentation/virtual/kvm/api.txt| 12 ++
 arch/powerpc/include/asm/kvm_book3s_64.h |  2 +-
 include/linux/kvm_host.h | 26 ++--
 include/uapi/linux/kvm.h |  1 +
 virt/kvm/kvm_main.c  | 70 
 5 files changed, 79 insertions(+), 32 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 51523b70b6b2..91ab5f2354aa 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -254,6 +254,11 @@ since the last call to this ioctl.  Bit 0 is the first 
page in the
 memory slot.  Ensure the entire structure is cleared to avoid padding
 issues.
 
+If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies
+the address space for which you want to return the dirty bitmap.
+They must be less than the value that KVM_CHECK_EXTENSION returns for
+the KVM_CAP_MULTI_ADDRESS_SPACE capability.
+
 
 4.9 KVM_SET_MEMORY_ALIAS
 
@@ -924,6 +929,13 @@ slot.  When changing an existing slot, it may be moved in 
the guest
 physical memory space, or its flags may be modified.  It may not be
 resized.  Slots may not overlap in guest physical address space.
 
+If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot"
+specifies the address space which is being modified.  They must be
+less than the value that KVM_CHECK_EXTENSION returns for the
+KVM_CAP_MULTI_ADDRESS_SPACE capability.  Slots in separate address spaces
+are unrelated; the restriction on overlapping slots only applies within
+each address space.
+
 Memory for the region is taken starting at the address denoted by the
 field userspace_addr, which must point at user addressable memory for
 the entire memory slot size.  Any object may back this memory, including
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 3536d12eb798..2aa79c864e91 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -430,7 +430,7 @@ static inline void note_hpte_modification(struct kvm *kvm,
  */
 static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 {
-   return rcu_dereference_raw_notrace(kvm->memslots);
+   return rcu_dereference_raw_notrace(kvm->memslots[0]);
 }
 
 extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ba1ea43998e4..9564fd78c547 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -44,6 +44,10 @@
 /* Two fragments for cross MMIO pages. */
 #define KVM_MAX_MMIO_FRAGMENTS 2
 
+#ifndef KVM_ADDRESS_SPACE_NUM
+#define KVM_ADDRESS_SPACE_NUM  1
+#endif
+
 /*
  * For the normal pfn, the highest 12 bits should be zero,
  * so we can mask bit 62 ~ bit 52  to indicate the error pfn,
@@ -331,6 +335,13 @@ struct kvm_kernel_irq_routing_entry {
 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 #endif
 
+#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
+{
+   return 0;
+}
+#endif
+
 /*
  * Note:
  * memslots are not sorted by id anymore, please use id_to_memslot()
@@ -349,7 +360,7 @@ struct kvm {
spinlock_t mmu_lock;
struct mutex slots_lock;
struct mm_struct *mm; /* userspace tied to this vm */
-   struct kvm_memslots *memslots;
+   struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM];
struct srcu_struct srcu;
struct srcu_struct irq_srcu;
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
@@ -464,16 +475,23 @@ void kvm_exit(void);
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
 
-static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
+static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
 {
-   return rcu_dereference_check(kvm->memslots,
+   return rcu_dereference_check(kvm->memslots[as_id],
srcu_read_lock_held(&kvm->srcu)
|| lockdep_is_held(&kvm->slots_lock));
 }
 
+static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
+{
+   return __kvm_memslots(kvm, 0);
+}
+
 static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu)
 {
-   return kvm_memslots(vcpu->kvm);
+   int as_id = kvm_arch_vcpu_memslots_id(vcpu);
+
+   return __kvm_memslots(vcpu->kvm, as_id);
 }
 
 static inline struct kvm_memory_slot *
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index eace8babd227..5ff1038437e3 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -816,6 +816,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_HWRNG 115
 #define KVM_CAP_DISABLE_QUIRKS 116
 #define KVM_CAP_X86_SMM 117
+#define KVM_CAP_MULTI_ADDRESS_SPACE 118
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9ef1a7e3c050..10ae7e348dcc 100644

[PATCH v2 04/13] KVM: x86: API changes for SMM support

2015-05-27 Thread Paolo Bonzini

This patch includes changes to the external API for SMM support.
All the changes are predicated by the availability of a new
capability, KVM_CAP_X86_SMM, which is added at the end of the
patch series.

Signed-off-by: Paolo Bonzini 
---
RFC->v1: add smi.pending and smi.rsm_unmasks_nmi fields, reduce padding
 in struct kvm_vcpu_events; remove memset of events struct,
 instead zero smi.pad.  KVM_CAP_X86_SMM frozen at 117.
---
 Documentation/virtual/kvm/api.txt | 40 +--
 arch/x86/include/asm/kvm_host.h   |  3 +++
 arch/x86/include/uapi/asm/kvm.h   | 11 ++-
 arch/x86/kvm/kvm_cache_regs.h |  5 +
 arch/x86/kvm/x86.c| 34 +++--
 include/uapi/linux/kvm.h  |  5 -
 6 files changed, 88 insertions(+), 10 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 695544420ff2..51523b70b6b2 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -820,11 +820,21 @@ struct kvm_vcpu_events {
} nmi;
__u32 sipi_vector;
__u32 flags;
+   struct {
+   __u8 smm;
+   __u8 pending;
+   __u8 smm_inside_nmi;
+   __u8 pad;
+   } smi;
 };
 
-KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
-interrupt.shadow contains a valid state. Otherwise, this field is undefined.
+Only two fields are defined in the flags field:
+
+- KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
+  interrupt.shadow contains a valid state.
 
+- KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that
+  smi contains a valid state.
 
 4.32 KVM_SET_VCPU_EVENTS
 
@@ -841,17 +851,20 @@ vcpu.
 See KVM_GET_VCPU_EVENTS for the data structure.
 
 Fields that may be modified asynchronously by running VCPUs can be excluded
-from the update. These fields are nmi.pending and sipi_vector. Keep the
-corresponding bits in the flags field cleared to suppress overwriting the
-current in-kernel state. The bits are:
+from the update. These fields are nmi.pending, sipi_vector, smi.smm,
+smi.pending. Keep the corresponding bits in the flags field cleared to
+suppress overwriting the current in-kernel state. The bits are:
 
 KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel
 KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector
+KVM_VCPUEVENT_VALID_SMM - transfer the smi sub-struct.
 
 If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
 the flags field to signal that interrupt.shadow contains a valid state and
 shall be written into the VCPU.
 
+KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available.
+
 
 4.33 KVM_GET_DEBUGREGS
 
@@ -2979,6 +2992,16 @@ len must be a multiple of sizeof(struct kvm_s390_irq). 
It must be > 0
 and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
 which is the maximum number of possibly pending cpu-local interrupts.
 
+4.90 KVM_SMI
+
+Capability: KVM_CAP_X86_SMM
+Architectures: x86
+Type: vcpu ioctl
+Parameters: none
+Returns: 0 on success, -1 on error
+
+Queues an SMI on the thread's vcpu.
+
 5. The kvm_run structure
 
 
@@ -3014,7 +3037,12 @@ an interrupt can be injected now with KVM_INTERRUPT.
 The value of the current interrupt flag.  Only valid if in-kernel
 local APIC is not used.
 
-   __u8 padding2[2];
+   __u16 flags;
+
+More architecture-specific flags detailing state of the VCPU that may
+affect the device's behavior.  The only currently defined flag is
+KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the
+VCPU is in system management mode.
 
/* in (pre_kvm_run), out (post_kvm_run) */
__u64 cr8;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4e299fcd0eb6..d52d7aea375f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -471,6 +471,7 @@ struct kvm_vcpu_arch {
atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
bool nmi_injected;/* Trying to inject an NMI this entry */
+   bool smi_pending;/* SMI queued after currently running handler */
 
struct mtrr_state_type mtrr_state;
u64 pat;
@@ -1115,6 +1116,8 @@ enum {
 #define HF_NMI_MASK(1 << 3)
 #define HF_IRET_MASK   (1 << 4)
 #define HF_GUEST_MASK  (1 << 5) /* VCPU is in guest-mode */
+#define HF_SMM_MASK(1 << 6)
+#define HF_SMM_INSIDE_NMI_MASK (1 << 7)
 
 /*
  * Hardware virtualization extension instructions may fault if a
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 2fec75e4b1e1..30100a3c1bed 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -106,6 +106,8 @@ struct kvm_ioapic_state {

[PATCH v2 07/13] KVM: add vcpu-specific functions to read/write/translate GFNs

2015-05-27 Thread Paolo Bonzini

We need to hide SMRAM from guests not running in SMM.  Therefore, all uses
of kvm_read_guest* and kvm_write_guest* must be changed to use different
address spaces depending on whether the VCPU is in system management mode.
We need to introduce a new family of functions for this.

For now, the functions are the same as the existing per-VM ones, except
for the type of the first argument, but later they will be changed to
use one of many "struct kvm_memslots" stored in struct kvm.

Whenever possible, slot-based functions are introduced, with two wrappers
for generic and vcpu-based actions.  kvm_read_guest and kvm_write_guest
are copied into kvm_vcpu_read_guest and kvm_vcpu_write_guest.

Signed-off-by: Paolo Bonzini 
---
v1->v2: new

 include/linux/kvm_host.h |  24 +++
 virt/kvm/kvm_main.c  | 167 +++
 2 files changed, 178 insertions(+), 13 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b019fee6d941..ba1ea43998e4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -471,6 +471,11 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm 
*kvm)
|| lockdep_is_held(&kvm->slots_lock));
 }
 
+static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu)
+{
+   return kvm_memslots(vcpu->kvm);
+}
+
 static inline struct kvm_memory_slot *
 id_to_memslot(struct kvm_memslots *slots, int id)
 {
@@ -576,6 +581,25 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
+struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
+struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t 
gfn);
+pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
+pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
+unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
+unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool 
*writable);
+int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int 
offset,
+int len);
+int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
+  unsigned long len);
+int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
+   unsigned long len);
+int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void 
*data,
+ int offset, int len);
+int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
+unsigned long len);
+void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
+
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6c8e124006ad..9ef1a7e3c050 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1100,6 +1100,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, 
gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_memslot);
 
+struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t 
gfn)
+{
+   return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
+}
+
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
@@ -1175,6 +1180,12 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
+unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+   return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
+
 /*
  * If writable is set to false, the hva returned by this function is only
  * allowed to be read.
@@ -1197,6 +1208,13 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t 
gfn, bool *writable)
return gfn_to_hva_memslot_prot(slot, gfn, writable);
 }
 
+unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool 
*writable)
+{
+   struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+
+   return gfn_to_hva_memslot_prot(slot, gfn, writable);
+}
+
 static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int write, struct page **page)
 {
@@ -1412,12 +1430,24 @@ pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
 
+pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+   return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), 
gfn);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
+
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
+pfn

[PATCH v2 09/13] KVM: x86: pass kvm_mmu_page to gfn_to_rmap

2015-05-27 Thread Paolo Bonzini

This is always available (with one exception in the auditing code).
Later we will also use the role to look up the right memslots array.

Signed-off-by: Paolo Bonzini 
---
v1->v2: new

 arch/x86/kvm/mmu.c   | 10 +-
 arch/x86/kvm/mmu_audit.c |  8 ++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index deb8862cfd54..6ea24812007a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1043,12 +1043,12 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int 
level,
 /*
  * Take gfn and return the reverse mapping to it.
  */
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct 
kvm_mmu_page *sp)
 {
struct kvm_memory_slot *slot;
 
slot = gfn_to_memslot(kvm, gfn);
-   return __gfn_to_rmap(gfn, level, slot);
+   return __gfn_to_rmap(gfn, sp->role.level, slot);
 }
 
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1066,7 +1066,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, 
gfn_t gfn)
 
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
-   rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+   rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
return pte_list_add(vcpu, spte, rmapp);
 }
 
@@ -1078,7 +1078,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 
sp = page_header(__pa(spte));
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
-   rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
+   rmapp = gfn_to_rmap(kvm, gfn, sp);
pte_list_remove(spte, rmapp);
 }
 
@@ -1612,7 +1612,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 
*spte, gfn_t gfn)
 
sp = page_header(__pa(spte));
 
-   rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+   rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
 
kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 368d53497314..9d99f17aa3be 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -146,7 +146,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 
*sptep)
return;
}
 
-   rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+   rmapp = gfn_to_rmap(kvm, gfn, rev_sp);
if (!*rmapp) {
if (!__ratelimit(&ratelimit_state))
return;
@@ -191,11 +191,15 @@ static void audit_write_protection(struct kvm *kvm, 
struct kvm_mmu_page *sp)
unsigned long *rmapp;
u64 *sptep;
struct rmap_iterator iter;
+   struct kvm_memslots *slots;
+   struct kvm_memory_slot *slot;
 
if (sp->role.direct || sp->unsync || sp->role.invalid)
return;
 
-   rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
+   slots = kvm_memslots(kvm);
+   slot = __gfn_to_memslot(slots, sp->gfn);
+   rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
 
for_each_rmap_spte(rmapp, &iter, sptep)
if (is_writable_pte(*sptep))
-- 
1.8.3.1


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2 11/13] KVM: x86: work on all available address spaces

2015-05-27 Thread Paolo Bonzini

This patch has no semantic change, but it prepares for the introduction
of a second address space for system management mode.

A new function x86_set_memory_region (and the "slots_lock taken"
counterpart __x86_set_memory_region) is introduced in order to
operate on all address spaces when adding or deleting private
memory slots.

Signed-off-by: Paolo Bonzini 
---
v1->v2: new

 arch/x86/include/asm/kvm_host.h |  5 +++
 arch/x86/kvm/mmu.c  | 84 ++---
 arch/x86/kvm/vmx.c  |  6 +--
 arch/x86/kvm/x86.c  | 40 ++--
 4 files changed, 91 insertions(+), 44 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2fd420255c2f..5a5e13af6e03 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1189,4 +1189,9 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, 
u64 *data);
 void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
 void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
 
+int __x86_set_memory_region(struct kvm *kvm,
+   const struct kvm_userspace_memory_region *mem);
+int x86_set_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ad67b56c6832..a749490bc1db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1503,30 +1503,33 @@ static int kvm_handle_hva_range(struct kvm *kvm,
struct kvm_memory_slot *memslot;
struct slot_rmap_walk_iterator iterator;
int ret = 0;
+   int i;
 
-   slots = kvm_memslots(kvm);
-
-   kvm_for_each_memslot(memslot, slots) {
-   unsigned long hva_start, hva_end;
-   gfn_t gfn_start, gfn_end;
+   for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+   slots = __kvm_memslots(kvm, i);
+   kvm_for_each_memslot(memslot, slots) {
+   unsigned long hva_start, hva_end;
+   gfn_t gfn_start, gfn_end;
 
-   hva_start = max(start, memslot->userspace_addr);
-   hva_end = min(end, memslot->userspace_addr +
-   (memslot->npages << PAGE_SHIFT));
-   if (hva_start >= hva_end)
-   continue;
-   /*
-* {gfn(page) | page intersects with [hva_start, hva_end)} =
-* {gfn_start, gfn_start+1, ..., gfn_end-1}.
-*/
-   gfn_start = hva_to_gfn_memslot(hva_start, memslot);
-   gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-   for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
-   PT_MAX_HUGEPAGE_LEVEL, gfn_start, gfn_end - 1,
-   &iterator)
-   ret |= handler(kvm, iterator.rmap, memslot,
-  iterator.gfn, iterator.level, data);
+   hva_start = max(start, memslot->userspace_addr);
+   hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+   if (hva_start >= hva_end)
+   continue;
+   /*
+* {gfn(page) | page intersects with [hva_start, 
hva_end)} =
+* {gfn_start, gfn_start+1, ..., gfn_end-1}.
+*/
+   gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+   gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, 
memslot);
+
+   for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
+PT_MAX_HUGEPAGE_LEVEL,
+gfn_start, gfn_end - 1,
+&iterator)
+   ret |= handler(kvm, iterator.rmap, memslot,
+  iterator.gfn, iterator.level, 
data);
+   }
}
 
return ret;
@@ -4536,21 +4539,23 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t 
gfn_start, gfn_t gfn_end)
 {
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
-
-   slots = kvm_memslots(kvm);
+   int i;
 
spin_lock(&kvm->mmu_lock);
-   kvm_for_each_memslot(memslot, slots) {
-   gfn_t start, end;
-
-   start = max(gfn_start, memslot->base_gfn);
-   end = min(gfn_end, memslot->base_gfn + memslot->npages);
-   if (start >= end)
-   continue;
+   for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+   slots = __kvm_memslots(kvm, i);
+   kvm_for_each_memslot(memslot, slots) {
+   gfn_t start, end;
+
+

[PATCH v2 12/13] KVM: x86: add SMM to the MMU role, support SMRAM address space

2015-05-27 Thread Paolo Bonzini

This is now very simple to do.  The only interesting part is a simple
trick to find the right memslot in gfn_to_rmap, retrieving the address
space from the spte role word.  The same trick is used in the auditing
code.

The comment on top of union kvm_mmu_page_role has been stale forever,
so remove it.  Speaking of stale code, remove pad_for_nice_hex_output
too: it was splitting the "access" bitfield across two bytes and thus
had effectively turned into pad_for_ugly_hex_output.

Signed-off-by: Paolo Bonzini 
---
v1->v2: new

 arch/x86/include/asm/kvm_host.h | 26 +++---
 arch/x86/kvm/mmu.c  | 15 ---
 arch/x86/kvm/mmu_audit.c| 10 +++---
 arch/x86/kvm/x86.c  |  2 ++
 4 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5a5e13af6e03..47006683f2fe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -184,23 +184,12 @@ struct kvm_mmu_memory_cache {
void *objects[KVM_NR_MEM_OBJS];
 };
 
-/*
- * kvm_mmu_page_role, below, is defined as:
- *
- *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- *   bits 4:7 - page table level for this shadow (1-4)
- *   bits 8:9 - page table quadrant for 2-level guests
- *   bit   16 - direct mapping of virtual to physical mapping at gfn
- *  used for real mode and two-dimensional paging
- *   bits 17:19 - common access permissions for all ptes in this shadow page
- */
 union kvm_mmu_page_role {
unsigned word;
struct {
unsigned level:4;
unsigned cr4_pae:1;
unsigned quadrant:2;
-   unsigned pad_for_nice_hex_output:6;
unsigned direct:1;
unsigned access:3;
unsigned invalid:1;
@@ -208,6 +197,15 @@ union kvm_mmu_page_role {
unsigned cr0_wp:1;
unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
+   unsigned :8;
+
+   /*
+* This is left at the top of the word so that
+* kvm_memslots_for_spte_role can extract it with a
+* simple shift.  While there is room, give it a whole
+* byte so it is also faster to load it from memory.
+*/
+   unsigned smm:8;
};
 };
 
@@ -1120,6 +1118,12 @@ enum {
 #define HF_SMM_MASK(1 << 6)
 #define HF_SMM_INSIDE_NMI_MASK (1 << 7)
 
+#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+#define KVM_ADDRESS_SPACE_NUM 2
+
+#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 
: 0)
+#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
+
 /*
  * Hardware virtualization extension instructions may fault if a
  * reboot turns off virtualization while processes are running.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a749490bc1db..8e9b1758b7a7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -806,13 +806,15 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 
 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
+   struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
gfn_t gfn;
int i;
 
gfn = sp->gfn;
-   slot = gfn_to_memslot(kvm, gfn);
+   slots = kvm_memslots_for_spte_role(kvm, sp->role);
+   slot = __gfn_to_memslot(slots, gfn);
for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
linfo->write_count += 1;
@@ -822,13 +824,15 @@ static void account_shadowed(struct kvm *kvm, struct 
kvm_mmu_page *sp)
 
 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
+   struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
gfn_t gfn;
int i;
 
gfn = sp->gfn;
-   slot = gfn_to_memslot(kvm, gfn);
+   slots = kvm_memslots_for_spte_role(kvm, sp->role);
+   slot = __gfn_to_memslot(slots, gfn);
for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
linfo->write_count -= 1;
@@ -1045,9 +1049,11 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
  */
 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct 
kvm_mmu_page *sp)
 {
+   struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
 
-   slot = gfn_to_memslot(kvm, gfn);
+   slots = kvm_memslots_for_spte_role(kvm, sp->role);
+   slot = __gfn_to_memslot(slots, gfn);
return __gfn_to_rmap(gfn, sp->role.level, slot);
 }
 
@@ -3924,6 +3930,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
struct kvm_mmu *context = &vcpu->arch.mmu;
 
context->base_role.word = 0;
+

[PATCH v2 10/13] KVM: x86: use vcpu-specific functions to read/write/translate GFNs

2015-05-27 Thread Paolo Bonzini

We need to hide SMRAM from guests not running in SMM.  Therefore,
all uses of kvm_read_guest* and kvm_write_guest* must be changed to
check whether the VCPU is in system management mode and use a
different set of memslots.  Switch from kvm_* to the newly-introduced
kvm_vcpu_*, which call into kvm_arch_vcpu_memslots_id.

Signed-off-by: Paolo Bonzini 
---
v1->v2: new

 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c  | 62 -
 arch/x86/kvm/mmu_audit.c|  2 +-
 arch/x86/kvm/paging_tmpl.h  | 18 ++--
 arch/x86/kvm/svm.c  | 12 
 arch/x86/kvm/vmx.c  | 32 ++---
 arch/x86/kvm/x86.c  | 32 ++---
 7 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 12a7318887ad..2fd420255c2f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -887,7 +887,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
   struct kvm_memory_slot *slot,
   gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots 
*slots);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6ea24812007a..ad67b56c6832 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -223,15 +223,15 @@ static unsigned int get_mmio_spte_generation(u64 spte)
return gen;
 }
 
-static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
 {
-   return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
+   return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
 }
 
-static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
   unsigned access)
 {
-   unsigned int gen = kvm_current_mmio_generation(kvm);
+   unsigned int gen = kvm_current_mmio_generation(vcpu);
u64 mask = generation_mmio_spte_mask(gen);
 
access &= ACC_WRITE_MASK | ACC_USER_MASK;
@@ -258,22 +258,22 @@ static unsigned get_mmio_spte_access(u64 spte)
return (spte & ~mask) & ~PAGE_MASK;
 }
 
-static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
  pfn_t pfn, unsigned access)
 {
if (unlikely(is_noslot_pfn(pfn))) {
-   mark_mmio_spte(kvm, sptep, gfn, access);
+   mark_mmio_spte(vcpu, sptep, gfn, access);
return true;
}
 
return false;
 }
 
-static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 {
unsigned int kvm_gen, spte_gen;
 
-   kvm_gen = kvm_current_mmio_generation(kvm);
+   kvm_gen = kvm_current_mmio_generation(vcpu);
spte_gen = get_mmio_spte_generation(spte);
 
trace_check_mmio_spte(spte, kvm_gen, spte_gen);
@@ -837,14 +837,14 @@ static void unaccount_shadowed(struct kvm *kvm, struct 
kvm_mmu_page *sp)
kvm->arch.indirect_shadow_pages--;
 }
 
-static int has_wrprotected_page(struct kvm *kvm,
+static int has_wrprotected_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
int level)
 {
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
 
-   slot = gfn_to_memslot(kvm, gfn);
+   slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (slot) {
linfo = lpage_info_slot(gfn, slot, level);
return linfo->write_count;
@@ -876,7 +876,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t 
gfn,
 {
struct kvm_memory_slot *slot;
 
-   slot = gfn_to_memslot(vcpu->kvm, gfn);
+   slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
  (no_dirty_log && slot->dirty_bitmap))
slot = NULL;
@@ -901,7 +901,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t 
large_gfn)
max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
 
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-   if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
+   if (has_wrprotected_page(vcpu, large_gfn, level))
break;
 
return level - 1;
@@ -1336,18 +1336,18 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm 
*kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static bool rmap_write_pr

[RFC 5/6] VSOCK: Introduce vhost-vsock.ko

2015-05-27 Thread Stefan Hajnoczi

From: Asias He 

VM sockets vhost transport implementation. This module runs in host
kernel.

Signed-off-by: Asias He 
Signed-off-by: Stefan Hajnoczi 
---
 drivers/vhost/vsock.c | 596 ++
 drivers/vhost/vsock.h |   4 +
 2 files changed, 600 insertions(+)
 create mode 100644 drivers/vhost/vsock.c
 create mode 100644 drivers/vhost/vsock.h

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
new file mode 100644
index 000..a9514aa
--- /dev/null
+++ b/drivers/vhost/vsock.c
@@ -0,0 +1,596 @@
+/*
+ * vhost transport for vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He 
+ * Stefan Hajnoczi 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include "vhost.h"
+#include "vsock.h"
+
+#define VHOST_VSOCK_DEFAULT_HOST_CID   2
+
+static int vhost_transport_socket_init(struct vsock_sock *vsk,
+  struct vsock_sock *psk);
+
+enum {
+   VHOST_VSOCK_FEATURES = VHOST_FEATURES,
+};
+
+/* Used to track all the vhost_vsock instances on the system. */
+static LIST_HEAD(vhost_vsock_list);
+static DEFINE_MUTEX(vhost_vsock_mutex);
+
+struct vhost_vsock_virtqueue {
+   struct vhost_virtqueue vq;
+};
+
+struct vhost_vsock {
+   /* Vhost device */
+   struct vhost_dev dev;
+   /* Vhost vsock virtqueue*/
+   struct vhost_vsock_virtqueue vqs[VSOCK_VQ_MAX];
+   /* Link to global vhost_vsock_list*/
+   struct list_head list;
+   /* Head for pkt from host to guest */
+   struct list_head send_pkt_list;
+   /* Work item to send pkt */
+   struct vhost_work send_pkt_work;
+   /* Wait queue for send pkt */
+   wait_queue_head_t queue_wait;
+   /* Used for global tx buf limitation */
+   u32 total_tx_buf;
+   /* Guest contex id this vhost_vsock instance handles */
+   u32 guest_cid;
+};
+
+static u32 vhost_transport_get_local_cid(void)
+{
+   u32 cid = VHOST_VSOCK_DEFAULT_HOST_CID;
+   return cid;
+}
+
+static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
+{
+   struct vhost_vsock *vsock;
+
+   mutex_lock(&vhost_vsock_mutex);
+   list_for_each_entry(vsock, &vhost_vsock_list, list) {
+   if (vsock->guest_cid == guest_cid) {
+   mutex_unlock(&vhost_vsock_mutex);
+   return vsock;
+   }
+   }
+   mutex_unlock(&vhost_vsock_mutex);
+
+   return NULL;
+}
+
+static void
+vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
+   struct vhost_virtqueue *vq)
+{
+   struct virtio_vsock_pkt *pkt;
+   bool added = false;
+   unsigned out, in;
+   struct sock *sk;
+   int head, ret;
+
+   mutex_lock(&vq->mutex);
+   vhost_disable_notify(&vsock->dev, vq);
+   for (;;) {
+   if (list_empty(&vsock->send_pkt_list)) {
+   vhost_enable_notify(&vsock->dev, vq);
+   break;
+   }
+
+   head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+&out, &in, NULL, NULL);
+   pr_debug("%s: head = %d\n", __func__, head);
+   if (head < 0)
+   break;
+
+   if (head == vq->num) {
+   if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
+   vhost_disable_notify(&vsock->dev, vq);
+   continue;
+   }
+   break;
+   }
+
+   /* TODO check out == 0 and in >= 1 */
+
+   pkt = list_first_entry(&vsock->send_pkt_list,
+  struct virtio_vsock_pkt, list);
+   list_del_init(&pkt->list);
+
+   /* FIXME: no assumption of frame layout */
+   ret = __copy_to_user(vq->iov[0].iov_base, &pkt->hdr,
+sizeof(pkt->hdr));
+   if (ret) {
+   virtio_transport_free_pkt(pkt);
+   vq_err(vq, "Faulted on copying pkt hdr\n");
+   break;
+   }
+   if (pkt->buf && pkt->len > 0) {
+   /* TODO avoid iov[1].iov_base buffer overflow, check 
pkt->len! */
+   ret = __copy_to_user(vq->iov[1].iov_base, pkt->buf,
+   pkt->len);
+   if (ret) {
+   virtio_transport_free_pkt(pkt);
+   vq_err(vq, "Faulted on copying pkt buf\n");
+   break;
+   }
+   }
+
+   vhost_add_used(vq, head, pkt->len);
+   added = true;
+
+   virtio_transport_dec_tx_pkt(pkt);
+   vsock->total_tx_buf -= pkt->len;
+

[PATCH v2 13/13] KVM: x86: advertise KVM_CAP_X86_SMM

2015-05-27 Thread Paolo Bonzini

Signed-off-by: Paolo Bonzini 
--
RFC->v1: depend on support for real mode CS base above 1M
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx.c  |  6 ++
 arch/x86/kvm/x86.c  | 15 +++
 4 files changed, 28 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 47006683f2fe..8ca32cfbcbd8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -709,6 +709,7 @@ struct kvm_x86_ops {
int (*hardware_setup)(void);   /* __init */
void (*hardware_unsetup)(void);/* __exit */
bool (*cpu_has_accelerated_tpr)(void);
+   bool (*cpu_has_high_real_mode_segbase)(void);
void (*cpuid_update)(struct kvm_vcpu *vcpu);
 
/* Create, but do not attach this VCPU */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6ff1faf4a2e8..680753186489 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4080,6 +4080,11 @@ static bool svm_cpu_has_accelerated_tpr(void)
return false;
 }
 
+static bool svm_has_high_real_mode_segbase(void)
+{
+   return true;
+}
+
 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
return 0;
@@ -4353,6 +4358,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+   .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
 
.vcpu_create = svm_create_vcpu,
.vcpu_free = svm_free_vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b1f13ff0cad3..e53800e2692e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8139,6 +8139,11 @@ static void vmx_handle_external_intr(struct kvm_vcpu 
*vcpu)
local_irq_enable();
 }
 
+static bool vmx_has_high_real_mode_segbase(void)
+{
+   return enable_unrestricted_guest || emulate_invalid_guest_state;
+}
+
 static bool vmx_mpx_supported(void)
 {
return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
@@ -10296,6 +10301,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = report_flexpriority,
+   .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
 
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2fa345d9bd6c..948bd38ec1cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2896,6 +2896,17 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
 #endif
r = 1;
break;
+   case KVM_CAP_X86_SMM:
+   /* SMBASE is usually relocated above 1M on modern chipsets,
+* and SMM handlers might indeed rely on 4G segment limits,
+* so do not report SMM to be available if real mode is
+* emulated via vm86 mode.  Still, do not go to great lengths
+* to avoid userspace's usage of the feature, because it is a
+* fringe case that is not enabled except via specific settings
+* of the module parameters.
+*/
+   r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
+   break;
case KVM_CAP_COALESCED_MMIO:
r = KVM_COALESCED_MMIO_PAGE_OFFSET;
break;
@@ -4289,6 +4300,10 @@ static void kvm_init_msr_list(void)
 
for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
switch (emulated_msrs[i]) {
+   case MSR_IA32_SMBASE:
+   if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
+   continue;
+   break;
default:
break;
}
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC 6/6] VSOCK: Add Makefile and Kconfig

2015-05-27 Thread Stefan Hajnoczi

From: Asias He 

Enable virtio-vsock and vhost-vsock.

Signed-off-by: Asias He 
Signed-off-by: Stefan Hajnoczi 
---
 drivers/vhost/Kconfig   |  4 
 drivers/vhost/Kconfig.vsock |  7 +++
 drivers/vhost/Makefile  |  4 
 net/vmw_vsock/Kconfig   | 18 ++
 net/vmw_vsock/Makefile  |  2 ++
 5 files changed, 35 insertions(+)
 create mode 100644 drivers/vhost/Kconfig.vsock

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 017a1e8..169fb19 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -32,3 +32,7 @@ config VHOST
---help---
  This option is selected by any driver which needs to access
  the core of vhost.
+
+if STAGING
+source "drivers/vhost/Kconfig.vsock"
+endif
diff --git a/drivers/vhost/Kconfig.vsock b/drivers/vhost/Kconfig.vsock
new file mode 100644
index 000..3491865
--- /dev/null
+++ b/drivers/vhost/Kconfig.vsock
@@ -0,0 +1,7 @@
+config VHOST_VSOCK
+   tristate "vhost virtio-vsock driver"
+   depends on VSOCKETS && EVENTFD
+   select VIRTIO_VSOCKETS_COMMON
+   default n
+   ---help---
+   Say M here to enable the vhost-vsock for virtio-vsock guests
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index e0441c3..6b012b9 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -4,5 +4,9 @@ vhost_net-y := net.o
 obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
 vhost_scsi-y := scsi.o
 
+obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
+vhost_vsock-y := vsock.o
+
 obj-$(CONFIG_VHOST_RING) += vringh.o
+
 obj-$(CONFIG_VHOST)+= vhost.o
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
index 14810ab..74e0bc8 100644
--- a/net/vmw_vsock/Kconfig
+++ b/net/vmw_vsock/Kconfig
@@ -26,3 +26,21 @@ config VMWARE_VMCI_VSOCKETS
 
  To compile this driver as a module, choose M here: the module
  will be called vmw_vsock_vmci_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS
+   tristate "virtio transport for Virtual Sockets"
+   depends on VSOCKETS && VIRTIO
+   select VIRTIO_VSOCKETS_COMMON
+   help
+ This module implements a virtio transport for Virtual Sockets.
+
+ Enable this transport if your Virtual Machine runs on Qemu/KVM.
+
+ To compile this driver as a module, choose M here: the module
+ will be called virtio_vsock_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS_COMMON
+   tristate
+   ---help---
+ This option is selected by any driver which needs to access
+ the virtio_vsock.
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index 2ce52d7..cf4c294 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -1,5 +1,7 @@
 obj-$(CONFIG_VSOCKETS) += vsock.o
 obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS) += virtio_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += virtio_transport_common.o
 
 vsock-y += af_vsock.o vsock_addr.o
 
-- 
2.4.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC 4/6] VSOCK: Introduce virtio-vsock.ko

2015-05-27 Thread Stefan Hajnoczi

From: Asias He 

VM sockets virtio transport implementation. This module runs in guest
kernel.

Signed-off-by: Asias He 
Signed-off-by: Stefan Hajnoczi 
---
 net/vmw_vsock/virtio_transport.c | 450 +++
 1 file changed, 450 insertions(+)
 create mode 100644 net/vmw_vsock/virtio_transport.c

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
new file mode 100644
index 000..ebe1eef
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport.c
@@ -0,0 +1,450 @@
+/*
+ * virtio transport for vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He 
+ * Stefan Hajnoczi 
+ *
+ * Some of the code is take from Gerd Hoffmann 's
+ * early virtio-vsock proof-of-concept bits.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static struct workqueue_struct *virtio_vsock_workqueue;
+static struct virtio_vsock *the_virtio_vsock;
+static void virtio_vsock_rx_fill(struct virtio_vsock *vsock);
+
+struct virtio_vsock {
+   /* Virtio device */
+   struct virtio_device *vdev;
+   /* Virtio virtqueue */
+   struct virtqueue *vqs[VSOCK_VQ_MAX];
+   /* Wait queue for send pkt */
+   wait_queue_head_t queue_wait;
+   /* Work item to send pkt */
+   struct work_struct tx_work;
+   /* Work item to recv pkt */
+   struct work_struct rx_work;
+   /* Mutex to protect send pkt*/
+   struct mutex tx_lock;
+   /* Mutex to protect recv pkt*/
+   struct mutex rx_lock;
+   /* Number of recv buffers */
+   int rx_buf_nr;
+   /* Number of max recv buffers */
+   int rx_buf_max_nr;
+   /* Used for global tx buf limitation */
+   u32 total_tx_buf;
+   /* Guest context id, just like guest ip address */
+   u32 guest_cid;
+};
+
+static struct virtio_vsock *virtio_vsock_get(void)
+{
+   return the_virtio_vsock;
+}
+
+static u32 virtio_transport_get_local_cid(void)
+{
+   struct virtio_vsock *vsock = virtio_vsock_get();
+
+   return vsock->guest_cid;
+}
+
+static int
+virtio_transport_send_pkt(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt_info *info)
+{
+   u32 src_cid, src_port, dst_cid, dst_port;
+   int ret, in_sg = 0, out_sg = 0;
+   struct virtio_transport *trans;
+   struct virtio_vsock_pkt *pkt;
+   struct virtio_vsock *vsock;
+   struct scatterlist hdr, buf, *sgs[2];
+   struct virtqueue *vq;
+   u32 pkt_len = info->pkt_len;
+   DEFINE_WAIT(wait);
+
+   vsock = virtio_vsock_get();
+   if (!vsock)
+   return -ENODEV;
+
+   src_cid = virtio_transport_get_local_cid();
+   src_port = vsk->local_addr.svm_port;
+   if (!info->remote_cid) {
+   dst_cid = vsk->remote_addr.svm_cid;
+   dst_port = vsk->remote_addr.svm_port;
+   } else {
+   dst_cid = info->remote_cid;
+   dst_port = info->remote_port;
+   }
+
+   trans = vsk->trans;
+   vq = vsock->vqs[VSOCK_VQ_TX];
+
+   if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
+   pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+   pkt_len = virtio_transport_get_credit(trans, pkt_len);
+   /* Do not send zero length OP_RW pkt*/
+   if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
+   return pkt_len;
+
+   /* Respect global tx buf limitation */
+   mutex_lock(&vsock->tx_lock);
+   while (pkt_len + vsock->total_tx_buf > VIRTIO_VSOCK_MAX_TX_BUF_SIZE) {
+   prepare_to_wait_exclusive(&vsock->queue_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+   mutex_unlock(&vsock->tx_lock);
+   schedule();
+   mutex_lock(&vsock->tx_lock);
+   finish_wait(&vsock->queue_wait, &wait);
+   }
+   vsock->total_tx_buf += pkt_len;
+   mutex_unlock(&vsock->tx_lock);
+
+   pkt = virtio_transport_alloc_pkt(vsk, info, pkt_len,
+src_cid, src_port,
+dst_cid, dst_port);
+   if (!pkt) {
+   /* TODO what about decrementing total_tx_buf */
+   virtio_transport_put_credit(trans, pkt_len);
+   return -ENOMEM;
+   }
+
+   pr_debug("%s:info->pkt_len= %d\n", __func__, info->pkt_len);
+
+   /* Will be released in virtio_transport_send_pkt_work */
+   sock_hold(&trans->vsk->sk);
+   virtio_transport_inc_tx_pkt(pkt);
+
+   /* Put pkt in the virtqueue */
+   sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+   sgs[out_sg++] = &hdr;
+   if (info->msg && info->pkt_len > 0) {
+   sg_init_one(&buf, pkt->buf, pkt->len);
+   sgs[out_sg++] = &buf;
+   }
+
+   mutex_lock(&vsock->tx_lock);
+   while ((ret = virtqueue_add_sgs(vq, sgs, out_sg, in_s

[RFC 1/6] VSOCK: Introduce vsock_find_unbound_socket and vsock_bind_dgram_generic

2015-05-27 Thread Stefan Hajnoczi

From: Asias He 

Signed-off-by: Asias He 
Signed-off-by: Stefan Hajnoczi 
---
 include/net/af_vsock.h   |  2 ++
 net/vmw_vsock/af_vsock.c | 70 
 2 files changed, 72 insertions(+)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 172632d..d52b984 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -172,8 +172,10 @@ void vsock_insert_connected(struct vsock_sock *vsk);
 void vsock_remove_bound(struct vsock_sock *vsk);
 void vsock_remove_connected(struct vsock_sock *vsk);
 struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
+struct sock *vsock_find_unbound_socket(struct sockaddr_vm *addr);
 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
 struct sockaddr_vm *dst);
 void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
+int vsock_bind_dgram_generic(struct vsock_sock *vsk, struct sockaddr_vm *addr);
 
 #endif /* __AF_VSOCK_H__ */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 2ec86e6..ae3ce3d 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -224,6 +224,17 @@ static struct sock *__vsock_find_bound_socket(struct 
sockaddr_vm *addr)
return NULL;
 }
 
+static struct sock *__vsock_find_unbound_socket(struct sockaddr_vm *addr)
+{
+   struct vsock_sock *vsk;
+
+   list_for_each_entry(vsk, vsock_unbound_sockets, bound_table)
+   if (addr->svm_port == vsk->local_addr.svm_port)
+   return sk_vsock(vsk);
+
+   return NULL;
+}
+
 static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
  struct sockaddr_vm *dst)
 {
@@ -299,6 +310,21 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm 
*addr)
 }
 EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
 
+struct sock *vsock_find_unbound_socket(struct sockaddr_vm *addr)
+{
+   struct sock *sk;
+
+   spin_lock_bh(&vsock_table_lock);
+   sk = __vsock_find_unbound_socket(addr);
+   if (sk)
+   sock_hold(sk);
+
+   spin_unlock_bh(&vsock_table_lock);
+
+   return sk;
+}
+EXPORT_SYMBOL_GPL(vsock_find_unbound_socket);
+
 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
 struct sockaddr_vm *dst)
 {
@@ -533,6 +559,50 @@ static int __vsock_bind_stream(struct vsock_sock *vsk,
return 0;
 }
 
+int vsock_bind_dgram_generic(struct vsock_sock *vsk, struct sockaddr_vm *addr)
+{
+   static u32 port = LAST_RESERVED_PORT + 1;
+   struct sockaddr_vm new_addr;
+
+   vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port);
+
+   if (addr->svm_port == VMADDR_PORT_ANY) {
+   bool found = false;
+   unsigned int i;
+
+   for (i = 0; i < MAX_PORT_RETRIES; i++) {
+   if (port <= LAST_RESERVED_PORT)
+   port = LAST_RESERVED_PORT + 1;
+
+   new_addr.svm_port = port++;
+
+   if (!__vsock_find_unbound_socket(&new_addr)) {
+   found = true;
+   break;
+   }
+   }
+
+   if (!found)
+   return -EADDRNOTAVAIL;
+   } else {
+   /* If port is in reserved range, ensure caller
+* has necessary privileges.
+*/
+   if (addr->svm_port <= LAST_RESERVED_PORT &&
+   !capable(CAP_NET_BIND_SERVICE)) {
+   return -EACCES;
+   }
+
+   if (__vsock_find_unbound_socket(&new_addr))
+   return -EADDRINUSE;
+   }
+
+   vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(vsock_bind_dgram_generic);
+
 static int __vsock_bind_dgram(struct vsock_sock *vsk,
  struct sockaddr_vm *addr)
 {
-- 
2.4.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC 2/6] Add dgram_skb to vsock_sock

2015-05-27 Thread Stefan Hajnoczi

From: Asias He 

This list will be used to match received packets when multiple packets
are used because datagram size is larger than the receive buffer size.

Signed-off-by: Stefan Hajnoczi 
---
 include/net/af_vsock.h   | 1 +
 net/vmw_vsock/af_vsock.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index d52b984..bc9055c 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -58,6 +58,7 @@ struct vsock_sock {
 */
struct list_head pending_links;
struct list_head accept_queue;
+   struct list_head dgram_skb;
bool rejected;
struct delayed_work dwork;
u32 peer_shutdown;
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index ae3ce3d..0b3c498 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -684,6 +684,7 @@ struct sock *__vsock_create(struct net *net,
vsk->listener = NULL;
INIT_LIST_HEAD(&vsk->pending_links);
INIT_LIST_HEAD(&vsk->accept_queue);
+   INIT_LIST_HEAD(&vsk->dgram_skb); /* TODO free list entries on shutdown 
and limit list size or timeout somehow? */
vsk->rejected = false;
vsk->sent_request = false;
vsk->ignore_connecting_rst = false;
-- 
2.4.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC 0/6] Add virtio transport for AF_VSOCK

2015-05-27 Thread Stefan Hajnoczi

This patch series adds a virtio transport for AF_VSOCK (net/vmw_vsock/).
AF_VSOCK is designed for communication between virtual machines and
hypervisors.  It is currently only implemented for VMware's VMCI transport.

This series implements the proposed virtio-vsock device specification from
here:
http://comments.gmane.org/gmane.comp.emulators.virtio.devel/855

Most of the work was done by Asias He and Gerd Hoffmann a while back.  I have
picked up the series again.

The QEMU userspace changes are here:
https://github.com/stefanha/qemu/commits/vsock

Why virtio-vsock?
-
Guest<->host communication is currently done over the virtio-serial device.
This makes it hard to port sockets API-based applications and is limited to
static ports.

virtio-vsock uses the sockets API so that applications can rely on familiar
SOCK_STREAM and SOCK_DGRAM semantics.  Applications on the host can easily
connect to guest agents because the sockets API allows multiple connections to
a listen socket (unlike virtio-serial).  This simplifies the guest<->host
communication and eliminates the need for extra processes on the host to
arbitrate virtio-serial ports.

Overview

This series adds 3 pieces:

1. virtio_transport_common.ko - core virtio vsock code that uses vsock.ko

2. virtio_transport.ko - guest driver

3. drivers/vhost/vsock.ko - host driver

Howto
-
The following kernel options are needed:
  CONFIG_VSOCKETS=y
  CONFIG_VIRTIO_VSOCKETS=y
  CONFIG_VIRTIO_VSOCKETS_COMMON=y
  CONFIG_VHOST_VSOCK=m

Launch QEMU as follows:
  # qemu ... -device vhost-vsock-pci,id=vhost-vsock-pci0

Guest and host can communicate via AF_VSOCK sockets.  The host's CID (address)
is 2 and the guest is automatically assigned a CID (use VMADDR_CID_ANY (-1) to
bind to it).

Status
--
I am auditing and testing the code, while iterating the virtio device
specification.  There is scope to change both the implementation (these
patches) and the virtio device specification.

TODO:
 * Flexible virtqueue descriptor layout
 * Avoid Linux-specific constants in packet headers (SOCK_STREAM/SOCK_DGRAM)
 * Send RST if there is no listening SOCK_STREAM socket
 * Add missing input validation for packet headers and vhost ioctls

Asias He (6):
  VSOCK: Introduce vsock_find_unbound_socket and
vsock_bind_dgram_generic
  Add dgram_skb to vsock_sock
  VSOCK: Introduce virtio-vsock-common.ko
  VSOCK: Introduce virtio-vsock.ko
  VSOCK: Introduce vhost-vsock.ko
  VSOCK: Add Makefile and Kconfig

 drivers/vhost/Kconfig   |4 +
 drivers/vhost/Kconfig.vsock |7 +
 drivers/vhost/Makefile  |4 +
 drivers/vhost/vsock.c   |  596 +++
 drivers/vhost/vsock.h   |4 +
 include/linux/virtio_vsock.h|  207 +
 include/net/af_vsock.h  |3 +
 include/uapi/linux/virtio_ids.h |1 +
 include/uapi/linux/virtio_vsock.h   |   80 ++
 net/vmw_vsock/Kconfig   |   18 +
 net/vmw_vsock/Makefile  |2 +
 net/vmw_vsock/af_vsock.c|   71 ++
 net/vmw_vsock/virtio_transport.c|  450 +++
 net/vmw_vsock/virtio_transport_common.c | 1248 +++
 14 files changed, 2695 insertions(+)
 create mode 100644 drivers/vhost/Kconfig.vsock
 create mode 100644 drivers/vhost/vsock.c
 create mode 100644 drivers/vhost/vsock.h
 create mode 100644 include/linux/virtio_vsock.h
 create mode 100644 include/uapi/linux/virtio_vsock.h
 create mode 100644 net/vmw_vsock/virtio_transport.c
 create mode 100644 net/vmw_vsock/virtio_transport_common.c

-- 
2.4.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] vfio/pci: Fix racy vfio_device_get_from_dev() call

2015-05-27 Thread Alex Williamson

Testing the driver for a PCI device is racy, it can be all but
complete in the release path and still report the driver as ours.
Therefore we can't trust drvdata to be valid.  This race can sometimes
be seen when one port of a multifunction device is being unbound from
the vfio-pci driver while another function is being released by the
user and attempting a bus reset.  The device in the remove path is
found as a dependent device for the bus reset of the release path
device, the driver is still set to vfio-pci, but the drvdata has
already been cleared, resulting in a null pointer dereference.

To resolve this, fix vfio_device_get_from_dev() to not take the
dev_get_drvdata() shortcut and instead traverse through the
iommu_group, vfio_group, vfio_device path to get a reference we
can trust.  Once we have that reference, we know the device isn't
in transition and we can test to make sure the driver is still what
we expect, so that we don't interfere with devices we don't own.

Signed-off-by: Alex Williamson 
---
 drivers/vfio/pci/vfio_pci.c |   16 +---
 drivers/vfio/vfio.c |   27 +++
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index e9851ad..964ad57 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1056,19 +1056,21 @@ struct vfio_devices {
 static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
 {
struct vfio_devices *devs = data;
-   struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
-
-   if (pci_drv != &vfio_pci_driver)
-   return -EBUSY;
+   struct vfio_device *device;
 
if (devs->cur_index == devs->max_index)
return -ENOSPC;
 
-   devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev);
-   if (!devs->devices[devs->cur_index])
+   device = vfio_device_get_from_dev(&pdev->dev);
+   if (!device)
return -EINVAL;
 
-   devs->cur_index++;
+   if (pci_dev_driver(pdev) != &vfio_pci_driver) {
+   vfio_device_put(device);
+   return -EBUSY;
+   }
+
+   devs->devices[devs->cur_index++] = device;
return 0;
 }
 
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index e1278fe..2fb29df 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -661,18 +661,29 @@ int vfio_add_group_dev(struct device *dev,
 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 
 /**
- * Get a reference to the vfio_device for a device that is known to
- * be bound to a vfio driver.  The driver implicitly holds a
- * vfio_device reference between vfio_add_group_dev and
- * vfio_del_group_dev.  We can therefore use drvdata to increment
- * that reference from the struct device.  This additional
- * reference must be released by calling vfio_device_put.
+ * Get a reference to the vfio_device for a device.  Even if the
+ * caller thinks they own the device, they could be racing with a
+ * release call path, so we can't trust drvdata for the shortcut.
+ * Go the long way around, from the iommu_group to the vfio_group
+ * to the vfio_device.
  */
 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 {
-   struct vfio_device *device = dev_get_drvdata(dev);
+   struct iommu_group *iommu_group;
+   struct vfio_group *group;
+   struct vfio_device *device;
+
+   iommu_group = iommu_group_get(dev);
+   if (!iommu_group)
+   return NULL;
 
-   vfio_device_get(device);
+   group = vfio_group_get_from_iommu(iommu_group);
+   iommu_group_put(iommu_group);
+   if (!group)
+   return NULL;
+
+   device = vfio_group_get_device(group, dev);
+   vfio_group_put(group);
 
return device;
 }

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] Announcing qboot, a minimal x86 firmware for QEMU

2015-05-27 Thread Dr. David Alan Gilbert

* Paolo Bonzini (pbonz...@redhat.com) wrote:
> 
> 
> On 26/05/2015 23:25, Christopher Covington wrote:
> > On 05/25/2015 08:53 AM, Paolo Bonzini wrote:
> >>
> >> On 22/05/2015 13:12, Daniel P. Berrange wrote:
> >>> In
> >>> particular I don't see why we need to have a SATA controller and ISA/LPC
> >>> bridge in every virt machine - root PCI bus only should be possible, as 
> >>> you
> >>> can provide disks via virtio-blk or virtio-scsi and serial, parallel, 
> >>> mouse,
> >>> floppy via PCI devices and/or by adding a USB bus in the cases where you
> >>> really need one.
> >>
> >> I think removing the ISA/LPC bridge is hard.  It includes the real-time
> >> clock and fw_cfg, for example.
> > 
> > Could VirtIO specified replacements make sense for these peripherals?
> 
> Not really.  virtio is too heavyweight and you'd be reinventing the
> wheel unnecessarily.

I see reasons to replace some but not all these components;  and there's no 
point
in replacing the ISA/LPC bridge since it's got nothing at all in it.

> For example, ARM's "-M virt" uses a pl011 block for the RTC, and also
> uses fw_cfg.  Another commonly used ISA device is the UART, for which
> again -M virt uses a pl031.

I don't see much point in replacing the simple PC uart with
anything virtio; I can imagine that you might want to go down
to something really trivial with non of the bells and whistles;
but a UART is pretty simple.

The PC RTC though, it's a bit of a disaster that's had 30 years
of random cruft added into it to hold random things that should never
have been there.

Dave

> 
> Paolo
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] Announcing qboot, a minimal x86 firmware for QEMU

2015-05-27 Thread Paolo Bonzini

On 27/05/2015 14:50, Christopher Covington wrote:
>> Not really.  virtio is too heavyweight
> 
> I'd be curious to read where in your estimation this weight lies. Is it
> one-time initialization or recurring? Is it specific to the PCI transport or
> does MMIO suffer from it as well?

It's heavyweight in the sense that virtio requires you to design a
communication mechanism based on ring buffers.  It's much harder than a
few ad-hoc registers.

And I know everyone is upset about the attack surface of QEMU these
days, but effort would be much better spent adding QEMU-specific
customizations to a static analysis tool (that e.g. would derive bound
checks for the MemoryRegion read/write ops and be able to prove that
said ops cannot access arrays out of their bounds).

>> and you'd be reinventing the wheel unnecessarily.
> 
> In my mind the utility of peripherals that are instruction set architecture
> agnostic and can work over several transports is in reducing the amount of
> (emulator/hypervisor, firmware, and OS) code used, and therefore in need of
> maintenance, for common system emulation and virtualization use cases.

A fully processor-agnostic hardware architecture is a non-goal.  You'll
always have stuff like interrupt controllers that is extremely tied to
the processor.

If you want to abstract hardware, use the firmware, Luke!  Things such
as UEFI and ACPI are there for exactly this reason.  We will be able to
reuse a lot of x86 hotplug code on ARM using ACPI.  And if you don't
want to use ACPI, you can always write native OS drivers for the same
hotplug hardware.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] Announcing qboot, a minimal x86 firmware for QEMU

2015-05-27 Thread Christopher Covington

On 05/27/2015 05:30 AM, Paolo Bonzini wrote:
> 
> 
> On 26/05/2015 23:25, Christopher Covington wrote:
>> On 05/25/2015 08:53 AM, Paolo Bonzini wrote:
>>>
>>> On 22/05/2015 13:12, Daniel P. Berrange wrote:
 In
 particular I don't see why we need to have a SATA controller and ISA/LPC
 bridge in every virt machine - root PCI bus only should be possible, as you
 can provide disks via virtio-blk or virtio-scsi and serial, parallel, 
 mouse,
 floppy via PCI devices and/or by adding a USB bus in the cases where you
 really need one.
>>>
>>> I think removing the ISA/LPC bridge is hard.  It includes the real-time
>>> clock and fw_cfg, for example.
>>
>> Could VirtIO specified replacements make sense for these peripherals?
> 
> Not really.  virtio is too heavyweight

I'd be curious to read where in your estimation this weight lies. Is it
one-time initialization or recurring? Is it specific to the PCI transport or
does MMIO suffer from it as well?

> and you'd be reinventing the wheel unnecessarily.

In my mind the utility of peripherals that are instruction set architecture
agnostic and can work over several transports is in reducing the amount of
(emulator/hypervisor, firmware, and OS) code used, and therefore in need of
maintenance, for common system emulation and virtualization use cases.

> For example, ARM's "-M virt" uses a pl011 block for the RTC, and also
> uses fw_cfg.  Another commonly used ISA device is the UART, for which
> again -M virt uses a pl031.

(UART is PL011; RTC is PL031)

Thanks,
Chris

-- 
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] Announcing qboot, a minimal x86 firmware for QEMU

2015-05-27 Thread Paolo Bonzini



On 27/05/2015 13:54, Peter Maydell wrote:
> On 27 May 2015 at 10:30, Paolo Bonzini  wrote:
>> > For example, ARM's "-M virt" uses a pl011 block for the RTC, and also
>> > uses fw_cfg.  Another commonly used ISA device is the UART, for which
>> > again -M virt uses a pl031.
> Partly we do that because there were a number of reports that trying
> to use virtio for the console didn't work reliably... Using the
> stock UART that is widely supported in UEFI/uboot/kernel was a
> conservative design choice.
> 
> The next thing that's likely to appear in "virt" is a PL061
> GPIO device, which you need for CPU hotplug and external-shutdown-request
> notifications.

Indeed, and the x86 Q35 chipset puts the ACPI registers... in the
ISA/LPC bridge. :)

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] Announcing qboot, a minimal x86 firmware for QEMU

2015-05-27 Thread Peter Maydell

On 27 May 2015 at 10:30, Paolo Bonzini  wrote:
> For example, ARM's "-M virt" uses a pl011 block for the RTC, and also
> uses fw_cfg.  Another commonly used ISA device is the UART, for which
> again -M virt uses a pl031.

Partly we do that because there were a number of reports that trying
to use virtio for the console didn't work reliably... Using the
stock UART that is widely supported in UEFI/uboot/kernel was a
conservative design choice.

The next thing that's likely to appear in "virt" is a PL061
GPIO device, which you need for CPU hotplug and external-shutdown-request
notifications.

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 10/23] userfaultfd: add new syscall to provide memory externalization

2015-05-27 Thread Thomas Martitz

Andrea Arcangeli  redhat.com> writes:

> 
> Once an userfaultfd has been created and certain region of the process
> virtual address space have been registered into it, the thread
> responsible for doing the memory externalization can manage the page
> faults in userland by talking to the kernel using the userfaultfd
> protocol.
> 
> poll() can be used to know when there are new pending userfaults to be
> read (POLLIN).
> 

Hello,

I already asked this for v3 but got no reply, so trying again:

I'm wondering why a new syscall was chosen over a simple special file
/dev/userfault (analogous to /dev/shm) to obtain an fd. In my book the
special file has only advantanges: no additional syscall is needed, system
admins can tweak access to this feature via normal file permissions, and
signaling the availability of the feature in the kernel simply by the
existence of the dev file.

I already wondered the same for memfd(). Here I can perhaps follow that
there is a need such fds before /dev is mounted (because PID1 might need
it). But not for this case as devtmpfs should be mounted early enough.

Not saying it's the wrong decision, but I want to learn about the rationale.

Best regards.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Announcing qboot, a minimal x86 firmware for QEMU

2015-05-27 Thread Paolo Bonzini

On 27/05/2015 11:36, Avi Kivity wrote:
> 
> 
> On 05/27/2015 12:30 PM, Paolo Bonzini wrote:
>>
>> On 26/05/2015 23:25, Christopher Covington wrote:
>>> On 05/25/2015 08:53 AM, Paolo Bonzini wrote:
 On 22/05/2015 13:12, Daniel P. Berrange wrote:
> In
> particular I don't see why we need to have a SATA controller and
> ISA/LPC
> bridge in every virt machine - root PCI bus only should be
> possible, as you
> can provide disks via virtio-blk or virtio-scsi and serial,
> parallel, mouse,
> floppy via PCI devices and/or by adding a USB bus in the cases
> where you
> really need one.
 I think removing the ISA/LPC bridge is hard.  It includes the real-time
 clock and fw_cfg, for example.
>>> Could VirtIO specified replacements make sense for these peripherals?
>> Not really.  virtio is too heavyweight and you'd be reinventing the
>> wheel unnecessarily.
>>
>> For example, ARM's "-M virt" uses a pl011 block for the RTC, and also
>> uses fw_cfg.  Another commonly used ISA device is the UART, for which
>> again -M virt uses a pl031.
>>
> 
> The RTC can be replaced by kvmclock, the keyboard by virtio-console. 
> Maybe we can provide an msr- or pci- based interface to fw_cfg.

The RTC is used for more than clock unfortunately.  S3 support uses it
for example, both to tell the firmware that it's an S3 resume and for
resuming when the alarm fires.

All in all, getting rid of ISA seems like chasing windmills.  If you
want to do that, fine, but then do not do that on a minimal firmware
like qboot or SeaBIOS.  For example, UEFI provides run-time services
that let you access some low-level devices like this, and that's part of
why the ARM virtual machine image specification mandates UEFI support.
But even then, the ARM virtual machine image specification lets you
choose between pl031 and a Xen pv console, and doesn't specify
virtio-console...

Paolo

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] Announcing qboot, a minimal x86 firmware for QEMU

2015-05-27 Thread Avi Kivity




On 05/27/2015 12:30 PM, Paolo Bonzini wrote:


On 26/05/2015 23:25, Christopher Covington wrote:

On 05/25/2015 08:53 AM, Paolo Bonzini wrote:

On 22/05/2015 13:12, Daniel P. Berrange wrote:

In
particular I don't see why we need to have a SATA controller and ISA/LPC
bridge in every virt machine - root PCI bus only should be possible, as you
can provide disks via virtio-blk or virtio-scsi and serial, parallel, mouse,
floppy via PCI devices and/or by adding a USB bus in the cases where you
really need one.

I think removing the ISA/LPC bridge is hard.  It includes the real-time
clock and fw_cfg, for example.

Could VirtIO specified replacements make sense for these peripherals?

Not really.  virtio is too heavyweight and you'd be reinventing the
wheel unnecessarily.

For example, ARM's "-M virt" uses a pl011 block for the RTC, and also
uses fw_cfg.  Another commonly used ISA device is the UART, for which
again -M virt uses a pl031.



The RTC can be replaced by kvmclock, the keyboard by virtio-console.  
Maybe we can provide an msr- or pci- based interface to fw_cfg.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] Announcing qboot, a minimal x86 firmware for QEMU

2015-05-27 Thread Paolo Bonzini



On 26/05/2015 23:25, Christopher Covington wrote:
> On 05/25/2015 08:53 AM, Paolo Bonzini wrote:
>>
>> On 22/05/2015 13:12, Daniel P. Berrange wrote:
>>> In
>>> particular I don't see why we need to have a SATA controller and ISA/LPC
>>> bridge in every virt machine - root PCI bus only should be possible, as you
>>> can provide disks via virtio-blk or virtio-scsi and serial, parallel, mouse,
>>> floppy via PCI devices and/or by adding a USB bus in the cases where you
>>> really need one.
>>
>> I think removing the ISA/LPC bridge is hard.  It includes the real-time
>> clock and fw_cfg, for example.
> 
> Could VirtIO specified replacements make sense for these peripherals?

Not really.  virtio is too heavyweight and you'd be reinventing the
wheel unnecessarily.

For example, ARM's "-M virt" uses a pl011 block for the RTC, and also
uses fw_cfg.  Another commonly used ISA device is the UART, for which
again -M virt uses a pl031.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 11/12] KVM: x86: add KVM_MEM_X86_SMRAM memory slot flag

2015-05-27 Thread Paolo Bonzini



On 26/05/2015 20:45, Avi Kivity wrote:
> Is this generic enough?  For example, a system could configure itself so
> that an SMRAM region goes to mmio, hiding real RAM.

That would work, because in !SMM you'd go to userspace and do MMIO
there.  But this is absolutely not generic enough.  Your proposed
alternative of having two tables is really neat to implement in both KVM
and QEMU.

Paolo

> 
> I see two alternatives:
> 
> - have three states: SMM, !SMM, both
> - define two tables: SMM, !SMM, both spanning the entire address space
> 
> you should probably document how dirty bitmap handling happens in the
> presence of SMM.
> 
>> +
>>   It is recommended to use this API instead of the
>> KVM_SET_MEMORY_REGION ioctl.
>>   The KVM_SET_MEMORY_REGION does not allow fine grained control over
>> memory
>>   allocation and is deprecated.
>> diff --git a/arch/x86/include/uapi/asm/kvm.h
>> b/arch/x86/include/uapi/asm/kvm.h
>> index 30100a3c1bed..46df15bc844f 100644
>> --- a/arch/x86/include/uapi/asm/kvm.h
>> +++ b/arch/x86/include/uapi/asm/kvm.h
>> @@ -45,6 +45,9 @@
>>   #define __KVM_HAVE_XCRS
>>   #define __KVM_HAVE_READONLY_MEM
>>   +#define __KVM_ARCH_VALID_FLAGSKVM_MEM_X86_SMRAM
>> +#define KVM_MEM_X86_SMRAM(1 << 24)
>> +
>>   /* Architectural interrupt line count. */
>>   #define KVM_NR_INTERRUPTS 256
>>   diff --git a/arch/x86/kvm/smram.c b/arch/x86/kvm/smram.c
>> index 73616edab631..e7dd933673a4 100644
>> --- a/arch/x86/kvm/smram.c
>> +++ b/arch/x86/kvm/smram.c
>> @@ -19,10 +19,23 @@
>> #include 
>>   #include 
>> +#include "kvm_cache_regs.h"
>> struct kvm_memory_slot *x86_gfn_to_memslot(struct kvm_vcpu *vcpu,
>> gfn_t gfn)
>>   {
>> -struct kvm_memory_slot *slot = gfn_to_memslot(vcpu->kvm, gfn);
>> +/* By using search_memslots directly the compiler can optimize away
>> + * the "if (found)" check below.
>> + *
>> + * It cannot do the same for gfn_to_memslot because it is not
>> inlined,
>> + * and it also cannot do the same for __gfn_to_memslot because the
>> + * kernel is compiled with -fno-delete-null-pointer-checks.
>> + */
>> +bool found;
>> +struct kvm_memslots *memslots = kvm_memslots(vcpu->kvm);
>> +struct kvm_memory_slot *slot = search_memslots(memslots, gfn,
>> &found);
>> +
>> +if (found && unlikely(slot->flags & KVM_MEM_X86_SMRAM) &&
>> !is_smm(vcpu))
>> +return NULL;
>> return slot;
>>   }
>> @@ -112,7 +125,15 @@ EXPORT_SYMBOL_GPL(x86_read_guest);
>>   int x86_gfn_to_hva_cache_init(struct kvm *kvm, struct
>> gfn_to_hva_cache *ghc,
>> gpa_t gpa, unsigned long len)
>>   {
>> -return kvm_gfn_to_hva_cache_init(kvm, ghc, gpa, len);
>> +int r = kvm_gfn_to_hva_cache_init(kvm, ghc, gpa, len);
>> +
>> +if (r < 0)
>> +return r;
>> +
>> +/* Use slow path for reads and writes to SMRAM.  */
>> +if (ghc->memslot && (ghc->memslot->flags & KVM_MEM_X86_SMRAM))
>> +ghc->memslot = NULL;
>> +return r;
>>   }
>>   EXPORT_SYMBOL_GPL(x86_gfn_to_hva_cache_init);
>>   diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 19d09a08885b..ae7c60262369 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -810,16 +810,18 @@ static inline void kvm_guest_exit(void)
>>* gfn_to_memslot() itself isn't here as an inline because that would
>>* bloat other code too much.
>>*/
>> -static inline struct kvm_memory_slot *
>> -search_memslots(struct kvm_memslots *slots, gfn_t gfn)
>> +static __always_inline struct kvm_memory_slot *
>> +search_memslots(struct kvm_memslots *slots, gfn_t gfn, bool *found)
>>   {
>>   int start = 0, end = slots->used_slots;
>>   int slot = atomic_read(&slots->lru_slot);
>>   struct kvm_memory_slot *memslots = slots->memslots;
>> if (gfn >= memslots[slot].base_gfn &&
>> -gfn < memslots[slot].base_gfn + memslots[slot].npages)
>> +gfn < memslots[slot].base_gfn + memslots[slot].npages) {
>> +*found = true;
>>   return &memslots[slot];
>> +}
>> while (start < end) {
>>   slot = start + (end - start) / 2;
>> @@ -833,16 +835,20 @@ search_memslots(struct kvm_memslots *slots,
>> gfn_t gfn)
>>   if (gfn >= memslots[start].base_gfn &&
>>   gfn < memslots[start].base_gfn + memslots[start].npages) {
>>   atomic_set(&slots->lru_slot, start);
>> +*found = true;
>>   return &memslots[start];
>>   }
>>   +*found = false;
>>   return NULL;
>>   }
>> static inline struct kvm_memory_slot *
>>   __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
>>   {
>> -return search_memslots(slots, gfn);
>> +bool found;
>> +
>> +return search_memslots(slots, gfn, &found);
>>   }
>> static inline unsigned long
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 0fcc5d28f3a9..46bff2082479 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -716,6 +716,10 @@ st

55 matches

Mail list logo