RE: [PATCH 1/2] KVM: x86: set TMR when the interrupt is accepted

2015-08-13 Thread Zhang, Yang Z
Zhang, Yang Z wrote on 2015-08-04:
 Paolo Bonzini wrote on 2015-08-04:
 
 
 On 04/08/2015 02:46, Zhang, Yang Z wrote:
 It is a problem for split irqchip, where the EOI exit bitmap can
 be inferred from the IOAPIC routes but the TMR cannot.  The
 hardware behavior on the other hand can be implemented purely within the 
 LAPIC.
 
 So updating the TMR within LAPIC is the only solution to handle it?
 
 It's the simplest and the one that makes most sense.  Considering
 that TMR is a pretty obscure feature, it's unlikely that it will be
 accelerated in the future.
 
 You may be right. It is safe if no future hardware plans to use it.
 Let me check with our hardware team to see whether it will be used or not in 
 future.

After checking with Jun, there is no guarantee that the guest running on 
another CPU will operate properly if hypervisor modify the vTMR from another 
CPU. So the hypervisor should not to do it.

 
 
 Paolo
 
 
 Best regards,
 Yang



Best regards,
Yang


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/4] irqchip: GICv3: Don't deactivate interrupts forwarded to a guest

2015-08-13 Thread Marc Zyngier
Commit 0a4377de3056 (genirq: Introduce irq_set_vcpu_affinity() to
target an interrupt to a VCPU) added just what we needed at the
lowest level to allow an interrupt to be deactivated by a guest.

When such a request reaches the GIC, it knows it doesn't need to
perform the deactivation anymore, and can safely leave the guest
do its magic. This of course requires additional support in both
VFIO and KVM.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 drivers/irqchip/irq-gic-v3.c | 35 +--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 61190fb..01c6329 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -70,6 +70,11 @@ static inline int gic_irq_in_rdist(struct irq_data *d)
return gic_irq(d)  32;
 }
 
+static inline bool forwarded_irq(struct irq_data *d)
+{
+   return d-handler_data != NULL;
+}
+
 static inline void __iomem *gic_dist_base(struct irq_data *d)
 {
if (gic_irq_in_rdist(d))/* SGI+PPI - SGI_base for this CPU */
@@ -231,6 +236,18 @@ static void gic_poke_irq(struct irq_data *d, u32 offset)
 static void gic_mask_irq(struct irq_data *d)
 {
gic_poke_irq(d, GICD_ICENABLER);
+   /*
+* When masking a forwarded interrupt, make sure it is
+* deactivated as well.
+*
+* This ensures that an interrupt that is getting
+* disabled/masked will not get stuck, because there is
+* noone to deactivate it (guest is being terminated).
+*/
+   if (static_key_true(supports_deactivate)) {
+   if (forwarded_irq(d))
+   gic_poke_irq(d, GICD_ICACTIVER);
+   }
 }
 
 static void gic_unmask_irq(struct irq_data *d)
@@ -296,8 +313,11 @@ static int gic_irq_get_irqchip_state(struct irq_data *d,
 static void gic_eoi_irq(struct irq_data *d)
 {
if (static_key_true(supports_deactivate)) {
-   /* No need to deactivate an LPI */
-   if (gic_irq(d) = 8192)
+   /*
+* No need to deactivate an LPI, or an interrupt that
+* is is getting forwarded to a vcpu.
+*/
+   if (gic_irq(d) = 8192 || forwarded_irq(d))
return;
gic_write_dir(gic_irq(d));
} else {
@@ -331,6 +351,16 @@ static int gic_set_type(struct irq_data *d, unsigned int 
type)
return gic_configure_irq(irq, type, base, rwp_wait);
 }
 
+static int gic_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
+{
+   if (static_key_true(supports_deactivate)) {
+   d-handler_data = vcpu;
+   return 0;
+   }
+
+   return -EINVAL;
+}
+
 static u64 gic_mpidr_to_affinity(u64 mpidr)
 {
u64 aff;
@@ -681,6 +711,7 @@ static struct irq_chip gic_chip = {
.irq_set_affinity   = gic_set_affinity,
.irq_get_irqchip_state  = gic_irq_get_irqchip_state,
.irq_set_irqchip_state  = gic_irq_set_irqchip_state,
+   .irq_set_vcpu_affinity  = gic_irq_set_vcpu_affinity,
.flags  = IRQCHIP_SET_TYPE_MASKED,
 };
 
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/4] irqchip: GIC: Convert to EOImode == 1

2015-08-13 Thread Marc Zyngier
So far, GICv2 has been used in with EOImode == 0. The effect of this
mode is to perform the priority drop and the deactivation of the
interrupt at the same time.

While this works perfectly for Linux (we only have a single priority),
it causes issues when an interrupt is forwarded to a guest, and when
we want the guest to perform the EOI itself.

For this case, the GIC architecture provides EOImode == 1, where:
- A write to the EOI register drops the priority of the interrupt and leaves
it active. Other interrupts at the same priority level can now be taken,
but the active interrupt cannot be taken again
- A write to the DIR marks the interrupt as inactive, meaning it can
now be taken again.

We only enable this feature when booted in HYP mode and that
the device-tree reporte a suitable CPU interface. Observable behaviour
should remain unchanged.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 drivers/irqchip/irq-gic.c   | 51 +++--
 include/linux/irqchip/arm-gic.h |  4 
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 4dd8826..b020c3a 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -46,6 +46,7 @@
 #include asm/irq.h
 #include asm/exception.h
 #include asm/smp_plat.h
+#include asm/virt.h
 
 #include irq-gic-common.h
 #include irqchip.h
@@ -82,6 +83,8 @@ static DEFINE_RAW_SPINLOCK(irq_controller_lock);
 #define NR_GIC_CPU_IF 8
 static u8 gic_cpu_map[NR_GIC_CPU_IF] __read_mostly;
 
+static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
+
 #ifndef MAX_GIC_NR
 #define MAX_GIC_NR 1
 #endif
@@ -137,6 +140,14 @@ static inline unsigned int gic_irq(struct irq_data *d)
return d-hwirq;
 }
 
+static inline bool primary_gic_irq(struct irq_data *d)
+{
+   if (MAX_GIC_NR  1)
+   return irq_data_get_irq_chip_data(d) == gic_data[0];
+
+   return true;
+}
+
 /*
  * Routines to acknowledge, disable and enable interrupts
  */
@@ -164,7 +175,14 @@ static void gic_unmask_irq(struct irq_data *d)
 
 static void gic_eoi_irq(struct irq_data *d)
 {
-   writel_relaxed(gic_irq(d), gic_cpu_base(d) + GIC_CPU_EOI);
+   u32 deact_offset = GIC_CPU_EOI;
+
+   if (static_key_true(supports_deactivate)) {
+   if (primary_gic_irq(d))
+   deact_offset = GIC_CPU_DEACTIVATE;
+   }
+
+   writel_relaxed(gic_irq(d), gic_cpu_base(d) + deact_offset);
 }
 
 static int gic_irq_set_irqchip_state(struct irq_data *d,
@@ -272,11 +290,15 @@ static void __exception_irq_entry gic_handle_irq(struct 
pt_regs *regs)
irqnr = irqstat  GICC_IAR_INT_ID_MASK;
 
if (likely(irqnr  15  irqnr  1021)) {
+   if (static_key_true(supports_deactivate))
+   writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
handle_domain_irq(gic-domain, irqnr, regs);
continue;
}
if (irqnr  16) {
writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
+   if (static_key_true(supports_deactivate))
+   writel_relaxed(irqstat, cpu_base + 
GIC_CPU_DEACTIVATE);
 #ifdef CONFIG_SMP
handle_IPI(irqnr, regs);
 #endif
@@ -359,6 +381,10 @@ static void gic_cpu_if_up(void)
 {
void __iomem *cpu_base = gic_data_cpu_base(gic_data[0]);
u32 bypass = 0;
+   u32 mode = 0;
+
+   if (static_key_true(supports_deactivate))
+   mode = GIC_CPU_CTRL_EOImodeNS;
 
/*
* Preserve bypass disable bits to be written back later
@@ -366,7 +392,7 @@ static void gic_cpu_if_up(void)
bypass = readl(cpu_base + GIC_CPU_CTRL);
bypass = GICC_DIS_BYPASS_MASK;
 
-   writel_relaxed(bypass | GICC_ENABLE, cpu_base + GIC_CPU_CTRL);
+   writel_relaxed(bypass | mode | GICC_ENABLE, cpu_base + GIC_CPU_CTRL);
 }
 
 
@@ -986,6 +1012,8 @@ void __init gic_init_bases(unsigned int gic_nr, int 
irq_start,
register_cpu_notifier(gic_cpu_notifier);
 #endif
set_handle_irq(gic_handle_irq);
+   if (static_key_true(supports_deactivate))
+   pr_info (GIC: Using split EOI/Deactivate mode\n);
}
 
gic_dist_init(gic);
@@ -1001,6 +1029,7 @@ gic_of_init(struct device_node *node, struct device_node 
*parent)
 {
void __iomem *cpu_base;
void __iomem *dist_base;
+   struct resource cpu_res;
u32 percpu_offset;
int irq;
 
@@ -1013,6 +1042,16 @@ gic_of_init(struct device_node *node, struct device_node 
*parent)
cpu_base = of_iomap(node, 1);
WARN(!cpu_base, unable to map gic cpu registers\n);
 
+   of_address_to_resource(node, 1, cpu_res);
+
+   /*
+* Disable split EOI/Deactivate if either HYP is not available
+* or the CPU interface is too small.
+*/
+  

[PATCH v2 0/4] irqchip: GICv2/v3: Add support for irq_vcpu_affinity

2015-08-13 Thread Marc Zyngier
The GICv2 and GICv3 architectures allow an active physical interrupt
to be forwarded to a guest, and the guest to indirectly perform the
deactivation of the interrupt by performing an EOI on the virtual
interrupt (see for example the GICv2 spec, 3.2.1).

This allows some substantial performance improvement for level
triggered interrupts that otherwise have to be masked/unmasked in
VFIO, not to mention the required trap back to KVM when the guest
performs an EOI.

To enable this, the GICs need to be switched to a different EOImode,
where a taken interrupt can be left active (which prevents the same
interrupt from being taken again), while other interrupts are still
being processed normally.

We also use the new irq_set_vcpu_affinity hook that was introduced for
Intel's Posted Interrupts to determine whether or not to perform the
deactivation at EOI-time.

As all of this only makes sense when the kernel can behave as a
hypervisor, we only enable this mode on detecting that the kernel was
actually booted in HYP mode, and that the GIC supports this feature.

This series is a complete rework of a RFC I sent over a year ago:

http://lists.infradead.org/pipermail/linux-arm-kernel/2014-June/266328.html

Since then, a lot has been either merged (the irqchip_state) or reworked
(my active-timer series: http://www.spinics.net/lists/kvm/msg118768.html),
and this implements the last few bits for Eric Auger's series to
finally make it into the kernel:

https://lkml.org/lkml/2015/7/2/268
https://lkml.org/lkml/2015/7/6/291

With all these patches combined, physical interrupt routing from the
kernel into a VM becomes possible.

This has been tested on Juno (GICv2) and FastModel (GICv3). A branch
is available at:

git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git 
irq/gic-irq-vcpu-affinity-v2

* From v1:
  - Fixes after review from Eric
  - Got rid of the cascaded GICv2 hack (it was broken anyway)
  - Folded the LPI deactivation patch (it makes more sense as part of
the main one.
  - Some clarifying comments about the deactivate on mask
  - I haven't retained Eric's Reviewed/Tested-by, as the code as
significantly changed on GICv2

Marc Zyngier (4):
  irqchip: GICv3: Convert to EOImode == 1
  irqchip: GICv3: Don't deactivate interrupts forwarded to a guest
  irqchip: GIC: Convert to EOImode == 1
  irqchip: GIC: Don't deactivate interrupts forwarded to a guest

 drivers/irqchip/irq-gic-v3.c   |  68 +--
 drivers/irqchip/irq-gic.c  | 109 -
 include/linux/irqchip/arm-gic-v3.h |   9 +++
 include/linux/irqchip/arm-gic.h|   4 ++
 4 files changed, 184 insertions(+), 6 deletions(-)

-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 4/4] irqchip: GIC: Don't deactivate interrupts forwarded to a guest

2015-08-13 Thread Marc Zyngier
Commit 0a4377de3056 (genirq: Introduce irq_set_vcpu_affinity() to
target an interrupt to a VCPU) added just what we needed at the
lowest level to allow an interrupt to be deactivated by a guest.

When such a request reaches the GIC, it knows it doesn't need to
perform the deactivation anymore, and can safely leave the guest
do its magic. This of course requires additional support in both
VFIO and KVM.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 drivers/irqchip/irq-gic.c | 58 +++
 1 file changed, 58 insertions(+)

diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index b020c3a..ea691be 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -148,6 +148,34 @@ static inline bool primary_gic_irq(struct irq_data *d)
return true;
 }
 
+static inline bool cascading_gic_irq(struct irq_data *d)
+{
+   /*
+* If handler_data pointing to one of the secondary GICs, then
+* this is the cascading interrupt, and it cannot possibly be
+* forwarded.
+*/
+   if (d-handler_data = (void *)(gic_data + 1) 
+   d-handler_data   (void *)(gic_data + MAX_GIC_NR))
+   return true;
+
+   return false;
+}
+
+static inline bool forwarded_irq(struct irq_data *d)
+{
+   /*
+* A forwarded interrupt:
+* - is on the primary GIC
+* - has its handler_data set to a value
+* - that isn't a secondary GIC
+*/
+   if (primary_gic_irq(d)  d-handler_data  !cascading_gic_irq(d))
+   return true;
+
+   return false;
+}
+
 /*
  * Routines to acknowledge, disable and enable interrupts
  */
@@ -166,6 +194,18 @@ static int gic_peek_irq(struct irq_data *d, u32 offset)
 static void gic_mask_irq(struct irq_data *d)
 {
gic_poke_irq(d, GIC_DIST_ENABLE_CLEAR);
+   /*
+* When masking a forwarded interrupt, make sure it is
+* deactivated as well.
+*
+* This ensures that an interrupt that is getting
+* disabled/masked will not get stuck, because there is
+* noone to deactivate it (guest is being terminated).
+*/
+   if (static_key_true(supports_deactivate)) {
+   if (forwarded_irq(d))
+   gic_poke_irq(d, GIC_DIST_ACTIVE_CLEAR);
+   }
 }
 
 static void gic_unmask_irq(struct irq_data *d)
@@ -178,6 +218,10 @@ static void gic_eoi_irq(struct irq_data *d)
u32 deact_offset = GIC_CPU_EOI;
 
if (static_key_true(supports_deactivate)) {
+   /* Do not deactivate an IRQ forwarded to a vcpu. */
+   if (forwarded_irq(d))
+   return;
+
if (primary_gic_irq(d))
deact_offset = GIC_CPU_DEACTIVATE;
}
@@ -251,6 +295,19 @@ static int gic_set_type(struct irq_data *d, unsigned int 
type)
return gic_configure_irq(gicirq, type, base, NULL);
 }
 
+static int gic_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
+{
+   /* Only interrupts on the primary GIC can be forwarded to a vcpu. */
+   if (static_key_true(supports_deactivate)) {
+   if (primary_gic_irq(d)  !cascading_gic_irq(d)) {
+   d-handler_data = vcpu;
+   return 0;
+   }
+   }
+
+   return -EINVAL;
+}
+
 #ifdef CONFIG_SMP
 static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
bool force)
@@ -346,6 +403,7 @@ static struct irq_chip gic_chip = {
 #endif
.irq_get_irqchip_state  = gic_irq_get_irqchip_state,
.irq_set_irqchip_state  = gic_irq_set_irqchip_state,
+   .irq_set_vcpu_affinity  = gic_irq_set_vcpu_affinity,
.flags  = IRQCHIP_SET_TYPE_MASKED,
 };
 
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/5] KVM: add kvm_has_request wrapper

2015-08-13 Thread Radim Krčmář
2015-08-12 21:57+0200, Christian Borntraeger:
 kvm_check_request is now somewhat a misnomer (what is the difference between 
 test and check?)

kvm_check_request has always been poetic;  it uses two meanings of
check, examine and tick off, at the same time.

We also want something that clears the request, so kvm_drop_request was
my best candidate so far.

 for the new interface. maybe we can rename kvm_check_request in a separate 
 patch somewhen.

I wonder why haven't we copied the naming convention from bit operations
(or if programming would be better if German was its language),

  kvm_test_request
  kvm_set_request
  kvm_clear_request
  kvm_test_and_clear_request

The only disadvantage is that
  kvm_test_and_clear_request
is longer than
  kvm_check_request
   123456789
by whooping 9 characters.

I could live with that.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 5/5] KVM: refactor asynchronous vcpu ioctl dispatch

2015-08-13 Thread Radim Krčmář
2015-08-12 22:03+0200, Christian Borntraeger:
 Am 05.08.2015 um 18:33 schrieb Radim Krčmář:
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 @@ -2252,12 +2252,15 @@ static long kvm_vcpu_ioctl(struct file *filp,
   * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
   * so vcpu_load() would break it.
   */
 +switch (ioctl) {
  #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
 -if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == 
 KVM_INTERRUPT)
 -return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 +case KVM_S390_INTERRUPT:
 +case KVM_S390_IRQ:
 +case KVM_INTERRUPT:
 
 When you are it you might want to put the KVM_S390* withing CONFIG_S390 and
 KVM_INTERRUPT within CONFIG_PPC || CONFIG_MIPS

Sure, thanks.

 This might speed up the switch statement for s390/ppc/mips a tiny bit. It 
 will add
 another ifdef, though. Paolo?

For v3, I will name the decision as an inline function, which should
make the #ifing more acceptable (at the cost of not having ioctls #defs
in the body of kvm_vcpu_ioctl).  Something like this,

static inline bool kvm_asynchronous_ioctl(unsigned ioctl)
{
switch (ioctl) {
#if defined(CONFIG_S390)
case KVM_S390_INTERRUPT:
case KVM_S390_IRQ:
#endif
#if defined(CONFIG_MIPS)
case KVM_INTERRUPT:
#endif
case KVM_USER_EXIT:
return true;
}
return false;
}

[...]
if (kvm_asynchronous_ioctl(ioctl))
return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/4] irqchip: GICv3: Convert to EOImode == 1

2015-08-13 Thread Marc Zyngier
So far, GICv3 has been used in with EOImode == 0. The effect of this
mode is to perform the priority drop and the deactivation of the
interrupt at the same time.

While this works perfectly for Linux (we only have a single priority),
it causes issues when an interrupt is forwarded to a guest, and when
we want the guest to perform the EOI itself.

For this case, the GIC architecture provides EOImode == 1, where:
- A write to ICC_EOIR1_EL1 drops the priority of the interrupt and leaves
it active. Other interrupts at the same priority level can now be taken,
but the active interrupt cannot be taken again
- A write to ICC_DIR_EL1 marks the interrupt as inactive, meaning it can
now be taken again.

This patch converts the driver to be able to use this new mode, depending
on whether or not the kernel can behave as a hypervisor. No feature change.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 drivers/irqchip/irq-gic-v3.c   | 37 +
 include/linux/irqchip/arm-gic-v3.h |  9 +
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index c52f7ba..61190fb 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -30,6 +30,7 @@
 #include asm/cputype.h
 #include asm/exception.h
 #include asm/smp_plat.h
+#include asm/virt.h
 
 #include irq-gic-common.h
 #include irqchip.h
@@ -50,6 +51,7 @@ struct gic_chip_data {
 };
 
 static struct gic_chip_data gic_data __read_mostly;
+static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
 
 #define gic_data_rdist()   (this_cpu_ptr(gic_data.rdists.rdist))
 #define gic_data_rdist_rd_base()   (gic_data_rdist()-rd_base)
@@ -293,7 +295,14 @@ static int gic_irq_get_irqchip_state(struct irq_data *d,
 
 static void gic_eoi_irq(struct irq_data *d)
 {
-   gic_write_eoir(gic_irq(d));
+   if (static_key_true(supports_deactivate)) {
+   /* No need to deactivate an LPI */
+   if (gic_irq(d) = 8192)
+   return;
+   gic_write_dir(gic_irq(d));
+   } else {
+   gic_write_eoir(gic_irq(d));
+   }
 }
 
 static int gic_set_type(struct irq_data *d, unsigned int type)
@@ -343,15 +352,24 @@ static asmlinkage void __exception_irq_entry 
gic_handle_irq(struct pt_regs *regs
 
if (likely(irqnr  15  irqnr  1020) || irqnr = 8192) {
int err;
+
+   if (static_key_true(supports_deactivate))
+   gic_write_eoir(irqnr);
+
err = handle_domain_irq(gic_data.domain, irqnr, regs);
if (err) {
WARN_ONCE(true, Unexpected interrupt 
received!\n);
-   gic_write_eoir(irqnr);
+   if (static_key_true(supports_deactivate))
+   gic_write_dir(irqnr);
+   else
+   gic_write_eoir(irqnr);
}
continue;
}
if (irqnr  16) {
gic_write_eoir(irqnr);
+   if (static_key_true(supports_deactivate))
+   gic_write_dir(irqnr);
 #ifdef CONFIG_SMP
handle_IPI(irqnr, regs);
 #else
@@ -451,8 +469,13 @@ static void gic_cpu_sys_reg_init(void)
/* Set priority mask register */
gic_write_pmr(DEFAULT_PMR_VALUE);
 
-   /* EOI deactivates interrupt too (mode 0) */
-   gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop_dir);
+   if (static_key_true(supports_deactivate)) {
+   /* EOI drops priority only (mode 1) */
+   gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop);
+   } else {
+   /* EOI deactivates interrupt too (mode 0) */
+   gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop_dir);
+   }
 
/* ... and let's hit the road... */
gic_write_grpen1(1);
@@ -820,6 +843,12 @@ static int __init gic_of_init(struct device_node *node, 
struct device_node *pare
if (of_property_read_u64(node, redistributor-stride, redist_stride))
redist_stride = 0;
 
+   if (!is_hyp_mode_available())
+   static_key_slow_dec(supports_deactivate);
+
+   if (static_key_true(supports_deactivate))
+   pr_info(GIC: Using split EOI/Deactivate mode\n);
+
gic_data.dist_base = dist_base;
gic_data.redist_regions = rdist_regs;
gic_data.nr_redist_regions = nr_redist_regions;
diff --git a/include/linux/irqchip/arm-gic-v3.h 
b/include/linux/irqchip/arm-gic-v3.h
index ffbc034..bc98832 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -104,6 +104,8 @@
 #define GICR_SYNCR 0x00C0
 #define GICR_MOVLPIR   0x0100
 #define GICR_MOVALLR

Re: [PATCH 1/2] KVM: x86: set TMR when the interrupt is accepted

2015-08-13 Thread Paolo Bonzini


On 13/08/2015 08:35, Zhang, Yang Z wrote:
 You may be right. It is safe if no future hardware plans to use
 it. Let me check with our hardware team to see whether it will be
 used or not in future.
 
 After checking with Jun, there is no guarantee that the guest running
 on another CPU will operate properly if hypervisor modify the vTMR
 from another CPU. So the hypervisor should not to do it.

I guess I can cause a vmexit on level-triggered interrupts, it's not a
big deal, but no weasel words, please.

What's going to break, and where is it documented?

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/5] KVM: add kvm_has_request wrapper

2015-08-13 Thread Christian Borntraeger
Am 13.08.2015 um 11:29 schrieb Paolo Bonzini:
 
 
 On 13/08/2015 11:11, Radim Krčmář wrote:
 for the new interface. maybe we can rename kvm_check_request in a separate 
 patch somewhen.
 I wonder why haven't we copied the naming convention from bit operations
 (or if programming would be better if German was its language),

   kvm_test_request
   kvm_set_request
   kvm_clear_request
   kvm_test_and_clear_request

 The only disadvantage is that
   kvm_test_and_clear_request
 is longer than
   kvm_check_request
123456789
 by whooping 9 characters.

 I could live with that.
 
 Yes, that would be much better.

+1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/14] arm64: Add 16K page size support

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

This patch turns on the 16K page support in the kernel. We
support 48bit VA (4 level page tables) and 47bit VA (3 level
page tables).

Cc: Mark Rutland mark.rutl...@arm.com
Cc: Catalin Marinas catalin.mari...@arm.com
Cc: Will Deacon will.dea...@arm.com
Cc: Steve Capper steve.cap...@linaro.org
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
 arch/arm64/Kconfig   |   25 -
 arch/arm64/include/asm/fixmap.h  |4 +++-
 arch/arm64/include/asm/kvm_arm.h |   12 
 arch/arm64/include/asm/page.h|2 ++
 arch/arm64/include/asm/sysreg.h  |2 ++
 arch/arm64/include/asm/thread_info.h |2 ++
 arch/arm64/kernel/head.S |7 ++-
 arch/arm64/mm/proc.S |4 +++-
 8 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b247897..8327edf 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -167,7 +167,8 @@ config PGTABLE_LEVELS
default 2 if ARM64_64K_PAGES  ARM64_VA_BITS_42
default 3 if ARM64_64K_PAGES  ARM64_VA_BITS_48
default 3 if ARM64_4K_PAGES  ARM64_VA_BITS_39
-   default 4 if ARM64_4K_PAGES  ARM64_VA_BITS_48
+   default 3 if ARM64_16K_PAGES  ARM64_VA_BITS_47
+   default 4 if !ARM64_64K_PAGES  ARM64_VA_BITS_48
 
 source init/Kconfig
 
@@ -444,6 +445,13 @@ config ARM64_4K_PAGES
help
  This feature enables 4KB pages support.
 
+config ARM64_16K_PAGES
+   bool 16KB
+   help
+ The system will use 16KB pages support. AArch32 emulation
+ requires applications compiled with 16K(or multiple of 16K)
+ aligned segments.
+
 config ARM64_64K_PAGES
bool 64KB
help
@@ -457,6 +465,7 @@ endchoice
 choice
prompt Virtual address space size
default ARM64_VA_BITS_39 if ARM64_4K_PAGES
+   default ARM64_VA_BITS_47 if ARM64_16K_PAGES
default ARM64_VA_BITS_42 if ARM64_64K_PAGES
help
  Allows choosing one of multiple possible virtual address
@@ -471,6 +480,10 @@ config ARM64_VA_BITS_42
bool 42-bit
depends on ARM64_64K_PAGES
 
+config ARM64_VA_BITS_47
+   bool 47-bit
+   depends on ARM64_16K_PAGES
+
 config ARM64_VA_BITS_48
bool 48-bit
 
@@ -480,6 +493,7 @@ config ARM64_VA_BITS
int
default 39 if ARM64_VA_BITS_39
default 42 if ARM64_VA_BITS_42
+   default 47 if ARM64_VA_BITS_47
default 48 if ARM64_VA_BITS_48
 
 config CPU_BIG_ENDIAN
@@ -550,7 +564,7 @@ config ARCH_WANT_GENERAL_HUGETLB
def_bool y
 
 config ARCH_WANT_HUGE_PMD_SHARE
-   def_bool y if ARM64_4K_PAGES
+   def_bool y if ARM64_4K_PAGES || ARM64_16K_PAGES
 
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
def_bool y
@@ -587,6 +601,7 @@ config XEN
 config FORCE_MAX_ZONEORDER
int
default 14 if (ARM64_64K_PAGES  TRANSPARENT_HUGEPAGE)
+   default 12 if (ARM64_16K_PAGES  TRANSPARENT_HUGEPAGE)
default 11
 
 menuconfig ARMV8_DEPRECATED
@@ -773,9 +788,9 @@ config COMPAT
  the user helper functions, VFP support and the ptrace interface are
  handled appropriately by the kernel.
 
- If you also enabled CONFIG_ARM64_64K_PAGES, please be aware that you
- will only be able to execute AArch32 binaries that were compiled with
- 64k aligned segments.
+ If you use a page size other than 4KB(i.e, 16KB or 64KB), please be 
aware
+ that you will only be able to execute AArch32 binaries that were 
compiled
+ with page size aligned segments.
 
  If you want to execute 32-bit userspace applications, say Y.
 
diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index c0739187..f44a390 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -55,8 +55,10 @@ enum fixed_addresses {
 * Temporary boot-time mappings, used by early_ioremap(),
 * before ioremap() is functional.
 */
-#ifdef CONFIG_ARM64_64K_PAGES
+#ifdefined(CONFIG_ARM64_64K_PAGES)
 #define NR_FIX_BTMAPS  4
+#elif  defined (CONFIG_ARM64_16K_PAGES)
+#define NR_FIX_BTMAPS  16
 #else
 #define NR_FIX_BTMAPS  64
 #endif
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index dcaf799..4d6a022 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -113,6 +113,7 @@
 #define VTCR_EL2_TG0_MASK  (3  14)
 #define VTCR_EL2_TG0_4K(0  14)
 #define VTCR_EL2_TG0_64K   (1  14)
+#define VTCR_EL2_TG0_16K   (2  14)
 #define VTCR_EL2_SH0_MASK  (3  12)
 #define VTCR_EL2_SH0_INNER (3  12)
 #define VTCR_EL2_ORGN0_MASK(3  10)
@@ -134,6 +135,8 @@
  *
  * Note that when using 4K pages, we concatenate two first level page tables
  * together.
+ * With 16K pages, we concatenate 16 first level page tables and enter at
+ * level 

[PATCH 14/14] arm64: 36 bit VA

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

36bit VA lets us use 2 level page tables while limiting the
available address space to 64GB.

Cc: Mark Rutland mark.rutl...@arm.com
Cc: Catalin Marinas catalin.mari...@arm.com
Cc: Will Deacon will.dea...@arm.com
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
 arch/arm64/Kconfig |8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 8327edf..0407fd3 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -164,6 +164,7 @@ config FIX_EARLYCON_MEM
 
 config PGTABLE_LEVELS
int
+   default 2 if ARM64_16K_PAGES  ARM64_VA_BITS_36
default 2 if ARM64_64K_PAGES  ARM64_VA_BITS_42
default 3 if ARM64_64K_PAGES  ARM64_VA_BITS_48
default 3 if ARM64_4K_PAGES  ARM64_VA_BITS_39
@@ -472,6 +473,10 @@ choice
  space sizes. The level of translation table is determined by
  a combination of page size and virtual address space size.
 
+config ARM64_VA_BITS_36
+   bool 36-bit
+   depends on ARM64_16K_PAGES
+
 config ARM64_VA_BITS_39
bool 39-bit
depends on ARM64_4K_PAGES
@@ -491,6 +496,7 @@ endchoice
 
 config ARM64_VA_BITS
int
+   default 36 if ARM64_VA_BITS_36
default 39 if ARM64_VA_BITS_39
default 42 if ARM64_VA_BITS_42
default 47 if ARM64_VA_BITS_47
@@ -564,7 +570,7 @@ config ARCH_WANT_GENERAL_HUGETLB
def_bool y
 
 config ARCH_WANT_HUGE_PMD_SHARE
-   def_bool y if ARM64_4K_PAGES || ARM64_16K_PAGES
+   def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES  !ARM64_VA_BITS_36)
 
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
def_bool y
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/14] arm: kvm: Move fake PGD handling to arch specific files

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

Rearrange the code for fake pgd handling, which is applicable
to only ARM64. The intention is to keep the common code cleaner,
unaware of the underlying hacks.

Cc: kvm...@lists.cs.columbia.edu
Cc: christoffer.d...@linaro.org
Cc: marc.zyng...@arm.com
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
 arch/arm/include/asm/kvm_mmu.h   |7 ++
 arch/arm/kvm/mmu.c   |   44 +-
 arch/arm64/include/asm/kvm_mmu.h |   43 +
 3 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 405aa18..1c9aa8a 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -173,6 +173,13 @@ static inline unsigned int kvm_get_hwpgd_size(void)
return PTRS_PER_S2_PGD * sizeof(pgd_t);
 }
 
+static inline pgd_t *kvm_setup_fake_pgd(pgd_t *pgd)
+{
+   return pgd;
+}
+
+static inline void kvm_free_fake_pgd(pgd_t *pgd) {}
+
 struct kvm;
 
 #define kvm_flush_dcache_to_poc(a,l)   __cpuc_flush_dcache_area((a), (l))
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 7b42012..b210622 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -677,43 +677,11 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
 * guest, we allocate a fake PGD and pre-populate it to point
 * to the next-level page table, which will be the real
 * initial page table pointed to by the VTTBR.
-*
-* When KVM_PREALLOC_LEVEL==2, we allocate a single page for
-* the PMD and the kernel will use folded pud.
-* When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
-* pages.
 */
-   if (KVM_PREALLOC_LEVEL  0) {
-   int i;
-
-   /*
-* Allocate fake pgd for the page table manipulation macros to
-* work.  This is not used by the hardware and we have no
-* alignment requirement for this allocation.
-*/
-   pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
-   GFP_KERNEL | __GFP_ZERO);
-
-   if (!pgd) {
-   kvm_free_hwpgd(hwpgd);
-   return -ENOMEM;
-   }
-
-   /* Plug the HW PGD into the fake one. */
-   for (i = 0; i  PTRS_PER_S2_PGD; i++) {
-   if (KVM_PREALLOC_LEVEL == 1)
-   pgd_populate(NULL, pgd + i,
-(pud_t *)hwpgd + i * PTRS_PER_PUD);
-   else if (KVM_PREALLOC_LEVEL == 2)
-   pud_populate(NULL, pud_offset(pgd, 0) + i,
-(pmd_t *)hwpgd + i * PTRS_PER_PMD);
-   }
-   } else {
-   /*
-* Allocate actual first-level Stage-2 page table used by the
-* hardware for Stage-2 page table walks.
-*/
-   pgd = (pgd_t *)hwpgd;
+   pgd = kvm_setup_fake_pgd(hwpgd);
+   if (IS_ERR(pgd)) {
+   kvm_free_hwpgd(hwpgd);
+   return PTR_ERR(pgd);
}
 
kvm_clean_pgd(pgd);
@@ -820,9 +788,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
 
unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
kvm_free_hwpgd(kvm_get_hwpgd(kvm));
-   if (KVM_PREALLOC_LEVEL  0)
-   kfree(kvm-arch.pgd);
-
+   kvm_free_fake_pgd(kvm-arch.pgd);
kvm-arch.pgd = NULL;
 }
 
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 6150567..2567fe8 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -198,6 +198,49 @@ static inline unsigned int kvm_get_hwpgd_size(void)
return PTRS_PER_S2_PGD * sizeof(pgd_t);
 }
 
+/*
+ * Allocate fake pgd for the page table manipulation macros to
+ * work.  This is not used by the hardware and we have no
+ * alignment requirement for this allocation.
+ */
+static inline pgd_t* kvm_setup_fake_pgd(pgd_t *hwpgd)
+{
+   int i;
+   pgd_t *pgd;
+
+   if (!KVM_PREALLOC_LEVEL)
+   return hwpgd;
+   /*
+* When KVM_PREALLOC_LEVEL==2, we allocate a single page for
+* the PMD and the kernel will use folded pud.
+* When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
+* pages.
+*/
+   pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
+   GFP_KERNEL | __GFP_ZERO);
+
+   if (!pgd)
+   return ERR_PTR(-ENOMEM);
+
+   /* Plug the HW PGD into the fake one. */
+   for (i = 0; i  PTRS_PER_S2_PGD; i++) {
+   if (KVM_PREALLOC_LEVEL == 1)
+   pgd_populate(NULL, pgd + i,
+(pud_t *)hwpgd + i * PTRS_PER_PUD);
+   else if (KVM_PREALLOC_LEVEL == 2)
+  

[PATCH 11/14] arm64: kvm: Rewrite fake pgd handling

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

The existing fake pgd handling code assumes that the stage-2 entry
level can only be one level down that of the host, which may not be
true always(e.g, with the introduction of 16k pagesize).

e.g.
With 16k page size and 48bit VA and 40bit IPA we have the following
split for page table levels:

level:  0   1 2 3
bits : [47] [46 - 36] [35 - 25] [24 - 14] [13 - 0]
 ^   ^ ^
 |   | |
   host entry| x stage-2 entry
 |
IPA -x

The stage-2 entry level is 2, due to the concatenation of 16tables
at level 2(mandated by the hardware). So, we need to fake two levels
to actually reach the hyp page table. This case cannot be handled
with the existing code, as, all we know about is KVM_PREALLOC_LEVEL
which kind of stands for two different pieces of information.

1) Whether we have fake page table entry levels.
2) The entry level of stage-2 translation.

We loose the information about the number of fake levels that
we may have to use. Also, KVM_PREALLOC_LEVEL computation itself
is wrong, as we assume the hw entry level is always 1 level down
from the host.

This patch introduces two seperate indicators :
1) Accurate entry level for stage-2 translation - HYP_PGTABLE_ENTRY_LEVEL -
   using the new helpers.
2) Number of levels of fake pagetable entries. (KVM_FAKE_PGTABLE_LEVELS)

The following conditions hold true for all cases(with 40bit IPA)
1) The stage-2 entry level = 2
2) Number of fake page-table entries is in the inclusive range [0, 2].

Cc: kvm...@lists.cs.columbia.edu
Cc: christoffer.d...@linaro.org
Cc: marc.zyng...@arm.com
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
 arch/arm64/include/asm/kvm_mmu.h |  114 --
 1 file changed, 61 insertions(+), 53 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 2567fe8..72cfd9e 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -41,18 +41,6 @@
  */
 #define TRAMPOLINE_VA  (HYP_PAGE_OFFSET_MASK  PAGE_MASK)
 
-/*
- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
- * levels in addition to the PGD and potentially the PUD which are
- * pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2
- * tables use one level of tables less than the kernel.
- */
-#ifdef CONFIG_ARM64_64K_PAGES
-#define KVM_MMU_CACHE_MIN_PAGES1
-#else
-#define KVM_MMU_CACHE_MIN_PAGES2
-#endif
-
 #ifdef __ASSEMBLY__
 
 /*
@@ -80,6 +68,26 @@
 #define KVM_PHYS_SIZE  (1UL  KVM_PHYS_SHIFT)
 #define KVM_PHYS_MASK  (KVM_PHYS_SIZE - 1UL)
 
+/*
+ * At stage-2 entry level, upto 16 tables can be concatenated and
+ * the hardware expects us to use concatenation, whenever possible.
+ * So, number of page table levels for KVM_PHYS_SHIFT is always
+ * the number of normal page table levels for (KVM_PHYS_SHIFT - 4).
+ */
+#define HYP_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4)
+/* Number of bits normally addressed by HYP_PGTABLE_LEVELS */
+#define HYP_PGTABLE_SHIFT  ARM64_HW_PGTABLE_LEVEL_SHIFT(HYP_PGTABLE_LEVELS 
+ 1)
+#define HYP_PGDIR_SHIFT
ARM64_HW_PGTABLE_LEVEL_SHIFT(HYP_PGTABLE_LEVELS)
+#define HYP_PGTABLE_ENTRY_LEVEL(4 - HYP_PGTABLE_LEVELS)
+
+/*
+ * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
+ * levels in addition to the PGD and potentially the PUD which are
+ * pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2
+ * tables use one level of tables less than the kernel.
+ */
+#define KVM_MMU_CACHE_MIN_PAGES(HYP_PGTABLE_LEVELS - 1)
+
 int create_hyp_mappings(void *from, void *to);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
 void free_boot_hyp_pgd(void);
@@ -145,56 +153,41 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 #define kvm_pud_addr_end(addr, end)pud_addr_end(addr, end)
 #define kvm_pmd_addr_end(addr, end)pmd_addr_end(addr, end)
 
-/*
- * In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address
- * the entire IPA input range with a single pgd entry, and we would only need
- * one pgd entry.  Note that in this case, the pgd is actually not used by
- * the MMU for Stage-2 translations, but is merely a fake pgd used as a data
- * structure for the kernel pgtable macros to work.
- */
-#if PGDIR_SHIFT  KVM_PHYS_SHIFT
-#define PTRS_PER_S2_PGD_SHIFT  0
+/* Number of concatenated tables in stage-2 entry level */
+#if KVM_PHYS_SHIFT  HYP_PGTABLE_SHIFT
+#define S2_ENTRY_TABLES_SHIFT  (KVM_PHYS_SHIFT - HYP_PGTABLE_SHIFT)
 #else
-#define PTRS_PER_S2_PGD_SHIFT  (KVM_PHYS_SHIFT - PGDIR_SHIFT)
+#define S2_ENTRY_TABLES_SHIFT  0
 #endif
+#define S2_ENTRY_TABLES(1  (S2_ENTRY_TABLES_SHIFT))
+
+/* Number of page table levels we fake to reach the hw pgtable for hyp */
+#define KVM_FAKE_PGTABLE_LEVELS

Re: [PATCH v5 5/5] KVM: eventfd: add irq bypass consumer management

2015-08-13 Thread Eric Auger
Hi Alex,
On 08/12/2015 09:05 PM, Alex Williamson wrote:
 On Mon, 2015-08-10 at 15:31 +0200, Eric Auger wrote:
 This patch adds the registration/unregistration of an
 irq_bypass_consumer on irqfd assignment/deassignment.

 Signed-off-by: Eric Auger eric.au...@linaro.org
 Signed-off-by: Feng Wu feng...@intel.com

 ---

 v4 - v5:
 - due to removal of static inline stubs, add
   #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
   around consumer registration/unregistration
 - add pr_info when registration fails

 v2 - v3 (Feng Wu):
 - Use kvm_arch_irq_bypass_start
 - Remove kvm_arch_irq_bypass_update
 - Add member 'struct irq_bypass_producer *producer' in
   'struct kvm_kernel_irqfd', it is needed by posted interrupt.
 - Remove 'irq_bypass_unregister_consumer' in kvm_irqfd_deassign()

 v1 - v2:
 - populate of kvm and gsi removed
 - unregister the consumer on irqfd_shutdown
 ---
  include/linux/kvm_irqfd.h |  2 ++
  virt/kvm/eventfd.c| 15 +++
  2 files changed, 17 insertions(+)

 diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
 index f926b39..0c1de05 100644
 --- a/include/linux/kvm_irqfd.h
 +++ b/include/linux/kvm_irqfd.h
 @@ -64,6 +64,8 @@ struct kvm_kernel_irqfd {
  struct list_head list;
  poll_table pt;
  struct work_struct shutdown;
 +struct irq_bypass_consumer consumer;
 +struct irq_bypass_producer *producer;
  };
  
  #endif /* __LINUX_KVM_IRQFD_H */
 diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
 index 647ffb8..d7a230f 100644
 --- a/virt/kvm/eventfd.c
 +++ b/virt/kvm/eventfd.c
 @@ -35,6 +35,7 @@
  #include linux/srcu.h
  #include linux/slab.h
  #include linux/seqlock.h
 +#include linux/irqbypass.h
  #include trace/events/kvm.h
  
  #include kvm/iodev.h
 @@ -140,6 +141,9 @@ irqfd_shutdown(struct work_struct *work)
  /*
   * It is now safe to release the object's resources
   */
 +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 +irq_bypass_unregister_consumer(irqfd-consumer);
 +#endif
  eventfd_ctx_put(irqfd-eventfd);
  kfree(irqfd);
  }
 @@ -379,6 +383,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd 
 *args)
   * we might race against the POLLHUP
   */
  fdput(f);
 +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 +irqfd-consumer.token = (void *)irqfd-eventfd;
 +irqfd-consumer.add_producer = kvm_arch_irq_bypass_add_producer;
 +irqfd-consumer.del_producer = kvm_arch_irq_bypass_del_producer;
 +irqfd-consumer.stop = kvm_arch_irq_bypass_stop;
 +irqfd-consumer.start = kvm_arch_irq_bypass_start;
 +ret = irq_bypass_register_consumer(irqfd-consumer);
 +if (ret)
 +pr_info(irq bypass consumer (token %p) registration fails: 
 %d\n,
 +irqfd-consumer.token, ret);
 +#endif
 
 Does this series compile on its own?  Aren't all these arch function
 unresolved?
yes it does since by default CONFIG_HAVE_KVM_IRQ_BYPASS is not defined

Best Regards

Eric
 
  
  return 0;
  
 
 
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 00/14] arm64: 16K translation granule support

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

This series enables the 16K page size support on Linux for arm64.
This series adds support for 48bit VA(4 level), 47bit VA(3 level) and
36bit VA(2 level) with 16K. 16K was a late addition to the architecture
and is not implemented by all CPUs. Added a check to ensure the
selected granule size is supported by the CPU, failing which the CPU
won't proceed with booting.

KVM bits have been tested on a fast model with GICv3 using Andre's kvmtool
with gicv3 support[1].

Patches 1-7 cleans up the kernel page size handling code.
Patches 8-11 Fixes some issues with the KVM bits, mainly the fake PGD
 handling code.
Patch 12Adds a check to ensure the CPU supports the selected granule size.
Patch 13-14 Adds the 16k page size support bits.

This series applies on top of for-next/core branch of the aarch64 tree and is
also available here:

git://linux-arm.org/linux-skp.git  16k/v1

[1] git://linux-arm.org/kvmtool.git gicv3/v4

TODO:
 1) Testing on a silicon
 2) Analyse the performance of HugePages with 16K (32MB) on a
silicon.
 3) SMMU driver

Suzuki K. Poulose (14):
  arm64: Move swapper pagetable definitions
  arm64: Handle section maps for swapper/idmap
  arm64: Introduce helpers for page table levels
  arm64: Calculate size for idmap_pg_dir at compile time
  arm64: Handle 4 level page table for swapper
  arm64: Clean config usages for page size
  arm64: Kconfig: Fix help text about AArch32 support with 64K pages
  arm64: kvm: Fix {V}TCR_EL2_TG0 mask
  arm64: Cleanup VTCR_EL2 computation
  arm: kvm: Move fake PGD handling to arch specific files
  arm64: kvm: Rewrite fake pgd handling
  arm64: Check for selected granule support
  arm64: Add 16K page size support
  arm64: 36 bit VA

 arch/arm/include/asm/kvm_mmu.h  |7 ++
 arch/arm/kvm/mmu.c  |   44 ++
 arch/arm64/Kconfig  |   37 +++--
 arch/arm64/Kconfig.debug|2 +-
 arch/arm64/include/asm/boot.h   |1 +
 arch/arm64/include/asm/fixmap.h |4 +-
 arch/arm64/include/asm/kernel-pgtable.h |   77 ++
 arch/arm64/include/asm/kvm_arm.h|   29 +--
 arch/arm64/include/asm/kvm_mmu.h|  135 +--
 arch/arm64/include/asm/page.h   |   20 +
 arch/arm64/include/asm/pgtable-hwdef.h  |   15 +++-
 arch/arm64/include/asm/sysreg.h |8 ++
 arch/arm64/include/asm/thread_info.h|4 +-
 arch/arm64/kernel/head.S|   71 +---
 arch/arm64/kernel/vmlinux.lds.S |1 +
 arch/arm64/mm/mmu.c |   70 +++-
 arch/arm64/mm/proc.S|4 +-
 17 files changed, 337 insertions(+), 192 deletions(-)
 create mode 100644 arch/arm64/include/asm/kernel-pgtable.h

-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/5] KVM: add kvm_has_request wrapper

2015-08-13 Thread Paolo Bonzini


On 13/08/2015 11:11, Radim Krčmář wrote:
  for the new interface. maybe we can rename kvm_check_request in a separate 
  patch somewhen.
 I wonder why haven't we copied the naming convention from bit operations
 (or if programming would be better if German was its language),
 
   kvm_test_request
   kvm_set_request
   kvm_clear_request
   kvm_test_and_clear_request
 
 The only disadvantage is that
   kvm_test_and_clear_request
 is longer than
   kvm_check_request
123456789
 by whooping 9 characters.
 
 I could live with that.

Yes, that would be much better.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 5/5] KVM: refactor asynchronous vcpu ioctl dispatch

2015-08-13 Thread Paolo Bonzini


On 12/08/2015 22:03, Christian Borntraeger wrote:
   #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
  -  if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == 
  KVM_INTERRUPT)
  -  return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
  +  case KVM_S390_INTERRUPT:
  +  case KVM_S390_IRQ:
  +  case KVM_INTERRUPT:
 When you are it you might want to put the KVM_S390* withing CONFIG_S390 and
 KVM_INTERRUPT within CONFIG_PPC || CONFIG_MIPS
 
 This might speed up the switch statement for s390/ppc/mips a tiny bit. It 
 will add
 another ifdef, though. Paolo?

Sure.  I wasn't sure of KVM_INTERRUPT's usage on s390.

I'm okay with keeping the switch inline too, but if Radim prefers a
function that's also fine.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/14] arm64: Check for selected granule support

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

Ensure that the selected page size is supported by the
CPU(s).

Cc: Mark Rutland mark.rutl...@arm.com
Cc: Catalin Marinas catalin.mari...@arm.com
Cc: Will Deacon will.dea...@arm.com
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
 arch/arm64/include/asm/sysreg.h |6 ++
 arch/arm64/kernel/head.S|   24 +++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index a7f3d4b..e01d323 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -87,4 +87,10 @@ static inline void config_sctlr_el1(u32 clear, u32 set)
 }
 #endif
 
+#define ID_AA64MMFR0_TGran4_SHIFT  28
+#define ID_AA64MMFR0_TGran64_SHIFT 24
+
+#define ID_AA64MMFR0_TGran4_ENABLED0x0
+#define ID_AA64MMFR0_TGran64_ENABLED   0x0
+
 #endif /* __ASM_SYSREG_H */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 01b8e58..0cb04db 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -31,10 +31,11 @@
 #include asm/cputype.h
 #include asm/kernel-pgtable.h
 #include asm/memory.h
-#include asm/thread_info.h
 #include asm/pgtable-hwdef.h
 #include asm/pgtable.h
 #include asm/page.h
+#include asm/sysreg.h
+#include asm/thread_info.h
 #include asm/virt.h
 
 #define __PHYS_OFFSET  (KERNEL_START - TEXT_OFFSET)
@@ -606,9 +607,25 @@ ENDPROC(__secondary_switched)
  *  x27 = *virtual* address to jump to upon completion
  *
  * other registers depend on the function called upon completion
+ * Checks if the selected granule size is supported by the CPU.
  */
+#ifdefined(CONFIG_ARM64_64K_PAGES)
+
+#define ID_AA64MMFR0_TGran_SHIFT   ID_AA64MMFR0_TGran64_SHIFT
+#define ID_AA64MMFR0_TGran_ENABLED ID_AA64MMFR0_TGran64_ENABLED
+
+#else
+
+#define ID_AA64MMFR0_TGran_SHIFT   ID_AA64MMFR0_TGran4_SHIFT
+#define ID_AA64MMFR0_TGran_ENABLED ID_AA64MMFR0_TGran4_ENABLED
+
+#endif
.section.idmap.text, ax
 __enable_mmu:
+   mrs x1, ID_AA64MMFR0_EL1
+   ubfxx2, x1, #ID_AA64MMFR0_TGran_SHIFT, 4
+   cmp x2, #ID_AA64MMFR0_TGran_ENABLED
+   b.ne__no_granule_support
ldr x5, =vectors
msr vbar_el1, x5
msr ttbr0_el1, x25  // load TTBR0
@@ -626,3 +643,8 @@ __enable_mmu:
isb
br  x27
 ENDPROC(__enable_mmu)
+
+__no_granule_support:
+   wfe
+   b __no_granule_support
+ENDPROC(__no_granule_support)
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/14] arm64: kvm: Fix {V}TCR_EL2_TG0 mask

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

{V}TCR_EL2_TG0 is a 2bit wide field, where:

 00 - 4K
 01 - 64K
 10 - 16K

But we use only 1 bit, which has worked well so far since
we never cared about 16K. Fix it for 16K support.

Cc: Catalin Marinas catalin.mari...@arm.com
Cc: Will Deacon will.dea...@arm.com
Cc: Marc Zyngier marc.zyng...@arm.com
Cc: Christoffer Dall christoffer.d...@linaro.org
Cc: kvm...@lists.cs.columbia.edu
Acked-by: Mark Rutland mark.rutl...@arm.com
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
 arch/arm64/include/asm/kvm_arm.h |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index ac6fafb..52dc9cc 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -98,7 +98,7 @@
 #define TCR_EL2_TBI(1  20)
 #define TCR_EL2_PS (7  16)
 #define TCR_EL2_PS_40B (2  16)
-#define TCR_EL2_TG0(1  14)
+#define TCR_EL2_TG0(3  14)
 #define TCR_EL2_SH0(3  12)
 #define TCR_EL2_ORGN0  (3  10)
 #define TCR_EL2_IRGN0  (3  8)
@@ -110,7 +110,7 @@
 
 /* VTCR_EL2 Registers bits */
 #define VTCR_EL2_PS_MASK   (7  16)
-#define VTCR_EL2_TG0_MASK  (1  14)
+#define VTCR_EL2_TG0_MASK  (3  14)
 #define VTCR_EL2_TG0_4K(0  14)
 #define VTCR_EL2_TG0_64K   (1  14)
 #define VTCR_EL2_SH0_MASK  (3  12)
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/14] arm64: Handle 4 level page table for swapper

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

At the moment, we only support maximum of 3-level page table for
swapper. With 48bit VA, 64K has only 3 levels and 4K uses section
mapping. Add support for 4-level page table for swapper, needed
by 16K pages.

Cc: Ard Biesheuvel ard.biesheu...@linaro.org
Cc: Mark Rutland mark.rutl...@arm.com
Cc: Catalin Marinas catalin.mari...@arm.com
Cc: Will Deacon will.dea...@arm.com
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
 arch/arm64/kernel/head.S |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 46670bf..01b8e58 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -271,7 +271,10 @@ ENDPROC(preserve_boot_args)
  */
.macro  create_pgd_entry, tbl, virt, tmp1, tmp2
create_table_entry \tbl, \virt, PGDIR_SHIFT, PTRS_PER_PGD, \tmp1, \tmp2
-#if SWAPPER_PGTABLE_LEVELS == 3
+#if SWAPPER_PGTABLE_LEVELS  3
+   create_table_entry \tbl, \virt, PUD_SHIFT, PTRS_PER_PUD, \tmp1, \tmp2
+#endif
+#if SWAPPER_PGTABLE_LEVELS  2
create_table_entry \tbl, \virt, SWAPPER_TABLE_SHIFT, PTRS_PER_PTE, 
\tmp1, \tmp2
 #endif
.endm
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/14] arm64: Handle section maps for swapper/idmap

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

We use section maps with 4K page size to create the
swapper/idmaps. So far we have used !64K or 4K checks
to handle the case where we use the section maps. This
patch adds a symbol to make it clear those cases.

Cc: Ard Biesheuvel ard.biesheu...@linaro.org
Cc: Mark Rutland mark.rutl...@arm.com
Cc: Catalin Marinas catalin.mari...@arm.com
Cc: Will Deacon will.dea...@arm.com
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
 arch/arm64/include/asm/kernel-pgtable.h |   31 +-
 arch/arm64/mm/mmu.c |   70 ++-
 2 files changed, 51 insertions(+), 50 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h 
b/arch/arm64/include/asm/kernel-pgtable.h
index 622929d..5876a36 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -19,6 +19,13 @@
 #ifndef __ASM_KERNEL_PGTABLE_H
 #define __ASM_KERNEL_PGTABLE_H
 
+/* With 4K pages, we use section maps. */
+#ifdef CONFIG_ARM64_4K_PAGES
+#define ARM64_SWAPPER_USES_SECTION_MAPS 1
+#else
+#define ARM64_SWAPPER_USES_SECTION_MAPS 0
+#endif
+
 /*
  * The idmap and swapper page tables need some space reserved in the kernel
  * image. Both require pgd, pud (4 levels only) and pmd tables to (section)
@@ -28,26 +35,28 @@
  * could be increased on the fly if system RAM is out of reach for the default
  * VA range, so 3 pages are reserved in all cases.
  */
-#ifdef CONFIG_ARM64_64K_PAGES
-#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS)
-#else
+#if ARM64_SWAPPER_USES_SECTION_MAPS
 #define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - 1)
+#else
+#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS)
 #endif
 
 #define SWAPPER_DIR_SIZE   (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
 #define IDMAP_DIR_SIZE (3 * PAGE_SIZE)
 
 /* Initial memory map size */
-#ifdef CONFIG_ARM64_64K_PAGES
-#define SWAPPER_BLOCK_SHIFTPAGE_SHIFT
-#define SWAPPER_BLOCK_SIZE PAGE_SIZE
-#define SWAPPER_TABLE_SHIFTPMD_SHIFT
-#else
+#if ARM64_SWAPPER_USES_SECTION_MAPS
 #define SWAPPER_BLOCK_SHIFTSECTION_SHIFT
 #define SWAPPER_BLOCK_SIZE SECTION_SIZE
 #define SWAPPER_TABLE_SHIFTPUD_SHIFT
+#else
+#define SWAPPER_BLOCK_SHIFTPAGE_SHIFT
+#define SWAPPER_BLOCK_SIZE PAGE_SIZE
+#define SWAPPER_TABLE_SHIFTPMD_SHIFT
 #endif
 
+/* The size of the initial kernel direct mapping */
+#define SWAPPER_INIT_MAP_SIZE  (_AC(1, UL)  SWAPPER_TABLE_SHIFT)
 
 /*
  * Initial memory map attributes.
@@ -55,10 +64,10 @@
 #define SWAPPER_PTE_FLAGS  PTE_TYPE_PAGE | PTE_AF | PTE_SHARED
 #define SWAPPER_PMD_FLAGS  PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S
 
-#ifdef CONFIG_ARM64_64K_PAGES
-#define SWAPPER_MM_MMUFLAGSPTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS
-#else
+#if ARM64_SWAPPER_USES_SECTION_MAPS
 #define SWAPPER_MM_MMUFLAGSPMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS
+#else
+#define SWAPPER_MM_MMUFLAGSPTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS
 #endif
 
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9211b85..71230488 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -32,6 +32,7 @@
 
 #include asm/cputype.h
 #include asm/fixmap.h
+#include asm/kernel-pgtable.h
 #include asm/sections.h
 #include asm/setup.h
 #include asm/sizes.h
@@ -353,14 +354,11 @@ static void __init map_mem(void)
 * memory addressable from the initial direct kernel mapping.
 *
 * The initial direct kernel mapping, located at swapper_pg_dir, gives
-* us PUD_SIZE (4K pages) or PMD_SIZE (64K pages) memory starting from
-* PHYS_OFFSET (which must be aligned to 2MB as per
-* Documentation/arm64/booting.txt).
+* us PUD_SIZE (with SECTION maps, i.e, 4K) or PMD_SIZE (without
+* SECTION maps, i.e, 64K pages) memory starting from PHYS_OFFSET
+* (which must be aligned to 2MB as per 
Documentation/arm64/booting.txt).
 */
-   if (IS_ENABLED(CONFIG_ARM64_64K_PAGES))
-   limit = PHYS_OFFSET + PMD_SIZE;
-   else
-   limit = PHYS_OFFSET + PUD_SIZE;
+   limit = PHYS_OFFSET + SWAPPER_INIT_MAP_SIZE;
memblock_set_current_limit(limit);
 
/* map all the memory banks */
@@ -371,21 +369,24 @@ static void __init map_mem(void)
if (start = end)
break;
 
-#ifndef CONFIG_ARM64_64K_PAGES
-   /*
-* For the first memory bank align the start address and
-* current memblock limit to prevent create_mapping() from
-* allocating pte page tables from unmapped memory.
-* When 64K pages are enabled, the pte page table for the
-* first PGDIR_SIZE is already present in swapper_pg_dir.
-*/
-   if (start  limit)
-   start = ALIGN(start, PMD_SIZE);
-   if (end  limit) {
-   limit = end  PMD_MASK;
-

[PATCH 07/14] arm64: Kconfig: Fix help text about AArch32 support with 64K pages

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

Update the help text for ARM64_64K_PAGES to reflect the reality
about AArch32 support.

Cc: Mark Rutland mark.rutl...@arm.com
Cc: Catalin Marinas catalin.mari...@arm.com
Cc: Will Deacon will.dea...@arm.com
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
 arch/arm64/Kconfig |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index d1fb2a3..b247897 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -449,8 +449,8 @@ config ARM64_64K_PAGES
help
  This feature enables 64KB pages support (4KB by default)
  allowing only two levels of page tables and faster TLB
- look-up. AArch32 emulation is not available when this feature
- is enabled.
+ look-up. AArch32 emulation requires applications compiled
+ with 64K aligned segments.
 
 endchoice
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/14] arm64: Clean config usages for page size

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

We use !CONFIG_ARM64_64K_PAGES for CONFIG_ARM64_4K_PAGES
(and vice versa) in code. It all worked well, so far since
we only had two options. Now, with the introduction of 16K,
these cases will break. This patch cleans up the code to
use the required CONFIG symbol expression without the assumption
that !64K = 4K (and vice versa)

Cc: Ard Biesheuvel ard.biesheu...@linaro.org
Cc: Catalin Marinas catalin.mari...@arm.com
Cc: Will Deacon will.dea...@arm.com
Cc: Steve Capper steve.cap...@linaro.org
Acked-by: Mark Rutland mark.rutl...@arm.com
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
 arch/arm64/Kconfig   |4 ++--
 arch/arm64/Kconfig.debug |2 +-
 arch/arm64/include/asm/thread_info.h |2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 78b89fa..d1fb2a3 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -550,7 +550,7 @@ config ARCH_WANT_GENERAL_HUGETLB
def_bool y
 
 config ARCH_WANT_HUGE_PMD_SHARE
-   def_bool y if !ARM64_64K_PAGES
+   def_bool y if ARM64_4K_PAGES
 
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
def_bool y
@@ -762,7 +762,7 @@ source fs/Kconfig.binfmt
 
 config COMPAT
bool Kernel support for 32-bit EL0
-   depends on !ARM64_64K_PAGES || EXPERT
+   depends on ARM64_4K_PAGES || EXPERT
select COMPAT_BINFMT_ELF
select HAVE_UID16
select OLD_SIGSUSPEND3
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index d6285ef..c24d6ad 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -77,7 +77,7 @@ config DEBUG_RODATA
   If in doubt, say Y
 
 config DEBUG_ALIGN_RODATA
-   depends on DEBUG_RODATA  !ARM64_64K_PAGES
+   depends on DEBUG_RODATA  ARM64_4K_PAGES
bool Align linker sections up to SECTION_SIZE
help
  If this option is enabled, sections that may potentially be marked as
diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index dcd06d1..d9c8c9f 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -23,7 +23,7 @@
 
 #include linux/compiler.h
 
-#ifndef CONFIG_ARM64_64K_PAGES
+#ifdef CONFIG_ARM64_4K_PAGES
 #define THREAD_SIZE_ORDER  2
 #endif
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/14] arm64: Calculate size for idmap_pg_dir at compile time

2015-08-13 Thread Suzuki K. Poulose
From: Suzuki K. Poulose suzuki.poul...@arm.com

Now that we can calculate the number of levels required for
mapping a va width, reserve exact number of pages that would
be required to cover the idmap. The idmap should be able to handle
the maximum physical address size supported.

Cc: Ard Biesheuvel ard.biesheu...@linaro.org
Cc: Mark Rutland mark.rutl...@arm.com
Cc: Catalin Marinas catalin.mari...@arm.com
Cc: Will Deacon will.dea...@arm.com
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
 arch/arm64/include/asm/boot.h   |1 +
 arch/arm64/include/asm/kernel-pgtable.h |7 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h
index 81151b6..678b63e 100644
--- a/arch/arm64/include/asm/boot.h
+++ b/arch/arm64/include/asm/boot.h
@@ -2,6 +2,7 @@
 #ifndef __ASM_BOOT_H
 #define __ASM_BOOT_H
 
+#include asm/page.h
 #include asm/sizes.h
 
 /*
diff --git a/arch/arm64/include/asm/kernel-pgtable.h 
b/arch/arm64/include/asm/kernel-pgtable.h
index 5876a36..def7168 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -33,16 +33,19 @@
  * map to pte level. The swapper also maps the FDT (see __create_page_tables
  * for more information). Note that the number of ID map translation levels
  * could be increased on the fly if system RAM is out of reach for the default
- * VA range, so 3 pages are reserved in all cases.
+ * VA range, so pages required to map highest possible PA are reserved in all
+ * cases.
  */
 #if ARM64_SWAPPER_USES_SECTION_MAPS
 #define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - 1)
+#define IDMAP_PGTABLE_LEVELS   (ARM64_HW_PGTABLE_LEVELS(PHYS_MASK_SHIFT) - 1)
 #else
 #define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS)
+#define IDMAP_PGTABLE_LEVELS   (ARM64_HW_PGTABLE_LEVELS(PHYS_MASK_SHIFT))
 #endif
 
 #define SWAPPER_DIR_SIZE   (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
-#define IDMAP_DIR_SIZE (3 * PAGE_SIZE)
+#define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
 
 /* Initial memory map size */
 #if ARM64_SWAPPER_USES_SECTION_MAPS
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 08/15] KVM: arm64: introduce ITS emulation file with stub functions

2015-08-13 Thread Eric Auger
On 07/10/2015 04:21 PM, Andre Przywara wrote:
 The ARM GICv3 ITS emulation code goes into a separate file, but
 needs to be connected to the GICv3 emulation, of which it is an
 option.
 Introduce the skeleton with function stubs to be filled later.
 Introduce the basic ITS data structure and initialize it, but don't
 return any success yet, as we are not yet ready for the show.
 
 Signed-off-by: Andre Przywara andre.przyw...@arm.com
 ---
  arch/arm64/kvm/Makefile|   1 +
  include/kvm/arm_vgic.h |   6 ++
  include/linux/irqchip/arm-gic-v3.h |   1 +
  virt/kvm/arm/its-emul.c| 125 
 +
  virt/kvm/arm/its-emul.h|  35 +++
  virt/kvm/arm/vgic-v3-emul.c|  24 ++-
  6 files changed, 189 insertions(+), 3 deletions(-)
  create mode 100644 virt/kvm/arm/its-emul.c
  create mode 100644 virt/kvm/arm/its-emul.h
 
 diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
 index f90f4aa..9803307 100644
 --- a/arch/arm64/kvm/Makefile
 +++ b/arch/arm64/kvm/Makefile
 @@ -25,5 +25,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
  kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v2-switch.o
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
 +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/its-emul.o
  kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v3-switch.o
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
 diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
 index 8c6cb0e..9e9d4aa 100644
 --- a/include/kvm/arm_vgic.h
 +++ b/include/kvm/arm_vgic.h
 @@ -156,6 +156,11 @@ struct vgic_io_device {
   struct kvm_io_device dev;
  };
  
 +struct vgic_its {
 + boolenabled;
 + spinlock_t  lock;
 +};
 +
  struct vgic_dist {
   spinlock_t  lock;
   boolin_kernel;
 @@ -264,6 +269,7 @@ struct vgic_dist {
   u64 *pendbaser;
  
   boollpis_enabled;
 + struct vgic_its its;
  };
  
  struct vgic_v2_cpu_if {
 diff --git a/include/linux/irqchip/arm-gic-v3.h 
 b/include/linux/irqchip/arm-gic-v3.h
 index ffbc034..df4e527 100644
 --- a/include/linux/irqchip/arm-gic-v3.h
 +++ b/include/linux/irqchip/arm-gic-v3.h
 @@ -177,6 +177,7 @@
  #define GITS_CWRITER 0x0088
  #define GITS_CREADR  0x0090
  #define GITS_BASER   0x0100
 +#define GITS_IDREGS_BASE 0xffd0
  #define GITS_PIDR2   GICR_PIDR2
  
  #define GITS_TRANSLATER  0x10040
 diff --git a/virt/kvm/arm/its-emul.c b/virt/kvm/arm/its-emul.c
 new file mode 100644
 index 000..659dd39
 --- /dev/null
 +++ b/virt/kvm/arm/its-emul.c
 @@ -0,0 +1,125 @@
 +/*
 + * GICv3 ITS emulation
 + *
 + * Copyright (C) 2015 ARM Ltd.
 + * Author: Andre Przywara andre.przyw...@arm.com
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License version 2 as
 + * published by the Free Software Foundation.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License
 + * along with this program.  If not, see http://www.gnu.org/licenses/.
 + */
 +
 +#include linux/cpu.h
 +#include linux/kvm.h
 +#include linux/kvm_host.h
 +#include linux/interrupt.h
 +
 +#include linux/irqchip/arm-gic-v3.h
 +#include kvm/arm_vgic.h
 +
 +#include asm/kvm_emulate.h
 +#include asm/kvm_arm.h
 +#include asm/kvm_mmu.h
 +
 +#include vgic.h
 +#include its-emul.h
 +
 +static bool handle_mmio_misc_gits(struct kvm_vcpu *vcpu,
 +   struct kvm_exit_mmio *mmio,
 +   phys_addr_t offset)
 +{
 + return false;
 +}
 +
 +static bool handle_mmio_gits_idregs(struct kvm_vcpu *vcpu,
 + struct kvm_exit_mmio *mmio,
 + phys_addr_t offset)
 +{
 + return false;
 +}
 +
 +static bool handle_mmio_gits_cbaser(struct kvm_vcpu *vcpu,
 + struct kvm_exit_mmio *mmio,
 + phys_addr_t offset)
 +{
 + return false;
 +}
 +
 +static bool handle_mmio_gits_cwriter(struct kvm_vcpu *vcpu,
 +  struct kvm_exit_mmio *mmio,
 +  phys_addr_t offset)
 +{
 + return false;
 +}
 +
 +static bool handle_mmio_gits_creadr(struct kvm_vcpu *vcpu,
 + struct kvm_exit_mmio *mmio,
 + phys_addr_t offset)
 +{
 + return false;
 +}
 +
 +static const struct vgic_io_range vgicv3_its_ranges[] = {
 + {
 + .base   = GITS_CTLR,
 

Re: [PATCH 12/14] arm64: Check for selected granule support

2015-08-13 Thread Steve Capper
On 13 August 2015 at 12:34, Suzuki K. Poulose suzuki.poul...@arm.com wrote:
 From: Suzuki K. Poulose suzuki.poul...@arm.com

 Ensure that the selected page size is supported by the
 CPU(s).

 Cc: Mark Rutland mark.rutl...@arm.com
 Cc: Catalin Marinas catalin.mari...@arm.com
 Cc: Will Deacon will.dea...@arm.com
 Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
 ---
  arch/arm64/include/asm/sysreg.h |6 ++
  arch/arm64/kernel/head.S|   24 +++-
  2 files changed, 29 insertions(+), 1 deletion(-)

 diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
 index a7f3d4b..e01d323 100644
 --- a/arch/arm64/include/asm/sysreg.h
 +++ b/arch/arm64/include/asm/sysreg.h
 @@ -87,4 +87,10 @@ static inline void config_sctlr_el1(u32 clear, u32 set)
  }
  #endif

 +#define ID_AA64MMFR0_TGran4_SHIFT  28
 +#define ID_AA64MMFR0_TGran64_SHIFT 24
 +
 +#define ID_AA64MMFR0_TGran4_ENABLED0x0
 +#define ID_AA64MMFR0_TGran64_ENABLED   0x0
 +
  #endif /* __ASM_SYSREG_H */
 diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
 index 01b8e58..0cb04db 100644
 --- a/arch/arm64/kernel/head.S
 +++ b/arch/arm64/kernel/head.S
 @@ -31,10 +31,11 @@
  #include asm/cputype.h
  #include asm/kernel-pgtable.h
  #include asm/memory.h
 -#include asm/thread_info.h
  #include asm/pgtable-hwdef.h
  #include asm/pgtable.h
  #include asm/page.h
 +#include asm/sysreg.h
 +#include asm/thread_info.h
  #include asm/virt.h

  #define __PHYS_OFFSET  (KERNEL_START - TEXT_OFFSET)
 @@ -606,9 +607,25 @@ ENDPROC(__secondary_switched)
   *  x27 = *virtual* address to jump to upon completion
   *
   * other registers depend on the function called upon completion
 + * Checks if the selected granule size is supported by the CPU.
   */
 +#ifdefined(CONFIG_ARM64_64K_PAGES)
 +
 +#define ID_AA64MMFR0_TGran_SHIFT   ID_AA64MMFR0_TGran64_SHIFT
 +#define ID_AA64MMFR0_TGran_ENABLED ID_AA64MMFR0_TGran64_ENABLED
 +
 +#else
 +
 +#define ID_AA64MMFR0_TGran_SHIFT   ID_AA64MMFR0_TGran4_SHIFT
 +#define ID_AA64MMFR0_TGran_ENABLED ID_AA64MMFR0_TGran4_ENABLED
 +
 +#endif
 .section.idmap.text, ax
  __enable_mmu:
 +   mrs x1, ID_AA64MMFR0_EL1
 +   ubfxx2, x1, #ID_AA64MMFR0_TGran_SHIFT, 4
 +   cmp x2, #ID_AA64MMFR0_TGran_ENABLED
 +   b.ne__no_granule_support
 ldr x5, =vectors
 msr vbar_el1, x5
 msr ttbr0_el1, x25  // load TTBR0
 @@ -626,3 +643,8 @@ __enable_mmu:
 isb
 br  x27
  ENDPROC(__enable_mmu)
 +
 +__no_granule_support:
 +   wfe
 +   b __no_granule_support
 +ENDPROC(__no_granule_support)
 --
 1.7.9.5


Hi Suzuki,
Is is possible to tell the user that the kernel has failed to boot due
to the kernel granule being unsupported?

Cheers,
--
Steve
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] KVM: x86: return bool from x86_ops.sync_pir_to_irr

2015-08-13 Thread Radim Krčmář
True means that we have added PIR to IRR.

Signed-off-by: Radim Krčmář rkrc...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c  |  4 ++--
 arch/x86/kvm/vmx.c  | 12 +++-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 09acaa64ef8e..b73696b59d77 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -831,7 +831,7 @@ struct kvm_x86_ops {
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
-   void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
+   bool (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 189e46479dd5..cd4ad20951c4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3758,9 +3758,9 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
return;
 }
 
-static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+static bool svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 {
-   return;
+   return false;
 }
 
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4cf25b90dbe0..e3ae8c236cca 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -817,7 +817,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
 static bool guest_state_valid(struct kvm_vcpu *vcpu);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
+static bool vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 static int alloc_identity_pagetable(struct kvm *kvm);
@@ -4430,19 +4430,21 @@ static void vmx_deliver_posted_interrupt(struct 
kvm_vcpu *vcpu, int vector)
kvm_vcpu_kick(vcpu);
 }
 
-static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+static bool vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
 
if (!pi_test_and_clear_on(vmx-pi_desc))
-   return;
+   return false;
 
kvm_apic_update_irr(vcpu, vmx-pi_desc.pir);
+
+   return true;
 }
 
-static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
+static bool vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
 {
-   return;
+   return false;
 }
 
 /*
-- 
2.5.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] KVM: x86: fix edge EOI and IOAPIC reconfig race

2015-08-13 Thread Radim Krčmář
(The main problem is that we care about EOI of edge interrupts, but our
 house of cards started a long time ago, so overturning that decision is
 not ideal for a stable fix.)

KVM uses eoi_exit_bitmap to track vectors that need an action on EOI.
The problem is that IOAPIC can be reconfigured while an interrupt with
old configuration is pending and eoi_exit_bitmap only remembers the
newest configuration so EOI from the pending interrupt is not
recognized.

This is not a problem for level interrupts, because IOAPIC sends
interrupt with the new configuration.
And then there are edge interrupts with ACK notifiers, like i8254 timer;
things can happen in this order
 1) IOAPIC inject a vector from i8254
 2) guest reconfigures that vector's VCPU and therefore eoi_exit_bitmap
on original VCPU gets cleared
 3) guest's handler for the vector does EOI
 4) KVM's EOI handler doesn't pass that vector to IOAPIC because it is
not in that VCPU's eoi_exit_bitmap
 5) i8254 stops working

A simple solution is to set the IOAPIC vector in eoi_exit_bitmap if the
vector is in PIR/IRR/ISR.

This creates an unwanted situation if the vector is reused by a
non-IOAPIC source, but I think it is so rare that we don't want to make
the solution more sophisticated.  The simple solution also doesn't work
if we are reconfiguring the vector.  (Shouldn't happen in the wild and
I'd rather fix users of ACK notifiers instead of working around that.)

The are no races because ioapic injection and reconfig are locked.

Fixes: 638e7c03efea (KVM: x86: Add EOI exit bitmap inference)
[Before 638e7c03efea, this bug happened only with APICv.]
Fixes: c7c9c56ca26f (x86, apicv: add virtual interrupt delivery support)
Cc: sta...@vger.kernel.org
Signed-off-by: Radim Krčmář rkrc...@redhat.com
---
 arch/x86/kvm/ioapic.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 2dcda0f188ba..85d25fe25e39 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -239,6 +239,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 
*eoi_exit_bitmap)
union kvm_ioapic_redirect_entry *e;
int index;
 
+   if (kvm_x86_ops-sync_pir_to_irr(vcpu))
+   kvm_make_request(KVM_REQ_EVENT, vcpu);
+
spin_lock(ioapic-lock);
for (index = 0; index  IOAPIC_NUM_PINS; index++) {
e = ioapic-redirtbl[index];
@@ -246,7 +249,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 
*eoi_exit_bitmap)
kvm_irq_has_notifier(ioapic-kvm, KVM_IRQCHIP_IOAPIC, 
index) ||
index == RTC_GSI) {
if (kvm_apic_match_dest(vcpu, NULL, 0,
-   e-fields.dest_id, e-fields.dest_mode))
+e-fields.dest_id, e-fields.dest_mode) ||
+   (e-fields.trig_mode == IOAPIC_EDGE_TRIG 
+kvm_apic_pending_eoi(vcpu, e-fields.vector)))
__set_bit(e-fields.vector,
(unsigned long *)eoi_exit_bitmap);
}
-- 
2.5.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] KVM: x86: fix edge EOI and IOAPIC reconfig race

2015-08-13 Thread Radim Krčmář
[1/2] changes sync_pir_to_irr interface to avoid a definitely
superfluous KVM_REQ_EVENT in [2/2].  It's possible that we don't need
KVM_REQ_EVENT, but a slight slowdown in execution is worth the time
needed for a proof.  (A quick test showed that KVM_REQ_EVENT is always
already set if sync_pir_to_irr does something.)

[2/2] is applicable to userspace split IOAPIC as well, but I hope that
we want it to work like normal IOAPIC (no ACK for edge irq), so I
haven't fixed it.


Radim Krčmář (2):
  KVM: x86: return bool from x86_ops.sync_pir_to_irr
  KVM: x86: fix edge EOI and IOAPIC reconfig race

 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/ioapic.c   |  7 ++-
 arch/x86/kvm/svm.c  |  4 ++--
 arch/x86/kvm/vmx.c  | 12 +++-
 4 files changed, 16 insertions(+), 9 deletions(-)

-- 
2.5.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 12/14] arm64: Check for selected granule support

2015-08-13 Thread Suzuki K. Poulose

On 13/08/15 13:28, Steve Capper wrote:

On 13 August 2015 at 12:34, Suzuki K. Poulose suzuki.poul...@arm.com wrote:

From: Suzuki K. Poulose suzuki.poul...@arm.com

Ensure that the selected page size is supported by the
CPU(s).

Cc: Mark Rutland mark.rutl...@arm.com
Cc: Catalin Marinas catalin.mari...@arm.com
Cc: Will Deacon will.dea...@arm.com
Signed-off-by: Suzuki K. Poulose suzuki.poul...@arm.com
---
  arch/arm64/include/asm/sysreg.h |6 ++
  arch/arm64/kernel/head.S|   24 +++-
  2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index a7f3d4b..e01d323 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -87,4 +87,10 @@ static inline void config_sctlr_el1(u32 clear, u32 set)
  }
  #endif

+#define ID_AA64MMFR0_TGran4_SHIFT  28
+#define ID_AA64MMFR0_TGran64_SHIFT 24
+
+#define ID_AA64MMFR0_TGran4_ENABLED0x0
+#define ID_AA64MMFR0_TGran64_ENABLED   0x0
+
  #endif /* __ASM_SYSREG_H */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 01b8e58..0cb04db 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -31,10 +31,11 @@
  #include asm/cputype.h
  #include asm/kernel-pgtable.h
  #include asm/memory.h
-#include asm/thread_info.h
  #include asm/pgtable-hwdef.h
  #include asm/pgtable.h
  #include asm/page.h
+#include asm/sysreg.h
+#include asm/thread_info.h
  #include asm/virt.h

  #define __PHYS_OFFSET  (KERNEL_START - TEXT_OFFSET)
@@ -606,9 +607,25 @@ ENDPROC(__secondary_switched)
   *  x27 = *virtual* address to jump to upon completion
   *
   * other registers depend on the function called upon completion
+ * Checks if the selected granule size is supported by the CPU.
   */
+#ifdefined(CONFIG_ARM64_64K_PAGES)
+
+#define ID_AA64MMFR0_TGran_SHIFT   ID_AA64MMFR0_TGran64_SHIFT
+#define ID_AA64MMFR0_TGran_ENABLED ID_AA64MMFR0_TGran64_ENABLED
+
+#else
+
+#define ID_AA64MMFR0_TGran_SHIFT   ID_AA64MMFR0_TGran4_SHIFT
+#define ID_AA64MMFR0_TGran_ENABLED ID_AA64MMFR0_TGran4_ENABLED
+
+#endif
 .section.idmap.text, ax
  __enable_mmu:
+   mrs x1, ID_AA64MMFR0_EL1
+   ubfxx2, x1, #ID_AA64MMFR0_TGran_SHIFT, 4
+   cmp x2, #ID_AA64MMFR0_TGran_ENABLED
+   b.ne__no_granule_support
 ldr x5, =vectors
 msr vbar_el1, x5
 msr ttbr0_el1, x25  // load TTBR0
@@ -626,3 +643,8 @@ __enable_mmu:
 isb
 br  x27
  ENDPROC(__enable_mmu)
+
+__no_granule_support:
+   wfe
+   b __no_granule_support
+ENDPROC(__no_granule_support)
--
1.7.9.5



Hi Suzuki,
Is is possible to tell the user that the kernel has failed to boot due
to the kernel granule being unsupported?


We don't have anything up at this time. The looping address is actually a clue
to the (expert) user. Not sure we can do something, until we get something like 
DEBUG_LL(?)
Or we should let it continue and end in a panic(?). The current situation can 
boot a
multi-cluster system with boot cluster having the Tgran support(which doesn't 
make a
strong use case though). I will try out some options and get back to you.


Thanks
Suzuki



Cheers,
--
Steve



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 09/15] KVM: arm64: implement basic ITS register handlers

2015-08-13 Thread Eric Auger
On 07/10/2015 04:21 PM, Andre Przywara wrote:
 Add emulation for some basic MMIO registers used in the ITS emulation.
 This includes:
 - GITS_{CTLR,TYPER,IIDR}
 - ID registers
 - GITS_{CBASER,CREADR,CWRITER}
   those implement the ITS command buffer handling
 
 Most of the handlers are pretty straight forward
straightforward?
, but CWRITER goes
 some extra miles to allow fine grained locking. The idea here
 is to let only the first instance iterate through the command ring
 buffer, CWRITER accesses on other VCPUs meanwhile will be picked up
 by that first instance and handled as well. The ITS lock is thus only
 hold for very small periods of time and is dropped before the actual
 command handler is called.
 
 Signed-off-by: Andre Przywara andre.przyw...@arm.com
 ---
  include/kvm/arm_vgic.h |   3 +
  include/linux/irqchip/arm-gic-v3.h |   8 ++
  virt/kvm/arm/its-emul.c| 205 
 +
  virt/kvm/arm/its-emul.h|   1 +
  virt/kvm/arm/vgic-v3-emul.c|   2 +
  5 files changed, 219 insertions(+)
 
 diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
 index 9e9d4aa..b432055 100644
 --- a/include/kvm/arm_vgic.h
 +++ b/include/kvm/arm_vgic.h
 @@ -159,6 +159,9 @@ struct vgic_io_device {
  struct vgic_its {
   boolenabled;
   spinlock_t  lock;
 + u64 cbaser;
 + int creadr;
 + int cwriter;
  };
  
  struct vgic_dist {
 diff --git a/include/linux/irqchip/arm-gic-v3.h 
 b/include/linux/irqchip/arm-gic-v3.h
 index df4e527..0b450c7 100644
 --- a/include/linux/irqchip/arm-gic-v3.h
 +++ b/include/linux/irqchip/arm-gic-v3.h
 @@ -179,15 +179,23 @@
  #define GITS_BASER   0x0100
  #define GITS_IDREGS_BASE 0xffd0
  #define GITS_PIDR2   GICR_PIDR2
 +#define GITS_PIDR4   0xffd0
 +#define GITS_CIDR0   0xfff0
 +#define GITS_CIDR1   0xfff4
 +#define GITS_CIDR2   0xfff8
 +#define GITS_CIDR3   0xfffc
  
  #define GITS_TRANSLATER  0x10040
  
  #define GITS_CTLR_ENABLE (1U  0)
  #define GITS_CTLR_QUIESCENT  (1U  31)
  
 +#define GITS_TYPER_PLPIS (1UL  0)
 +#define GITS_TYPER_IDBITS_SHIFT  8
  #define GITS_TYPER_DEVBITS_SHIFT 13
  #define GITS_TYPER_DEVBITS(r)r)  
 GITS_TYPER_DEVBITS_SHIFT)  0x1f) + 1)
  #define GITS_TYPER_PTA   (1UL  19)
 +#define GITS_TYPER_HWCOLLCNT_SHIFT   24
  
  #define GITS_CBASER_VALID(1UL  63)
  #define GITS_CBASER_nCnB (0UL  59)
 diff --git a/virt/kvm/arm/its-emul.c b/virt/kvm/arm/its-emul.c
 index 659dd39..b498f06 100644
 --- a/virt/kvm/arm/its-emul.c
 +++ b/virt/kvm/arm/its-emul.c
 @@ -32,10 +32,62 @@
  #include vgic.h
  #include its-emul.h
  
 +#define BASER_BASE_ADDRESS(x) ((x)  0xf000ULL)
 +
 +/* The distributor lock is held by the VGIC MMIO handler. */
  static bool handle_mmio_misc_gits(struct kvm_vcpu *vcpu,
 struct kvm_exit_mmio *mmio,
 phys_addr_t offset)
  {
 + struct vgic_its *its = vcpu-kvm-arch.vgic.its;
 + u32 reg;
 + bool was_enabled;
 +
 + switch (offset  ~3) {
 + case 0x00:  /* GITS_CTLR */
 + /* We never defer any command execution. */
 + reg = GITS_CTLR_QUIESCENT;
 + if (its-enabled)
 + reg |= GITS_CTLR_ENABLE;
 + was_enabled = its-enabled;
 + vgic_reg_access(mmio, reg, offset  3,
 + ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
 + its-enabled = !!(reg  GITS_CTLR_ENABLE);
 + return !was_enabled  its-enabled;
 + case 0x04:  /* GITS_IIDR */
 + reg = (PRODUCT_ID_KVM  24) | (IMPLEMENTER_ARM  0);
 + vgic_reg_access(mmio, reg, offset  3,
 + ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
 + break;
 + case 0x08:  /* GITS_TYPER */
 + /*
 +  * We use linear CPU numbers for redistributor addressing,
 +  * so GITS_TYPER.PTA is 0.
 +  * To avoid memory waste on the guest side, we keep the
 +  * number of IDBits and DevBits low for the time being.
 +  * This could later be made configurable by userland.
 +  * Since we have all collections in linked list, we claim
 +  * that we can hold all of the collection tables in our
 +  * own memory and that the ITT entry size is 1 byte (the
 +  * smallest possible one).
 +  */
 + reg = GITS_TYPER_PLPIS;
 + reg |= 0xff  GITS_TYPER_HWCOLLCNT_SHIFT;
 + reg |= 0x0f  GITS_TYPER_DEVBITS_SHIFT;
 + reg |= 0x0f  GITS_TYPER_IDBITS_SHIFT;
 +  

Re: [PATCH 12/14] arm64: Check for selected granule support

2015-08-13 Thread Catalin Marinas
On Thu, Aug 13, 2015 at 03:45:07PM +0100, Suzuki K. Poulose wrote:
 On 13/08/15 13:28, Steve Capper wrote:
 On 13 August 2015 at 12:34, Suzuki K. Poulose suzuki.poul...@arm.com wrote:
   __enable_mmu:
 +   mrs x1, ID_AA64MMFR0_EL1
 +   ubfxx2, x1, #ID_AA64MMFR0_TGran_SHIFT, 4
 +   cmp x2, #ID_AA64MMFR0_TGran_ENABLED
 +   b.ne__no_granule_support
  ldr x5, =vectors
  msr vbar_el1, x5
  msr ttbr0_el1, x25  // load TTBR0
 @@ -626,3 +643,8 @@ __enable_mmu:
  isb
  br  x27
   ENDPROC(__enable_mmu)
 +
 +__no_granule_support:
 +   wfe
 +   b __no_granule_support
 +ENDPROC(__no_granule_support)
 --
 1.7.9.5
 
 
 Is is possible to tell the user that the kernel has failed to boot due
 to the kernel granule being unsupported?
 
 We don't have anything up at this time. The looping address is actually a 
 clue
 to the (expert) user. Not sure we can do something, until we get something 
 like DEBUG_LL(?)

No.

 Or we should let it continue and end in a panic(?). The current situation can 
 boot a
 multi-cluster system with boot cluster having the Tgran support(which doesn't 
 make a
 strong use case though). I will try out some options and get back to you.

If the boot CPU does not support 16KB pages, in general there isn't much
we can do since the console printing is done after we enabled the MMU.
Even mapping the UART address requires fixmap support and the PAGE_SIZE
is hard-coded in the kernel image. The DT is also mapped at run-time.

While in theory it's possible to fall back to a 4KB page size just
enough to load the DT and figure out the early console, I suggest we
just live with the looping address clue.

-- 
Catalin
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/PATCH 2/3] x86/signal/64: Try to preserve hardware SS across 64-bit signal delivery

2015-08-13 Thread Andy Lutomirski
On Thu, Aug 13, 2015 at 2:41 PM, Linus Torvalds
torva...@linux-foundation.org wrote:
 On Thu, Aug 13, 2015 at 2:26 PM, Andy Lutomirski l...@amacapital.net wrote:

 VERW is no good, because it considers non-present segments to be
 writable.  Test cases for the win!

 Seriously? That's crazy. I don't think I've actually ever used VERW,
 but the documentation certainly says that the segment has to be
 writable, and I quote

   The validation performed is the same as is performed when a segment
 selector is loaded into the DS, ES, FS, or GS register, and the
 indicated access (read or write) is performed

 which damn well shouldn't work for non-present segments. Odd.


I can try to come up with a self-contained test case, but I'm
reasonably confident that I did it right and that I sprinkled the
right printks around.

--Andy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/PATCH 2/3] x86/signal/64: Try to preserve hardware SS across 64-bit signal delivery

2015-08-13 Thread Andy Lutomirski
On Thu, Aug 13, 2015 at 2:49 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Thu, Aug 13, 2015 at 2:41 PM, Linus Torvalds
 torva...@linux-foundation.org wrote:
 On Thu, Aug 13, 2015 at 2:26 PM, Andy Lutomirski l...@amacapital.net wrote:

 VERW is no good, because it considers non-present segments to be
 writable.  Test cases for the win!

 Seriously? That's crazy. I don't think I've actually ever used VERW,
 but the documentation certainly says that the segment has to be
 writable, and I quote

   The validation performed is the same as is performed when a segment
 selector is loaded into the DS, ES, FS, or GS register, and the
 indicated access (read or write) is performed

 which damn well shouldn't work for non-present segments. Odd.


 I can try to come up with a self-contained test case, but I'm
 reasonably confident that I did it right and that I sprinkled the
 right printks around.

The SDM pseudocode, the APM's description:

A segment is writable if all of the following apply:
 - the selector is not a null selector.
 - the descriptor is within the GDT or LDT limit.
 - the segment is a writable data segment.
 - the descriptor DPL is greater than or equal to both the CPL and RPL.

and the SDM's bullet points:

To set the ZF flag, the following conditions must be met:
 - The segment selector is not NULL.
 - The selector must denote a descriptor within the bounds of the
descriptor table (GDT or LDT).
 - The selector must denote the descriptor of a code or data segment
(not that of a system segment or gate).
 - For the VERR instruction, the segment must be readable.
 - For the VERW instruction, the segment must be a writable data segment.
 - If the segment is not a conforming code segment, the segment’s DPL
must be greater than...

all seem to suggest that P isn't checked.

If I quote a bit farther than you did:

The validation performed is the same as is performed when a segment
selector is loaded into the DS, ES, FS, or GS
register, and the indicated access (read or write) is performed. The
segment selector's value cannot result in a
protection exception, enabling the software to anticipate possible
segment access problems.

I think the idea is that VERW is supposed to check protection but not
presence, the idea being that a hypothetical non-paged segmented OS
would swap out a segment and mark it not-present, and the resulting
failure would be #NP, which isn't a protection exception.

Did anyone ever write an OS that used this stuff?  The Internet
suggests that OS/2 1.0 on the 286 supported swapping, so I bet it
actually used this mechanism, and woe unto any user (ahem, ring 1-3)
app that used LAR, checked the present bit, and blew up when a segment
was paged out.

--Andy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/PATCH 2/3] x86/signal/64: Try to preserve hardware SS across 64-bit signal delivery

2015-08-13 Thread Linus Torvalds
On Thu, Aug 13, 2015 at 2:26 PM, Andy Lutomirski l...@amacapital.net wrote:

 VERW is no good, because it considers non-present segments to be
 writable.  Test cases for the win!

Seriously? That's crazy. I don't think I've actually ever used VERW,
but the documentation certainly says that the segment has to be
writable, and I quote

  The validation performed is the same as is performed when a segment
selector is loaded into the DS, ES, FS, or GS register, and the
indicated access (read or write) is performed

which damn well shouldn't work for non-present segments. Odd.

  Linus
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/PATCH 2/3] x86/signal/64: Try to preserve hardware SS across 64-bit signal delivery

2015-08-13 Thread Andy Lutomirski
On Thu, Aug 13, 2015 at 1:18 PM, Andy Lutomirski l...@kernel.org wrote:
 Linux used to have nearly useless SS handling for 64-bit signals.  Signal
 delivery to a 64-bit process would preserve the SS selector, but the
 selector wasn't saved in sigcontext.  Sigreturn would then clobber SS.
 If the signal being delivered was due to a bad SS, signal delivery would
 fail and the task would be killed.

 As of Linux 4.1, it's fixed: signal delivery sets up a valid SS in the
 hardware SS selector, saves the old SS in the sigcontext, and restores it
 properly in sigreturn.

 DOSEMU had a curious workaround for the old behavior: it saved the
 hardware SS selector it was given during signal delivery and fudged
 RIP and CS so that sigreturn would return to a trampoline that
 restored the old RIP, CS, and, importantly, SS.

 The upshot is that the change in sigcontext had no effect on DOSEMU
 (DOSEMU doesn't care what was in sigcontext, and the fact that the
 old SS is presented to the trampoline in new kernels is irrelevant
 because the trampoline uses long mode), but the change in signal
 delivery caused DOSEMU's workaround to restore __USER_DS instead of
 the correct pre-signal SS value.

 Do our best to work around it: explicitly check whether the old SS
 is usable and leave it alone during signal delivery if it is.
 Sigreturn is unchanged.

 Reported-by: Stas Sergeev s...@list.ru
 Fixes: c6f2062935c8 (x86/signal/64: Fix SS handling for signals delivered to 
 64-bit programs)
 Signed-off-by: Andy Lutomirski l...@kernel.org
 ---

 +   asm (lar %[old_ss], %[ar]\n\t
 +jz 1f\n\t
 +xorl %[ar], %[ar]\n\t/* If invalid, set ar = 0 */
 +1:
 +: [ar] =r (ar)
 +: [old_ss] rm ((u16)regs-ss));
 +

Now that I sent this...

I should learn to think very carefully before doubting Linus'
off-the-cuff intuition about the x86 instruction set.  This can use
VERW after all, as long as it's careful to set RPL==3.

For purposes of testing DOSEMU testing, the version I sent should be
fine.  For purposes of review, please pretend that I was sensible and
used VERW instead of LAR.  This also means that the VMX fixlet is
optional, but I still think it's a good idea.

--Andy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/PATCH 2/3] x86/signal/64: Try to preserve hardware SS across 64-bit signal delivery

2015-08-13 Thread Andy Lutomirski
On Thu, Aug 13, 2015 at 1:25 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Thu, Aug 13, 2015 at 1:18 PM, Andy Lutomirski l...@kernel.org wrote:
 Linux used to have nearly useless SS handling for 64-bit signals.  Signal
 delivery to a 64-bit process would preserve the SS selector, but the
 selector wasn't saved in sigcontext.  Sigreturn would then clobber SS.
 If the signal being delivered was due to a bad SS, signal delivery would
 fail and the task would be killed.

 As of Linux 4.1, it's fixed: signal delivery sets up a valid SS in the
 hardware SS selector, saves the old SS in the sigcontext, and restores it
 properly in sigreturn.

 DOSEMU had a curious workaround for the old behavior: it saved the
 hardware SS selector it was given during signal delivery and fudged
 RIP and CS so that sigreturn would return to a trampoline that
 restored the old RIP, CS, and, importantly, SS.

 The upshot is that the change in sigcontext had no effect on DOSEMU
 (DOSEMU doesn't care what was in sigcontext, and the fact that the
 old SS is presented to the trampoline in new kernels is irrelevant
 because the trampoline uses long mode), but the change in signal
 delivery caused DOSEMU's workaround to restore __USER_DS instead of
 the correct pre-signal SS value.

 Do our best to work around it: explicitly check whether the old SS
 is usable and leave it alone during signal delivery if it is.
 Sigreturn is unchanged.

 Reported-by: Stas Sergeev s...@list.ru
 Fixes: c6f2062935c8 (x86/signal/64: Fix SS handling for signals delivered 
 to 64-bit programs)
 Signed-off-by: Andy Lutomirski l...@kernel.org
 ---

 +   asm (lar %[old_ss], %[ar]\n\t
 +jz 1f\n\t
 +xorl %[ar], %[ar]\n\t/* If invalid, set ar = 0 */
 +1:
 +: [ar] =r (ar)
 +: [old_ss] rm ((u16)regs-ss));
 +

 Now that I sent this...

 I should learn to think very carefully before doubting Linus'
 off-the-cuff intuition about the x86 instruction set.  This can use
 VERW after all, as long as it's careful to set RPL==3.

And the corollary to that is: I should also assume that Linus is out
to get me :)

VERW is no good, because it considers non-present segments to be
writable.  Test cases for the win!

--Andy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC/PATCH 1/3] x86/kvm: Rename VMX's segment access rights defines

2015-08-13 Thread Andy Lutomirski
VMX encodes access rights differently from LAR, and the latter is
most likely what x86 people think of when they think of access
rights.

Rename them to avoid confusion.

Cc: kvm@vger.kernel.org
Signed-off-by: Andy Lutomirski l...@kernel.org
---
 arch/x86/include/asm/vmx.h | 46 +++---
 arch/x86/kvm/vmx.c | 14 +++---
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index da772edd19ab..78e243ae1786 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -367,29 +367,29 @@ enum vmcs_field {
 #define TYPE_PHYSICAL_APIC_EVENT(10  12)
 #define TYPE_PHYSICAL_APIC_INST (15  12)
 
-/* segment AR */
-#define SEGMENT_AR_L_MASK (1  13)
-
-#define AR_TYPE_ACCESSES_MASK 1
-#define AR_TYPE_READABLE_MASK (1  1)
-#define AR_TYPE_WRITEABLE_MASK (1  2)
-#define AR_TYPE_CODE_MASK (1  3)
-#define AR_TYPE_MASK 0x0f
-#define AR_TYPE_BUSY_64_TSS 11
-#define AR_TYPE_BUSY_32_TSS 11
-#define AR_TYPE_BUSY_16_TSS 3
-#define AR_TYPE_LDT 2
-
-#define AR_UNUSABLE_MASK (1  16)
-#define AR_S_MASK (1  4)
-#define AR_P_MASK (1  7)
-#define AR_L_MASK (1  13)
-#define AR_DB_MASK (1  14)
-#define AR_G_MASK (1  15)
-#define AR_DPL_SHIFT 5
-#define AR_DPL(ar) (((ar)  AR_DPL_SHIFT)  3)
-
-#define AR_RESERVD_MASK 0xfffe0f00
+/* segment AR in VMCS -- these are different from what LAR reports */
+#define VMX_SEGMENT_AR_L_MASK (1  13)
+
+#define VMX_AR_TYPE_ACCESSES_MASK 1
+#define VMX_AR_TYPE_READABLE_MASK (1  1)
+#define VMX_AR_TYPE_WRITEABLE_MASK (1  2)
+#define VMX_AR_TYPE_CODE_MASK (1  3)
+#define VMX_AR_TYPE_MASK 0x0f
+#define VMX_AR_TYPE_BUSY_64_TSS 11
+#define VMX_AR_TYPE_BUSY_32_TSS 11
+#define VMX_AR_TYPE_BUSY_16_TSS 3
+#define VMX_AR_TYPE_LDT 2
+
+#define VMX_AR_UNUSABLE_MASK (1  16)
+#define VMX_AR_S_MASK (1  4)
+#define VMX_AR_P_MASK (1  7)
+#define VMX_AR_L_MASK (1  13)
+#define VMX_AR_DB_MASK (1  14)
+#define VMX_AR_G_MASK (1  15)
+#define VMX_AR_DPL_SHIFT 5
+#define VMX_AR_DPL(ar) (((ar)  VMX_AR_DPL_SHIFT)  3)
+
+#define VMX_AR_RESERVD_MASK 0xfffe0f00
 
 #define TSS_PRIVATE_MEMSLOT(KVM_USER_MEM_SLOTS + 0)
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT   (KVM_USER_MEM_SLOTS + 1)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e856dd566f4c..d7ff79a5135b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3423,12 +3423,12 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
vmx_segment_cache_clear(to_vmx(vcpu));
 
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
-   if ((guest_tr_ar  AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
+   if ((guest_tr_ar  VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
pr_debug_ratelimited(%s: tss fixup for long mode. \n,
 __func__);
vmcs_write32(GUEST_TR_AR_BYTES,
-(guest_tr_ar  ~AR_TYPE_MASK)
-| AR_TYPE_BUSY_64_TSS);
+(guest_tr_ar  ~VMX_AR_TYPE_MASK)
+| VMX_AR_TYPE_BUSY_64_TSS);
}
vmx_set_efer(vcpu, vcpu-arch.efer | EFER_LMA);
 }
@@ -3719,7 +3719,7 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu)
return 0;
else {
int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
-   return AR_DPL(ar);
+   return VMX_AR_DPL(ar);
}
 }
 
@@ -3847,11 +3847,11 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
 
if (cs.unusable)
return false;
-   if (~cs.type  (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
+   if (~cs.type  (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
return false;
if (!cs.s)
return false;
-   if (cs.type  AR_TYPE_WRITEABLE_MASK) {
+   if (cs.type  VMX_AR_TYPE_WRITEABLE_MASK) {
if (cs.dpl  cs_rpl)
return false;
} else {
@@ -3901,7 +3901,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int 
seg)
return false;
if (!var.present)
return false;
-   if (~var.type  (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
+   if (~var.type  (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
if (var.dpl  rpl) /* DPL  RPL */
return false;
}
-- 
2.4.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC/PATCH 0/3] x86/signal/64: A better attempt at SS cleanup

2015-08-13 Thread Andy Lutomirski
This is almost certainly not 4.2 material.

This applies to -linux before the sigcontext revert.  If people like
these, I'll rebase them on top of the revert and I'll do something about
the UAPI build issue (if necessary -- renaming __pad0 may actually be
fine).

This should allow new programs to opt in to sane SS handling.  It makes
signal delivery reliable in the face of weird SS values (it was
unreliable before Linux 4.1, and it's unreliable again after the
revert).  Unlike the previous try, it should *not* break DOSEMU's hack
to figure out what SS was at the time of signal delivery, at least in
cases where DOSEMU wouldn't crash outright due to completely bogus SS
values.  Also unlike the previous try, it won't crash DOSEMU when DOSEMU
invalidates the old SS from a signal handler but doesn't know to update
the sigcontext.

The sigreturn_64 selftest is updated to use the new flag.  It passes.
For a real version of these patches, I'll add more tests to make sure
that we get the weird corner cases right.  (There are probably cases
where this isn't quite right on Xen, too, but we might not care.)

Andy Lutomirski (3):
  x86/kvm: Rename VMX's segment access rights defines
  x86/signal/64: Try to preserve hardware SS across 64-bit signal
delivery
  x86/signal/64: Add explicit controls for sigcontext SS handling

 arch/x86/include/asm/desc_defs.h| 23 +++
 arch/x86/include/asm/sighandling.h  |  1 -
 arch/x86/include/asm/vmx.h  | 46 +++---
 arch/x86/include/uapi/asm/ucontext.h| 26 +---
 arch/x86/kernel/signal.c| 70 -
 arch/x86/kvm/vmx.c  | 14 +++
 tools/testing/selftests/x86/sigreturn.c | 26 
 7 files changed, 160 insertions(+), 46 deletions(-)

-- 
2.4.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC/PATCH 3/3] x86/signal/64: Add explicit controls for sigcontext SS handling

2015-08-13 Thread Andy Lutomirski
This adds two new uc_flags flags.  UC_SAVED_SS will be set for all
64-bit signals (including x32).  It indicates that the saved SS field
is valid and that the kernel understands UC_RESTORE_SS.

The kernel will *not* set UC_RESTORE_SS.  User signal handlers can
set UC_RESTORE_SS themselves to indicate that sigreturn should
restore SS from the sigcontext.

64-bit programs that use segmentation are encouraged to check
UC_SAVED_SS and set UC_RESTORE_SS in their signal handlers.  This is
the only straightforward way to cause sigreturn to restore SS.  (The
only non-test program that I know of that uses segmentation in a
64-bit binary is DOSEMU, and DOSEMU currently uses a nasty
trampoline to work around the lack of this mechanism in old kernels.
It could detect UC_RESTORE_SS and use it to avoid needing a
trampoline.

Cc: Stas Sergeev s...@list.ru
Cc: Linus Torvalds torva...@linux-foundation.org
Cc: Cyrill Gorcunov gorcu...@gmail.com
Cc: Pavel Emelyanov xe...@parallels.com
Signed-off-by: Andy Lutomirski l...@kernel.org
---
 arch/x86/include/asm/sighandling.h  |  1 -
 arch/x86/include/uapi/asm/ucontext.h| 26 +
 arch/x86/kernel/signal.c| 41 ++---
 tools/testing/selftests/x86/sigreturn.c | 26 +
 4 files changed, 80 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/sighandling.h 
b/arch/x86/include/asm/sighandling.h
index 89db46752a8f..452c88b8ad06 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -13,7 +13,6 @@
 X86_EFLAGS_CF | X86_EFLAGS_RF)
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
 int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 struct pt_regs *regs, unsigned long mask);
 
diff --git a/arch/x86/include/uapi/asm/ucontext.h 
b/arch/x86/include/uapi/asm/ucontext.h
index b7c29c8017f2..964bc3b46ff3 100644
--- a/arch/x86/include/uapi/asm/ucontext.h
+++ b/arch/x86/include/uapi/asm/ucontext.h
@@ -1,11 +1,27 @@
 #ifndef _ASM_X86_UCONTEXT_H
 #define _ASM_X86_UCONTEXT_H
 
-#define UC_FP_XSTATE   0x1 /* indicates the presence of extended state
-* information in the memory layout pointed
-* by the fpstate pointer in the ucontext's
-* sigcontext struct (uc_mcontext).
-*/
+/*
+ * indicates the presence of extended state
+ * information in the memory layout pointed
+ * by the fpstate pointer in the ucontext's
+ * sigcontext struct (uc_mcontext).
+ */
+#define UC_FP_XSTATE   0x1
+
+#ifdef __x86_64__
+/*
+ * UC_SAVED_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext.  Kernels that set UC_SAVED_SS
+ * allow signal handlers to set UC_RESTORE_SS; if UC_RESTORE_SS is set,
+ * then sigreturn will restore SS.
+ *
+ * For compatibility with old programs, the kernel will *not* set
+ * UC_RESTORE_SS when delivering signals.
+ */
+#define UC_SAVED_SS0x2
+#define UC_RESTORE_SS  0x4
+#endif
 
 #include asm-generic/ucontext.h
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 784af1e49fc1..746250e9bce1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,7 +61,9 @@
regs-seg = GET_SEG(seg) | 3;   \
 } while (0)
 
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
+static int restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext __user *sc,
+ unsigned long uc_flags)
 {
void __user *buf;
unsigned int tmpflags;
@@ -94,7 +96,19 @@ int restore_sigcontext(struct pt_regs *regs, struct 
sigcontext __user *sc)
 #endif /* CONFIG_X86_64 */
 
COPY_SEG_CPL3(cs);
+
+#ifdef CONFIG_X86_64
+   /*
+* For the 64-bit ABI, we only restore SS if UC_RESTORE_SS
+* is set.  Otherwise we rely on the fact that regs-ss
+* is already set to __USER_DS by the SYSCALL entry code.
+*/
+   if (uc_flags  UC_RESTORE_SS)
+   COPY_SEG_CPL3(ss);
+#else
+   /* For the 32-bit ABI, we always restore SS. */
COPY_SEG_CPL3(ss);
+#endif
 
get_user_ex(tmpflags, sc-flags);
regs-flags = (regs-flags  ~FIX_EFLAGS) | (tmpflags  
FIX_EFLAGS);
@@ -336,6 +350,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
void __user *restorer;
int err = 0;
void __user *fpstate = NULL;
+   unsigned long flags = 0;
 
frame = get_sigframe(ksig-ka, regs, sizeof(*frame), fpstate);
 
@@ -349,9 +364,12 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
/* Create the ucontext.  */
   

Re: [PATCH] book3s_hv_rmhandlers:Pass the correct trap argument to kvmhv_commence_exit

2015-08-13 Thread Sam Bobroff
On Thu, May 21, 2015 at 01:57:04PM +0530, Gautham R. Shenoy wrote:
 In guest_exit_cont we call kvmhv_commence_exit which expects the trap
 number as the argument. However r3 doesn't contain the trap number at
 this point and as a result we would be calling the function with a
 spurious trap number.
 
 Fix this by copying r12 into r3 before calling kvmhv_commence_exit as
 r12 contains the trap number
 
 Signed-off-by: Gautham R. Shenoy e...@linux.vnet.ibm.com

Hi Gautham,

I agree with your logic: r3 is quite clearly corrupted in that path. So:

Reviewed-by: Sam Bobroff sam.bobr...@au1.ibm.com

Just one comment: Do you have a case of this causing some visible problem due
to the corrupted trap number? (I'll test the patch if you do.)

Cheers,
Sam.

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Is it possible to have a pre-format qcow2 container with balloon file?

2015-08-13 Thread laalaa
I wish to have a big qcow2 container, say 1TB, to hold the growing bulk of 
data.  At the beginning it only get 100GB of data.  To save the storage, I 
would create an empty container, say:

# qemu-img create -f qcow2 data.qcow2 1000G

When time pass, some files saved, some files deleted.  The data size may still 
be 100GB, while the qcow2 container may have grown to 400GB.

What I understood is that the inode deleted inside VM is impossible to be 
removed in qcow2 due to POSIX limitation.

So, what I do now is:

1. In server, create a qcow2 with 1000G size
# qemu-img create -f qcow2 data.qcow2 1000G

2. Inside VM, format the qcow2 to ext4 (or whatever format)
# mkfs.ext4 /dev/vdb

3. Inside VM, create 8 x 100GB of dummy files (zero.1 to zero.8)
# mount /dev/vdb /mnt/vdb
# for i in 1 2 3 4 5 6 7 8; dd if=/dev/zero of=/mnt/vdb/zero.$i bs=100 
count=10; done
# umount /dev/vdb

4. In server, compress the qcow2
# qemu-img convert -c -f qcow2 -O qcow2 data.qcow2 data-compressed.qcow2

This effectively limit qcow2 file to a maximum size of 200GB no matter how much 
read/write it got.  In case the data size is growing, I can simply remove 
zero.1 to zero.8 on demand without data migration / partitioning / 
down-time.

The question is: Is it possible to have a feature to create pre-formatted (say, 
ext4) qcow2 container that is filled with balloon files (zero.xxx), so that the 
above steps can be done in one step?  Say, no need to make the balloon files to 
inflate the qcow2 to 1TB size and use compress command to trim it down (which 
is slow).

  --
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm:powerpc:Fix return statements for wrapper functions in the file book3s_64_mmu_hv.c

2015-08-13 Thread Michael Ellerman
On Wed, 2015-08-12 at 21:06 +0200, Alexander Graf wrote:
 
 On 10.08.15 17:27, Nicholas Krause wrote:
  This fixes the wrapper functions kvm_umap_hva_hv and the function
  kvm_unmap_hav_range_hv to return the return value of the function
  kvm_handle_hva or kvm_handle_hva_range that they are wrapped to
  call internally rather then always making the caller of these
  wrapper functions think they always run successfully by returning
  the value of zero directly.
  
  Signed-off-by: Nicholas Krause xerofo...@gmail.com
 
 Paul, could you please take on this one?

Paul's away for a while can you take it directly?

cheers


--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm:powerpc:Fix return statements for wrapper functions in the file book3s_64_mmu_hv.c

2015-08-13 Thread Michael Ellerman
On Wed, 2015-08-12 at 21:06 +0200, Alexander Graf wrote:
 
 On 10.08.15 17:27, Nicholas Krause wrote:
  This fixes the wrapper functions kvm_umap_hva_hv and the function
  kvm_unmap_hav_range_hv to return the return value of the function
  kvm_handle_hva or kvm_handle_hva_range that they are wrapped to
  call internally rather then always making the caller of these
  wrapper functions think they always run successfully by returning
  the value of zero directly.
  
  Signed-off-by: Nicholas Krause xerofo...@gmail.com
 
 Paul, could you please take on this one?

Paul's away for a while can you take it directly?

cheers


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 10/15] KVM: arm64: add data structures to model ITS interrupt translation

2015-08-13 Thread Eric Auger

On 07/10/2015 04:21 PM, Andre Przywara wrote:
 The GICv3 Interrupt Translation Service (ITS) uses tables in memory
 to allow a sophisticated interrupt routing. It features device tables,
 an interrupt table per device and a table connecting collections to
 actual CPUs (aka. redistributors in the GICv3 lingo).
 Since the interrupt numbers for the LPIs are allocated quite sparsely
 and the range can be quite huge (8192 LPIs being the minimum), using
 bitmaps or arrays for storing information is a waste of memory.
 We use linked lists instead, which we iterate linearily. This works
 very well with the actual number of LPIs/MSIs in the guest being
 quite low. Should the number of LPIs exceed the number where iterating
 through lists seems acceptable, we can later revisit this and use more
 efficient data structures.
 
 Signed-off-by: Andre Przywara andre.przyw...@arm.com
 ---
  include/kvm/arm_vgic.h  |  3 +++
  virt/kvm/arm/its-emul.c | 48 
  2 files changed, 51 insertions(+)
 
 diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
 index b432055..1648668 100644
 --- a/include/kvm/arm_vgic.h
 +++ b/include/kvm/arm_vgic.h
 @@ -25,6 +25,7 @@
  #include linux/spinlock.h
  #include linux/types.h
  #include kvm/iodev.h
 +#include linux/list.h
  
  #define VGIC_NR_IRQS_LEGACY  256
  #define VGIC_NR_SGIS 16
 @@ -162,6 +163,8 @@ struct vgic_its {
   u64 cbaser;
   int creadr;
   int cwriter;
 + struct list_headdevice_list;
 + struct list_headcollection_list;
  };
  
  struct vgic_dist {
 diff --git a/virt/kvm/arm/its-emul.c b/virt/kvm/arm/its-emul.c
 index b498f06..7f217fa 100644
 --- a/virt/kvm/arm/its-emul.c
 +++ b/virt/kvm/arm/its-emul.c
 @@ -21,6 +21,7 @@
  #include linux/kvm.h
  #include linux/kvm_host.h
  #include linux/interrupt.h
 +#include linux/list.h
  
  #include linux/irqchip/arm-gic-v3.h
  #include kvm/arm_vgic.h
 @@ -32,6 +33,25 @@
  #include vgic.h
  #include its-emul.h
  
 +struct its_device {
 + struct list_head dev_list;
 + struct list_head itt;
 + u32 device_id;
 +};
 +
 +struct its_collection {
 + struct list_head coll_list;
 + u32 collection_id;
 + u32 target_addr;
 +};
 +
 +struct its_itte {
 + struct list_head itte_list;
 + struct its_collection *collection;
 + u32 lpi;
 + u32 event_id;
 +};
 +
  #define BASER_BASE_ADDRESS(x) ((x)  0xf000ULL)
  
  /* The distributor lock is held by the VGIC MMIO handler. */
 @@ -311,6 +331,9 @@ int vits_init(struct kvm *kvm)
  
   spin_lock_init(its-lock);
  
 + INIT_LIST_HEAD(its-device_list);
 + INIT_LIST_HEAD(its-collection_list);
 +
   its-enabled = false;
  
   return -ENXIO;
 @@ -320,11 +343,36 @@ void vits_destroy(struct kvm *kvm)
  {
   struct vgic_dist *dist = kvm-arch.vgic;
   struct vgic_its *its = dist-its;
 + struct its_device *dev;
 + struct its_itte *itte;
 + struct list_head *dev_cur, *dev_temp;
 + struct list_head *cur, *temp;
  
   if (!vgic_has_its(kvm))
   return;
  
 + if (!its-device_list.next)
Why not using list_empty? But I think I would simply remove this since
the empty case if handle below...
 + return;
 +
 + spin_lock(its-lock);
 + list_for_each_safe(dev_cur, dev_temp, its-device_list) {
 + dev = container_of(dev_cur, struct its_device, dev_list);
isn't the usage of list_for_each_entry_safe more synthetic here?
 + list_for_each_safe(cur, temp, dev-itt) {
 + itte = (container_of(cur, struct its_itte, itte_list));
same

Eric
 + list_del(cur);
 + kfree(itte);
 + }
 + list_del(dev_cur);
 + kfree(dev);
 + }
 +
 + list_for_each_safe(cur, temp, its-collection_list) {
 + list_del(cur);
 + kfree(container_of(cur, struct its_collection, coll_list));
 + }
 +
   kfree(dist-pendbaser);
  
   its-enabled = false;
 + spin_unlock(its-lock);
  }
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html