Re: [RFC PATCH 3/6] kvm: arm64: Fix up RELR relocation in hyp code/data

2020-11-24 Thread Marc Zyngier

On 2020-11-19 16:25, David Brazdil wrote:
The arm64 kernel also supports packing of relocation data using the 
RELR

format. Implement a parser of RELR data and fixup the relocations using
the same infra as RELA relocs.

Signed-off-by: David Brazdil 
---
 arch/arm64/kvm/va_layout.c | 41 ++
 1 file changed, 41 insertions(+)

diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
index b80fab974896..7f45a98eacfd 100644
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -145,6 +145,43 @@ static void __fixup_hyp_rela(void)
__fixup_hyp_rel(rel[i].r_offset);
 }

+#ifdef CONFIG_RELR
+static void __fixup_hyp_relr(void)
+{
+   u64 *rel, *end;
+
+   rel = (u64*)(kimage_vaddr + __load_elf_u64(__relr_offset));
+   end = rel + (__load_elf_u64(__relr_size) / sizeof(*rel));
+
+   while (rel < end) {
+   unsigned n;
+   u64 addr = *(rel++);
+
+   /* Address must not have the LSB set. */
+   BUG_ON(addr & BIT(0));
+
+   /* Fix up the first address of the chain. */
+   __fixup_hyp_rel(addr);
+
+   /*
+* Loop over bitmaps, i.e. as long as words' LSB is 1.
+* Each bit (ordered from LSB to MSB) represents one word from
+* the last full address (exclusive). If the corresponding bit
+* is 1, there is a relative relocation on that word.
+*/


What is the endianness of this bitmap? Is it guaranteed to be in
CPU-endian format?


+   for (n = 0; rel < end && (*rel & BIT(0)); n++) {
+   unsigned i;
+   u64 bitmap = *(rel++);


nit: if you change this u64 for an unsigned long...


+
+   for (i = 1; i < 64; ++i) {
+   if ((bitmap & BIT(i)))
+   __fixup_hyp_rel(addr + 8 * (63 * n + 
i));
+   }


... this can be written as:

i = 1;
for_each_set_bit_from(i, &bitmap, 64)
__fixup_hyp_rel(addr + 8 * (63 * n + i));


+   }
+   }
+}
+#endif
+
 /*
  * The kernel relocated pointers to kernel VA. Iterate over 
relocations in
  * the hypervisor ELF sections and convert them to hyp VA. This avoids 
the

@@ -156,6 +193,10 @@ __init void kvm_fixup_hyp_relocations(void)
return;

__fixup_hyp_rela();
+
+#ifdef CONFIG_RELR
+   __fixup_hyp_relr();
+#endif
 }

 static u32 compute_instruction(int n, u32 rd, u32 rn)


Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [RFC PATCH 2/6] kvm: arm64: Fix up RELA relocations in hyp code/data

2020-11-24 Thread Marc Zyngier

On 2020-11-19 16:25, David Brazdil wrote:

KVM nVHE code runs under a different VA mapping than the kernel, hence
so far it relied only on PC-relative addressing to avoid accidentally
using a relocated kernel VA from a constant pool (see hyp_symbol_addr).

So as to reduce the possibility of a programmer error, fixup the
relocated addresses instead. Let the kernel relocate them to kernel VA
first, but then iterate over them again, filter those that point to hyp
code/data and convert the kernel VA to hyp VA.

This is done after kvm_compute_layout and before apply_alternatives.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_mmu.h |  1 +
 arch/arm64/kernel/smp.c  |  4 +-
 arch/arm64/kvm/va_layout.c   | 76 
 3 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h 
b/arch/arm64/include/asm/kvm_mmu.h

index 5168a0c516ae..e5226f7e4732 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -105,6 +105,7 @@ alternative_cb_end
 void kvm_update_va_mask(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
 void kvm_compute_layout(void);
+void kvm_fixup_hyp_relocations(void);

 static __always_inline unsigned long __kern_hyp_va(unsigned long v)
 {
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 18e9727d3f64..30241afc2c93 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -434,8 +434,10 @@ static void __init hyp_mode_check(void)
   "CPU: CPUs started in inconsistent modes");
else
pr_info("CPU: All CPU(s) started at EL1\n");
-   if (IS_ENABLED(CONFIG_KVM))
+   if (IS_ENABLED(CONFIG_KVM)) {
kvm_compute_layout();
+   kvm_fixup_hyp_relocations();
+   }
 }

 void __init smp_cpus_done(unsigned int max_cpus)
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
index d8cc51bd60bf..b80fab974896 100644
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 

@@ -82,6 +83,81 @@ __init void kvm_compute_layout(void)
init_hyp_physvirt_offset();
 }

+#define __load_elf_u64(s)  \
+   ({  \
+   extern u64 s;   \
+   u64 val;\
+   \
+   asm ("ldr %0, =%1" : "=r"(val) : "S"(&s));\
+   val;\
+   })


I'm not sure I get the rational about the naming here. None of this
has much to do with ELF, but seems to just load a value from a
constant pool.


+
+static bool __is_within_bounds(u64 addr, char *start, char *end)
+{
+   return start <= (char*)addr && (char*)addr < end;
+}
+
+static bool __is_in_hyp_section(u64 addr)
+{
+   return __is_within_bounds(addr, __hyp_text_start, __hyp_text_end) ||
+	   __is_within_bounds(addr, __hyp_rodata_start, __hyp_rodata_end) 
||

+  __is_within_bounds(addr,
+ CHOOSE_NVHE_SYM(__per_cpu_start),
+ CHOOSE_NVHE_SYM(__per_cpu_end));
+}
+
+static void __fixup_hyp_rel(u64 addr)
+{
+   u64 *ptr, kern_va, hyp_va;
+
+   /* Adjust the relocation address taken from ELF for KASLR. */
+   addr += kaslr_offset();
+
+   /* Skip addresses not in any of the hyp sections. */
+   if (!__is_in_hyp_section(addr))
+   return;
+
+   /* Get the LM alias of the relocation address. */
+   ptr = (u64*)kvm_ksym_ref((void*)addr);


Why the casting? We should be perfectly fine without.

nit: we really need to change the name of this helper, it doesn't have
anything to do with symbols anymore. And actually, lm_alias() *is* the
right thing to use here (we don't relocate anything on VHE).


+
+   /*
+* Read the value at the relocation address. It has already been
+* relocated to the actual kernel kimg VA.
+*/
+   kern_va = (u64)kvm_ksym_ref((void*)*ptr);


Same comment.


+
+   /* Convert to hyp VA. */
+   hyp_va = __early_kern_hyp_va(kern_va);
+
+   /* Store hyp VA at the relocation address. */
+   *ptr = __early_kern_hyp_va(kern_va);
+}
+
+static void __fixup_hyp_rela(void)
+{
+   Elf64_Rela *rel;
+   size_t i, n;
+
+   rel = (Elf64_Rela*)(kimage_vaddr + __load_elf_u64(__rela_offset));
+   n = __load_elf_u64(__rela_size) / sizeof(*rel);
+
+   for (i = 0; i < n; ++i)
+   __fixup_hyp_rel(rel[i].r_offset);
+}
+
+/*
+ * The kernel relocated pointers to kernel VA. Iterate over 
relocations in
+ * the hypervisor ELF sections and convert them to hyp VA. This avoids 
the

+ * need to only use PC-relative addressing

Re: [PATCH 3/3] pinctrl: qcom: Clear possible pending irq when remuxing GPIOs

2020-11-24 Thread Marc Zyngier

On 2020-11-24 12:43, Maulik Shah wrote:

Hi Marc,

On 11/24/2020 4:45 PM, Marc Zyngier wrote:

On 2020-11-24 10:37, Maulik Shah wrote:

[...]


  static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
    unsigned function,
    unsigned group)
  {
  struct msm_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev);
+    struct gpio_chip *gc = &pctrl->chip;
+    unsigned int irq = irq_find_mapping(gc->irq.domain, group);
  const struct msm_pingroup *g;
  unsigned long flags;
  u32 val, mask;
+    u32 oldval;
+    u32 old_i;
  int i;
    g = &pctrl->soc->groups[group];
@@ -187,15 +215,26 @@ static int msm_pinmux_set_mux(struct 
pinctrl_dev *pctldev,

  if (WARN_ON(i == g->nfuncs))
  return -EINVAL;
  -    raw_spin_lock_irqsave(&pctrl->lock, flags);
+    disable_irq(irq);
  -    val = msm_readl_ctl(pctrl, g);
+    raw_spin_lock_irqsave(&pctrl->lock, flags);
+    oldval = val = msm_readl_ctl(pctrl, g);
  val &= ~mask;
  val |= i << g->mux_bit;
  msm_writel_ctl(val, pctrl, g);
-
  raw_spin_unlock_irqrestore(&pctrl->lock, flags);
  +    /*
+ * Clear IRQs if switching to/from GPIO mode since muxing 
to/from

+ * the GPIO path can cause phantom edges.
+ */
+    old_i = (oldval & mask) >> g->mux_bit;
+    if (old_i != i &&
+    (i == pctrl->soc->gpio_func || old_i == 
pctrl->soc->gpio_func))

+    msm_pinctrl_clear_pending_irq(pctrl, group, irq);


disable_irq() and enable_irq() should be moved inside this if loop. 
as

only use for this is to mask the IRQ when switching back to gpio IRQ
mode?

i also don't think we should leave IRQ enabled at the end of this
function by default, probably need to check if IRQ was already
unmasked before disabling it, then only call enable_irq().


Why? It looks to me that this reproduces the behaviour of 
IRQCHIP_SET_TYPE_MASKED, which is highly desirable. What

problem are you trying to address with this?


Correct, here trying to reproduce the behaviour of
IRQCHIP_SET_TYPE_MASKED which i guess is ok once its moved inside if
loop as this is the place its switching to IRQ mode.

but there is a problem to leave it enabled at the end of set_direction
callbacks, see below.






+
+    enable_irq(irq);
+
  return 0;
  }
  @@ -456,32 +495,45 @@ static const struct pinconf_ops 
msm_pinconf_ops = {
  static int msm_gpio_direction_input(struct gpio_chip *chip, 
unsigned offset)

  {
  const struct msm_pingroup *g;
+    unsigned int irq = irq_find_mapping(chip->irq.domain, offset);
  struct msm_pinctrl *pctrl = gpiochip_get_data(chip);
  unsigned long flags;
+    u32 oldval;
  u32 val;
    g = &pctrl->soc->groups[offset];
  +    disable_irq(irq);
+
  raw_spin_lock_irqsave(&pctrl->lock, flags);
  -    val = msm_readl_ctl(pctrl, g);
+    oldval = val = msm_readl_ctl(pctrl, g);
  val &= ~BIT(g->oe_bit);
  msm_writel_ctl(val, pctrl, g);
    raw_spin_unlock_irqrestore(&pctrl->lock, flags);
  +    if (oldval != val)
+    msm_pinctrl_clear_pending_irq(pctrl, offset, irq);
+
+    enable_irq(irq);


i do not think we need disable_irq() and enable_irq() here, changing
direction to input does not mean its being used for interrupt only, 
it

may be set to use something like Rx mode in UART.

the client driver should enable IRQ when needed.


And the kernel doesn't expect random interrupts to fire. Again, what
are you trying to fix by removing these?


I see leaving IRQ enabled here can cause problems. For example in
qcom_geni_serial.c driver before requesting IRQ, it sets the
IRQ_NOAUTOEN flag to not keep it enabled.

see the below snippet
    irq_set_status_flags(uport->irq, IRQ_NOAUTOEN);
    ret = devm_request_irq(uport->dev, uport->irq, 
qcom_geni_serial_isr,

    IRQF_TRIGGER_HIGH, port->name, uport);

later when this devm_request_irq() invokes .irq_request_resources
callback it will reach msm_gpio_irq_reqres() from
where msm_gpio_direction_input() is called which leaves the irq
enabled at the end with enable_irq() which was not expected by driver.


No it doesn't. disable_irq()/enable_irq() are designed to nest.
If the interrupt line was disabled before the disable/enable
sequence, it will still be disabled after.


It will cause is IRQ storm since the UART geni driver uses GPIO in Rx
mode when out of suspend. The IRQ mode in GPIO is enabled
with suspend entry only. During resume the IRQ will again be disabled
and GPIO will be switched to Rx mode.


I don't see how this contradicts what is above. If the interrupt was
disabled before hitting this sequence, it will still be disabled after.
Am I missing something? Have you actually seen the problem on HW?

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH 3/3] pinctrl: qcom: Clear possible pending irq when remuxing GPIOs

2020-11-24 Thread Marc Zyngier

On 2020-11-24 10:37, Maulik Shah wrote:

[...]


  static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
  unsigned function,
  unsigned group)
  {
struct msm_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev);
+   struct gpio_chip *gc = &pctrl->chip;
+   unsigned int irq = irq_find_mapping(gc->irq.domain, group);
const struct msm_pingroup *g;
unsigned long flags;
u32 val, mask;
+   u32 oldval;
+   u32 old_i;
int i;
g = &pctrl->soc->groups[group];
@@ -187,15 +215,26 @@ static int msm_pinmux_set_mux(struct pinctrl_dev 
*pctldev,

if (WARN_ON(i == g->nfuncs))
return -EINVAL;
  - raw_spin_lock_irqsave(&pctrl->lock, flags);
+   disable_irq(irq);
  - val = msm_readl_ctl(pctrl, g);
+   raw_spin_lock_irqsave(&pctrl->lock, flags);
+   oldval = val = msm_readl_ctl(pctrl, g);
val &= ~mask;
val |= i << g->mux_bit;
msm_writel_ctl(val, pctrl, g);
-
raw_spin_unlock_irqrestore(&pctrl->lock, flags);
  + /*
+* Clear IRQs if switching to/from GPIO mode since muxing to/from
+* the GPIO path can cause phantom edges.
+*/
+   old_i = (oldval & mask) >> g->mux_bit;
+   if (old_i != i &&
+   (i == pctrl->soc->gpio_func || old_i == pctrl->soc->gpio_func))
+   msm_pinctrl_clear_pending_irq(pctrl, group, irq);


disable_irq() and enable_irq() should be moved inside this if loop. as
only use for this is to mask the IRQ when switching back to gpio IRQ
mode?

i also don't think we should leave IRQ enabled at the end of this
function by default, probably need to check if IRQ was already
unmasked before disabling it, then only call enable_irq().


Why? It looks to me that this reproduces the behaviour of 
IRQCHIP_SET_TYPE_MASKED, which is highly desirable. What

problem are you trying to address with this?




+
+   enable_irq(irq);
+
return 0;
  }
  @@ -456,32 +495,45 @@ static const struct pinconf_ops 
msm_pinconf_ops = {
  static int msm_gpio_direction_input(struct gpio_chip *chip, unsigned 
offset)

  {
const struct msm_pingroup *g;
+   unsigned int irq = irq_find_mapping(chip->irq.domain, offset);
struct msm_pinctrl *pctrl = gpiochip_get_data(chip);
unsigned long flags;
+   u32 oldval;
u32 val;
g = &pctrl->soc->groups[offset];
  + disable_irq(irq);
+
raw_spin_lock_irqsave(&pctrl->lock, flags);
  - val = msm_readl_ctl(pctrl, g);
+   oldval = val = msm_readl_ctl(pctrl, g);
val &= ~BIT(g->oe_bit);
msm_writel_ctl(val, pctrl, g);
raw_spin_unlock_irqrestore(&pctrl->lock, flags);
  + if (oldval != val)
+   msm_pinctrl_clear_pending_irq(pctrl, offset, irq);
+
+   enable_irq(irq);


i do not think we need disable_irq() and enable_irq() here, changing
direction to input does not mean its being used for interrupt only, it
may be set to use something like Rx mode in UART.

the client driver should enable IRQ when needed.


And the kernel doesn't expect random interrupts to fire. Again, what
are you trying to fix by removing these?

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v15 6/9] arm64/kvm: Add hypercall service for kvm ptp.

2020-11-24 Thread Marc Zyngier

On 2020-11-24 05:20, Jianyong Wu wrote:

Hi Marc,


[...]


> +/* ptp_kvm counter type ID */
> +#define ARM_PTP_VIRT_COUNTER  0
> +#define ARM_PTP_PHY_COUNTER   1
> +#define ARM_PTP_NONE_COUNTER  2

The architecture definitely doesn't have this last counter.


Yeah, this is just represent no counter data needed from guest.
Some annotation should be added here.


I'd rather you remove it entirely, or explain why you really cannot
do without a fake counter.

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v15 7/9] ptp: arm/arm64: Enable ptp_kvm for arm/arm64

2020-11-24 Thread Marc Zyngier

Jianyong,

On 2020-11-24 05:37, Jianyong Wu wrote:

Hi Marc,


[...]


> +
arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FU
NC_ID,
> +   ARM_PTP_NONE_COUNTER, &hvc_res);

I really don't see the need to use a non-architectural counter ID.
Using the virtual counter ID should just be fine, and shouldn't lead 
to any

issue.




Am I missing something?


In this function, no counter data is needed. If virtual counter ID is
used here, user may be confused with why we get virtual counter
data and do nothing with it. So I explicitly add a new "NONE" counter
ID to make it clear.

WDYT?


ITIABI (I Think It's A Bad Idea). There are two counters, and the API
allows to retrieve the data for any of these two. If the "user" doesn't
want to do anything with the data, that's their problem.

Here, you can just sue the virtual counter, and that will give you the
exact same semantic, without inventing non-architectural state.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH 1/3] irqchip: qcom-pdc: Fix phantom irq when changing between rising/falling

2020-11-24 Thread Marc Zyngier

On 2020-11-24 00:01, Douglas Anderson wrote:

We have a problem if we use gpio-keys and configure wakeups such that
we only want one edge to wake us up.  AKA:
  wakeup-event-action = ;
  wakeup-source;

Specifically we end up with a phantom interrupt that blocks suspend if
the line was already high and we want wakeups on rising edges (AKA we
want the GPIO to go low and then high again before we wake up).  The
opposite is also problematic.

Specifically, here's what's happening today:
1. Normally, gpio-keys configures to look for both edges.  Due to the
   current workaround introduced in commit c3c0c2e18d94 ("pinctrl:
   qcom: Handle broken/missing PDC dual edge IRQs on sc7180"), if the
   line was high we'd configure for falling edges.
2. At suspend time, we change to look for rising edges.
3. After qcom_pdc_gic_set_type() runs, we get a phantom interrupt.

We can solve this by just clearing the phantom interrupt.

NOTE: it is possible that this could cause problems for a client with
very specific needs, but there's not much we can do with this
hardware.  As an example, let's say the interrupt signal is currently
high and the client is looking for falling edges.  The client now
changes to look for rising edges.  The client could possibly expect
that if the line has a short pulse low (and back high) that it would
always be detected.  Specifically no matter when the pulse happened,
it should either have tripped the (old) falling edge trigger or the
(new) rising edge trigger.  We will simply not trip it.  We could
narrow down the race a bit by polling our parent before changing
types, but no matter what we do there will still be a period of time
where we can't tell the difference between a real transition (or more
than one transition) and the phantom.

Signed-off-by: Douglas Anderson 
---

 drivers/irqchip/qcom-pdc.c | 19 ++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/irqchip/qcom-pdc.c b/drivers/irqchip/qcom-pdc.c
index bd39e9de6ecf..7d097164aadc 100644
--- a/drivers/irqchip/qcom-pdc.c
+++ b/drivers/irqchip/qcom-pdc.c
@@ -159,6 +159,8 @@ static int qcom_pdc_gic_set_type(struct irq_data
*d, unsigned int type)
 {
int pin_out = d->hwirq;
enum pdc_irq_config_bits pdc_type;
+   enum pdc_irq_config_bits old_pdc_type;
+   int ret;

if (pin_out == GPIO_NO_WAKE_IRQ)
return 0;
@@ -187,9 +189,24 @@ static int qcom_pdc_gic_set_type(struct irq_data
*d, unsigned int type)
return -EINVAL;
}

+   old_pdc_type = pdc_reg_read(IRQ_i_CFG, pin_out);
pdc_reg_write(IRQ_i_CFG, pin_out, pdc_type);

-   return irq_chip_set_type_parent(d, type);
+   ret = irq_chip_set_type_parent(d, type);
+
+   /*
+* When we change types the PDC can give a phantom interrupt.
+* Clear it.  Specifically the phantom shows up if a line is already
+* high and we change to rising or if a line is already low and we
+* change to falling but let's be consistent and clear it always.
+*
+* Doing this works because we have IRQCHIP_SET_TYPE_MASKED so the
+* interrupt will be cleared before the rest of the system sees it.
+*/
+   if (old_pdc_type != pdc_type)
+   irq_chip_set_parent_state(d, IRQCHIP_STATE_PENDING, 0);


nit: s/0/false/.

You could also make it conditional on the parent side having been 
successful.

And while we're looking at this: do you need to rollback the PDC state
if the GIC side has failed? It's all very hypothetical, but just in 
case...



+
+   return ret;
 }

 static struct irq_chip qcom_pdc_gic_chip = {


It otherwise looks sensible. Is that a fix for 5.10?

M.
--
Jazz is not dead. It just smells funny...


Re: [RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

2020-11-24 Thread Marc Zyngier

On 2020-11-24 08:10, Shenming Lu wrote:

On 2020/11/23 17:27, Marc Zyngier wrote:

On 2020-11-23 06:54, Shenming Lu wrote:

From: Zenghui Yu 

When setting the forwarding path of a VLPI, it is more consistent to


I'm not sure it is more consistent. It is a *new* behaviour, because 
it only

matters for migration, which has been so far unsupported.


Alright, consistent may not be accurate...
But I have doubt that whether there is really no need to transfer the
pending states
from kvm'vgic to VPT in set_forwarding regardless of migration, and the 
similar

for unset_forwarding.


If you have to transfer that state outside of the a save/restore, it 
means that
you have missed the programming of the PCI endpoint. This is an 
established
restriction that the MSI programming must occur *after* the translation 
has
been established using MAPI/MAPTI (see the large comment at the 
beginning of

vgic-v4.c).

If you want to revisit this, fair enough. But you will need a lot more 
than

just opportunistically transfer the pending state.





also transfer the pending state from irq->pending_latch to VPT 
(especially
in migration, the pending states of VLPIs are restored into kvm’s 
vgic
first). And we currently send "INT+VSYNC" to trigger a VLPI to 
pending.


Signed-off-by: Zenghui Yu 
Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c 
b/arch/arm64/kvm/vgic/vgic-v4.c

index b5fa73c9fd35..cc3ab9cea182 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -418,6 +418,18 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, 
int virq,

 irq->host_irq    = virq;
 atomic_inc(&map.vpe->vlpi_count);

+    /* Transfer pending state */
+    ret = irq_set_irqchip_state(irq->host_irq,
+    IRQCHIP_STATE_PENDING,
+    irq->pending_latch);
+    WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+
+    /*
+ * Let it be pruned from ap_list later and don't bother
+ * the List Register.
+ */
+    irq->pending_latch = false;


It occurs to me that calling into irq_set_irqchip_state() for a large
number of interrupts can take a significant amount of time. It is also
odd that you dump the VPT with the VPE unmapped, but rely on the VPE
being mapped for the opposite operation.

Shouldn't these be symmetric, all performed while the VPE is unmapped?
It would also save a lot of ITS traffic.



My thought was to use the existing interface directly without 
unmapping...


If you want to unmap the vPE and poke the VPT here, as I said in the 
cover
letter, set/unset_forwarding might also be called when all devices are 
running
at normal run time, in which case the unmapping of the vPE is not 
allowed...


No, I'm suggesting that you don't do anything here, but instead as a 
by-product

of restoring the ITS tables. What goes wrong if you use the
KVM_DEV_ARM_ITS_RESTORE_TABLE backend instead?


Another possible solution is to add a new dedicated interface to QEMU
to transfer
these pending states to HW in GIC VM state change handler corresponding 
to

save_pending_tables?


Userspace has no way to know we use GICv4, and I intend to keep it
completely out of the loop. The API is already pretty tortuous, and
I really don't want to add any extra complexity to it.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [RFC PATCH v1 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

2020-11-24 Thread Marc Zyngier

On 2020-11-24 07:40, Shenming Lu wrote:

On 2020/11/23 17:18, Marc Zyngier wrote:

On 2020-11-23 06:54, Shenming Lu wrote:

After pausing all vCPUs and devices capable of interrupting, in order

    ^
See my comment below about this.

to save the information of all interrupts, besides flushing the 
pending

states in kvm’s vgic, we also try to flush the states of VLPIs in the
virtual pending tables into guest RAM, but we need to have GICv4.1 
and

safely unmap the vPEs first.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v3.c | 62 
+++

 1 file changed, 56 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v3.c 
b/arch/arm64/kvm/vgic/vgic-v3.c

index 9cdf39a94a63..e1b3aa4b2b12 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only

 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -356,6 +358,39 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
*kvm, struct vgic_irq *irq)
 return 0;
 }

+/*
+ * With GICv4.1, we can get the VLPI's pending state after unmapping
+ * the vPE. The deactivation of the doorbell interrupt will trigger
+ * the unmapping of the associated vPE.
+ */
+static void get_vlpi_state_pre(struct vgic_dist *dist)
+{
+    struct irq_desc *desc;
+    int i;
+
+    if (!kvm_vgic_global_state.has_gicv4_1)
+    return;
+
+    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+    desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+    irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+    }
+}
+
+static void get_vlpi_state_post(struct vgic_dist *dist)


nit: the naming feels a bit... odd. Pre/post what?


My understanding is that the unmapping is a preparation for 
get_vlpi_state...

Maybe just call it unmap/map_all_vpes?


Yes, much better.

[...]


+    if (irq->hw) {
+    WARN_RATELIMIT(irq_get_irqchip_state(irq->host_irq,
+    IRQCHIP_STATE_PENDING, &is_pending),
+   "IRQ %d", irq->host_irq);


Isn't this going to warn like mad on a GICv4.0 system where this, by 
definition,

will generate an error?


As we have returned an error in save_its_tables if hw && !has_gicv4_1, 
we don't

have to warn this here?


Are you referring to the check in vgic_its_save_itt() that occurs in 
patch 4?
Fair enough, though I think the use of irq_get_irqchip_state() isn't 
quite

what we want, as per my comments on patch #1.




+    }
+
+    if (stored == is_pending)
 continue;

-    if (irq->pending_latch)
+    if (is_pending)
 val |= 1 << bit_nr;
 else
 val &= ~(1 << bit_nr);

 ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
 if (ret)
-    return ret;
+    goto out;
 }
-    return 0;
+
+out:
+    get_vlpi_state_post(dist);


This bit worries me: you have unmapped the VPEs, so any interrupt that 
has been
generated during that phase is now forever lost (the GIC doesn't have 
ownership

of the pending tables).


In my opinion, during this phase, the devices capable of interrupting
should have  already been paused (prevent from sending interrupts),
such as VFIO migration protocol has already realized it.


Is that a hard guarantee? Pausing devices *may* be possible for a 
limited
set of endpoints, but I'm not sure that is universally possible to 
restart
them and expect a consistent state (you have just dropped a bunch of 
network

packets on the floor...).

Do you really expect the VM to be restartable from that point? I don't 
see how

this is possible.



If the migration has encountered an error, the src VM might be
restarted, so we have to map the vPEs back.


As I said above, I doubt it is universally possible to do so, but
after all, this probably isn't worse that restarting on the target...

M.
--
Jazz is not dead. It just smells funny...


Re: [RFC PATCH v1 1/4] irqchip/gic-v4.1: Plumb get_irqchip_state VLPI callback

2020-11-24 Thread Marc Zyngier

On 2020-11-24 07:38, Shenming Lu wrote:

On 2020/11/23 17:01, Marc Zyngier wrote:

On 2020-11-23 06:54, Shenming Lu wrote:

From: Zenghui Yu 

Up to now, the irq_get_irqchip_state() callback of its_irq_chip
leaves unimplemented since there is no architectural way to get
the VLPI's pending state before GICv4.1. Yeah, there has one in
v4.1 for VLPIs.

With GICv4.1, after unmapping the vPE, which cleans and invalidates
any caching of the VPT, we can get the VLPI's pending state by


This is a crucial note: without this unmapping and invalidation,
the pending bits are not generally accessible (they could be cached
in a GIC private structure, cache or otherwise).


peeking at the VPT. So we implement the irq_get_irqchip_state()
callback of its_irq_chip to do it.

Signed-off-by: Zenghui Yu 
Signed-off-by: Shenming Lu 
---
 drivers/irqchip/irq-gic-v3-its.c | 38 


 1 file changed, 38 insertions(+)

diff --git a/drivers/irqchip/irq-gic-v3-its.c 
b/drivers/irqchip/irq-gic-v3-its.c

index 0fec31931e11..287003cacac7 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1695,6 +1695,43 @@ static void its_irq_compose_msi_msg(struct
irq_data *d, struct msi_msg *msg)
 iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg);
 }

+static bool its_peek_vpt(struct its_vpe *vpe, irq_hw_number_t hwirq)
+{
+    int mask = hwirq % BITS_PER_BYTE;


nit: this isn't a mask, but a shift instead. BIT(hwirq % BPB) would 
give

you a mask.


Ok, I will correct it.




+    void *va;
+    u8 *pt;
+
+    va = page_address(vpe->vpt_page);
+    pt = va + hwirq / BITS_PER_BYTE;
+
+    return !!(*pt & (1U << mask));
+}
+
+static int its_irq_get_irqchip_state(struct irq_data *d,
+ enum irqchip_irq_state which, bool *val)
+{
+    struct its_device *its_dev = irq_data_get_irq_chip_data(d);
+    struct its_vlpi_map *map = get_vlpi_map(d);
+
+    if (which != IRQCHIP_STATE_PENDING)
+    return -EINVAL;
+
+    /* not intended for physical LPI's pending state */
+    if (!map)
+    return -EINVAL;
+
+    /*
+ * In GICv4.1, a VMAPP with {V,Alloc}=={0,1} cleans and 
invalidates
+ * any caching of the VPT associated with the vPEID held in the 
GIC.

+ */
+    if (!is_v4_1(its_dev->its) || 
atomic_read(&map->vpe->vmapp_count))


It isn't clear to me what prevents this from racing against a mapping 
of
the VPE. Actually, since we only hold the LPI irqdesc lock, I'm pretty 
sure

nothing prevents it.


Yes, should have the vmovp_lock held?


That's not helping because of the VPE activation.


And is it necessary to also hold this lock in
its_vpe_irq_domain_activate/deactivate?


Well, you'd need that, but that's unnecessary complex AFAICT.






+    return -EACCES;


I can sort of buy EACCESS for a VPE that is currently mapped, but a 
non-4.1

ITS should definitely return EINVAL.


Alright, EINVAL looks better.




+
+    *val = its_peek_vpt(map->vpe, map->vintid);
+
+    return 0;
+}
+
 static int its_irq_set_irqchip_state(struct irq_data *d,
  enum irqchip_irq_state which,
  bool state)
@@ -1975,6 +2012,7 @@ static struct irq_chip its_irq_chip = {
 .irq_eoi    = irq_chip_eoi_parent,
 .irq_set_affinity    = its_set_affinity,
 .irq_compose_msi_msg    = its_irq_compose_msi_msg,
+    .irq_get_irqchip_state    = its_irq_get_irqchip_state,


My biggest issue with this is that it isn't a reliable interface.
It happens to work in the context of KVM, because you make sure it
is called at the right time, but that doesn't make it safe in general
(anyone with the interrupt number is allowed to call this at any 
time).


We check the vmapp_count in it to ensure the unmapping of the vPE, and
let the caller do the unmapping (they should know whether it is the 
right

time). If the unmapping is not done, just return a failure.


And without guaranteeing mutual exclusion against a concurrent VMAPP,
checking the vmapp_count means nothing (you need the lock described
above, and start sprinkling it around in other places as well).



Is there a problem with poking at the VPT page from the KVM side?
The code should be exactly the same (maybe simpler even), and at least
you'd be guaranteed to be in the correct context.


Yeah, that also seems a good choice.
If you prefer it, we can try to realize it in v2.


I'd certainly prefer that. Let me know if you spot any implementation
issue with that.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v2 00/24] Opt-in always-on nVHE hypervisor

2020-11-23 Thread Marc Zyngier
Hi David,

On Mon, 16 Nov 2020 20:42:54 +,
David Brazdil  wrote:
> 
> As we progress towards being able to keep guest state private to the
> host running nVHE hypervisor, this series allows the hypervisor to
> install itself on newly booted CPUs before the host is allowed to run
> on them.
> 
> All functionality described below is opt-in, guarded by an early param
> 'kvm-arm.protected'. Future patches specific to the new "protected" mode
> should be hidden behind the same param.
> 
> The hypervisor starts trapping host SMCs and intercepting host's PSCI
> CPU_ON/SUSPEND calls. It replaces the host's entry point with its own,
> initializes the EL2 state of the new CPU and installs the nVHE hyp vector
> before ERETing to the host's entry point.
> 
> The kernel checks new cores' features against the finalized system
> capabilities. To avoid the need to move this code/data to EL2, the
> implementation only allows to boot cores that were online at the time of
> KVM initialization and therefore had been checked already.
> 
> Other PSCI SMCs are forwarded to EL3, though only the known set of SMCs
> implemented in the kernel is allowed. Non-PSCI SMCs are also forwarded
> to EL3. Future changes will need to ensure the safety of all SMCs wrt.
> private guests.
> 
> The host is still allowed to reset EL2 back to the stub vector, eg. for
> hibernation or kexec, but will not disable nVHE when there are no VMs.

I have now been through the whole series, and I don't think there is
anything really major (although I haven't tried it yet).

I think it would benefit from being rebased on top of kvmarm/queue, as
it'd give you the opportunity to replace a number of per-CPU state
fields with global function pointers. Another thing is that we seem to
have diverging interpretations of the PSCI spec when it comes to
CPU_SUSPEND.

Finally, please include the PSCI maintainers in your next posting, as
I'll need their Ack to pick the first few patches.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.


Re: [PATCH v2 08/24] kvm: arm64: Add SMC handler in nVHE EL2

2020-11-23 Thread Marc Zyngier
On Mon, 16 Nov 2020 20:43:02 +,
David Brazdil  wrote:
> 
> Add handler of host SMCs in KVM nVHE trap handler. Forward all SMCs to
> EL3 and propagate the result back to EL1. This is done in preparation
> for validating host SMCs in KVM nVHE protected mode.
> 
> The implementation assumes that firmware uses SMCCC v1.2 or older. That
> means x0-x17 can be used both for arguments and results, other GPRs are
> preserved.
> 
> Signed-off-by: David Brazdil 
> ---
>  arch/arm64/kvm/hyp/nvhe/host.S | 38 ++
>  arch/arm64/kvm/hyp/nvhe/hyp-main.c | 26 
>  2 files changed, 64 insertions(+)
> 
> diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
> index ed27f06a31ba..52dae5cd5a28 100644
> --- a/arch/arm64/kvm/hyp/nvhe/host.S
> +++ b/arch/arm64/kvm/hyp/nvhe/host.S
> @@ -183,3 +183,41 @@ SYM_CODE_START(__kvm_hyp_host_vector)
>   invalid_host_el1_vect   // FIQ 32-bit EL1
>   invalid_host_el1_vect   // Error 32-bit EL1
>  SYM_CODE_END(__kvm_hyp_host_vector)
> +
> +/*
> + * Forward SMC with arguments in struct kvm_cpu_context, and
> + * store the result into the same struct. Assumes SMCCC 1.2 or older.
> + *
> + * x0: struct kvm_cpu_context*
> + */
> +SYM_CODE_START(__kvm_hyp_host_forward_smc)
> + /*
> +  * Use x18 to keep a pointer to the host context because x18
> +  * is callee-saved SMCCC but not in AAPCS64.
> +  */
> + mov x18, x0
> +
> + ldp x0, x1,   [x18, #CPU_XREG_OFFSET(0)]
> + ldp x2, x3,   [x18, #CPU_XREG_OFFSET(2)]
> + ldp x4, x5,   [x18, #CPU_XREG_OFFSET(4)]
> + ldp x6, x7,   [x18, #CPU_XREG_OFFSET(6)]
> + ldp x8, x9,   [x18, #CPU_XREG_OFFSET(8)]
> + ldp x10, x11, [x18, #CPU_XREG_OFFSET(10)]
> + ldp x12, x13, [x18, #CPU_XREG_OFFSET(12)]
> + ldp x14, x15, [x18, #CPU_XREG_OFFSET(14)]
> + ldp x16, x17, [x18, #CPU_XREG_OFFSET(16)]
> +
> + smc #0
> +
> + stp x0, x1,   [x18, #CPU_XREG_OFFSET(0)]
> + stp x2, x3,   [x18, #CPU_XREG_OFFSET(2)]
> + stp x4, x5,   [x18, #CPU_XREG_OFFSET(4)]
> + stp x6, x7,   [x18, #CPU_XREG_OFFSET(6)]
> + stp x8, x9,   [x18, #CPU_XREG_OFFSET(8)]
> + stp x10, x11, [x18, #CPU_XREG_OFFSET(10)]
> + stp x12, x13, [x18, #CPU_XREG_OFFSET(12)]
> + stp x14, x15, [x18, #CPU_XREG_OFFSET(14)]
> + stp x16, x17, [x18, #CPU_XREG_OFFSET(16)]

This is going to be really good for CPUs that need to use ARCH_WA1 for
their Spectre-v2 mitigation... :-( If that's too expensive, we may
have to reduce the number of save/restored registers, but I'm worried
the battle is already lost by the time we reach this (the host trap
path is already a huge hammer).

Eventually, we'll have to insert the mitigation in the vectors anyway,
just like we have on the guest exit path. Boo.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.


Re: [PATCH v2 12/24] kvm: arm64: Bootstrap PSCI SMC handler in nVHE EL2

2020-11-23 Thread Marc Zyngier
On Mon, 16 Nov 2020 20:43:06 +,
David Brazdil  wrote:
> 
> Add a handler of PSCI SMCs in nVHE hyp code. The handler is initialized
> with the version used by the host's PSCI driver and the function IDs it
> was configured with. If the SMC function ID matches one of the
> configured PSCI calls (for v0.1) or falls into the PSCI function ID
> range (for v0.2+), the SMC is handled by the PSCI handler. For now, all
> SMCs return PSCI_RET_NOT_SUPPORTED.
> 
> Signed-off-by: David Brazdil 
> ---
>  arch/arm64/include/asm/kvm_hyp.h |   4 ++
>  arch/arm64/kvm/arm.c |  14 
>  arch/arm64/kvm/hyp/nvhe/Makefile |   2 +-
>  arch/arm64/kvm/hyp/nvhe/hyp-main.c   |   6 +-
>  arch/arm64/kvm/hyp/nvhe/psci-relay.c | 104 +++
>  5 files changed, 128 insertions(+), 2 deletions(-)
>  create mode 100644 arch/arm64/kvm/hyp/nvhe/psci-relay.c
> 
> diff --git a/arch/arm64/include/asm/kvm_hyp.h 
> b/arch/arm64/include/asm/kvm_hyp.h
> index a3289071f3d8..95a2bbbcc7e1 100644
> --- a/arch/arm64/include/asm/kvm_hyp.h
> +++ b/arch/arm64/include/asm/kvm_hyp.h
> @@ -96,6 +96,10 @@ void deactivate_traps_vhe_put(void);
>  
>  u64 __guest_enter(struct kvm_vcpu *vcpu);
>  
> +#ifdef __KVM_NVHE_HYPERVISOR__
> +bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt);
> +#endif
> +
>  void __noreturn hyp_panic(void);
>  #ifdef __KVM_NVHE_HYPERVISOR__
>  void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 
> par);
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index cdd7981ea560..7d2270eeecfb 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -19,6 +19,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #define CREATE_TRACE_POINTS
> @@ -1514,6 +1515,18 @@ static void init_cpu_logical_map(void)
>   CHOOSE_NVHE_SYM(__cpu_logical_map)[cpu] = cpu_logical_map(cpu);
>  }
>  
> +static void init_psci_relay(void)
> +{
> + extern u32 kvm_nvhe_sym(kvm_host_psci_version);
> + extern u32 kvm_nvhe_sym(kvm_host_psci_function_id)[PSCI_FN_MAX];

nit: I'd rather have these outside of the function body.

> + int i;
> +
> + CHOOSE_NVHE_SYM(kvm_host_psci_version) = psci_ops.get_version
> + ? psci_ops.get_version() : PSCI_VERSION(0, 0);

nit: please write this with an if/else construct, it will read a lot
better.

> + for (i = 0; i < PSCI_FN_MAX; ++i)
> + CHOOSE_NVHE_SYM(kvm_host_psci_function_id)[i] = 
> psci_get_function_id(i);

Either pick kvm_nvhe_sym(), or CHOOSE_NVHE_SYM(). Having both used
together is just an annoyance (and in this case there is nothing to
choose, really).

> +}
> +
>  static int init_common_resources(void)
>  {
>   return kvm_set_ipa_limit();
> @@ -1693,6 +1706,7 @@ static int init_hyp_mode(void)
>   }
>  
>   init_cpu_logical_map();
> + init_psci_relay();
>  
>   return 0;
>  
> diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile 
> b/arch/arm64/kvm/hyp/nvhe/Makefile
> index 2d842e009a40..bf62c8e42ab2 100644
> --- a/arch/arm64/kvm/hyp/nvhe/Makefile
> +++ b/arch/arm64/kvm/hyp/nvhe/Makefile
> @@ -7,7 +7,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__
>  ccflags-y := -D__KVM_NVHE_HYPERVISOR__
>  
>  obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
> -  hyp-main.o hyp-smp.o
> +  hyp-main.o hyp-smp.o psci-relay.o
>  obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
>../fpsimd.o ../hyp-entry.o
>  
> diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c 
> b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> index 71a17af05953..df4acb40dd39 100644
> --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> @@ -120,7 +120,11 @@ static void skip_host_instruction(void)
>  
>  static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
>  {
> - default_host_smc_handler(host_ctxt);
> + bool handled;
> +
> + handled = kvm_host_psci_handler(host_ctxt);
> + if (!handled)
> + default_host_smc_handler(host_ctxt);
>  
>   /*
>* Unlike HVC, the return address of an SMC is the instruction's PC.
> diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c 
> b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
> new file mode 100644
> index ..d75d3f896bfd
> --- /dev/null
> +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
> @@ -0,0 +1,104 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2020 - Google LLC
> + * Author: David Brazdil 
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +/* Config options set by the host. */
> +u32 __ro_after_init kvm_host_psci_version = PSCI_VERSION(0, 0);
> +u32 __ro_after_init kvm_host_psci_function_id[PSCI_FN_MAX];
> +
> +static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt)
> +{
> + return host_ctxt->regs.regs[0];
> +}
> +
> +static bool is_psci_0_1_call(u64 func_id)
> +{
> + unsigned int i;
> +
> + for (i = 0; i <

Re: [PATCH v2 23/24] kvm: arm64: Trap host SMCs in protected mode.

2020-11-23 Thread Marc Zyngier
On Mon, 16 Nov 2020 20:43:17 +,
David Brazdil  wrote:
> 
> While protected nVHE KVM is installed, start trapping all host SMCs.
> By default, these are simply forwarded to EL3, but PSCI SMCs are
> validated first.
> 
> Create new constant HCR_HOST_NVHE_PROTECTED_FLAGS with the new set of HCR
> flags to use while the nVHE vector is installed when the kernel was
> booted with the protected flag enabled. Switch back to the default HCR
> flags when switching back to the stub vector.
> 
> Signed-off-by: David Brazdil 
> ---
>  arch/arm64/include/asm/kvm_arm.h   |  1 +
>  arch/arm64/kvm/hyp/nvhe/hyp-init.S | 12 
>  arch/arm64/kvm/hyp/nvhe/switch.c   |  5 -
>  3 files changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_arm.h 
> b/arch/arm64/include/asm/kvm_arm.h
> index 64ce29378467..4e90c2debf70 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -80,6 +80,7 @@
>HCR_FMO | HCR_IMO | HCR_PTW )
>  #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
>  #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA)
> +#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC)
>  #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
>  
>  /* TCR_EL2 Registers bits */
> diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S 
> b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> index 6d8202d2bdfb..8f3602f320ac 100644
> --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> @@ -88,6 +88,12 @@ SYM_CODE_END(__kvm_hyp_init)
>   * x0: struct kvm_nvhe_init_params PA
>   */
>  SYM_CODE_START(___kvm_hyp_init)
> +alternative_if ARM64_PROTECTED_KVM
> + mov_q   x1, HCR_HOST_NVHE_PROTECTED_FLAGS
> + msr hcr_el2, x1
> + isb

Why the ISB? For HCR_TSC to have any effect, you'll have to go via an
ERET to EL1 first, which will have the required synchronisation effect.

> +alternative_else_nop_endif
> +
>   ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
>   msr tpidr_el2, x1
>  
> @@ -224,6 +230,12 @@ reset:
>   msr sctlr_el2, x5
>   isb
>  
> +alternative_if ARM64_PROTECTED_KVM
> + mov_q   x5, HCR_HOST_NVHE_FLAGS
> + msr hcr_el2, x5
> + isb

Same thing here, I believe.

> +alternative_else_nop_endif
> +
>   /* Install stub vectors */
>   adr_l   x5, __hyp_stub_vectors
>   msr vbar_el2, x5
> diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c 
> b/arch/arm64/kvm/hyp/nvhe/switch.c
> index 8ae8160bc93a..e1f8e0797144 100644
> --- a/arch/arm64/kvm/hyp/nvhe/switch.c
> +++ b/arch/arm64/kvm/hyp/nvhe/switch.c
> @@ -96,7 +96,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
>   mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
>  
>   write_sysreg(mdcr_el2, mdcr_el2);
> - write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
> + if (is_protected_kvm_enabled())
> + write_sysreg(HCR_HOST_NVHE_PROTECTED_FLAGS, hcr_el2);
> + else
> + write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
>   write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
>   write_sysreg(__kvm_hyp_host_vector, vbar_el2);
>  }
> -- 
> 2.29.2.299.gdc1121823c-goog
> 
> 

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.


Re: [PATCH v2 21/24] kvm: arm64: Add kvm-arm.protected early kernel parameter

2020-11-23 Thread Marc Zyngier
On Mon, 16 Nov 2020 20:43:15 +,
David Brazdil  wrote:
> 
> Add an early parameter that allows users to opt into protected KVM mode
> when using the nVHE hypervisor. In this mode, guest state will be kept
> private from the host. This will primarily involve enabling stage-2
> address translation for the host, restricting DMA to host memory, and
> filtering host SMCs.
> 
> Capability ARM64_PROTECTED_KVM is set if the param is passed, CONFIG_KVM
> is enabled and the kernel was not booted with VHE.
> 
> Signed-off-by: David Brazdil 
> ---
>  arch/arm64/include/asm/cpucaps.h |  3 ++-
>  arch/arm64/include/asm/virt.h|  8 
>  arch/arm64/kernel/cpufeature.c   | 29 +
>  arch/arm64/kvm/arm.c | 10 +-
>  4 files changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/cpucaps.h 
> b/arch/arm64/include/asm/cpucaps.h
> index e7d98997c09c..ac075f70b2e4 100644
> --- a/arch/arm64/include/asm/cpucaps.h
> +++ b/arch/arm64/include/asm/cpucaps.h
> @@ -66,7 +66,8 @@
>  #define ARM64_HAS_TLB_RANGE  56
>  #define ARM64_MTE57
>  #define ARM64_WORKAROUND_1508412 58
> +#define ARM64_PROTECTED_KVM  59
>  
> -#define ARM64_NCAPS  59
> +#define ARM64_NCAPS  60
>  
>  #endif /* __ASM_CPUCAPS_H */
> diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
> index 6069be50baf9..2fde1186b962 100644
> --- a/arch/arm64/include/asm/virt.h
> +++ b/arch/arm64/include/asm/virt.h
> @@ -97,6 +97,14 @@ static __always_inline bool has_vhe(void)
>   return cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN);
>  }
>  
> +static __always_inline bool is_protected_kvm_enabled(void)
> +{
> + if (is_vhe_hyp_code())
> + return false;
> + else
> + return cpus_have_final_cap(ARM64_PROTECTED_KVM);
> +}
> +
>  #endif /* __ASSEMBLY__ */
>  
>  #endif /* ! __ASM__VIRT_H */
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 6f36c4f62f69..dd5bc0f0cf0d 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -1709,6 +1709,29 @@ static void cpu_enable_mte(struct 
> arm64_cpu_capabilities const *cap)
>  }
>  #endif /* CONFIG_ARM64_MTE */
>  
> +#ifdef CONFIG_KVM
> +static bool enable_protected_kvm;
> +
> +static bool has_protected_kvm(const struct arm64_cpu_capabilities *entry, 
> int __unused)
> +{
> + if (!enable_protected_kvm)
> + return false;
> +
> + if (is_kernel_in_hyp_mode()) {
> + pr_warn("Protected KVM not available with VHE\n");
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static int __init early_protected_kvm_cfg(char *buf)
> +{
> + return strtobool(buf, &enable_protected_kvm);
> +}
> +early_param("kvm-arm.protected", early_protected_kvm_cfg);

Please add some documentation to
Documentation/admin-guide/kernel-parameters.txt.

> +#endif /* CONFIG_KVM */
> +
>  /* Internal helper functions to match cpu capability type */
>  static bool
>  cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap)
> @@ -1822,6 +1845,12 @@ static const struct arm64_cpu_capabilities 
> arm64_features[] = {
>   .field_pos = ID_AA64PFR0_EL1_SHIFT,
>   .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT,
>   },
> + {
> + .desc = "Protected KVM",
> + .capability = ARM64_PROTECTED_KVM,
> + .type = ARM64_CPUCAP_SYSTEM_FEATURE,
> + .matches = has_protected_kvm,
> + },
>  #endif
>   {
>   .desc = "Kernel page table isolation (KPTI)",
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index c76a8e5bd19c..49d2474f2a80 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -1796,6 +1796,12 @@ int kvm_arch_init(void *opaque)
>   return -ENODEV;
>   }
>  
> + /* The PROTECTED_KVM cap should not have been enabled for VHE. */
> + if (in_hyp_mode && is_protected_kvm_enabled()) {
> + kvm_pr_unimpl("VHE protected mode unsupported, not 
> initializing\n");
> + return -ENODEV;

How can this happen? Don't we already take care of this?

> + }
> +
>   if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
>   cpus_have_final_cap(ARM64_WORKAROUND_1508412))
>   kvm_info("Guests without required CPU erratum workarounds can 
> deadlock system!\n" \
> @@ -1827,7 +1833,9 @@ int kvm_arch_init(void *opaque)
>   if (err)
>   goto out_hyp;
>  
> - if (in_hyp_mode)
> + if (is_protected_kvm_enabled())
> + kvm_info("Protected nVHE mode initialized successfully\n");
> + else if (in_hyp_mode)
>   kvm_info("VHE mode initialized successfully\n");
>   else
>   kvm_info("Hyp mode initialized successfully\n");
> -- 
> 2.29.2.299.gdc1121823c-goog
> 

Re: [PATCH v2 20/24] kvm: arm64: Intercept host's CPU_SUSPEND PSCI SMCs

2020-11-23 Thread Marc Zyngier
Adding Lorenzo and Sudeep to this one in particular, as there is a bit
of a corner case below.

On Mon, 16 Nov 2020 20:43:14 +,
David Brazdil  wrote:
> 
> Add a handler of CPU_SUSPEND host PSCI SMCs. The SMC can either enter
> a sleep state indistinguishable from a WFI or a deeper sleep state that
> behaves like a CPU_OFF+CPU_ON.
> 
> The handler saves r0,pc of the host and makes the same call to EL3 with
> the hyp CPU entry point. It either returns back to the handler and then
> back to the host, or wakes up into the entry point and initializes EL2
> state before dropping back to EL1.
> 
> There is a simple atomic lock around the reset state struct to protect
> from races with CPU_ON. A well-behaved host should never run CPU_ON
> against an already online core, and the kernel indeed does not allow
> that, so if the core sees its reset state struct locked, it will return
> a non-spec error code PENDING_ON. This protects the hypervisor state and

"non-spec" as in "outside of the PSCI specification"? Err...

> avoids the need for more complicated locking and/or tracking power state
> of individual cores.
> 
> Signed-off-by: David Brazdil 
> ---
>  arch/arm64/kvm/hyp/nvhe/psci-relay.c | 39 +++-
>  1 file changed, 38 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c 
> b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
> index 2daf52b59846..313ef42f0eab 100644
> --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
> +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
> @@ -121,6 +121,39 @@ static void release_reset_state(struct 
> kvm_host_psci_state *cpu_state)
>   atomic_set_release(&cpu_state->pending_on, 0);
>  }
>  
> +static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
> +{
> + u64 power_state = host_ctxt->regs.regs[1];
> + unsigned long pc = host_ctxt->regs.regs[2];
> + unsigned long r0 = host_ctxt->regs.regs[3];
> + struct kvm_host_psci_state *cpu_state;
> + struct kvm_nvhe_init_params *cpu_params;
> + int ret;
> +
> + cpu_state = this_cpu_ptr(&kvm_host_psci_state);
> + cpu_params = this_cpu_ptr(&kvm_init_params);
> +
> + /*
> +  * Lock the reset state struct. This fails if the host has concurrently
> +  * called CPU_ON with this CPU as target. The kernel keeps track of
> +  * online CPUs, so that should never happen. If it does anyway, return
> +  * a non-spec error. This avoids the need for spinlocks.
> +  */
> + if (!try_acquire_reset_state(cpu_state, pc, r0))
> + return PSCI_RET_ALREADY_ON;

So that's the core of the problem. I'm definitely not keen on EL2
returning unspecified error codes. But there is something I don't get:

If the CPU is currently booting (reset state is locked), it means that
CPU hasn't reached the EL1 kernel yet. So how can this same CPU issue
a CPU_SUSPEND from EL1? CPU_SUSPEND can't be called for a third party,
only by a CPU for itself.

It looks like this case cannot happen by construction. And if it
happens, it looks like the only course of action should be to panic,
as we have lost track of the running CPUs. Am I missing something
obvious?

> +
> + /*
> +  * Will either return if shallow sleep state, or wake up into the entry
> +  * point if it is a deep sleep state.
> +  */
> + ret = psci_call(func_id, power_state,
> + __hyp_pa(hyp_symbol_addr(__kvm_hyp_cpu_entry)),
> + __hyp_pa(cpu_params));
> +
> + release_reset_state(cpu_state);
> + return ret;
> +}
> +
>  static int psci_cpu_on(u64 func_id, struct kvm_cpu_context *host_ctxt)
>  {
>   u64 mpidr = host_ctxt->regs.regs[1];
> @@ -178,7 +211,9 @@ asmlinkage void __noreturn __kvm_hyp_psci_cpu_entry(void)
>  
>  static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context 
> *host_ctxt)
>  {
> - if (func_id == kvm_host_psci_function_id[PSCI_FN_CPU_OFF])
> + if (func_id == kvm_host_psci_function_id[PSCI_FN_CPU_SUSPEND])
> + return psci_cpu_suspend(func_id, host_ctxt);
> + else if (func_id == kvm_host_psci_function_id[PSCI_FN_CPU_OFF])
>   return psci_forward(host_ctxt);
>   else if (func_id == kvm_host_psci_function_id[PSCI_FN_CPU_ON])
>   return psci_cpu_on(func_id, host_ctxt);
> @@ -202,6 +237,8 @@ static unsigned long psci_0_2_handler(u64 func_id, struct 
> kvm_cpu_context *host_
>   case PSCI_0_2_FN_SYSTEM_RESET:
>   psci_forward_noreturn(host_ctxt);
>   unreachable();
> + case PSCI_0_2_FN64_CPU_SUSPEND:
> + return psci_cpu_suspend(func_id, host_ctxt);
>   case PSCI_0_2_FN64_CPU_ON:
>   return psci_cpu_on(func_id, host_ctxt);
>   default:
> -- 
> 2.29.2.299.gdc1121823c-goog
> 
> 

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.


Re: [PATCH v2 19/24] kvm: arm64: Intercept host's PSCI_CPU_ON SMCs

2020-11-23 Thread Marc Zyngier
On Mon, 16 Nov 2020 20:43:13 +,
David Brazdil  wrote:
> 
> Add a handler of the CPU_ON PSCI call from host. When invoked, it looks
> up the logical CPU ID corresponding to the provided MPIDR and populates
> the state struct of the target CPU with the provided x0, pc. It then
> calls CPU_ON itself, with an entry point in hyp that initializes EL2
> state before returning ERET to the provided PC in EL1.
> 
> There is a simple atomic lock around the reset state struct. If it is
> already locked, CPU_ON will return PENDING_ON error code.
> 
> Signed-off-by: David Brazdil 
> ---
>  arch/arm64/include/asm/kvm_asm.h |   8 ++-
>  arch/arm64/kvm/arm.c |   1 +
>  arch/arm64/kvm/hyp/nvhe/psci-relay.c | 104 +++
>  3 files changed, 110 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_asm.h 
> b/arch/arm64/include/asm/kvm_asm.h
> index 109867fb76f6..2e36ba4be748 100644
> --- a/arch/arm64/include/asm/kvm_asm.h
> +++ b/arch/arm64/include/asm/kvm_asm.h
> @@ -175,9 +175,11 @@ struct kvm_s2_mmu;
>  DECLARE_KVM_NVHE_SYM(__kvm_hyp_init);
>  DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector);
>  DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
> -#define __kvm_hyp_init   CHOOSE_NVHE_SYM(__kvm_hyp_init)
> -#define __kvm_hyp_host_vectorCHOOSE_NVHE_SYM(__kvm_hyp_host_vector)
> -#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector)
> +DECLARE_KVM_NVHE_SYM(__kvm_hyp_psci_cpu_entry);
> +#define __kvm_hyp_init   CHOOSE_NVHE_SYM(__kvm_hyp_init)
> +#define __kvm_hyp_host_vector
> CHOOSE_NVHE_SYM(__kvm_hyp_host_vector)
> +#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector)
> +#define __kvm_hyp_psci_cpu_entry 
> CHOOSE_NVHE_SYM(__kvm_hyp_psci_cpu_entry)
>  
>  extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
>  DECLARE_KVM_NVHE_SYM(__per_cpu_start);
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 7d2270eeecfb..c76a8e5bd19c 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -1365,6 +1365,7 @@ static void cpu_init_hyp_mode(void)
>  
>   params->vector_hyp_va = (unsigned 
> long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));
>   params->stack_hyp_va = 
> kern_hyp_va(__this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE);
> + params->entry_hyp_va = (unsigned 
> long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_psci_cpu_entry));

It feels really odd to use a per-CPU variable to keep track of
something that is essentially a constant. Why can't we just have an
assembly version of __kimg_hyp_va() and use that to compute the branch
target directly in __kvm_hyp_cpu_entry()? __kvm_hyp_host_vector is
another one.

>   params->pgd_pa = kvm_mmu_get_httbr();
>  
>   /*
> diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c 
> b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
> index 7542de8bd679..2daf52b59846 100644
> --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
> +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
> @@ -9,10 +9,15 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
>  
> +#define INVALID_CPU_ID UINT_MAX
> +
> +extern char __kvm_hyp_cpu_entry[];
> +
>  /* Config options set by the host. */
>  u32 __ro_after_init kvm_host_psci_version = PSCI_VERSION(0, 0);
>  u32 __ro_after_init kvm_host_psci_function_id[PSCI_FN_MAX];
> @@ -20,6 +25,14 @@ s64 __ro_after_init hyp_physvirt_offset;
>  
>  #define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset)
>  
> +struct kvm_host_psci_state {
> + atomic_t pending_on;
> + unsigned long pc;
> + unsigned long r0;
> +};
> +
> +static DEFINE_PER_CPU(struct kvm_host_psci_state, kvm_host_psci_state);
> +
>  static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt)
>  {
>   return host_ctxt->regs.regs[0];
> @@ -76,10 +89,99 @@ static __noreturn unsigned long 
> psci_forward_noreturn(struct kvm_cpu_context *ho
>   hyp_panic(); /* unreachable */
>  }
>  
> +static unsigned int find_cpu_id(u64 mpidr)
> +{
> + int i;

nit: unsigned int?

> +
> + if (mpidr != INVALID_HWID) {

This is a little ugly on the side [(c) FZ], and deserves a comment
("Reject MPIDRs matching the init value of the __cpu_logical_map[]
array"?).

Also, I personally prefer a construct that reduces the nesting:

if (mpidr == INVALID_HWID)
return INVALID_CPU_ID;

> + for (i = 0; i < NR_CPUS; i++) {
> + if (cpu_logical_map(i) == mpidr)
> + return i;
> + }
> + }
> +
> + return INVALID_CPU_ID;
> +}
> +
> +static bool try_acquire_reset_state(struct kvm_host_psci_state *cpu_state,
> + unsigned long pc, unsigned long r0)
> +{
> + if (atomic_cmpxchg_acquire(&cpu_state->pending_on, 0, 1) != 0)

What guarantees that this cmpxchg is inlined here? Also, having some
names for 0 and 1 would be nice.

> + return false;
> +
> + cpu_state->pc = pc;
> + cpu_s

Re: [PATCH v2 15/24] kvm: arm64: Extract parts of el2_setup into a macro

2020-11-23 Thread Marc Zyngier
On Mon, 16 Nov 2020 20:43:09 +,
David Brazdil  wrote:
> 
> When the a CPU is booted in EL2, the kernel checks for VHE support and
> initializes the CPU core accordingly. For nVHE it also installs the stub
> vectors and drops down to EL1.
> 
> Once KVM gains the ability to boot cores without going through the
> kernel entry point, it will need to initialize the CPU the same way.
> Extract the relevant bits of el2_setup into an init_el2_state macro
> with an argument specifying whether to initialize for VHE or nVHE.
> 
> No functional change. Size of el2_setup increased by 148 bytes due
> to duplication.
> 
> Signed-off-by: David Brazdil 
> ---
>  arch/arm64/include/asm/el2_setup.h | 185 +
>  arch/arm64/kernel/head.S   | 144 +++---
>  2 files changed, 201 insertions(+), 128 deletions(-)
>  create mode 100644 arch/arm64/include/asm/el2_setup.h
> 
> diff --git a/arch/arm64/include/asm/el2_setup.h 
> b/arch/arm64/include/asm/el2_setup.h
> new file mode 100644
> index ..e5026e0aa878
> --- /dev/null
> +++ b/arch/arm64/include/asm/el2_setup.h
> @@ -0,0 +1,185 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2012,2013 - ARM Ltd
> + * Author: Marc Zyngier 
> + */
> +
> +#ifndef __ARM_KVM_INIT_H__
> +#define __ARM_KVM_INIT_H__
> +
> +#ifndef __ASSEMBLY__
> +#error Assembly-only header
> +#endif
> +
> +#ifdef CONFIG_ARM_GIC_V3
> +#include 
> +#endif
> +
> +#include 
> +#include 
> +#include 
> +
> +.macro __init_el2_sctlr
> + mov_q   x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2)
> + msr sctlr_el2, x0
> + isb
> +.endm
> +
> +/*
> + * Allow Non-secure EL1 and EL0 to access physical timer and counter.
> + * This is not necessary for VHE, since the host kernel runs in EL2,
> + * and EL0 accesses are configured in the later stage of boot process.
> + * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout
> + * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined
> + * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1
> + * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in
> + * EL2.
> + */
> +.macro __init_el2_timers mode
> +.ifeqs "\mode", "nvhe"
> + mrs x0, cnthctl_el2
> + orr x0, x0, #3  // Enable EL1 physical timers
> + msr cnthctl_el2, x0
> +.endif
> + msr cntvoff_el2, xzr// Clear virtual offset
> +.endm
> +
> +.macro __init_el2_debug mode
> + mrs x1, id_aa64dfr0_el1
> + sbfxx0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4
> + cmp x0, #1
> + b.lt1f  // Skip if no PMU present
> + mrs x0, pmcr_el0// Disable debug access traps
> + ubfxx0, x0, #11, #5 // to EL2 and allow access to
> +1:
> + cselx2, xzr, x0, lt // all PMU counters from EL1
> +
> + /* Statistical profiling */
> + ubfxx0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4
> + cbz x0, 3f  // Skip if SPE not present
> +
> +.ifeqs "\mode", "nvhe"
> + mrs_s   x0, SYS_PMBIDR_EL1  // If SPE available at EL2,
> + and x0, x0, #(1 << SYS_PMBIDR_EL1_P_SHIFT)
> + cbnzx0, 2f  // then permit sampling of 
> physical
> + mov x0, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \
> +   1 << SYS_PMSCR_EL2_PA_SHIFT)
> + msr_s   SYS_PMSCR_EL2, x0   // addresses and physical 
> counter
> +2:
> + mov x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
> + orr x2, x2, x0  // If we don't have VHE, then
> + // use EL1&0 translation.
> +.else
> + orr x2, x2, #MDCR_EL2_TPMS  // For VHE, use EL2 translation
> + // and disable access from EL1
> +.endif
> +
> +3:
> + msr mdcr_el2, x2// Configure debug traps
> +.endm
> +
> +/* LORegions */
> +.macro __init_el2_lor
> + mrs x1, id_aa64mmfr1_el1
> + ubfxx0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4
> + cbz x0, 1f
> + msr_s   SYS_LORC_EL1, xzr
> +1:
> +.endm
> +
> +/* Stage-2 translation */
> +.macro __init_el2_stage2
> + msr vttbr_el2, xzr
> +.endm
> +
> +/* GICv3 system register access */
> +#ifdef CONFIG_ARM_GIC_V3

nit: this #ifdef isn't relevant anymore and can be dropped throughout
the file.

> +.macro __init_el2_gicv3
> + mrs x0

Re: [PATCH v2 07/24] kvm: arm64: Refactor handle_trap to use a switch

2020-11-23 Thread Marc Zyngier
On Mon, 16 Nov 2020 20:43:01 +,
David Brazdil  wrote:
> 
> Small refactor so that nVHE's handle_trap uses a switch on the Exception
> Class value of ESR_EL2 in preparation for adding a handler of SMC32/64.

nit: SMC32 seems to be a leftover from the previous version.

>
> Signed-off-by: David Brazdil 
> ---
>  arch/arm64/kvm/hyp/nvhe/hyp-main.c | 15 ---
>  1 file changed, 8 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c 
> b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> index 411b0f652417..19332c20fcde 100644
> --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> @@ -16,9 +16,9 @@
>  
>  DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
>  
> -static void handle_host_hcall(unsigned long func_id,
> -   struct kvm_cpu_context *host_ctxt)
> +static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
>  {
> + unsigned long func_id = host_ctxt->regs.regs[0];
>   unsigned long ret = 0;
>  
>   switch (func_id) {
> @@ -109,11 +109,12 @@ static void handle_host_hcall(unsigned long func_id,
>  void handle_trap(struct kvm_cpu_context *host_ctxt)
>  {
>   u64 esr = read_sysreg_el2(SYS_ESR);
> - unsigned long func_id;
>  
> - if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)
> + switch (ESR_ELx_EC(esr)) {
> + case ESR_ELx_EC_HVC64:
> + handle_host_hcall(host_ctxt);
> + break;
> + default:
>   hyp_panic();
> -
> - func_id = host_ctxt->regs.regs[0];
> - handle_host_hcall(func_id, host_ctxt);
> + }
>  }
> -- 
> 2.29.2.299.gdc1121823c-goog
> 
> 

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.


Re: [PATCH v2 06/24] kvm: arm64: Move hyp-init params to a per-CPU struct

2020-11-23 Thread Marc Zyngier
On Mon, 16 Nov 2020 20:43:00 +,
David Brazdil  wrote:
> 
> Once we start initializing KVM on newly booted cores before the rest of
> the kernel, parameters to __do_hyp_init will need to be provided by EL2
> rather than EL1. At that point it will not be possible to pass its four
> arguments directly because PSCI_CPU_ON only supports one context
> argument.
> 
> Refactor __do_hyp_init to accept its parameters in a struct. This
> prepares the code for KVM booting cores as well as removes any limits on
> the number of __do_hyp_init arguments.
> 
> Signed-off-by: David Brazdil 
> ---
>  arch/arm64/include/asm/kvm_asm.h   |  7 +++
>  arch/arm64/include/asm/kvm_hyp.h   |  4 
>  arch/arm64/kernel/asm-offsets.c|  4 
>  arch/arm64/kvm/arm.c   | 26 ++
>  arch/arm64/kvm/hyp/nvhe/hyp-init.S | 21 ++---
>  arch/arm64/kvm/hyp/nvhe/hyp-main.c |  2 ++
>  6 files changed, 41 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_asm.h 
> b/arch/arm64/include/asm/kvm_asm.h
> index 54387ccd1ab2..01904e88cead 100644
> --- a/arch/arm64/include/asm/kvm_asm.h
> +++ b/arch/arm64/include/asm/kvm_asm.h
> @@ -150,6 +150,13 @@ extern void *__vhe_undefined_symbol;
>  
>  #endif
>  
> +struct kvm_nvhe_init_params {
> + unsigned long tpidr_el2;
> + unsigned long vector_hyp_va;
> + unsigned long stack_hyp_va;
> + phys_addr_t pgd_pa;
> +};
> +
>  /* Translate a kernel address @ptr into its equivalent linear mapping */
>  #define kvm_ksym_ref(ptr)\
>   ({  \
> diff --git a/arch/arm64/include/asm/kvm_hyp.h 
> b/arch/arm64/include/asm/kvm_hyp.h
> index 6b664de5ec1f..a3289071f3d8 100644
> --- a/arch/arm64/include/asm/kvm_hyp.h
> +++ b/arch/arm64/include/asm/kvm_hyp.h
> @@ -15,6 +15,10 @@
>  DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
>  DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
>  
> +#ifdef __KVM_NVHE_HYPERVISOR__
> +DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
> +#endif

I'm not sure we should bother with this #ifdefery. Having the
declaration present at all times doesn't really hurt, since it is only
defined in the HYP code. Cutting down on the conditionals would
certainly help readability.

> +
>  #define read_sysreg_elx(r,nvh,vh)\
>   ({  \
>   u64 reg;\
> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> index 7d32fc959b1a..4435ad8be938 100644
> --- a/arch/arm64/kernel/asm-offsets.c
> +++ b/arch/arm64/kernel/asm-offsets.c
> @@ -110,6 +110,10 @@ int main(void)
>DEFINE(CPU_APGAKEYLO_EL1,  offsetof(struct kvm_cpu_context, 
> sys_regs[APGAKEYLO_EL1]));
>DEFINE(HOST_CONTEXT_VCPU,  offsetof(struct kvm_cpu_context, 
> __hyp_running_vcpu));
>DEFINE(HOST_DATA_CONTEXT,  offsetof(struct kvm_host_data, host_ctxt));
> +  DEFINE(NVHE_INIT_TPIDR_EL2,offsetof(struct kvm_nvhe_init_params, 
> tpidr_el2));
> +  DEFINE(NVHE_INIT_VECTOR_HYP_VA,offsetof(struct kvm_nvhe_init_params, 
> vector_hyp_va));
> +  DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, 
> stack_hyp_va));
> +  DEFINE(NVHE_INIT_PGD_PA,   offsetof(struct kvm_nvhe_init_params, pgd_pa));
>  #endif
>  #ifdef CONFIG_CPU_PM
>DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp));
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index c0ffb019ca8b..4838556920fb 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -50,6 +50,7 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
>  
>  static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
>  unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
> +DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
>  
>  /* The VMID used in the VTTBR */
>  static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
> @@ -1347,10 +1348,7 @@ static int kvm_map_vectors(void)
>  
>  static void cpu_init_hyp_mode(void)
>  {
> - phys_addr_t pgd_ptr;
> - unsigned long hyp_stack_ptr;
> - unsigned long vector_ptr;
> - unsigned long tpidr_el2;
> + struct kvm_nvhe_init_params *params = 
> this_cpu_ptr_nvhe_sym(kvm_init_params);
>   struct arm_smccc_res res;
>  
>   /* Switch from the HYP stub to our own HYP init vector */
> @@ -1361,13 +1359,18 @@ static void cpu_init_hyp_mode(void)
>* kernel's mapping to the linear mapping, and store it in tpidr_el2
>* so that we can use adr_l to access per-cpu variables in EL2.
>*/
> - tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
> - (unsigned 
> long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
> + params->tpidr_el2 = (unsigned 
> long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
> + 

Re: [PATCH 1/2] drm/meson: dw-hdmi: Disable clocks on driver teardown

2020-11-23 Thread Marc Zyngier

On 2020-11-23 14:03, Jerome Brunet wrote:

On Fri 20 Nov 2020 at 10:42, Marc Zyngier  wrote:


The HDMI driver request clocks early, but never disable them, leaving
the clocks on even when the driver is removed.

Fix it by slightly refactoring the clock code, and register a devm
action that will eventually disable/unprepare the enabled clocks.

Signed-off-by: Marc Zyngier 
---
 drivers/gpu/drm/meson/meson_dw_hdmi.c | 43 
++-

 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c 
b/drivers/gpu/drm/meson/meson_dw_hdmi.c

index 7f8eea494147..29623b309cb1 100644
--- a/drivers/gpu/drm/meson/meson_dw_hdmi.c
+++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c
@@ -145,8 +145,6 @@ struct meson_dw_hdmi {
struct reset_control *hdmitx_apb;
struct reset_control *hdmitx_ctrl;
struct reset_control *hdmitx_phy;
-   struct clk *hdmi_pclk;
-   struct clk *venci_clk;
struct regulator *hdmi_supply;
u32 irq_stat;
struct dw_hdmi *hdmi;
@@ -946,6 +944,29 @@ static void meson_disable_regulator(void *data)
regulator_disable(data);
 }

+static void meson_disable_clk(void *data)
+{
+   clk_disable_unprepare(data);
+}
+
+static int meson_enable_clk(struct device *dev, char *name)
+{
+   struct clk *clk;
+   int ret;
+
+   clk = devm_clk_get(dev, name);
+   if (IS_ERR(clk)) {
+   dev_err(dev, "Unable to get %s pclk\n", name);
+   return PTR_ERR(clk);
+   }
+
+   ret = clk_prepare_enable(clk);
+   if (!ret)
+   ret = devm_add_action_or_reset(dev, meson_disable_clk,
clk);


Thanks for fixing this Marc.

FYI, while it is fine to declare a function to disable the clocks, a 
quick

cast may avoid it

devm_add_action_or_reset(dev, (void(*)(void *))clk_disable_unprepare, 
clk);


While this works for now, a change to the clk_disable_unprepare()
prototype (such as adding a second argument) would now go completely
unnoticed (after all, you've cast the function, it *must* be correct,
right?), and someone would spend a few hours trying to track down memory
corruption or some other interesting results.

Yes, casting C functions can be hilarious.

I can see a few uses of this hack in the tree, and I have my pop-corn
ready.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v2 04/24] arm64: Move MAIR_EL1_SET to asm/memory.h

2020-11-23 Thread Marc Zyngier
On Mon, 16 Nov 2020 20:42:58 +,
David Brazdil  wrote:
> 
> KVM currently initializes MAIR_EL2 to the value of MAIR_EL1. In
> preparation for initializing MAIR_EL2 before MAIR_EL1, move the constant
> into a shared header file. Since it is used for EL1 and EL2, rename to
> MAIR_ELx_SET.
> 
> Signed-off-by: David Brazdil 
> ---
>  arch/arm64/include/asm/memory.h | 29 ++---
>  arch/arm64/include/asm/sysreg.h | 30 ++
>  arch/arm64/mm/proc.S| 15 +--
>  3 files changed, 45 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
> index cd61239bae8c..8ae8fd883a0c 100644
> --- a/arch/arm64/include/asm/memory.h
> +++ b/arch/arm64/include/asm/memory.h
> @@ -13,6 +13,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  /*
>   * Size of the PCI I/O space. This must remain a power of two so that
> @@ -124,21 +125,6 @@
>   */
>  #define SEGMENT_ALIGNSZ_64K
>  
> -/*
> - * Memory types available.
> - *
> - * IMPORTANT: MT_NORMAL must be index 0 since vm_get_page_prot() may 'or' in
> - * the MT_NORMAL_TAGGED memory type for PROT_MTE mappings. Note
> - * that protection_map[] only contains MT_NORMAL attributes.
> - */
> -#define MT_NORMAL0
> -#define MT_NORMAL_TAGGED 1
> -#define MT_NORMAL_NC 2
> -#define MT_NORMAL_WT 3
> -#define MT_DEVICE_nGnRnE 4
> -#define MT_DEVICE_nGnRE  5
> -#define MT_DEVICE_GRE6
> -
>  /*
>   * Memory types for Stage-2 translation
>   */
> @@ -152,6 +138,19 @@
>  #define MT_S2_FWB_NORMAL 6
>  #define MT_S2_FWB_DEVICE_nGnRE   1
>  
> +/*
> + * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal memory 
> and
> + * changed during __cpu_setup to Normal Tagged if the system supports MTE.
> + */
> +#define MAIR_ELx_SET \
> + (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) |  \
> +  MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |\
> +  MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |\
> +  MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) |  \
> +  MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |\
> +  MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) |  \
> +  MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED))
> +
>  #ifdef CONFIG_ARM64_4K_PAGES
>  #define IOREMAP_MAX_ORDER(PUD_SHIFT)
>  #else
> diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
> index e2ef4c2edf06..24e773414cb4 100644
> --- a/arch/arm64/include/asm/sysreg.h
> +++ b/arch/arm64/include/asm/sysreg.h
> @@ -635,6 +635,34 @@
>  /* Position the attr at the correct index */
>  #define MAIR_ATTRIDX(attr, idx)  ((attr) << ((idx) * 8))
>  
> +/*
> + * Memory types available.
> + *
> + * IMPORTANT: MT_NORMAL must be index 0 since vm_get_page_prot() may 'or' in
> + * the MT_NORMAL_TAGGED memory type for PROT_MTE mappings. Note
> + * that protection_map[] only contains MT_NORMAL attributes.
> + */
> +#define MT_NORMAL0
> +#define MT_NORMAL_TAGGED 1
> +#define MT_NORMAL_NC 2
> +#define MT_NORMAL_WT 3
> +#define MT_DEVICE_nGnRnE 4
> +#define MT_DEVICE_nGnRE  5
> +#define MT_DEVICE_GRE6
> +
> +/*
> + * Default MAIR_ELx. MT_NORMAL_TAGGED is initially mapped as Normal memory 
> and
> + * changed during __cpu_setup to Normal Tagged if the system supports MTE.
> + */
> +#define MAIR_ELx_SET \
> + (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) |  \
> +  MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |\
> +  MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |\
> +  MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) |  \
> +  MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |\
> +  MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) |  \
> +  MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED))
> +
>  /* id_aa64isar0 */
>  #define ID_AA64ISAR0_RNDR_SHIFT  60
>  #define ID_AA64ISAR0_TLB_SHIFT   56
> @@ -992,6 +1020,7 @@
>  /* Safe value for MPIDR_EL1: Bit31:RES1, Bit30:U:0, Bit24:MT:0 */
>  #define SYS_MPIDR_SAFE_VAL   (BIT(31))
>  
> +#ifndef LINKER_SCRIPT

This is terribly ugly. Why is this included by the linker script? Does
it actually define __ASSEMBLY__?

>  #ifdef __ASSEMBLY__
>  
>   .irp
> num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
> @@ -1109,5 +1138,6 @@
>  })
>  
>  #endif
> +#endif   /* LINKER_SCRIPT */
>  
>  #endif   /* __ASM_SYSREG_H */
> diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
> index 23c326a06b2d..e3b9aa372b96 100644
> --- a/arch/arm64/mm/proc.S
> +++ b/arch/arm64/mm

Re: [PATCH v2 02/24] psci: Accessor for configured PSCI function IDs

2020-11-23 Thread Marc Zyngier
On Mon, 16 Nov 2020 20:42:56 +,
David Brazdil  wrote:
> 
> Function IDs used by PSCI are configurable for v0.1 via DT/APCI. If the
> host is using PSCI v0.1, KVM's host PSCI proxy needs to use the same IDs.
> Expose the array holding the information with a read-only accessor.
> 
> Signed-off-by: David Brazdil 
> ---
>  drivers/firmware/psci/psci.c | 14 ++
>  include/linux/psci.h | 10 ++
>  2 files changed, 16 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
> index 213c68418a65..d835f3d8b121 100644
> --- a/drivers/firmware/psci/psci.c
> +++ b/drivers/firmware/psci/psci.c
> @@ -58,16 +58,14 @@ typedef unsigned long (psci_fn)(unsigned long, unsigned 
> long,
>   unsigned long, unsigned long);
>  static psci_fn *invoke_psci_fn;
>  
> -enum psci_function {
> - PSCI_FN_CPU_SUSPEND,
> - PSCI_FN_CPU_ON,
> - PSCI_FN_CPU_OFF,
> - PSCI_FN_MIGRATE,
> - PSCI_FN_MAX,
> -};
> -
>  static u32 psci_function_id[PSCI_FN_MAX];
>  
> +u32 psci_get_function_id(enum psci_function fn)
> +{
> + WARN_ON(fn >= PSCI_FN_MAX);

If we are going to warn on something that is out of bounds, maybe we
shouldn't perform the access at all? And maybe check for lower bound
as well?

> + return psci_function_id[fn];
> +}
> +
>  #define PSCI_0_2_POWER_STATE_MASK\
>   (PSCI_0_2_POWER_STATE_ID_MASK | \
>   PSCI_0_2_POWER_STATE_TYPE_MASK | \
> diff --git a/include/linux/psci.h b/include/linux/psci.h
> index 2a1bfb890e58..5b49a5c82d6f 100644
> --- a/include/linux/psci.h
> +++ b/include/linux/psci.h
> @@ -21,6 +21,16 @@ bool psci_power_state_is_valid(u32 state);
>  int psci_set_osi_mode(bool enable);
>  bool psci_has_osi_support(void);
>  
> +enum psci_function {
> + PSCI_FN_CPU_SUSPEND,
> + PSCI_FN_CPU_ON,
> + PSCI_FN_CPU_OFF,
> + PSCI_FN_MIGRATE,
> + PSCI_FN_MAX,
> +};
> +
> +u32 psci_get_function_id(enum psci_function fn);
> +
>  struct psci_operations {
>   u32 (*get_version)(void);
>   int (*cpu_suspend)(u32 state, unsigned long entry_point);
> -- 
> 2.29.2.299.gdc1121823c-goog
> 
> 

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.


Re: [PATCH v2 00/24] Opt-in always-on nVHE hypervisor

2020-11-23 Thread Marc Zyngier
On Mon, 16 Nov 2020 20:42:54 +,
David Brazdil  wrote:
> 
> As we progress towards being able to keep guest state private to the
> host running nVHE hypervisor, this series allows the hypervisor to
> install itself on newly booted CPUs before the host is allowed to run
> on them.
> 
> All functionality described below is opt-in, guarded by an early param
> 'kvm-arm.protected'. Future patches specific to the new "protected" mode
> should be hidden behind the same param.
> 
> The hypervisor starts trapping host SMCs and intercepting host's PSCI
> CPU_ON/SUSPEND calls. It replaces the host's entry point with its own,
> initializes the EL2 state of the new CPU and installs the nVHE hyp vector
> before ERETing to the host's entry point.
> 
> The kernel checks new cores' features against the finalized system
> capabilities. To avoid the need to move this code/data to EL2, the
> implementation only allows to boot cores that were online at the time of
> KVM initialization and therefore had been checked already.
> 
> Other PSCI SMCs are forwarded to EL3, though only the known set of SMCs
> implemented in the kernel is allowed. Non-PSCI SMCs are also forwarded
> to EL3. Future changes will need to ensure the safety of all SMCs wrt.
> private guests.
> 
> The host is still allowed to reset EL2 back to the stub vector, eg. for
> hibernation or kexec, but will not disable nVHE when there are no VMs.
> 
> Tested on Rock Pi 4b, based on 5.10-rc4.

Adding Lorenzo and Sudeep for the PSCI side of things.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.


Re: [PATCH v2 1/3] genirq/affinity: Add irq_update_affinity_desc()

2020-11-23 Thread Marc Zyngier

Hi John,

On 2020-11-23 12:54, John Garry wrote:

Hi Marc,


Right, but if the driver is removed then the interrupts should be
deallocated, right?



When removing the driver we just call free_irq(), which removes the
handler and disables the interrupt.

But about the irq_desc, this is created when the mapping is created 
in

irq_create_fwspec_mapping(), and I don't see this being torn down in
the driver removal, so persistent in that regard.


If the irq_descs are created via the platform_get_irq() calls in
platform_get_irqs_affinity(), I'd expect some equivalent helper to
tear things down as a result, calling irq_dispose_mapping() behind the
scenes.



So this looks lacking in the kernel AFAICS...

So is there a reason for which irq dispose mapping is not a
requirement for drivers when finished with the irq? because of shared
interrupts?


For a bunch of reasons: IRQ number used to be created 1:1 with their
physical counterpart, so there wasn't a need to "get rid" of the
associated data structures. Also, people expected their drivers
to be there for the lifetime of the system (believe it or not,
hotplug devices are pretty new!).

Shared interrupts are just another part of the problem (although we
should be able to work out whether there is any action attached to
the descriptor before blowing it away.




So for pci msi I can see that we free the irq_desc in
pci_disable_msi() -> free_msi_irqs() -> msi_domain_free_irqs() ...

So what I am missing here?


I'm not sure the paths are strictly equivalent. On the PCI side, we
can have something that completely driver agnostic, as it is all
architectural. In your case, only the endpoint driver knows about what
happens, and needs to free things accordingly.

Finally, there is the issue in your driver that everything is
requested using devm_request_irq, which cannot play nicely with an
explicit irq_desc teardown. You'll probably need to provide the
equivalent devm helpers for your driver to safely be taken down.



Yeah, so since we use the devm irq request method, we also need a devm
dispose release method as we can't dispose the irq mapping in the
remove() method, prior to the irq_free() in the later devm release
method.

But it looks like there is more to it than that, which I'm worried is
far from non-trivial. For example, just calling irq_dispose_mapping()
for removal and then plaform_get_irq()->acpi_get_irq() second time
fails as it looks like more tidy-up is needed for removal...


Most probably. I could imagine things failing if there is any trace
of an existing translation in the ITS or in the platform-MSI layer,
for example, or if the interrupt is still active...

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v15 6/9] arm64/kvm: Add hypercall service for kvm ptp.

2020-11-23 Thread Marc Zyngier

On 2020-11-23 10:44, Marc Zyngier wrote:

On 2020-11-11 06:22, Jianyong Wu wrote:

ptp_kvm will get this service through SMCC call.
The service offers wall time and cycle count of host to guest.
The caller must specify whether they want the host cycle count
or the difference between host cycle count and cntvoff.

Signed-off-by: Jianyong Wu 
---
 arch/arm64/kvm/hypercalls.c | 61 
+

 include/linux/arm-smccc.h   | 17 +++
 2 files changed, 78 insertions(+)

diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index b9d8607083eb..f7d189563f3d 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -9,6 +9,51 @@
 #include 
 #include 

+static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
+{
+   struct system_time_snapshot systime_snapshot;
+   u64 cycles = ~0UL;
+   u32 feature;
+
+   /*
+* system time and counter value must captured in the same
+* time to keep consistency and precision.
+*/
+   ktime_get_snapshot(&systime_snapshot);
+
+   // binding ptp_kvm clocksource to arm_arch_counter
+   if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
+   return;
+
+   val[0] = upper_32_bits(systime_snapshot.real);
+   val[1] = lower_32_bits(systime_snapshot.real);


What is the endianness of these values? I can't see it defined
anywhere, and this is likely not to work if guest and hypervisor
don't align.


Scratch that. This is all passed via registers, so the endianness
of the data is irrelevant. Please discard any comment about endianness
I made in this review.

The documentation aspect still requires to be beefed up.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v15 8/9] doc: add ptp_kvm introduction for arm64 support

2020-11-23 Thread Marc Zyngier

On 2020-11-11 06:22, Jianyong Wu wrote:

PTP_KVM implementation depends on hypercall using SMCCC. So we
introduce a new SMCCC service ID. This doc explains how does the
ID define and how does PTP_KVM works on arm/arm64.

Signed-off-by: Jianyong Wu 
---
 Documentation/virt/kvm/api.rst |  9 +++
 Documentation/virt/kvm/arm/index.rst   |  1 +
 Documentation/virt/kvm/arm/ptp_kvm.rst | 29 +
 Documentation/virt/kvm/timekeeping.rst | 35 ++
 4 files changed, 74 insertions(+)
 create mode 100644 Documentation/virt/kvm/arm/ptp_kvm.rst

diff --git a/Documentation/virt/kvm/api.rst 
b/Documentation/virt/kvm/api.rst

index 36d5f1f3c6dd..9843dbcbf770 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6391,3 +6391,12 @@ When enabled, KVM will disable paravirtual
features provided to the
 guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf
 (0x4001). Otherwise, a guest may use the paravirtual features
 regardless of what has actually been exposed through the CPUID leaf.
+
+8.27 KVM_CAP_PTP_KVM
+
+
+:Architectures: arm64
+
+This capability indicates that KVM virtual PTP service is supported in 
host.
+It must company with the implementation of KVM virtual PTP service in 
host
+so VMM can probe if there is the service in host by checking this 
capability.

diff --git a/Documentation/virt/kvm/arm/index.rst
b/Documentation/virt/kvm/arm/index.rst
index 3e2b2aba90fc..78a9b670aafe 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -10,3 +10,4 @@ ARM
hyp-abi
psci
pvtime
+   ptp_kvm
diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst
b/Documentation/virt/kvm/arm/ptp_kvm.rst
new file mode 100644
index ..bb1e6cfefe44
--- /dev/null
+++ b/Documentation/virt/kvm/arm/ptp_kvm.rst
@@ -0,0 +1,29 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+PTP_KVM support for arm/arm64
+=
+
+PTP_KVM is used for time sync between guest and host in a high 
precision.

+It needs to get the wall time and counter value from the host and
transfer these
+to guest via hypercall service. So one more hypercall service has been 
added.

+
+This new SMCCC hypercall is defined as:
+
+* ARM_SMCCC_HYP_KVM_PTP_FUNC_ID: 0x8601
+
+As both 32 and 64-bits ptp_kvm client should be supported, we choose
SMC32/HVC32
+calling convention.
+
+ARM_SMCCC_HYP_KVM_PTP_FUNC_ID:
+
+=====
+Function ID: (uint32)  0x8601
+Arguments:  (uint32)  ARM_PTP_PHY_COUNTER(1) or
ARM_PTP_VIRT_COUNTER(0)
+   which indicate acquiring physical 
counter or

+   virtual counter respectively.
+return value:(uint32)  NOT_SUPPORTED(-1) or val0 and val1 
represent
+   wall clock time and val2 and val3 
represent

+   counter cycle.


This needs a lot more description:

- Which word contains what part of the data (upper/lower part of the 
64bit data)

- The endianness of the data returned

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v15 7/9] ptp: arm/arm64: Enable ptp_kvm for arm/arm64

2020-11-23 Thread Marc Zyngier

On 2020-11-11 06:22, Jianyong Wu wrote:
Currently, there is no mechanism to keep time sync between guest and 
host
in arm/arm64 virtualization environment. Time in guest will drift 
compared

with host after boot up as they may both use third party time sources
to correct their time respectively. The time deviation will be in order
of milliseconds. But in some scenarios,like in cloud envirenment, we 
ask


environment


for higher time precision.

kvm ptp clock, which chooses the host clock source as a reference
clock to sync time between guest and host, has been adopted by x86
which takes the time sync order from milliseconds to nanoseconds.

This patch enables kvm ptp clock for arm/arm64 and improves clock sync 
precison


precision


significantly.

Test result comparisons between with kvm ptp clock and without it in 
arm/arm64

are as follows. This test derived from the result of command 'chronyc
sources'. we should take more care of the last sample column which 
shows
the offset between the local clock and the source at the last 
measurement.


no kvm ptp in guest:
MS Name/IP address   Stratum Poll Reach LastRx Last sample

^* dns1.synet.edu.cn  2   6   37713  +1040us[+1581us] +/-   
21ms
^* dns1.synet.edu.cn  2   6   37721  +1040us[+1581us] +/-   
21ms
^* dns1.synet.edu.cn  2   6   37729  +1040us[+1581us] +/-   
21ms
^* dns1.synet.edu.cn  2   6   37737  +1040us[+1581us] +/-   
21ms
^* dns1.synet.edu.cn  2   6   37745  +1040us[+1581us] +/-   
21ms
^* dns1.synet.edu.cn  2   6   37753  +1040us[+1581us] +/-   
21ms
^* dns1.synet.edu.cn  2   6   37761  +1040us[+1581us] +/-   
21ms
^* dns1.synet.edu.cn  2   6   377 4   -130us[ +796us] +/-   
21ms
^* dns1.synet.edu.cn  2   6   37712   -130us[ +796us] +/-   
21ms
^* dns1.synet.edu.cn  2   6   37720   -130us[ +796us] +/-   
21ms


in host:
MS Name/IP address   Stratum Poll Reach LastRx Last sample

^* 120.25.115.20  2   7   37772   -470us[ -603us] +/-   
18ms
^* 120.25.115.20  2   7   37792   -470us[ -603us] +/-   
18ms
^* 120.25.115.20  2   7   377   112   -470us[ -603us] +/-   
18ms
^* 120.25.115.20  2   7   377 2   +872ns[-6808ns] +/-   
17ms
^* 120.25.115.20  2   7   37722   +872ns[-6808ns] +/-   
17ms
^* 120.25.115.20  2   7   37743   +872ns[-6808ns] +/-   
17ms
^* 120.25.115.20  2   7   37763   +872ns[-6808ns] +/-   
17ms
^* 120.25.115.20  2   7   37783   +872ns[-6808ns] +/-   
17ms
^* 120.25.115.20  2   7   377   103   +872ns[-6808ns] +/-   
17ms
^* 120.25.115.20  2   7   377   123   +872ns[-6808ns] +/-   
17ms


The dns1.synet.edu.cn is the network reference clock for guest and
120.25.115.20 is the network reference clock for host. we can't get the
clock error between guest and host directly, but a roughly estimated 
value

will be in order of hundreds of us to ms.

with kvm ptp in guest:
chrony has been disabled in host to remove the disturb by network 
clock.


MS Name/IP address Stratum Poll Reach LastRx Last sample

* PHC00   3   377 8 -7ns[   +1ns] +/-
3ns
* PHC00   3   377 8 +1ns[  +16ns] +/-
3ns
* PHC00   3   377 6 -4ns[   -0ns] +/-
6ns
* PHC00   3   377 6 -8ns[  -12ns] +/-
5ns
* PHC00   3   377 5 +2ns[   +4ns] +/-
4ns
* PHC00   3   37713 +2ns[   +4ns] +/-
4ns
* PHC00   3   37712 -4ns[   -6ns] +/-
4ns
* PHC00   3   37711 -8ns[  -11ns] +/-
6ns
* PHC00   3   37710-14ns[  -20ns] +/-
4ns
* PHC00   3   377 8 +4ns[   +5ns] +/-
4ns


The PHC0 is the ptp clock which choose the host clock as its source
clock. So we can see that the clock difference between host and guest
is in order of ns.

Signed-off-by: Jianyong Wu 
---
 drivers/clocksource/arm_arch_timer.c | 28 ++
 drivers/ptp/Kconfig  |  2 +-
 drivers/ptp/Makefile |  1 +
 drivers/ptp/ptp_kvm_arm.c| 44 
 4 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 drivers/ptp/ptp_kvm_arm.c

diff --git a/drivers/clocksource/arm_arch_timer.c
b/drivers/clocksource/arm_arch_timer.c
index d55acffb0b90..b33c5a663d30 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 

 #include 
 #include 
@@ -1650,3 +1651,30 @@ static int __init arch_timer_acpi_init(struct
acpi_table_header *table)
 }
 TIMER

Re: [PATCH v15 6/9] arm64/kvm: Add hypercall service for kvm ptp.

2020-11-23 Thread Marc Zyngier

On 2020-11-11 06:22, Jianyong Wu wrote:

ptp_kvm will get this service through SMCC call.
The service offers wall time and cycle count of host to guest.
The caller must specify whether they want the host cycle count
or the difference between host cycle count and cntvoff.

Signed-off-by: Jianyong Wu 
---
 arch/arm64/kvm/hypercalls.c | 61 +
 include/linux/arm-smccc.h   | 17 +++
 2 files changed, 78 insertions(+)

diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index b9d8607083eb..f7d189563f3d 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -9,6 +9,51 @@
 #include 
 #include 

+static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
+{
+   struct system_time_snapshot systime_snapshot;
+   u64 cycles = ~0UL;
+   u32 feature;
+
+   /*
+* system time and counter value must captured in the same
+* time to keep consistency and precision.
+*/
+   ktime_get_snapshot(&systime_snapshot);
+
+   // binding ptp_kvm clocksource to arm_arch_counter
+   if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
+   return;
+
+   val[0] = upper_32_bits(systime_snapshot.real);
+   val[1] = lower_32_bits(systime_snapshot.real);


What is the endianness of these values? I can't see it defined
anywhere, and this is likely not to work if guest and hypervisor
don't align.


+
+   /*
+* which of virtual counter or physical counter being
+* asked for is decided by the r1 value of SMCCC
+* call. If no invalid r1 value offered, default cycle
+* value(-1) will be returned.
+* Note: keep in mind that feature is u32 and smccc_get_arg1
+* will return u64, so need auto cast here.
+*/
+   feature = smccc_get_arg1(vcpu);
+   switch (feature) {
+   case ARM_PTP_VIRT_COUNTER:
+		cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, 
CNTVOFF_EL2);

+   break;
+   case ARM_PTP_PHY_COUNTER:
+   cycles = systime_snapshot.cycles;
+   break;
+   case ARM_PTP_NONE_COUNTER:


What is this "NONE" counter?


+   break;
+   default:
+   val[0] = SMCCC_RET_NOT_SUPPORTED;
+   break;
+   }
+   val[2] = upper_32_bits(cycles);
+   val[3] = lower_32_bits(cycles);


Same problem as above.


+}
+
 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 {
u32 func_id = smccc_get_function(vcpu);
@@ -79,6 +124,22 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
break;
case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+   val[0] |= BIT(ARM_SMCCC_KVM_FUNC_KVM_PTP);
+   break;
+   /*
+* This serves virtual kvm_ptp.
+* Four values will be passed back.
+* reg0 stores high 32-bits of host ktime;
+* reg1 stores low 32-bits of host ktime;
+* For ARM_PTP_VIRT_COUNTER:
+* reg2 stores high 32-bits of difference of host cycles and cntvoff;
+* reg3 stores low 32-bits of difference of host cycles and cntvoff.
+* For ARM_PTP_PHY_COUNTER:
+* reg2 stores the high 32-bits of host cycles;
+* reg3 stores the low 32-bits of host cycles.
+*/
+   case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+   kvm_ptp_get_time(vcpu, val);
break;
default:
return kvm_psci_call(vcpu);
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index d75408141137..a03c5dd409d3 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -103,6 +103,7 @@

 /* KVM "vendor specific" services */
 #define ARM_SMCCC_KVM_FUNC_FEATURES0
+#define ARM_SMCCC_KVM_FUNC_KVM_PTP 1


I think having KVM once in the name is enough.


 #define ARM_SMCCC_KVM_FUNC_FEATURES_2  127
 #define ARM_SMCCC_KVM_NUM_FUNCS128

@@ -114,6 +115,22 @@

 #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED   1

+/*
+ * ptp_kvm is a feature used for time sync between vm and host.
+ * ptp_kvm module in guest kernel will get service from host using
+ * this hypercall ID.
+ */
+#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID   \
+   ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+  ARM_SMCCC_SMC_32,\
+  ARM_SMCCC_OWNER_VENDOR_HYP,  \
+  ARM_SMCCC_KVM_FUNC_KVM_PTP)
+
+/* ptp_kvm counter type ID */
+#define ARM_PTP_VIRT_COUNTER   0
+#define ARM_PTP_PHY_COUNTER1
+#define ARM_PTP_NONE_COUNTER   2


The architecture definitely doesn't have this last counter.


+
 /* Paravirtualised time calls (defined by ARM DEN0057A) */
 #define ARM_SMCCC_HV_PV_TIME_FEATURES  \
  

Re: [RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

2020-11-23 Thread Marc Zyngier

On 2020-11-23 06:54, Shenming Lu wrote:

From: Zenghui Yu 

When setting the forwarding path of a VLPI, it is more consistent to


I'm not sure it is more consistent. It is a *new* behaviour, because it 
only

matters for migration, which has been so far unsupported.

also transfer the pending state from irq->pending_latch to VPT 
(especially

in migration, the pending states of VLPIs are restored into kvm’s vgic
first). And we currently send "INT+VSYNC" to trigger a VLPI to pending.

Signed-off-by: Zenghui Yu 
Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c 
b/arch/arm64/kvm/vgic/vgic-v4.c

index b5fa73c9fd35..cc3ab9cea182 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -418,6 +418,18 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, 
int virq,

irq->host_irq= virq;
atomic_inc(&map.vpe->vlpi_count);

+   /* Transfer pending state */
+   ret = irq_set_irqchip_state(irq->host_irq,
+   IRQCHIP_STATE_PENDING,
+   irq->pending_latch);
+   WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+
+   /*
+* Let it be pruned from ap_list later and don't bother
+* the List Register.
+*/
+   irq->pending_latch = false;


It occurs to me that calling into irq_set_irqchip_state() for a large
number of interrupts can take a significant amount of time. It is also
odd that you dump the VPT with the VPE unmapped, but rely on the VPE
being mapped for the opposite operation.

Shouldn't these be symmetric, all performed while the VPE is unmapped?
It would also save a lot of ITS traffic.


+
 out:
mutex_unlock(&its->its_lock);
return ret;


Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [RFC PATCH v1 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

2020-11-23 Thread Marc Zyngier

On 2020-11-23 06:54, Shenming Lu wrote:

After pausing all vCPUs and devices capable of interrupting, in order

^
See my comment below about this.


to save the information of all interrupts, besides flushing the pending
states in kvm’s vgic, we also try to flush the states of VLPIs in the
virtual pending tables into guest RAM, but we need to have GICv4.1 and
safely unmap the vPEs first.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v3.c | 62 +++
 1 file changed, 56 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v3.c 
b/arch/arm64/kvm/vgic/vgic-v3.c

index 9cdf39a94a63..e1b3aa4b2b12 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only

 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -356,6 +358,39 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
*kvm, struct vgic_irq *irq)
return 0;
 }

+/*
+ * With GICv4.1, we can get the VLPI's pending state after unmapping
+ * the vPE. The deactivation of the doorbell interrupt will trigger
+ * the unmapping of the associated vPE.
+ */
+static void get_vlpi_state_pre(struct vgic_dist *dist)
+{
+   struct irq_desc *desc;
+   int i;
+
+   if (!kvm_vgic_global_state.has_gicv4_1)
+   return;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+   }
+}
+
+static void get_vlpi_state_post(struct vgic_dist *dist)


nit: the naming feels a bit... odd. Pre/post what?


+{
+   struct irq_desc *desc;
+   int i;
+
+   if (!kvm_vgic_global_state.has_gicv4_1)
+   return;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
+   }
+}
+
 /**
  * vgic_v3_save_pending_tables - Save the pending tables into guest 
RAM

  * kvm lock and all vcpu lock must be held
@@ -365,14 +400,17 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq;
gpa_t last_ptr = ~(gpa_t)0;
-   int ret;
+   int ret = 0;
u8 val;

+   get_vlpi_state_pre(dist);
+
list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
int byte_offset, bit_nr;
struct kvm_vcpu *vcpu;
gpa_t pendbase, ptr;
bool stored;
+   bool is_pending = irq->pending_latch;

vcpu = irq->target_vcpu;
if (!vcpu)
@@ -387,24 +425,36 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
if (ptr != last_ptr) {
ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
last_ptr = ptr;
}

stored = val & (1U << bit_nr);
-   if (stored == irq->pending_latch)
+
+   /* also flush hw pending state */


This comment looks out of place, as we aren't flushing anything.


+   if (irq->hw) {
+   WARN_RATELIMIT(irq_get_irqchip_state(irq->host_irq,
+   IRQCHIP_STATE_PENDING, 
&is_pending),
+  "IRQ %d", irq->host_irq);


Isn't this going to warn like mad on a GICv4.0 system where this, by 
definition,

will generate an error?


+   }
+
+   if (stored == is_pending)
continue;

-   if (irq->pending_latch)
+   if (is_pending)
val |= 1 << bit_nr;
else
val &= ~(1 << bit_nr);

ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
}
-   return 0;
+
+out:
+   get_vlpi_state_post(dist);


This bit worries me: you have unmapped the VPEs, so any interrupt that 
has been
generated during that phase is now forever lost (the GIC doesn't have 
ownership

of the pending tables).

Do you really expect the VM to be restartable from that point? I don't 
see how

this is possible.


+
+   return ret;
 }

 /**


Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [RFC PATCH v1 1/4] irqchip/gic-v4.1: Plumb get_irqchip_state VLPI callback

2020-11-23 Thread Marc Zyngier

On 2020-11-23 06:54, Shenming Lu wrote:

From: Zenghui Yu 

Up to now, the irq_get_irqchip_state() callback of its_irq_chip
leaves unimplemented since there is no architectural way to get
the VLPI's pending state before GICv4.1. Yeah, there has one in
v4.1 for VLPIs.

With GICv4.1, after unmapping the vPE, which cleans and invalidates
any caching of the VPT, we can get the VLPI's pending state by


This is a crucial note: without this unmapping and invalidation,
the pending bits are not generally accessible (they could be cached
in a GIC private structure, cache or otherwise).


peeking at the VPT. So we implement the irq_get_irqchip_state()
callback of its_irq_chip to do it.

Signed-off-by: Zenghui Yu 
Signed-off-by: Shenming Lu 
---
 drivers/irqchip/irq-gic-v3-its.c | 38 
 1 file changed, 38 insertions(+)

diff --git a/drivers/irqchip/irq-gic-v3-its.c 
b/drivers/irqchip/irq-gic-v3-its.c

index 0fec31931e11..287003cacac7 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1695,6 +1695,43 @@ static void its_irq_compose_msi_msg(struct
irq_data *d, struct msi_msg *msg)
iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg);
 }

+static bool its_peek_vpt(struct its_vpe *vpe, irq_hw_number_t hwirq)
+{
+   int mask = hwirq % BITS_PER_BYTE;


nit: this isn't a mask, but a shift instead. BIT(hwirq % BPB) would give
you a mask.


+   void *va;
+   u8 *pt;
+
+   va = page_address(vpe->vpt_page);
+   pt = va + hwirq / BITS_PER_BYTE;
+
+   return !!(*pt & (1U << mask));
+}
+
+static int its_irq_get_irqchip_state(struct irq_data *d,
+enum irqchip_irq_state which, bool *val)
+{
+   struct its_device *its_dev = irq_data_get_irq_chip_data(d);
+   struct its_vlpi_map *map = get_vlpi_map(d);
+
+   if (which != IRQCHIP_STATE_PENDING)
+   return -EINVAL;
+
+   /* not intended for physical LPI's pending state */
+   if (!map)
+   return -EINVAL;
+
+   /*
+* In GICv4.1, a VMAPP with {V,Alloc}=={0,1} cleans and invalidates
+* any caching of the VPT associated with the vPEID held in the GIC.
+*/
+   if (!is_v4_1(its_dev->its) || atomic_read(&map->vpe->vmapp_count))


It isn't clear to me what prevents this from racing against a mapping of
the VPE. Actually, since we only hold the LPI irqdesc lock, I'm pretty 
sure

nothing prevents it.


+   return -EACCES;


I can sort of buy EACCESS for a VPE that is currently mapped, but a 
non-4.1

ITS should definitely return EINVAL.


+
+   *val = its_peek_vpt(map->vpe, map->vintid);
+
+   return 0;
+}
+
 static int its_irq_set_irqchip_state(struct irq_data *d,
 enum irqchip_irq_state which,
 bool state)
@@ -1975,6 +2012,7 @@ static struct irq_chip its_irq_chip = {
.irq_eoi= irq_chip_eoi_parent,
.irq_set_affinity   = its_set_affinity,
.irq_compose_msi_msg= its_irq_compose_msi_msg,
+   .irq_get_irqchip_state  = its_irq_get_irqchip_state,


My biggest issue with this is that it isn't a reliable interface.
It happens to work in the context of KVM, because you make sure it
is called at the right time, but that doesn't make it safe in general
(anyone with the interrupt number is allowed to call this at any time).

Is there a problem with poking at the VPT page from the KVM side?
The code should be exactly the same (maybe simpler even), and at least
you'd be guaranteed to be in the correct context.


.irq_set_irqchip_state  = its_irq_set_irqchip_state,
.irq_retrigger  = its_irq_retrigger,
.irq_set_vcpu_affinity  = its_irq_set_vcpu_affinity,


Thanks,

M.
--
Jazz is not dead. It just smells funny...


[GIT PULL] irqchip fixes for 5.10, take #2

2020-11-22 Thread Marc Zyngier
Hi Thomas,

Here a couple of patches from the irqchip department: a fix for a typo
leading to incorrect trigger information being used on the Exiu
driver, and another for save/restore with the GICv3 ITS.

Please pull,

M.

The following changes since commit d95bdca75b3fb41bf185efe164e05aed820081a5:

  irqchip/ti-sci-inta: Add support for unmapped event handling (2020-11-01 
12:00:50 +)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git 
tags/irqchip-fixes-5.10-2

for you to fetch changes up to 74cde1a53368aed4f2b4b54bf7030437f64a534b:

  irqchip/gic-v3-its: Unconditionally save/restore the ITS state on suspend 
(2020-11-22 12:58:35 +)


irqchip fixes for Linux 5.10, take #2

- Fix Exiu driver trigger type when using ACPI
- Fix GICv3 ITS suspend/resume to use the in-kernel path
  at all times, sidestepping braindead firmware support


Chen Baozi (1):
  irqchip/exiu: Fix the index of fwspec for IRQ type

Xu Qiang (1):
  irqchip/gic-v3-its: Unconditionally save/restore the ITS state on suspend

 drivers/irqchip/irq-gic-v3-its.c | 16 +++-
 drivers/irqchip/irq-sni-exiu.c   |  2 +-
 2 files changed, 4 insertions(+), 14 deletions(-)


Re: [PATCH 0/2] arm64: Allow the rescheduling IPI to bypass irq_enter/exit

2020-11-22 Thread Marc Zyngier
On Fri, 20 Nov 2020 14:17:12 +,
Thomas Gleixner  wrote:

Thomas,

> So with the pre 5.8 scheduler IPI we had scheduler_ipi() doing wonderful
> things

[... tons of interesting and helpful stuff ...]

> Hope that clarifies it.

It does in a way, as it maps some of the low-level x86 things onto
generic concepts. It certainly outlines that we have a lot of work
ahead of us, none of which can be expected to be a quick fix. It
requires restructuring a lot of the low-level arm64 code (Mark has
been toying with it for a couple of years now), and rejig it in funny
ways to match the expectations of the core code.

I haven't tried to think about 32bit ARM just yet, and we may have to
sever the IRQ ties that exist between the two architectures if we want
to make forward progress in a reasonable time frame. In general, it
looks that we'll need a new interface from the generic low-level IRQ
entry code into this new common framework.

> > If arm64 has forever been broken, I'd really like to know and fix it.
> 
> Define broken. It kinda works with all the warts and bolts in tracing,
> context tracking and other places. Is it entirely correct? Probably
> not.
>
> The scheduler IPI is trivial compared to the more interesting ones like
> NMI, MCE, breakpoint, debug exceptions etc. We found quite some
> interesting issues with all of that muck during our 'noinstr' chase.

Yup, point taken. However, given the complexity of the above and the
fact that we currently have a measurable performance regression in
5.10, I'll respin the original series with the cleanups you mentioned,
and the added note that we *really* need to sort this.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.


Re: [PATCH v2 1/3] genirq/affinity: Add irq_update_affinity_desc()

2020-11-22 Thread Marc Zyngier
On Fri, 20 Nov 2020 11:52:09 +,
John Garry  wrote:
> 
> Hi Thomas,
> 
> >> Just mentioning a couple of things here, which could be a clue to what
> >> is going on:
> >> - the device is behind mbigen secondary irq controller
> >> - the flow in the LLDD is to allocate all 128 interrupts during probe,
> >> but we only register handlers for a subset with device managed API
> > Right, but if the driver is removed then the interrupts should be
> > deallocated, right?
> > 
> 
> When removing the driver we just call free_irq(), which removes the
> handler and disables the interrupt.
> 
> But about the irq_desc, this is created when the mapping is created in
> irq_create_fwspec_mapping(), and I don't see this being torn down in
> the driver removal, so persistent in that regard.

If the irq_descs are created via the platform_get_irq() calls in
platform_get_irqs_affinity(), I'd expect some equivalent helper to
tear things down as a result, calling irq_dispose_mapping() behind the
scenes.

> So for pci msi I can see that we free the irq_desc in
> pci_disable_msi() -> free_msi_irqs() -> msi_domain_free_irqs() ...
> 
> So what I am missing here?

I'm not sure the paths are strictly equivalent. On the PCI side, we
can have something that completely driver agnostic, as it is all
architectural. In your case, only the endpoint driver knows about what
happens, and needs to free things accordingly.

Finally, there is the issue in your driver that everything is
requested using devm_request_irq, which cannot play nicely with an
explicit irq_desc teardown. You'll probably need to provide the
equivalent devm helpers for your driver to safely be taken down.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.


Re: [PATCH] irqchip/exiu: Fix the index of fwspec for IRQ type

2020-11-22 Thread Marc Zyngier
On Tue, 17 Nov 2020 11:20:15 +0800, Chen Baozi wrote:
> Since fwspec->param_count of ACPI node is two, the index of IRQ type
> in fwspec->param[] should be 1 rather than 2.

Applied to irq/irqchip-next, thanks!

[1/1] irqchip/exiu: Fix the index of fwspec for IRQ type
  commit: d001e41e1b15716e9b759df5ef00510699f85282

I added Fixes: and Cc: stable tags for a good measure.

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.




Re: [PATCH -next] irq-chip/gic-v3-its: Fixed an issue where the ITS executes the residual commands in the queue again when the ITS wakes up from sleep mode.

2020-11-22 Thread Marc Zyngier
On Sat, 7 Nov 2020 10:42:26 +, Xu Qiang wrote:
> On my platform, ITS_FLAGS_SAVE_SUSPEND_STATE is not set,thus do nothing
> in its suspend and resuse function.On the other hand,firmware stores
> GITS_CTRL,GITS_CBASER,GITS_CWRITER and GITS_BASER in the suspend,
> and restores these registers in the resume. As a result, the ITS executes
> the residual commands in the queue.
> 
> Memory corruption may occur in the following scenarios:
> 
> [...]

Applied to irq/irqchip-next, thanks!

[1/1] irqchip/gic-v3-its: Unconditionally save/restore the ITS state on suspend
  commit: a51f7296f38f498c6f186c82ae3aa25ae10bb266

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.




Re: [PATCH v7 00/17] Add support for Clang LTO

2020-11-21 Thread Marc Zyngier

On 2020-11-20 23:53, Nick Desaulniers wrote:

On Fri, Nov 20, 2020 at 3:30 PM Ard Biesheuvel  wrote:


On Fri, 20 Nov 2020 at 21:19, Nick Desaulniers 
 wrote:

>
> On Fri, Nov 20, 2020 at 2:30 AM Ard Biesheuvel  wrote:
> >
> > On Thu, 19 Nov 2020 at 00:42, Nick Desaulniers  
wrote:
> > >
> > > Thanks for continuing to drive this series Sami.  For the series,
> > >
> > > Tested-by: Nick Desaulniers 
> > >
> > > I did virtualized boot tests with the series applied to aarch64
> > > defconfig without CONFIG_LTO, with CONFIG_LTO_CLANG, and a third time
> > > with CONFIG_THINLTO.  If you make changes to the series in follow ups,
> > > please drop my tested by tag from the modified patches and I'll help
> > > re-test.  Some minor feedback on the Kconfig change, but I'll post it
> > > off of that patch.
> > >
> >
> > When you say 'virtualized" do you mean QEMU on x86? Or actual
> > virtualization on an AArch64 KVM host?
>
> aarch64 guest on x86_64 host.  If you have additional configurations
> that are important to you, additional testing help would be
> appreciated.
>

Could you run this on an actual phone? Or does Android already ship
with this stuff?


By `this`, if you mean "the LTO series", it has been shipping on
Android phones for years now, I think it's even required in the latest
release.

If you mean "the LTO series + mainline" on a phone, well there's the
android-mainline of https://android.googlesource.com/kernel/common/,
in which this series was recently removed in order to facilitate
rebasing Android's patches on ToT-mainline until getting the series
landed upstream.  Bit of a chicken and the egg problem there.

If you mean "the LTO series + mainline + KVM" on a phone; I don't know
the precise state of aarch64 KVM and Android (Will or Marc would
know).


If you are lucky enough to have an Android system booting at EL2,
KVM should just works [1], though I haven't tried with this series.


We did experiment recently with RockPI's for aach64 KVM, IIRC;
I think Android is tricky as it still requires A64+A32/T32 chipsets,


Which is about 100% of the Android systems at the moment (I don't think
any of the asymmetric SoCs are in the wild yet). It doesn't really 
affect

KVM anyway.

 M.

[1] with the broken firmware gotchas that I believed to be erradicated
8 years ago, but are still prevalent in the Android world: laughable
PSCI implementation, invalid CNTFRQ_EL0...
--
Who you jivin' with that Cosmik Debris?


Re: [PATCH] random: avoid arch_get_random_seed_long() when collecting IRQ randomness

2020-11-20 Thread Marc Zyngier

On 2020-11-05 15:29, Ard Biesheuvel wrote:

When reseeding the CRNG periodically, arch_get_random_seed_long() is
called to obtain entropy from an architecture specific source if one
is implemented. In most cases, these are special instructions, but in
some cases, such as on ARM, we may want to back this using firmware
calls, which are considerably more expensive.

Another call to arch_get_random_seed_long() exists in the CRNG driver,
in add_interrupt_randomness(), which collects entropy by capturing
inter-interrupt timing and relying on interrupt jitter to provide
random bits. This is done by keeping a per-CPU state, and mixing in
the IRQ number, the cycle counter and the return address every time an
interrupt is taken, and mixing this per-CPU state into the entropy pool
every 64 invocations, or at least once per second. The entropy that is
gathered this way is credited as 1 bit of entropy. Every time this
happens, arch_get_random_seed_long() is invoked, and the result is
mixed in as well, and also credited with 1 bit of entropy.

This means that arch_get_random_seed_long() is called at least once
per second on every CPU, which seems excessive, and doesn't really
scale, especially in a virtualization scenario where CPUs may be
oversubscribed: in cases where arch_get_random_seed_long() is backed
by an instruction that actually goes back to a shared hardware entropy
source (such as RNDRRS on ARM), we will end up hitting it hundreds of
times per second.

So let's drop the call to arch_get_random_seed_long() from
add_interrupt_randomness(), and instead, rely on crng_reseed() to call
the arch hook to get random seed material from the platform.

Signed-off-by: Ard Biesheuvel 


Looks sensible. Having this on the interrupt path looks quite
heavy handed, and my understanding of the above is that it has
an adverse effect on the entropy pool.

Acked-by: Marc Zyngier 

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH 2/2] drm/meson: dw-hdmi: Enable the iahb clock early enough

2020-11-20 Thread Marc Zyngier

On 2020-11-20 10:54, Guillaume Tucker wrote:

On 20/11/2020 09:42, Marc Zyngier wrote:

Instead of moving meson_dw_hdmi_init() around which breaks existing
platform, let's enable the clock meson_dw_hdmi_init() depends on.
This means we don't have to worry about this clock being enabled or
not, depending on the boot-loader features.

Fixes: b33340e33acd ("drm/meson: dw-hdmi: Ensure that clocks are 
enabled before touching the TOP registers")

Reported-by: Guillaume Tucker 


Although I am triaging kernelci bisections, it was initially
found thanks to our friendly bot.  So if you're OK with this, it
would most definitely appreciate a mention:

  Reported-by: "kernelci.org bot" 


Sure. Neil can add this when (and if) he applies these patches.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: next/master bisection: baseline.dmesg.emerg on meson-gxbb-p200

2020-11-20 Thread Marc Zyngier

On 2020-11-20 09:26, Neil Armstrong wrote:

On 19/11/2020 19:35, Marc Zyngier wrote:
diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c 
b/drivers/gpu/drm/meson/meson_dw_hdmi.c

index 7f8eea494147..52af8ba94311 100644
--- a/drivers/gpu/drm/meson/meson_dw_hdmi.c
+++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c
@@ -146,6 +146,7 @@ struct meson_dw_hdmi {
 struct reset_control *hdmitx_ctrl;
 struct reset_control *hdmitx_phy;
 struct clk *hdmi_pclk;
+    struct clk *iahb_clk;
 struct clk *venci_clk;
 struct regulator *hdmi_supply;
 u32 irq_stat;
@@ -1033,6 +1034,13 @@ static int meson_dw_hdmi_bind(struct device 
*dev, struct device *master,

 }
 clk_prepare_enable(meson_dw_hdmi->hdmi_pclk);

+    meson_dw_hdmi->iahb_clk = devm_clk_get(dev, "iahb");
+    if (IS_ERR(meson_dw_hdmi->iahb_clk)) {
+    dev_err(dev, "Unable to get iahb clk\n");
+    return PTR_ERR(meson_dw_hdmi->iahb_clk);
+    }
+    clk_prepare_enable(meson_dw_hdmi->iahb_clk);



On previous SoCs, iahb was directly the bus clock (clk81), and on 
recent socs

this clock is a gate.

The question is why is it disabled. Maybe a previous failed probe 
disabled it

in the dw-hdmi probe failure code and this clock is needed for
meson_dw_hdmi_init(),
so yeah this is the right fix.

Thanks.

Could you send a revert of b33340e33acdfe5ca6a5aa1244709575ae1e0432
and then proper fix with clk_disable_unprepare added ?


Bah. I missed that email and sent a slightly different resolution.
Hopefully that'll be good enough.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v5 0/2] MTE support for KVM guest

2020-11-20 Thread Marc Zyngier

On 2020-11-20 09:50, Steven Price wrote:

On 19/11/2020 19:11, Marc Zyngier wrote:



Does this sound reasonable?

I'll clean up the set_pte_at() change and post a v6 later today.


Please hold on. I still haven't reviewed your v5, nor have I had time
to read your reply to my comments on v4.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: AMU extension v1 support for cortex A76, A77, A78 CPUs

2020-11-20 Thread Marc Zyngier

On 2020-11-20 09:09, Vladimir Murzin wrote:

On 11/20/20 8:56 AM, Marc Zyngier wrote:

On 2020-11-20 04:30, Neeraj Upadhyay wrote:

Hi,

For ARM cortex A76, A77, A78 cores (which as per TRM, support AMU)
AA64PFR0[47:44] field is not set, and AMU does not get enabled for
them.
Can you please provide support for these CPUs in cpufeature.c?


If that was the case, that'd be an erratum, and it would need to be
documented as such. It could also be that this is an optional feature
for these cores (though the TRM doesn't suggest that).

Can someone at ARM confirm what is the expected behaviour of these 
CPUs?


Not a confirmation, but IIRC, these are imp def features, while our 
cpufeatures

catches architected one.


Ah, good point. So these CPUs implement some sort of AMU, and not *the* 
AMU.


Yet the register names are the same. Who thought that'd be a good idea?

M.
--
Jazz is not dead. It just smells funny...


[PATCH 2/2] drm/meson: dw-hdmi: Enable the iahb clock early enough

2020-11-20 Thread Marc Zyngier
Instead of moving meson_dw_hdmi_init() around which breaks existing
platform, let's enable the clock meson_dw_hdmi_init() depends on.
This means we don't have to worry about this clock being enabled or
not, depending on the boot-loader features.

Fixes: b33340e33acd ("drm/meson: dw-hdmi: Ensure that clocks are enabled before 
touching the TOP registers")
Reported-by: Guillaume Tucker 
Tested-by: Guillaume Tucker 
Signed-off-by: Marc Zyngier 
---
 drivers/gpu/drm/meson/meson_dw_hdmi.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c 
b/drivers/gpu/drm/meson/meson_dw_hdmi.c
index 29623b309cb1..aad75a22dc33 100644
--- a/drivers/gpu/drm/meson/meson_dw_hdmi.c
+++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c
@@ -1051,6 +1051,10 @@ static int meson_dw_hdmi_bind(struct device *dev, struct 
device *master,
if (ret)
return ret;
 
+   ret = meson_enable_clk(dev, "iahb");
+   if (ret)
+   return ret;
+
ret = meson_enable_clk(dev, "venci");
if (ret)
return ret;
@@ -1086,6 +1090,8 @@ static int meson_dw_hdmi_bind(struct device *dev, struct 
device *master,
 
encoder->possible_crtcs = BIT(0);
 
+   meson_dw_hdmi_init(meson_dw_hdmi);
+
DRM_DEBUG_DRIVER("encoder initialized\n");
 
/* Bridge / Connector */
@@ -1110,8 +1116,6 @@ static int meson_dw_hdmi_bind(struct device *dev, struct 
device *master,
if (IS_ERR(meson_dw_hdmi->hdmi))
return PTR_ERR(meson_dw_hdmi->hdmi);
 
-   meson_dw_hdmi_init(meson_dw_hdmi);
-
next_bridge = of_drm_find_bridge(pdev->dev.of_node);
if (next_bridge)
drm_bridge_attach(encoder, next_bridge,
-- 
2.28.0



[PATCH 0/2] More meson HDMI fixes

2020-11-20 Thread Marc Zyngier
Guillaume reported that my earlier fixes for the meson HDMI driver
broke another set of machines which are now exploding (clock not
enabled).

I have thus reconsidered the approach and came up with an alternative
fix (enable a missing clock), which Guillaume confirmed to be working.
Jerome pointed out that this driver is leaking clock references like a
sieve, so that needed addressing too.

The first patch start by fixing the clock leakage using a devm
facility.

The second patch addresses the earlier crash by reusing the
infrastructure put together in the first patch.

Tested on VIM3l.

Marc Zyngier (2):
  drm/meson: dw-hdmi: Disable clocks on driver teardown
  drm/meson: dw-hdmi: Enable the iahb clock early enough

 drivers/gpu/drm/meson/meson_dw_hdmi.c | 51 ++-
 1 file changed, 35 insertions(+), 16 deletions(-)

-- 
2.28.0



[PATCH 1/2] drm/meson: dw-hdmi: Disable clocks on driver teardown

2020-11-20 Thread Marc Zyngier
The HDMI driver request clocks early, but never disable them, leaving
the clocks on even when the driver is removed.

Fix it by slightly refactoring the clock code, and register a devm
action that will eventually disable/unprepare the enabled clocks.

Signed-off-by: Marc Zyngier 
---
 drivers/gpu/drm/meson/meson_dw_hdmi.c | 43 ++-
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c 
b/drivers/gpu/drm/meson/meson_dw_hdmi.c
index 7f8eea494147..29623b309cb1 100644
--- a/drivers/gpu/drm/meson/meson_dw_hdmi.c
+++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c
@@ -145,8 +145,6 @@ struct meson_dw_hdmi {
struct reset_control *hdmitx_apb;
struct reset_control *hdmitx_ctrl;
struct reset_control *hdmitx_phy;
-   struct clk *hdmi_pclk;
-   struct clk *venci_clk;
struct regulator *hdmi_supply;
u32 irq_stat;
struct dw_hdmi *hdmi;
@@ -946,6 +944,29 @@ static void meson_disable_regulator(void *data)
regulator_disable(data);
 }
 
+static void meson_disable_clk(void *data)
+{
+   clk_disable_unprepare(data);
+}
+
+static int meson_enable_clk(struct device *dev, char *name)
+{
+   struct clk *clk;
+   int ret;
+
+   clk = devm_clk_get(dev, name);
+   if (IS_ERR(clk)) {
+   dev_err(dev, "Unable to get %s pclk\n", name);
+   return PTR_ERR(clk);
+   }
+
+   ret = clk_prepare_enable(clk);
+   if (!ret)
+   ret = devm_add_action_or_reset(dev, meson_disable_clk, clk);
+
+   return ret;
+}
+
 static int meson_dw_hdmi_bind(struct device *dev, struct device *master,
void *data)
 {
@@ -1026,19 +1047,13 @@ static int meson_dw_hdmi_bind(struct device *dev, 
struct device *master,
if (IS_ERR(meson_dw_hdmi->hdmitx))
return PTR_ERR(meson_dw_hdmi->hdmitx);
 
-   meson_dw_hdmi->hdmi_pclk = devm_clk_get(dev, "isfr");
-   if (IS_ERR(meson_dw_hdmi->hdmi_pclk)) {
-   dev_err(dev, "Unable to get HDMI pclk\n");
-   return PTR_ERR(meson_dw_hdmi->hdmi_pclk);
-   }
-   clk_prepare_enable(meson_dw_hdmi->hdmi_pclk);
+   ret = meson_enable_clk(dev, "isfr");
+   if (ret)
+   return ret;
 
-   meson_dw_hdmi->venci_clk = devm_clk_get(dev, "venci");
-   if (IS_ERR(meson_dw_hdmi->venci_clk)) {
-   dev_err(dev, "Unable to get venci clk\n");
-   return PTR_ERR(meson_dw_hdmi->venci_clk);
-   }
-   clk_prepare_enable(meson_dw_hdmi->venci_clk);
+   ret = meson_enable_clk(dev, "venci");
+   if (ret)
+   return ret;
 
dw_plat_data->regm = devm_regmap_init(dev, NULL, meson_dw_hdmi,
  &meson_dw_hdmi_regmap_config);
-- 
2.28.0



Re: [PATCH 0/2] arm64: Allow the rescheduling IPI to bypass irq_enter/exit

2020-11-20 Thread Marc Zyngier

[+ Mark who has been hacking in the same area lately]

Hi Thomas,

On 2020-11-03 20:32, Thomas Gleixner wrote:

On Sun, Nov 01 2020 at 13:14, Marc Zyngier wrote:

Vincent recently reported [1] that 5.10-rc1 showed a significant
regression when running "perf bench sched pipe" on arm64, and
pinpointed it to the recent move to handling IPIs as normal
interrupts.

The culprit is the use of irq_enter/irq_exit around the handling of
the rescheduling IPI, meaning that we enter the scheduler right after
the handling of the IPI instead of deferring it to the next preemption
event. This accounts for most of the overhead introduced.


irq_enter()/exit() does not end up in the scheduler. If it does then
please show the call chain.

Scheduling happens when the IPI returns just before returning into the
low level code (or on ARM in the low level code) when NEED_RESCHED is
set (which is usually the case when the IPI is sent) and:

  the IPI hit user space

or

  the IPI hit in preemptible kernel context and CONFIG_PREEMPT[_RT] is
  enabled.

Not doing so would be a bug. So I really don't understand your 
reasoning

here.


You are of course right. I somehow associated the overhead of the 
resched

IPI with the scheduler itself. I stand corrected.




On architectures that have architected IPIs at the CPU level (x86
being the obvious one), the absence of irq_enter/exit is natural.


It's not really architected IPIs. We reserve the top 20ish vectors on
each CPU for IPIs and other per processor interrupts, e.g. the per cpu
timer.

Now lets look at what x86 does:

Interrupts and regular IPIs (function call ) do

irqentry_enter()   <- handles rcu_irq_enter() or context tracking
  ...
  irq_enter_rcu()
  ...
  irq_exit_rcu()
irqentry_exit() <- handles need_resched()

The scheduler IPI does:

irqentry_enter()   <- handles rcu_irq_enter() or context tracking
  ...
  __irq_enter_raw()
  ...
  __irq_exit_raw()
irqentry_exit() <- handles need_resched()

So we don't invoke irq_enter_rcu() on enter and on exit we skip
irq_exit_rcu(). That's fine because

  - Calling the tick management is pointless because this is going to
schedule anyway or something consumed the need_resched already.

  - The irqtime accounting is silly because it covers only the call and
returns. The time spent in the accounting is more than the time
we are accounting (empty call).

So what your hack fails to invoke is rcu_irq_enter()/exit() in case 
that

the IPI hits the idle task in an RCU off region. You also fail to tell
lockdep. No cookies!


Who needs cookies when you can have cheese? ;-)

More seriously, it seems to me that we have a bit of a 
cross-architecture

disconnect here. I have been trying to join the dots between what you
describe above, and the behaviour of arm64 (and probably a large number
of the non-x86 architectures), and I feel massively confused.

Up to 5.9, our handling of the rescheduling IPI was "do as little as
possible": decode the interrupt from the lowest possible level (the
root irqchip), call into arch code, end-up in scheduler_ipi(), the end.

No lockdep, no RCU, no nothing.

What changed? Have we missed some radical change in the way the core
kernel expects the arch code to do thing? I'm aware of the 
kernel/entry/common.c stuff, which implements most of the goodies you

mention, but this effectively is x86-only at the moment.

If arm64 has forever been broken, I'd really like to know and fix it.


The bad news is that these patches are ugly as sin, and I really don't
like them.


Yes, they are ugly and the extra conditional in the irq handling path 
is

not pretty either.


I specially hate that they can give driver authors the idea that they
can make random interrupts "faster".


Just split the guts of irq_modify_status() into __irq_modify_status()
and call that from irq_modify_status().

Reject IRQ_HIDDEN (which should have been IRQ_IPI really) and IRQ_NAKED
(IRQ_RAW perhaps) in irq_modify_status().

Do not export __irq_modify_status() so it can be only used from 
built-in

code which takes it away from driver writers.


Yup, done.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: AMU extension v1 support for cortex A76, A77, A78 CPUs

2020-11-20 Thread Marc Zyngier

On 2020-11-20 04:30, Neeraj Upadhyay wrote:

Hi,

For ARM cortex A76, A77, A78 cores (which as per TRM, support AMU)
AA64PFR0[47:44] field is not set, and AMU does not get enabled for
them.
Can you please provide support for these CPUs in cpufeature.c?


If that was the case, that'd be an erratum, and it would need to be
documented as such. It could also be that this is an optional feature
for these cores (though the TRM doesn't suggest that).

Can someone at ARM confirm what is the expected behaviour of these CPUs?

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v5 0/2] MTE support for KVM guest

2020-11-19 Thread Marc Zyngier

On 2020-11-19 18:42, Andrew Jones wrote:

On Thu, Nov 19, 2020 at 03:45:40PM +, Peter Maydell wrote:
On Thu, 19 Nov 2020 at 15:39, Steven Price  
wrote:

> This series adds support for Arm's Memory Tagging Extension (MTE) to
> KVM, allowing KVM guests to make use of it. This builds on the existing
> user space support already in v5.10-rc1, see [1] for an overview.

> The change to require the VMM to map all guest memory PROT_MTE is
> significant as it means that the VMM has to deal with the MTE tags even
> if it doesn't care about them (e.g. for virtual devices or if the VMM
> doesn't support migration). Also unfortunately because the VMM can
> change the memory layout at any time the check for PROT_MTE/VM_MTE has
> to be done very late (at the point of faulting pages into stage 2).

I'm a bit dubious about requring the VMM to map the guest memory
PROT_MTE unless somebody's done at least a sketch of the design
for how this would work on the QEMU side. Currently QEMU just
assumes the guest memory is guest memory and it can access it
without special precautions...



There are two statements being made here:

1) Requiring the use of PROT_MTE when mapping guest memory may not fit
   QEMU well.

2) New KVM features should be accompanied with supporting QEMU code in
   order to prove that the APIs make sense.

I strongly agree with (2). While kvmtool supports some quick testing, 
it
doesn't support migration. We must test all new features with a 
migration

supporting VMM.

I'm not sure about (1). I don't feel like it should be a major problem,
but (2).

I'd be happy to help with the QEMU prototype, but preferably when 
there's

hardware available. Has all the current MTE testing just been done on
simulators? And, if so, are there regression tests regularly running on
the simulators too? And can they test migration? If hardware doesn't
show up quickly and simulators aren't used for regression tests, then
all this code will start rotting from day one.


While I agree with the sentiment, the reality is pretty bleak.

I'm pretty sure nobody will ever run a migration on emulation. I also 
doubt
there is much overlap between MTE users and migration users, 
unfortunately.


No HW is available today, and when it becomes available, it will be in
the form of a closed system on which QEMU doesn't run, either because
we are locked out of EL2 (as usual), or because migration is not part of
the use case (like KVM on Android, for example).

So we can wait another two (five?) years until general purpose HW 
becomes

available, or we start merging what we can test today. I'm inclined to
do the latter.

And I think it is absolutely fine for QEMU to say "no MTE support with 
KVM"

(we can remove all userspace visibility, except for the capability).

M.
--
Jazz is not dead. It just smells funny...


Re: next/master bisection: baseline.dmesg.emerg on meson-gxbb-p200

2020-11-19 Thread Marc Zyngier

On 2020-11-19 18:13, Jerome Brunet wrote:

On Thu 19 Nov 2020 at 19:04, Guillaume Tucker
 wrote:


Hi Marc,

On 19/11/2020 11:58, Marc Zyngier wrote:

On 2020-11-19 10:26, Neil Armstrong wrote:

On 19/11/2020 11:20, Marc Zyngier wrote:

On 2020-11-19 08:50, Guillaume Tucker wrote:

Please see the automated bisection report below about some kernel
errors on meson-gxbb-p200.

Reports aren't automatically sent to the public while we're
trialing new bisection features on kernelci.org, however this one
looks valid.

The bisection started with next-20201118 but the errors are still
present in next-20201119.  Details for this regression:

  https://kernelci.org/test/case/id/5fb6196bfd0127fd68d8d902/

The first error is:

  [   14.757489] Internal error: synchronous external abort: 
96000210

[#1] PREEMPT SMP


Looks like yet another clock ordering setup. I guess different 
Amlogic

platforms have slightly different ordering requirements.

Neil, do you have any idea of which platform requires which 
ordering?
The variability in DT and platforms is pretty difficult to follow 
(and

I don't think I have such board around).


The requirements should be the same, here the init was done before 
calling
dw_hdmi_probe to be sure the clocks and internals resets were 
deasserted.
But since you boot from u-boot already enabling these, it's already 
active.


The solution would be to revert and do some check in 
meson_dw_hdmi_init() to

check if already enabled and do nothing.


A better fix seems to be this, which makes it explicit that there is
a dependency between some of the registers accessed from 
meson_dw_hdmi_init()

and the iahb clock.

Guillaume, can you give this a go on your failing box?


I confirm it solves the problem.  Please add this to your fix
patch if it's OK with you:

  Reported-by: "kernelci.org bot" 
  Tested-by: Guillaume Tucker 


For the record, it passed all the tests when applied on top of
the "bad" revision found by the bisection:

  
http://lava.baylibre.com:10080/scheduler/alljobs?search=v5.10-rc3-1021-gb8668a2e5ea1


and the exact same test on the "bad" revision without the fix
consistently showed the error:

  http://lava.baylibre.com:10080/scheduler/job/374176


Thanks,
Guillaume


diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c 
b/drivers/gpu/drm/meson/meson_dw_hdmi.c

index 7f8eea494147..52af8ba94311 100644
--- a/drivers/gpu/drm/meson/meson_dw_hdmi.c
+++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c
@@ -146,6 +146,7 @@ struct meson_dw_hdmi {
 struct reset_control *hdmitx_ctrl;
 struct reset_control *hdmitx_phy;
 struct clk *hdmi_pclk;
+struct clk *iahb_clk;
 struct clk *venci_clk;
 struct regulator *hdmi_supply;
 u32 irq_stat;
@@ -1033,6 +1034,13 @@ static int meson_dw_hdmi_bind(struct device 
*dev, struct device *master,

 }
 clk_prepare_enable(meson_dw_hdmi->hdmi_pclk);

+meson_dw_hdmi->iahb_clk = devm_clk_get(dev, "iahb");
+if (IS_ERR(meson_dw_hdmi->iahb_clk)) {
+dev_err(dev, "Unable to get iahb clk\n");
+return PTR_ERR(meson_dw_hdmi->iahb_clk);
+}
+clk_prepare_enable(meson_dw_hdmi->iahb_clk);


If you guys are going ahead with this fix, this call to
clk_prepare_enable() needs to be balanced with clk_disable_unprepare() 
somehow


Yup, good point.

Although this driver *never* disables any clock it enables, and leaves 
it

to the main DW driver, which I guess makes it leak references.

So all 3 clocks need fixing.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: next/master bisection: baseline.dmesg.emerg on meson-gxbb-p200

2020-11-19 Thread Marc Zyngier

On 2020-11-19 10:26, Neil Armstrong wrote:

On 19/11/2020 11:20, Marc Zyngier wrote:

On 2020-11-19 08:50, Guillaume Tucker wrote:

Please see the automated bisection report below about some kernel
errors on meson-gxbb-p200.

Reports aren't automatically sent to the public while we're
trialing new bisection features on kernelci.org, however this one
looks valid.

The bisection started with next-20201118 but the errors are still
present in next-20201119.  Details for this regression:

  https://kernelci.org/test/case/id/5fb6196bfd0127fd68d8d902/

The first error is:

  [   14.757489] Internal error: synchronous external abort: 96000210
[#1] PREEMPT SMP


Looks like yet another clock ordering setup. I guess different Amlogic
platforms have slightly different ordering requirements.

Neil, do you have any idea of which platform requires which ordering?
The variability in DT and platforms is pretty difficult to follow (and
I don't think I have such board around).


The requirements should be the same, here the init was done before 
calling
dw_hdmi_probe to be sure the clocks and internals resets were 
deasserted.
But since you boot from u-boot already enabling these, it's already 
active.


The solution would be to revert and do some check in 
meson_dw_hdmi_init() to

check if already enabled and do nothing.


A better fix seems to be this, which makes it explicit that there is
a dependency between some of the registers accessed from 
meson_dw_hdmi_init()

and the iahb clock.

Guillaume, can you give this a go on your failing box?

Thanks,

M.

diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c 
b/drivers/gpu/drm/meson/meson_dw_hdmi.c

index 7f8eea494147..52af8ba94311 100644
--- a/drivers/gpu/drm/meson/meson_dw_hdmi.c
+++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c
@@ -146,6 +146,7 @@ struct meson_dw_hdmi {
struct reset_control *hdmitx_ctrl;
struct reset_control *hdmitx_phy;
struct clk *hdmi_pclk;
+   struct clk *iahb_clk;
struct clk *venci_clk;
struct regulator *hdmi_supply;
u32 irq_stat;
@@ -1033,6 +1034,13 @@ static int meson_dw_hdmi_bind(struct device *dev, 
struct device *master,

}
clk_prepare_enable(meson_dw_hdmi->hdmi_pclk);

+   meson_dw_hdmi->iahb_clk = devm_clk_get(dev, "iahb");
+   if (IS_ERR(meson_dw_hdmi->iahb_clk)) {
+   dev_err(dev, "Unable to get iahb clk\n");
+   return PTR_ERR(meson_dw_hdmi->iahb_clk);
+   }
+   clk_prepare_enable(meson_dw_hdmi->iahb_clk);
+
meson_dw_hdmi->venci_clk = devm_clk_get(dev, "venci");
if (IS_ERR(meson_dw_hdmi->venci_clk)) {
dev_err(dev, "Unable to get venci clk\n");
@@ -1071,6 +1079,8 @@ static int meson_dw_hdmi_bind(struct device *dev, 
struct device *master,


encoder->possible_crtcs = BIT(0);

+   meson_dw_hdmi_init(meson_dw_hdmi);
+
DRM_DEBUG_DRIVER("encoder initialized\n");

/* Bridge / Connector */
@@ -1095,8 +1105,6 @@ static int meson_dw_hdmi_bind(struct device *dev, 
struct device *master,

if (IS_ERR(meson_dw_hdmi->hdmi))
return PTR_ERR(meson_dw_hdmi->hdmi);

-   meson_dw_hdmi_init(meson_dw_hdmi);
-
next_bridge = of_drm_find_bridge(pdev->dev.of_node);
if (next_bridge)
drm_bridge_attach(encoder, next_bridge,


--
Jazz is not dead. It just smells funny...


Re: next/master bisection: baseline.dmesg.emerg on meson-gxbb-p200

2020-11-19 Thread Marc Zyngier

On 2020-11-19 10:26, Neil Armstrong wrote:

On 19/11/2020 11:20, Marc Zyngier wrote:

On 2020-11-19 08:50, Guillaume Tucker wrote:

Please see the automated bisection report below about some kernel
errors on meson-gxbb-p200.

Reports aren't automatically sent to the public while we're
trialing new bisection features on kernelci.org, however this one
looks valid.

The bisection started with next-20201118 but the errors are still
present in next-20201119.  Details for this regression:

  https://kernelci.org/test/case/id/5fb6196bfd0127fd68d8d902/

The first error is:

  [   14.757489] Internal error: synchronous external abort: 96000210
[#1] PREEMPT SMP


Looks like yet another clock ordering setup. I guess different Amlogic
platforms have slightly different ordering requirements.

Neil, do you have any idea of which platform requires which ordering?
The variability in DT and platforms is pretty difficult to follow (and
I don't think I have such board around).


The requirements should be the same, here the init was done before 
calling
dw_hdmi_probe to be sure the clocks and internals resets were 
deasserted.
But since you boot from u-boot already enabling these, it's already 
active.


The crashing platform also boots with u-boot. What is the difference?
No HDMI support?

The solution would be to revert and do some check in 
meson_dw_hdmi_init() to

check if already enabled and do nothing.


It looks more subtle than that, as it also depends on which DT is 
provided
(an early meson_dw_hdmi_init() works with the kernel DT, and breaks with 
the

u-boot DT). So whatever difference is between the two DTs causes havoc.
u-boot obviously only uses its own DT, so we are looking at a kernel bug 
here.


Not having this patch also breaks module reinsertion (HDMI clocks are 
disabled

on unbind), so *something* has to be done late.

So here are my questions:
- What breaks in my config when I boot using u-boot's DT?
- How to detect early that the registers are clocked or not?

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: next/master bisection: baseline.dmesg.emerg on meson-gxbb-p200

2020-11-19 Thread Marc Zyngier

On 2020-11-19 08:50, Guillaume Tucker wrote:

Please see the automated bisection report below about some kernel
errors on meson-gxbb-p200.

Reports aren't automatically sent to the public while we're
trialing new bisection features on kernelci.org, however this one
looks valid.

The bisection started with next-20201118 but the errors are still
present in next-20201119.  Details for this regression:

  https://kernelci.org/test/case/id/5fb6196bfd0127fd68d8d902/

The first error is:

  [   14.757489] Internal error: synchronous external abort: 96000210
[#1] PREEMPT SMP


Looks like yet another clock ordering setup. I guess different Amlogic
platforms have slightly different ordering requirements.

Neil, do you have any idea of which platform requires which ordering?
The variability in DT and platforms is pretty difficult to follow (and
I don't think I have such board around).

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v3 3/5] irqchip: ocelot: Add support for Luton platforms

2020-11-18 Thread Marc Zyngier

On 2020-11-16 16:24, Gregory CLEMENT wrote:

This patch extends irqchip driver for oceleot to be used with an other
vcoreiii base platform: Luton.

Based on a larger patch from Lars Povlsen 
Signed-off-by: Gregory CLEMENT 
---
 drivers/irqchip/irq-mscc-ocelot.c | 145 +-
 1 file changed, 123 insertions(+), 22 deletions(-)

diff --git a/drivers/irqchip/irq-mscc-ocelot.c
b/drivers/irqchip/irq-mscc-ocelot.c
index 88143c0b700c..9964800c53c2 100644
--- a/drivers/irqchip/irq-mscc-ocelot.c
+++ b/drivers/irqchip/irq-mscc-ocelot.c
@@ -12,39 +12,115 @@
 #include 
 #include 

-#define ICPU_CFG_INTR_INTR_STICKY  0x10
-#define ICPU_CFG_INTR_INTR_ENA 0x18
-#define ICPU_CFG_INTR_INTR_ENA_CLR 0x1c
-#define ICPU_CFG_INTR_INTR_ENA_SET 0x20
-#define ICPU_CFG_INTR_DST_INTR_IDENT(x)(0x38 + 0x4 * (x))
-#define ICPU_CFG_INTR_INTR_TRIGGER(x)  (0x5c + 0x4 * (x))
-
-#define OCELOT_NR_IRQ 24
+#define ICPU_CFG_INTR_DST_INTR_IDENT(_p, x)	(_p->reg_off_ident + 0x4 * 
(x))
+#define ICPU_CFG_INTR_INTR_TRIGGER(_p, x)	(_p->reg_off_trigger + 0x4 * 
(x))

+
+#define FLAGS_NEED_INIT_ENABLE BIT(0)
+#define FLAGS_FORCE_LUTON_STYLEBIT(1)
+#define FLAGS_HAS_TRIGGER  BIT(2)
+
+struct chip_props {
+   u32 flags;
+   u32 reg_off_sticky;
+   u32 reg_off_ena;
+   u32 reg_off_ena_clr;
+   u32 reg_off_ena_set;
+   u32 reg_off_ident;
+   u32 reg_off_trigger;
+   u32 reg_off_force;
+   u32 reg_off_ena_irq0;


Do all these fields need to be u32s?


+   u32 n_irq;
+};
+
+static const struct chip_props ocelot_props = {
+   .flags  = FLAGS_HAS_TRIGGER,
+   .reg_off_sticky = 0x10,
+   .reg_off_ena= 0x18,
+   .reg_off_ena_clr= 0x1c,
+   .reg_off_ena_set= 0x20,
+   .reg_off_ident  = 0x38,
+   .reg_off_trigger= 0x5c,
+   .reg_off_force  = 0xc,
+   .n_irq  = 24,
+};
+
+static const struct chip_props luton_props = {
+   .flags  = FLAGS_NEED_INIT_ENABLE |
+ FLAGS_FORCE_LUTON_STYLE,


LUTON_STYLE doesn't quite convey the idea of what this implies.
Please use a name that actually means something (other than the
city north of London). And force what exactly?


+   .reg_off_sticky = 0,
+   .reg_off_ena= 0x4,
+   .reg_off_ena_clr= 0x8,
+   .reg_off_ena_set= 0xc,
+   .reg_off_ident  = 0x18,
+   .reg_off_trigger= 0,


If this field doesn't exist (because the matching flag isn't set), don't
list it.


+   .reg_off_force  = 0x38,
+   .reg_off_ena_irq0   = 0x14,
+   .n_irq  = 28,
+};


Please split this patch in two: rework the Ocelot driver to the "new"
style in one patch, then introduce new platform in another.



 static void ocelot_irq_unmask(struct irq_data *data)
 {
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
+   struct irq_domain *d = data->domain;
+   struct chip_props *p = d->host_data;
struct irq_chip_type *ct = irq_data_get_chip_type(data);
unsigned int mask = data->mask;
u32 val;

irq_gc_lock(gc);
-   val = irq_reg_readl(gc, ICPU_CFG_INTR_INTR_TRIGGER(0)) |
- irq_reg_readl(gc, ICPU_CFG_INTR_INTR_TRIGGER(1));
-   if (!(val & mask))
-   irq_reg_writel(gc, mask, ICPU_CFG_INTR_INTR_STICKY);
+   if (p->flags & FLAGS_HAS_TRIGGER) {
+   val = irq_reg_readl(gc, ICPU_CFG_INTR_INTR_TRIGGER(p, 0)) |
+   irq_reg_readl(gc, ICPU_CFG_INTR_INTR_TRIGGER(p, 1));
+   if (!(val & mask))
+   irq_reg_writel(gc, mask, p->reg_off_sticky);
+   }

*ct->mask_cache &= ~mask;
-   irq_reg_writel(gc, mask, ICPU_CFG_INTR_INTR_ENA_SET);
+   irq_reg_writel(gc, mask, p->reg_off_ena_set);
irq_gc_unlock(gc);
 }

+static void luton_irq_force(struct irq_data *data,
+   struct irq_chip_generic *gc,
+   struct chip_props *p)
+{
+   int off = p->reg_off_force + (data->hwirq * sizeof(u32));
+   u32 val = irq_reg_readl(gc, off);
+
+   irq_reg_writel(gc, val | BIT(3), off);
+}
+
+static int ocelot_irq_force(struct irq_data *data,
+   enum irqchip_irq_state which, bool state)


Please use a name that matches the same of the API.


+{
+   struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
+   struct irq_domain *d = data->domain;
+   struct chip_props *p = d->host_data;
+   int ret = -EINVAL;
+
+   /* Only supports triggering */
+   if ((which == IRQCHIP_STATE_PENDING ||
+which == IRQCHIP_STATE_ACTIVE) &&


PENDING and ACTIVE are two very different things, and yet you handle
them the same way. Which one does your interrupt controller support?


+   state && p->reg_off_force) {
+   

Re: [RFC PATCH V2] acpi/irq: Add stacked IRQ domain support to PCI interrupt link

2020-11-18 Thread Marc Zyngier

Hi Chen,

On top of Bjorn's comments:

On 2020-11-17 13:42, Chen Baozi wrote:

Some PCIe designs require software to do extra acknowledgements for
legacy INTx interrupts. If the driver is written only for device tree,
things are simple. In that case, a new driver can be written under
driver/pci/controller/ with a DT node of PCIe host written like:

  pcie {
...
interrupt-map = <0 0 0  1  &pcie_intc 0>,
<0 0 0  2  &pcie_intc 1>,
<0 0 0  3  &pcie_intc 2>,
<0 0 0  4  &pcie_intc 3>;

pcie_intc: legacy-interrupt-controller {
  interrupt-controller;
  #interrupt-cells = <1>;
  interrupt-parent = <&gic>;
  interrupts = <0 226 4>;
};
  };

Similar designs can be found on Aardvark, MediaTek Gen2 and Socionext
UniPhier PCIe controller at the moment. Essentially, those designs are
supported by inserting an extra interrupt controller between PCIe host
and GIC and parse the topology in a DT-based PCI controller driver.
As we turn to ACPI, All the PCIe hosts are described the same ID of
"PNP0A03" and share driver/acpi/pci_root.c. It comes to be a problem
to make this kind of PCI INTx work under ACPI.

Therefore, we introduce an stacked IRQ domain support to PCI interrupt
link for ACPI. With this support, we can populate the ResourceSource
to refer to a device object that describes an interrupt controller.
That would allow us to refer to a dedicated driver which implements
the logic needed to manage the interrupt state. With this patch,
those PCI interrupt links can be supported by describing the INTx
in ACPI table as the following example:

  Device (IXIU) {
...
  }

  Device(LINKA) {
Name(_HID, EISAID("PNP0C0F"))
Name(_PRS, ResourceTemplate(){
  Interrupt(ResourceProducer, Level, ActiveHigh, Exclusive, 0, 
"\\SB.IXIU")

{ 60 }
})
...
  }

  Device(PCI0) {
...
Name(_PRT, Package{
  Package{ 0x, 0, LINKA, 0 }
  ...
})
  }

Signed-off-by: Chen Baozi 
---
 drivers/acpi/irq.c  | 22 +-
 drivers/acpi/pci_irq.c  |  6 --
 drivers/acpi/pci_link.c | 17 +++--
 include/acpi/acpi_drivers.h |  2 +-
 include/linux/acpi.h|  4 
 5 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/drivers/acpi/irq.c b/drivers/acpi/irq.c
index e209081d644b..e78a44815c44 100644
--- a/drivers/acpi/irq.c
+++ b/drivers/acpi/irq.c
@@ -81,6 +81,25 @@ void acpi_unregister_gsi(u32 gsi)
 }
 EXPORT_SYMBOL_GPL(acpi_unregister_gsi);

+int acpi_register_irq(struct device *dev, u32 irq, int trigger,
+ int polarity, struct fwnode_handle *domain_id)
+{
+   struct irq_fwspec fwspec;
+
+   if (WARN_ON(!domain_id)) {
+   pr_warn("GSI: No registered irqchip, giving up\n");


A fwnode_handle is not an irqchip. It's just an opaque identifier
for a HW block. Furthermore, there is no need to have both a WARN_ON()
and a pr_warn(). Please pick one.

I'd also suggest you rename domain_id to fwnode, which is the commonly
used idiom (yes, I know about the unfortunate precedent in 
acpi_register_gsi()).



+   return -EINVAL;
+   }
+
+   fwspec.fwnode = domain_id;
+   fwspec.param[0] = irq;
+   fwspec.param[1] = acpi_dev_get_irq_type(trigger, polarity);
+   fwspec.param_count = 2;
+
+   return irq_create_fwspec_mapping(&fwspec);
+}
+EXPORT_SYMBOL_GPL(acpi_register_irq);


By the way, this is almost an exact duplicate of acpi_register_gsi().
You definitely want to make this code common.


+
 /**
  * acpi_get_irq_source_fwhandle() - Retrieve fwhandle from IRQ 
resource source.

  * @source: acpi_resource_source to use for the lookup.
@@ -92,7 +111,7 @@ EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
  * Return:
  * The referenced device fwhandle or NULL on failure
  */
-static struct fwnode_handle *
+struct fwnode_handle *
 acpi_get_irq_source_fwhandle(const struct acpi_resource_source 
*source)

 {
struct fwnode_handle *result;
@@ -115,6 +134,7 @@ acpi_get_irq_source_fwhandle(const struct
acpi_resource_source *source)
acpi_bus_put_acpi_device(device);
return result;
 }
+EXPORT_SYMBOL_GPL(acpi_get_irq_source_fwhandle);

 /*
  * Context for the resource walk used to lookup IRQ resources.
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 14ee631cb7cf..19296d70c95c 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -410,6 +410,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
char *link = NULL;
char link_desc[16];
int rc;
+   struct fwnode_handle *irq_domain;


fwnode_handle is most definitely not an IRQ domain.



pin = dev->pin;
if (!pin) {
@@ -438,7 +439,8 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
gsi = acpi_pci_link_allocate_irq(entry->link,
 entry->index,
  

Re: WARNING: kernel/irq/chip.c:242 __irq_startup+0xa8/0xb0

2020-11-18 Thread Marc Zyngier

Naresh,

On 2020-11-18 06:11, Naresh Kamboju wrote:
On Tue, 13 Oct 2020 at 11:09, Naresh Kamboju 
 wrote:


On stable rc  5.8.15 the following kernel warning was noticed once
while boot and this is hard to reproduce.


This is now reproduciable on arm64 NXP ls2088 device


How reproducible? On demand? Once in a while?



[   19.980839] [ cut here ]
[   19.985468] WARNING: CPU: 1 PID: 441 at kernel/irq/chip.c:242
__irq_startup+0x9c/0xa8
[   19.985472] Modules linked in: rfkill lm90 ina2xx crct10dif_ce
qoriq_thermal fuse
[   20.000773] CPU: 1 PID: 441 Comm: (agetty) Not tainted 5.4.78-rc1 #2


Can you please try and reproduce this on mainline?


[   20.000775] Hardware name: Freescale Layerscape 2088A RDB Board (DT)
[   20.000779] pstate: 6085 (nZCv daIf -PAN -UAO)
[   20.018253] pc : __irq_startup+0x9c/0xa8
[   20.018256] lr : irq_startup+0x64/0x130
[   20.018259] sp : 80001122f8e0
[   20.029303] x29: 80001122f8e0 x28: 0082c242d400
[   20.029306] x27: dd0f47234768 x26: 00020902
[   20.029309] x25: dd0f461a6f10 x24: dd0f461a6bc8
[   20.029311] x23:  x22: 0001
[   20.029314] x21: 0001 x20: 0082c22f8780
[   20.029316] x19: 0082c1060800 x18: 0001
[   20.029318] x17:  x16: 8000114a
[   20.029321] x15:  x14: 0082c0e92f90
[   20.071738] x13: 0082c0e93080 x12: 80001146
[   20.071741] x11: dead0100 x10: 0040
[   20.071743] x9 : dd0f47093ba8 x8 : dd0f47093ba0
[   20.087653] x7 : 0082a2b0 x6 : dd0f47074958
[   20.087655] x5 : dd0f47074000 x4 : 80001123
[   20.087657] x3 : 0504 x2 : 0001
[   20.103567] x1 : 03032004 x0 : 0082c1060858
[   20.103570] Call trace:
[   20.103573]  __irq_startup+0x9c/0xa8
[   20.103577]  irq_startup+0x64/0x130
[   20.118359]  __enable_irq+0x7c/0x88
[   20.118362]  enable_irq+0x54/0xa8
[   20.118367]  serial8250_do_startup+0x658/0x718
[   20.118371]  serial8250_startup+0x38/0x48


Looking at the DT:

serial0: serial@21c0500 {
interrupts = <0 32 0x4>; /* Level high type */

serial1: serial@21c0600 {
interrupts = <0 32 0x4>; /* Level high type */

serial2: serial@21d0500 {
interrupts = <0 33 0x4>; /* Level high type */

serial3: serial@21d0600 {
interrupts = <0 33 0x4>; /* Level high type */


The UART interrupt lines are shared. Braindead, 1980 style.

Which UART is agetty trying to use? Is there any other process using
another UART concurrently? We could have a race between the line being
shut down on one device, and activated from the other, but I can't
spot it right away.

If you can reproduce it easily enough, it shouldn't be too hard to trace
what happens around the activation of the shared interrupt (assuming 
that

this is where the problem is).

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v4 2/2] arm64: kvm: Introduce MTE VCPU feature

2020-11-17 Thread Marc Zyngier

Hi Steven,

On 2020-10-26 15:57, Steven Price wrote:

Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging
for a VM. This exposes the feature to the guest and automatically tags
memory pages touched by the VM as PG_mte_tagged (and clears the tags
storage) to ensure that the guest cannot see stale tags, and so that 
the

tags are correctly saved/restored across swap.

Signed-off-by: Steven Price 
Reviewed-by: Andrew Jones 
---
 arch/arm64/include/asm/kvm_emulate.h |  3 +++
 arch/arm64/include/asm/kvm_host.h|  3 +++
 arch/arm64/kvm/arm.c |  9 +
 arch/arm64/kvm/mmu.c | 20 
 arch/arm64/kvm/sys_regs.c|  6 +-
 include/uapi/linux/kvm.h |  1 +
 6 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h
b/arch/arm64/include/asm/kvm_emulate.h
index 5ef2669ccd6c..66c0d9e7c2b4 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -79,6 +79,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu 
*vcpu)

if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
vcpu_el1_is_32bit(vcpu))
vcpu->arch.hcr_el2 |= HCR_TID2;
+
+   if (vcpu->kvm->arch.mte_enabled)


Please add a predicate (vcpu_has_mte() or kvm_has_mte()?) for this.


+   vcpu->arch.hcr_el2 |= HCR_ATA;
 }

 static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_host.h
b/arch/arm64/include/asm/kvm_host.h
index 95ab7345dcc8..cd993aec0440 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -118,6 +118,9 @@ struct kvm_arch {
 */
unsigned long *pmu_filter;
unsigned int pmuver;
+
+   /* Memory Tagging Extension enabled for the guest */
+   bool mte_enabled;
 };

 struct kvm_vcpu_fault_info {
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index f56122eedffc..7ee93bcac017 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -89,6 +89,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = 0;
kvm->arch.return_nisv_io_abort_to_user = true;
break;
+   case KVM_CAP_ARM_MTE:
+   if (!system_supports_mte() || kvm->created_vcpus)
+   return -EINVAL;


You also want to avoid 32bit guests. Also, what is the rational for
this being a VM capability as opposed to a CPU feature, similar
to SVE and PMU?


+   r = 0;
+   kvm->arch.mte_enabled = true;
+   break;
default:
r = -EINVAL;
break;
@@ -210,6 +216,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, 
long ext)

 */
r = 1;
break;
+   case KVM_CAP_ARM_MTE:
+   r = system_supports_mte();


Same comment about 32bit.


+   break;
case KVM_CAP_STEAL_TIME:
r = kvm_arm_pvtime_supported();
break;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 19aacc7d64de..38fe25310ca1 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -862,6 +862,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu,
phys_addr_t fault_ipa,
if (vma_pagesize == PAGE_SIZE && !force_pte)
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
   &pfn, &fault_ipa);
+
+   /*
+* The otherwise redundant test for system_supports_mte() allows the
+* code to be compiled out when CONFIG_ARM64_MTE is not present.
+*/
+	if (system_supports_mte() && kvm->arch.mte_enabled && pfn_valid(pfn)) 
{

+   /*
+* VM will be able to see the page's tags, so we must ensure
+* they have been initialised.
+*/
+   struct page *page = pfn_to_page(pfn);
+   long i, nr_pages = compound_nr(page);
+
+   /* if PG_mte_tagged is set, tags have already been initialised 
*/
+   for (i = 0; i < nr_pages; i++, page++) {
+   if (!test_and_set_bit(PG_mte_tagged, &page->flags))
+   mte_clear_page_tags(page_address(page));
+   }
+   }


What are the visibility requirements for the tags, specially if the
guest has its MMU off? Is there any cache management that needs to
occur?

Another thing is device-like memory that is managed by userspace,
such as the QEMU emulated flash, for which there also might be tags.
How is that dealt with? In general, what are the expectations for
tags on memory shared between host and guest? Who owns them?


+
if (writable) {
prot |= KVM_PGTABLE_PROT_W;
kvm_set_pfn_dirty(pfn);
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 430e36e1a13d..35a3dc448231 100644
--- a/arch/arm64/kvm

Re: [PATCH v4 1/2] arm64: kvm: Save/restore MTE registers

2020-11-17 Thread Marc Zyngier

Hi Steven,

These patches unfortunately don't apply to -rc4 anymore, as we repainted
quite a bit while working on fixes. I'd be grateful if you could rebase 
them.


A few other things though:

On 2020-10-26 15:57, Steven Price wrote:

Define the new system registers that MTE introduces and context switch
them. The MTE feature is still hidden from the ID register as it isn't
supported in a VM yet.

Signed-off-by: Steven Price 
Reviewed-by: Andrew Jones 
---
 arch/arm64/include/asm/kvm_host.h  |  4 
 arch/arm64/include/asm/sysreg.h|  3 ++-
 arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 14 ++
 arch/arm64/kvm/sys_regs.c  | 14 ++
 4 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h
b/arch/arm64/include/asm/kvm_host.h
index 0aecbab6a7fb..95ab7345dcc8 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -134,6 +134,8 @@ enum vcpu_sysreg {
SCTLR_EL1,  /* System Control Register */
ACTLR_EL1,  /* Auxiliary Control Register */
CPACR_EL1,  /* Coprocessor Access Control */
+   RGSR_EL1,   /* Random Allocation Tag Seed Register */
+   GCR_EL1,/* Tag Control Register */
ZCR_EL1,/* SVE Control */
TTBR0_EL1,  /* Translation Table Base Register 0 */
TTBR1_EL1,  /* Translation Table Base Register 1 */
@@ -150,6 +152,8 @@ enum vcpu_sysreg {
TPIDR_EL1,  /* Thread ID, Privileged */
AMAIR_EL1,  /* Aux Memory Attribute Indirection Register */
CNTKCTL_EL1,/* Timer Control Register (EL1) */
+   TFSRE0_EL1, /* Tag Fault Status Register (EL0) */
+   TFSR_EL1,   /* Tag Fault Stauts Register (EL1) */
PAR_EL1,/* Physical Address Register */
MDSCR_EL1,  /* Monitor Debug System Control Register */
MDCCINT_EL1,/* Monitor Debug Comms Channel Interrupt Enable Reg */
diff --git a/arch/arm64/include/asm/sysreg.h 
b/arch/arm64/include/asm/sysreg.h

index d52c1b3ce589..7727df0bc09d 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -565,7 +565,8 @@
 #define SCTLR_ELx_M(BIT(0))

 #define SCTLR_ELx_FLAGS(SCTLR_ELx_M  | SCTLR_ELx_A | SCTLR_ELx_C | \
-SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB)
+SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB | \
+SCTLR_ELx_ITFSB)

 /* SCTLR_EL2 specific flags. */
 #define SCTLR_EL2_RES1	((BIT(4))  | (BIT(5))  | (BIT(11)) | (BIT(16)) 
| \

diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index 7a986030145f..a124ffa49ba3 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -18,6 +18,11 @@
 static inline void __sysreg_save_common_state(struct kvm_cpu_context 
*ctxt)

 {
ctxt_sys_reg(ctxt, MDSCR_EL1)   = read_sysreg(mdscr_el1);
+   if (system_supports_mte()) {
+   ctxt_sys_reg(ctxt, RGSR_EL1)= read_sysreg_s(SYS_RGSR_EL1);
+   ctxt_sys_reg(ctxt, GCR_EL1) = read_sysreg_s(SYS_GCR_EL1);
+   ctxt_sys_reg(ctxt, TFSRE0_EL1)  = read_sysreg_s(SYS_TFSRE0_EL1);


As far as I can tell, HCR_EL2.ATA is still clear when running a guest.
So why, do we save/restore this state yet?

Also, I wonder whether we should keep these in the C code. If one day
we enable MTE in the kernel, we will have to move them to the assembly
part, much like we do for PAuth. And I fear that "one day" is pretty
soon:

https://lore.kernel.org/linux-arm-kernel/cover.1605046192.git.andreyk...@google.com/



+   }
 }

 static inline void __sysreg_save_user_state(struct kvm_cpu_context 
*ctxt)

@@ -45,6 +50,8 @@ static inline void __sysreg_save_el1_state(struct
kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, CNTKCTL_EL1) = read_sysreg_el1(SYS_CNTKCTL);
ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg(par_el1);
ctxt_sys_reg(ctxt, TPIDR_EL1)   = read_sysreg(tpidr_el1);
+   if (system_supports_mte())
+   ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR);

ctxt_sys_reg(ctxt, SP_EL1)  = read_sysreg(sp_el1);
ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR);
@@ -63,6 +70,11 @@ static inline void
__sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
 static inline void __sysreg_restore_common_state(struct 
kvm_cpu_context *ctxt)

 {
write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1),  mdscr_el1);
+   if (system_supports_mte()) {
+   write_sysreg_s(ctxt_sys_reg(ctxt, RGSR_EL1), SYS_RGSR_EL1);
+   write_sysreg_s(ctxt_sys_reg(ctxt, GCR_EL1), SYS_GCR_EL1);
+   write_sysreg_s(ctxt_sys_reg(ctxt, TFSRE0_EL1), SYS_TFSRE0_EL1);
+   }
 }

 static inline void __sysreg_restore_user_state(struct kvm_cpu_context 
*ctxt)

@@ -106,6 +118,8 @@ static inline void
_

Re: [PATCH] KVM: arm64: vgic-v3: Drop the reporting of GICR_TYPER.Last for userspace

2020-11-17 Thread Marc Zyngier
On Tue, 17 Nov 2020 23:16:29 +0800, Zenghui Yu wrote:
> It was recently reported that if GICR_TYPER is accessed before the RD base
> address is set, we'll suffer from the unset @rdreg dereferencing. Oops...
> 
>   gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
>   (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;
> 
> It's "expected" that users will access registers in the redistributor if
> the RD has been properly configured (e.g., the RD base address is set). But
> it hasn't yet been covered by the existing documentation.
> 
> [...]

Applied to kvm-arm64/fixes-5.10, thanks!

[1/1] KVM: arm64: vgic-v3: Drop the reporting of GICR_TYPER.Last for userspace
  commit: 23bde34771f1ea92fb5e6682c0d8c04304d34b3b

I have added a Cc stable, as this definitely wants to be backported.

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.




Re: [PATCH 1/2] KVM: arm64: vgic: Forbid invalid userspace Redistributor accesses

2020-11-17 Thread Marc Zyngier

On 2020-11-17 09:59, Auger Eric wrote:

Hi Marc,

On 11/17/20 9:49 AM, Marc Zyngier wrote:

Hi Zenghui,

On 2020-11-16 14:57, Zenghui Yu wrote:

Hi Marc,

On 2020/11/16 22:10, Marc Zyngier wrote:
My take is that only if the "[Re]Distributor base address" is 
specified
in the system memory map, will the user-provided 
kvm_device_attr.offset
make sense. And we can then handle the access to the register which 
is

defined by "base address + offset".


I'd tend to agree, but it is just that this is a large change at 
-rc4.

I'd rather have a quick fix for 5.10, and a more invasive change for
5.11,
spanning all the possible vgic devices.


So you prefer fixing it by "return a value that doesn't have the Last
bit set" for v5.10? I'm ok with it and can send v2 for it.


Cool. Thanks for that.

Btw, looking again at the way we handle the user-reading of 
GICR_TYPER


vgic_mmio_read_v3r_typer(vcpu, addr, len)

it seems that @addr is actually the *offset* of GICR_TYPER (0x0008) 
and
@addr is unlikely to be equal to last_rdist_typer, which is the *GPA* 
of

the last RD. Looks like the user-reading of GICR_TYPER.Last is always
broken?


I think you are right. Somehow, we don't seem to track the index of
the RD in the region, so we can never compute the address of the RD
even if the base address is set.

Let's drop the reporting of Last for userspace for now, as it never
worked. If you post a patch addressing that quickly, I'll get it to
Paolo by the end of the week (there's another fix that needs merging).

Eric: do we have any test covering the userspace API?


So as this issue seems related to the changes made when implementing 
the

multiple RDIST regions, I volunteer to write those KVM selftests :-)


You're on! :D

More seriously, there is scope for fuzzing the device save/restore API,
as we find bugs every time someone change the "known good" ordering that
is implemented in QEMU.

Maybe it means getting rid of some unnecessary flexibility, as proposed
by Zenghui, if we are confident that no userspace makes use of it.
And in the future, making sure that new APIs are rigid enough to avoid 
such

bugs.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH 0/4] drm/meson: Module removal fixes

2020-11-17 Thread Marc Zyngier

Hi Neil,

On 2020-11-17 08:49, Neil Armstrong wrote:

Hi Marc,

On 16/11/2020 21:07, Marc Zyngier wrote:

Hi all,

Having recently moved over to a top-of-the-tree u-boot on one of my
VIM3L systems in order to benefit from unrelated improvements
(automatic PCIe detection, EFI...), I faced the issue that my kernel
would hang like this:

[  OK  ] Finished Helper to synchronize boot up for ifupdown.
[  OK  ] Started Rule-based Manager for Device Events and Files.
[7.114516] VDDCPU: supplied by regulator-dummy
[  OK  ] Found device /dev/ttyAML0.
[7.146862] meson-drm ff90.vpu: Queued 2 outputs on vpu
[7.169630] fb0: switching to meson-drm-fb from simple
[7.169944] Console: switching to colour dummy device 80x25
[7.179250] meson-drm ff90.vpu: CVBS Output connector not 
available


and that's it.

After some poking around, I figured out that it is in the
meson-dw-hdmi module that the CPU was hanging...


I'll be interested in having your kernel config, I never had such 
report

since I enabled HDMI support in U-Boot a few years ago.


Yeah, I was pretty surprised too. I have a hunch that this is caused
by u-boot DT exposing an extra MMIO region (dubbed "hhi") that gets
picked up by the kernel driver. *Not* having the region in the DT
(as in the kernel's version of the same DT) makes the driver work
exactly once:

Decompiled u-boot DT:

hdmi-tx@0 {
compatible = "amlogic,meson-g12a-dw-hdmi";
reg = <0x00 0x00 0x00 0x1 0x00 0x3c000 0x00 0x1000>;
[...]
reg-names = "hdmitx\0hhi";

Decompiled kernel DT:

hdmi-tx@0 {
compatible = "amlogic,meson-g12a-dw-hdmi";
reg = <0x00 0x00 0x00 0x1>;

There seem to be some complex interactions between the HDMI driver
and the DRM driver, both using this MMIO region at any given time.
But I admit not having tried very hard to follow the DRM maze of
intricate callbacks. All I needed was this box to reliably boot with
the firmware-provided DT.

You can find a reasonably recent version of my config at [1].

M.

[1] http://www.loen.fr/tmp/Config.full-arm64
--
Jazz is not dead. It just smells funny...


Re: [PATCH 1/2] KVM: arm64: vgic: Forbid invalid userspace Redistributor accesses

2020-11-17 Thread Marc Zyngier

Hi Zenghui,

On 2020-11-16 14:57, Zenghui Yu wrote:

Hi Marc,

On 2020/11/16 22:10, Marc Zyngier wrote:
My take is that only if the "[Re]Distributor base address" is 
specified
in the system memory map, will the user-provided 
kvm_device_attr.offset
make sense. And we can then handle the access to the register which 
is

defined by "base address + offset".


I'd tend to agree, but it is just that this is a large change at -rc4.
I'd rather have a quick fix for 5.10, and a more invasive change for 
5.11,

spanning all the possible vgic devices.


So you prefer fixing it by "return a value that doesn't have the Last
bit set" for v5.10? I'm ok with it and can send v2 for it.


Cool. Thanks for that.


Btw, looking again at the way we handle the user-reading of GICR_TYPER

vgic_mmio_read_v3r_typer(vcpu, addr, len)

it seems that @addr is actually the *offset* of GICR_TYPER (0x0008) and
@addr is unlikely to be equal to last_rdist_typer, which is the *GPA* 
of

the last RD. Looks like the user-reading of GICR_TYPER.Last is always
broken?


I think you are right. Somehow, we don't seem to track the index of
the RD in the region, so we can never compute the address of the RD
even if the base address is set.

Let's drop the reporting of Last for userspace for now, as it never
worked. If you post a patch addressing that quickly, I'll get it to
Paolo by the end of the week (there's another fix that needs merging).

Eric: do we have any test covering the userspace API?

Thanks,

M.
--
Jazz is not dead. It just smells funny...


[PATCH 1/4] drm/meson: Free RDMA resources after tearing down DRM

2020-11-16 Thread Marc Zyngier
Removing the meson DRM module results in the following splat:

[ 2179.451346] Hardware name:  , BIOS 2021.01-rc2-00012-gde865f7ee1 11/16/2020
[ 2179.458316] Workqueue: events drm_mode_rmfb_work_fn [drm]
[ 2179.463597] pstate: 80c9 (Nzcv daif +PAN +UAO -TCO BTYPE=--)
[ 2179.469558] pc : meson_rdma_writel_sync+0x44/0xb0 [meson_drm]
[ 2179.475243] lr : meson_g12a_afbcd_reset+0x34/0x60 [meson_drm]
[ 2179.480930] sp : ffc01212bb70
[ 2179.484207] x29: ffc01212bb70 x28: ff8044f66f00
[ 2179.489469] x27: ff8045b13800 x26: 0001
[ 2179.494730] x25:  x24: 0001
[ 2179.41] x23:  x22: 
[ 2179.505252] x21: 0028 x20: 1a01
[ 2179.510513] x19: ff8046029480 x18: 
[ 2179.515775] x17:  x16: 
[ 2179.521036] x15:  x14: 
[ 2179.526297] x13: 00400326 x12: 0309030303260300
[ 2179.531558] x11: 0300054004a0 x10: 0418054004000400
[ 2179.536820] x9 : ffc008fe4914 x8 : ff8040a1adc0
[ 2179.542081] x7 :  x6 : ff8042aa0080
[ 2179.547342] x5 : ff8044f66f00 x4 : ffc008fe5bc8
[ 2179.552603] x3 : 00010101 x2 : 0001
[ 2179.557865] x1 :  x0 : 
[ 2179.563127] Call trace:
[ 2179.565548]  meson_rdma_writel_sync+0x44/0xb0 [meson_drm]
[ 2179.570894]  meson_g12a_afbcd_reset+0x34/0x60 [meson_drm]
[ 2179.576241]  meson_plane_atomic_disable+0x38/0xb0 [meson_drm]
[ 2179.581966]  drm_atomic_helper_commit_planes+0x1e0/0x21c [drm_kms_helper]
[ 2179.588684]  drm_atomic_helper_commit_tail_rpm+0x68/0xb0 [drm_kms_helper]
[ 2179.595410]  commit_tail+0xac/0x190 [drm_kms_helper]
[ 2179.600326]  drm_atomic_helper_commit+0x16c/0x390 [drm_kms_helper]
[ 2179.606484]  drm_atomic_commit+0x58/0x70 [drm]
[ 2179.610880]  drm_framebuffer_remove+0x398/0x434 [drm]
[ 2179.615881]  drm_mode_rmfb_work_fn+0x68/0x8c [drm]
[ 2179.620575]  process_one_work+0x1cc/0x49c
[ 2179.624538]  worker_thread+0x200/0x444
[ 2179.628246]  kthread+0x14c/0x160
[ 2179.631439]  ret_from_fork+0x10/0x38

caused by the fact that the RDMA buffer has already been freed,
resulting in meson_rdma_writel_sync() getting a NULL pointer.

Move the afbcd reset and meson_rdma_free calls after the DRM
unregistration is complete so that the teardown can safely complete.

Signed-off-by: Marc Zyngier 
---
 drivers/gpu/drm/meson/meson_drv.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/meson/meson_drv.c 
b/drivers/gpu/drm/meson/meson_drv.c
index 8b9c8dd788c4..324fa489f1c4 100644
--- a/drivers/gpu/drm/meson/meson_drv.c
+++ b/drivers/gpu/drm/meson/meson_drv.c
@@ -389,15 +389,15 @@ static void meson_drv_unbind(struct device *dev)
meson_canvas_free(priv->canvas, priv->canvas_id_vd1_2);
}
 
-   if (priv->afbcd.ops) {
-   priv->afbcd.ops->reset(priv);
-   meson_rdma_free(priv);
-   }
-
drm_dev_unregister(drm);
drm_irq_uninstall(drm);
drm_kms_helper_poll_fini(drm);
drm_dev_put(drm);
+
+   if (priv->afbcd.ops) {
+   priv->afbcd.ops->reset(priv);
+   meson_rdma_free(priv);
+   }
 }
 
 static const struct component_master_ops meson_drv_master_ops = {
-- 
2.28.0



[PATCH 2/4] drm/meson: Unbind all connectors on module removal

2020-11-16 Thread Marc Zyngier
Removing the meson DRM module results in the following splats:

[   42.689228] WARNING: CPU: 0 PID: 572 at drivers/gpu/drm/drm_irq.c:192 
drm_irq_uninstall+0x130/0x160 [drm]
[...]
[   42.812820] Hardware name:  , BIOS 2021.01-rc2-00012-gde865f7ee1 11/16/2020
[   42.819723] pstate: 80400089 (Nzcv daIf +PAN -UAO -TCO BTYPE=--)
[   42.825737] pc : drm_irq_uninstall+0x130/0x160 [drm]
[   42.830647] lr : drm_irq_uninstall+0xc4/0x160 [drm]
[...]
[   42.917614] Call trace:
[   42.920086]  drm_irq_uninstall+0x130/0x160 [drm]
[   42.924612]  meson_drv_unbind+0x68/0xa4 [meson_drm]
[   42.929436]  component_del+0xc0/0x180
[   42.933058]  meson_dw_hdmi_remove+0x28/0x40 [meson_dw_hdmi]
[   42.938576]  platform_drv_remove+0x38/0x60
[   42.942628]  __device_release_driver+0x190/0x23c
[   42.947198]  driver_detach+0xcc/0x160
[   42.950822]  bus_remove_driver+0x68/0xe0
[   42.954702]  driver_unregister+0x3c/0x6c
[   42.958583]  platform_driver_unregister+0x20/0x2c
[   42.963243]  meson_dw_hdmi_platform_driver_exit+0x18/0x4a8 [meson_dw_hdmi]
[   42.970057]  __arm64_sys_delete_module+0x1bc/0x294
[   42.974801]  el0_svc_common.constprop.0+0x80/0x240
[   42.979542]  do_el0_svc+0x30/0xa0
[   42.982821]  el0_svc+0x18/0x50
[   42.985839]  el0_sync_handler+0x198/0x404
[   42.989806]  el0_sync+0x158/0x180

immediatelly followed by

[   43.002296] WARNING: CPU: 0 PID: 572 at 
drivers/gpu/drm/drm_mode_config.c:504 drm_mode_config_cleanup+0x2a8/0x304 [drm]
[...]
[   43.128150] Hardware name:  , BIOS 2021.01-rc2-00012-gde865f7ee1 11/16/2020
[   43.135052] pstate: 8049 (Nzcv daif +PAN -UAO -TCO BTYPE=--)
[   43.141062] pc : drm_mode_config_cleanup+0x2a8/0x304 [drm]
[   43.146492] lr : drm_mode_config_cleanup+0xac/0x304 [drm]
[...]
[   43.233979] Call trace:
[   43.236451]  drm_mode_config_cleanup+0x2a8/0x304 [drm]
[   43.241538]  drm_mode_config_init_release+0x1c/0x2c [drm]
[   43.246886]  drm_managed_release+0xa8/0x120 [drm]
[   43.251543]  drm_dev_put+0x94/0xc0 [drm]
[   43.255380]  meson_drv_unbind+0x78/0xa4 [meson_drm]
[   43.260204]  component_del+0xc0/0x180
[   43.263829]  meson_dw_hdmi_remove+0x28/0x40 [meson_dw_hdmi]
[   43.269344]  platform_drv_remove+0x38/0x60
[   43.273398]  __device_release_driver+0x190/0x23c
[   43.277967]  driver_detach+0xcc/0x160
[   43.281590]  bus_remove_driver+0x68/0xe0
[   43.285471]  driver_unregister+0x3c/0x6c
[   43.289352]  platform_driver_unregister+0x20/0x2c
[   43.294011]  meson_dw_hdmi_platform_driver_exit+0x18/0x4a8 [meson_dw_hdmi]
[   43.300826]  __arm64_sys_delete_module+0x1bc/0x294
[   43.305570]  el0_svc_common.constprop.0+0x80/0x240
[   43.310312]  do_el0_svc+0x30/0xa0
[   43.313590]  el0_svc+0x18/0x50
[   43.316608]  el0_sync_handler+0x198/0x404
[   43.320574]  el0_sync+0x158/0x180
[   43.323852] ---[ end trace d796a3072dab01da ]---
[   43.328561] [drm:drm_mode_config_cleanup [drm]] *ERROR* connector HDMI-A-1 
leaked!

both triggered by the fact that the HDMI subsystem is still active,
and the DRM removal doesn't result in the connectors being torn down.

Call drm_atomic_helper_shutdown() and component_unbind_all() to safely
tear the module down.

Signed-off-by: Marc Zyngier 
---
 drivers/gpu/drm/meson/meson_drv.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/meson/meson_drv.c 
b/drivers/gpu/drm/meson/meson_drv.c
index 324fa489f1c4..3d1de9cbb1c8 100644
--- a/drivers/gpu/drm/meson/meson_drv.c
+++ b/drivers/gpu/drm/meson/meson_drv.c
@@ -390,8 +390,10 @@ static void meson_drv_unbind(struct device *dev)
}
 
drm_dev_unregister(drm);
-   drm_irq_uninstall(drm);
drm_kms_helper_poll_fini(drm);
+   drm_atomic_helper_shutdown(drm);
+   component_unbind_all(dev, drm);
+   drm_irq_uninstall(drm);
drm_dev_put(drm);
 
if (priv->afbcd.ops) {
-- 
2.28.0



[PATCH 0/4] drm/meson: Module removal fixes

2020-11-16 Thread Marc Zyngier
Hi all,

Having recently moved over to a top-of-the-tree u-boot on one of my
VIM3L systems in order to benefit from unrelated improvements
(automatic PCIe detection, EFI...), I faced the issue that my kernel
would hang like this:

[  OK  ] Finished Helper to synchronize boot up for ifupdown.
[  OK  ] Started Rule-based Manager for Device Events and Files.
[7.114516] VDDCPU: supplied by regulator-dummy
[  OK  ] Found device /dev/ttyAML0.
[7.146862] meson-drm ff90.vpu: Queued 2 outputs on vpu
[7.169630] fb0: switching to meson-drm-fb from simple
[7.169944] Console: switching to colour dummy device 80x25
[7.179250] meson-drm ff90.vpu: CVBS Output connector not available

and that's it.

After some poking around, I figured out that it is in the
meson-dw-hdmi module that the CPU was hanging...

Reverting to the kernel DT instead of u-boot's papered over it
somehow, but it turned out that removing the module (modprobe -r)
resulted in a firework. And for every issue I was fixing, another
followed. Much fun for a rainy Monday in the basement!

I ended up with the following 4 patches, which solve all my problems:
I can now boot with the u-boot provided DT, and the hdmi and DRM
drivers can be removed and re-inserted at will.

The first patch is a straightforward use-after-free, causing a NULL
pointer dereference. Moving things around fixes it.

The second patch shows that I have no clue about the DRM subsystem
whatsoever. I mimicked what my Rockchip systems are doing, and the two
warnings disappeared. It can't completely be wrong (famous last
words...).

The third patch fixes a *very* common issue with regulators (I've
fixed at least 3 drivers with a similar issue). I guess the devm
subsystem needs to grow a new helper at some point.

The last patch finally fixes the issue I was seeing: the HDMI driver
hangs when accessing a register with clocks disabled, which they are
on module removal. It also fixes my u-boot booting for similar
reasons, I guess.

I went as far as reaching out for a HDMI cable and verifying that I
was getting a working display. Total dedication.

Feedback much appreciated.

M.

Marc Zyngier (4):
  drm/meson: Free RDMA resources after tearing down DRM
  drm/meson: Unbind all connectors on module removal
  drm/meson: dw-hdmi: Register a callback to disable the regulator
  drm/meson: dw-hdmi: Ensure that clocks are enabled before touching the
TOP registers

 drivers/gpu/drm/meson/meson_drv.c | 12 +++-
 drivers/gpu/drm/meson/meson_dw_hdmi.c | 13 +++--
 2 files changed, 18 insertions(+), 7 deletions(-)

-- 
2.28.0



[PATCH 4/4] drm/meson: dw-hdmi: Ensure that clocks are enabled before touching the TOP registers

2020-11-16 Thread Marc Zyngier
Removing the meson-dw-hdmi module and re-inserting it results in a hang
as the driver writes to HDMITX_TOP_SW_RESET. Similar effects can be seen
when booting with mainline u-boot and using the u-boot provided DT (which
is highly desirable).

The reason for the hang seem to be that the clocks are not always
enabled by the time we enter meson_dw_hdmi_init(). Moving this call
*after* dw_hdmi_probe() ensures that the clocks are enabled.

Signed-off-by: Marc Zyngier 
---
 drivers/gpu/drm/meson/meson_dw_hdmi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c 
b/drivers/gpu/drm/meson/meson_dw_hdmi.c
index 68826cf9993f..7f8eea494147 100644
--- a/drivers/gpu/drm/meson/meson_dw_hdmi.c
+++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c
@@ -1073,8 +1073,6 @@ static int meson_dw_hdmi_bind(struct device *dev, struct 
device *master,
 
DRM_DEBUG_DRIVER("encoder initialized\n");
 
-   meson_dw_hdmi_init(meson_dw_hdmi);
-
/* Bridge / Connector */
 
dw_plat_data->priv_data = meson_dw_hdmi;
@@ -1097,6 +1095,8 @@ static int meson_dw_hdmi_bind(struct device *dev, struct 
device *master,
if (IS_ERR(meson_dw_hdmi->hdmi))
return PTR_ERR(meson_dw_hdmi->hdmi);
 
+   meson_dw_hdmi_init(meson_dw_hdmi);
+
next_bridge = of_drm_find_bridge(pdev->dev.of_node);
if (next_bridge)
drm_bridge_attach(encoder, next_bridge,
-- 
2.28.0



[PATCH 3/4] drm/meson: dw-hdmi: Register a callback to disable the regulator

2020-11-16 Thread Marc Zyngier
Removing the meson-dw-hdmi module results in the following splat:

i[   43.340509] WARNING: CPU: 0 PID: 572 at drivers/regulator/core.c:2125 
_regulator_put.part.0+0x16c/0x174
[...]
[   43.454870] CPU: 0 PID: 572 Comm: modprobe Tainted: GW   E 
5.10.0-rc4-00049-gd274813a4de3-dirty #2147
[   43.465042] Hardware name:  , BIOS 2021.01-rc2-00012-gde865f7ee1 11/16/2020
[   43.471945] pstate: 8049 (Nzcv daif +PAN -UAO -TCO BTYPE=--)
[   43.477896] pc : _regulator_put.part.0+0x16c/0x174
[   43.482638] lr : regulator_put+0x44/0x60
[...]
[   43.568715] Call trace:
[   43.571132]  _regulator_put.part.0+0x16c/0x174
[   43.575529]  regulator_put+0x44/0x60
[   43.579067]  devm_regulator_release+0x20/0x2c
[   43.583380]  release_nodes+0x1c8/0x2b4
[   43.587087]  devres_release_all+0x44/0x6c
[   43.591056]  __device_release_driver+0x1a0/0x23c
[   43.595626]  driver_detach+0xcc/0x160
[   43.599249]  bus_remove_driver+0x68/0xe0
[   43.603130]  driver_unregister+0x3c/0x6c
[   43.607011]  platform_driver_unregister+0x20/0x2c
[   43.611678]  meson_dw_hdmi_platform_driver_exit+0x18/0x4a8 [meson_dw_hdmi]
[   43.618485]  __arm64_sys_delete_module+0x1bc/0x294

as the HDMI regulator is still enabled on release.

In order to address this, register a callback that will deal with
the disabling when the driver is unbound, solving the problem.

Signed-off-by: Marc Zyngier 
---
 drivers/gpu/drm/meson/meson_dw_hdmi.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c 
b/drivers/gpu/drm/meson/meson_dw_hdmi.c
index 29a8ff41595d..68826cf9993f 100644
--- a/drivers/gpu/drm/meson/meson_dw_hdmi.c
+++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c
@@ -941,6 +941,11 @@ static void meson_dw_hdmi_init(struct meson_dw_hdmi 
*meson_dw_hdmi)
 
 }
 
+static void meson_disable_regulator(void *data)
+{
+   regulator_disable(data);
+}
+
 static int meson_dw_hdmi_bind(struct device *dev, struct device *master,
void *data)
 {
@@ -989,6 +994,10 @@ static int meson_dw_hdmi_bind(struct device *dev, struct 
device *master,
ret = regulator_enable(meson_dw_hdmi->hdmi_supply);
if (ret)
return ret;
+   ret = devm_add_action_or_reset(dev, meson_disable_regulator,
+  meson_dw_hdmi->hdmi_supply);
+   if (ret)
+   return ret;
}
 
meson_dw_hdmi->hdmitx_apb = devm_reset_control_get_exclusive(dev,
-- 
2.28.0



Re: [PATCH 1/2] KVM: arm64: vgic: Forbid invalid userspace Redistributor accesses

2020-11-16 Thread Marc Zyngier

On 2020-11-16 13:09, Zenghui Yu wrote:

Hi Marc,

On 2020/11/16 1:04, Marc Zyngier wrote:

Hi Zenghui,

On 2020-11-13 14:28, Zenghui Yu wrote:
It's expected that users will access registers in the redistributor 
*if*
the RD has been initialized properly. Unfortunately userspace can be 
bogus
enough to access registers before setting the RD base address, and 
KVM
implicitly allows it (we handle the access anyway, regardless of 
whether

the base address is set).

Bad thing happens when we're handling the user read of GICR_TYPER. We 
end

up with an oops when deferencing the unset rdreg...

gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
    (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;

Fix this issue by informing userspace what had gone wrong (-ENXIO).


I'm worried about the "implicit" aspect of the access that this patch
now forbids.

The problem is that the existing documentation doesn't cover this 
case, > and -ENXIO's "Getting or setting this register is not yet 
supported"

is way too vague.


Indeed. How about changing to

-ENXIO  Getting or setting this register is not yet supported
or VGIC not properly configured (e.g., [Re]Distributor base
address is unknown)


Looks OK to me.




There is a precedent with the ITS, but that's undocumented
as well. Also, how about v2? If that's the wasy we are going to fix 
this,

we also nned to beef up the documentation.


Sure, I plan to do so and hope it won't break the existing userspace.


Well, at this stage we can only hope.



Of course, the other horrible way to address the issue is to return a 
value
that doesn't have the Last bit set, since we can't synthetise it. It 
doesn't
change the userspace API, and I can even find some (admittedly  
twisted)

logic to it (since there is no base address, there is no last RD...).


I'm fine with it. But I'm afraid that there might be other issues due 
to
the "unexpected" accesses since I haven't tested with all registers 
from

userspace.


I have had a look at the weekend, and couldn't see any other other GICR
register that would suffer from rdreg being NULL. I haven't looked at
GICD, but I don't anticipate anything bad on that front.


My take is that only if the "[Re]Distributor base address" is specified
in the system memory map, will the user-provided kvm_device_attr.offset
make sense. And we can then handle the access to the register which is
defined by "base address + offset".


I'd tend to agree, but it is just that this is a large change at -rc4.
I'd rather have a quick fix for 5.10, and a more invasive change for 
5.11,

spanning all the possible vgic devices.

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 13/24] kvm: arm64: Add CPU entry point in nVHE hyp

2020-11-16 Thread Marc Zyngier

On 2020-11-16 11:49, David Brazdil wrote:

>  #ifdef CONFIG_CPU_PM
>DEFINE(CPU_CTX_SP,  offsetof(struct cpu_suspend_ctx, sp));
> diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> index 1697d25756e9..f999a35b2c8c 100644
> --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> @@ -6,6 +6,7 @@
>
>  #include 
>  #include 
> +#include 

This should probably be included from the file that provides 
init_el2_state.


Agreed. This is a workaround for the fact that the arm-gic* headers 
don't play

nice with each other (define the same constants).


Ah, that...


Including arm-gic-v3.h in
kvm_asm.h will trigger macro redefine warnings in vgic*-v2.c because 
they

include arm-gic.h.


Boo.


Another option is to create a header just for el2 init. Would that be
preferable? Other ideas?


Having an asm/el2_setup.h file feels like a better option. After all, it
is in no way KVM specific, and having the macros hanging around in
asm/kvm_asm.h feels odd.

M.
--
Jazz is not dead. It just smells funny...


Re: [patch 06/19] arm64: irqstat: Get rid of duplicated declaration

2020-11-16 Thread Marc Zyngier

On 2020-11-13 14:02, Thomas Gleixner wrote:

irq_cpustat_t is exactly the same as the asm-generic one. Define
ack_bad_irq so the generic header does not emit the generic version of 
it.


Signed-off-by: Thomas Gleixner 
Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Marc Zyngier 
Cc: linux-arm-ker...@lists.infradead.org
---
 arch/arm64/include/asm/hardirq.h |7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -13,11 +13,8 @@
 #include 
 #include 

-typedef struct {
-   unsigned int __softirq_pending;
-} cacheline_aligned irq_cpustat_t;
-
-#include 	/* Standard mappings for irq_cpustat_t 
above */

+#define ack_bad_irq ack_bad_irq
+#include 

 #define __ARCH_IRQ_EXIT_IRQS_DISABLED  1


Acked-by: Marc Zyngier 

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH 1/2] KVM: arm64: vgic: Forbid invalid userspace Redistributor accesses

2020-11-15 Thread Marc Zyngier

Hi Zenghui,

On 2020-11-13 14:28, Zenghui Yu wrote:
It's expected that users will access registers in the redistributor 
*if*
the RD has been initialized properly. Unfortunately userspace can be 
bogus

enough to access registers before setting the RD base address, and KVM
implicitly allows it (we handle the access anyway, regardless of 
whether

the base address is set).

Bad thing happens when we're handling the user read of GICR_TYPER. We 
end

up with an oops when deferencing the unset rdreg...

gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
(rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;

Fix this issue by informing userspace what had gone wrong (-ENXIO).


I'm worried about the "implicit" aspect of the access that this patch
now forbids.

The problem is that the existing documentation doesn't cover this case,
and -ENXIO's "Getting or setting this register is not yet supported"
is way too vague. There is a precedent with the ITS, but that's 
undocumented
as well. Also, how about v2? If that's the wasy we are going to fix 
this,

we also nned to beef up the documentation.

Of course, the other horrible way to address the issue is to return a 
value
that doesn't have the Last bit set, since we can't synthetise it. It 
doesn't

change the userspace API, and I can even find some (admittedly  twisted)
logic to it (since there is no base address, there is no last RD...).

Thoughts?

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs

2020-11-14 Thread Marc Zyngier

On 2020-11-14 03:37, Alexey Kardashevskiy wrote:


What is the easiest way to get irq-hierarchical hardware?
I have a bunch of powerpc boxes (no good) but also a raspberry pi,
a bunch of 32/64bit orange pi's, an "armada" arm box,
thinkpads - is any of this good for the task?


If your HW doesn't require an interrupt hierarchy, run VMs!
Booting an arm64 guest with virtual PCI devices will result in
hierarchies being created (PCI-MSI -> GIC MSI widget -> GIC).


Absolutely :) But the beauty of ARM is that one can buy an actual ARM
device for 20$, I have "opi one+ allwinner h6 64bit cortex a53 1GB
RAM", is it worth using KVM on this device, or is it too small for
that?


I've run VMs on smaller machines. 256MB of guest RAM is enough to boot
a full blown Debian system with PCI devices, and your AW box should be
up to the task as long as you run a mainline kernel on it. Please don't
add to the pile of junk!


You can use KVM, or even bare QEMU on x86 if you are so inclined.


Have a QEMU command line handy for x86/tcg?


/me digs, as my x86 boxes are overspec'd X terminals these days:

Here you go, courtesy of Will:
http://cdn.kernel.org/pub/linux/kernel/people/will/docs/qemu/qemu-arm64-howto.html

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs

2020-11-13 Thread Marc Zyngier

Hi Alexey,

On 2020-11-09 09:46, Alexey Kardashevskiy wrote:
PCI devices share 4 legacy INTx interrupts from the same PCI host 
bridge.

Device drivers map/unmap hardware interrupts via irq_create_mapping()/
irq_dispose_mapping(). The problem with that these interrupts are
shared and when performing hot unplug, we need to unmap the interrupt
only when the last device is released.

This reuses already existing irq_desc::kobj for this purpose.
The refcounter is naturally 1 when the descriptor is allocated already;
this adds kobject_get() in places where already existing mapped virq
is returned.

This reorganizes irq_dispose_mapping() to release the kobj and let
the release callback do the cleanup.

As kobject_put() is called directly now (not via RCU), it can also 
handle

the early boot case (irq_kobj_base==NULL) with the help of
the kobject::state_in_sysfs flag and without additional 
irq_sysfs_del().
While at this, clean up the comment at where irq_sysfs_del() was 
called.


Quick grep shows no sign of irq reference counting in drivers. Drivers
typically request mapping when probing and dispose it when removing;
platforms tend to dispose only if setup failed and the rest seems
calling one dispose per one mapping. Except (at least) PPC/pseries
which needs https://lkml.org/lkml/2020/10/27/259

Cc: Cédric Le Goater 
Cc: Marc Zyngier 
Cc: Michael Ellerman 
Cc: Qian Cai 
Cc: Rob Herring 
Cc: Frederic Barrat 
Cc: Michal Suchánek 
Cc: Thomas Gleixner 
Signed-off-by: Alexey Kardashevskiy 
---

This is what it is fixing for powerpc:

There was a comment about whether hierarchical IRQ domains should
contribute to this reference counter and I need some help here as
I cannot see why.
It is reverse now - IRQs contribute to domain->mapcount and
irq_domain_associate/irq_domain_disassociate take necessary steps to
keep this counter in order. What might be missing is that if we have
cascade of IRQs (as in the IOAPIC example from
Documentation/core-api/irq/irq-domain.rst ), then a parent IRQ should
contribute to the children IRQs and it is up to
irq_domain_ops::alloc/free hooks, and they all seem to be eventually
calling irq_domain_alloc_irqs_xxx/irq_domain_free_irqs_xxx which seems
right.

Documentation/core-api/irq/irq-domain.rst also suggests there is a lot
to see in debugfs about IRQs but on my thinkpad there nothing about
hierarchy.

So I'll ask again :)

What is the easiest way to get irq-hierarchical hardware?
I have a bunch of powerpc boxes (no good) but also a raspberry pi,
a bunch of 32/64bit orange pi's, an "armada" arm box,
thinkpads - is any of this good for the task?


If your HW doesn't require an interrupt hierarchy, run VMs!
Booting an arm64 guest with virtual PCI devices will result in
hierarchies being created (PCI-MSI -> GIC MSI widget -> GIC).
You can use KVM, or even bare QEMU on x86 if you are so inclined.

I'll try to go through this patch over the week-end (or more probably
early next week), and try to understand where our understandings
differ.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: iommu/vt-d: Cure VF irqdomain hickup

2020-11-13 Thread Marc Zyngier

On 2020-11-12 21:34, Thomas Gleixner wrote:

On Thu, Nov 12 2020 at 20:15, Thomas Gleixner wrote:

The recent changes to store the MSI irqdomain pointer in struct device
missed that Intel DMAR does not register virtual function devices.  
Due to
that a VF device gets the plain PCI-MSI domain assigned and then 
issues

compat MSI messages which get caught by the interrupt remapping unit.

Cure that by inheriting the irq domain from the physical function
device.

That's a temporary workaround. The correct fix is to inherit the irq 
domain

from the bus, but that's a larger effort which needs quite some other
changes to the way how x86 manages PCI and MSI domains.


Bah, that's not really going to work with the way how irq remapping
works on x86 because at least Intel/DMAR can have more than one DMAR
unit on a bus.

So the alternative solution would be to assign the domain per device,
but the current ordering creates a hen and egg problem. Looking the
domain up in pci_set_msi_domain() does not work because at that point
the device is not registered in the IOMMU. That happens from
device_add().

Marc, is there any problem to reorder the calls in pci_device_add():

  device_add();
  pci_set_msi_domain();


I *think* it works as long as we keep the "match_driver = false" hack.
Otherwise, we risk binding to a driver early, and game over.


That would allow to add a irq_find_matching_fwspec() based lookup to
pci_msi_get_device_domain().


Just so that I understand the issue: is the core of the problem that
there is no 1:1 mapping between a PCI bus and a DMAR unit, and no
firmware topology information to indicate which one to pick?


Though I'm not yet convinced that the outcome would be less horrible
than the hack in the DMAR driver when I'm taking all the other horrors
of x86 (including XEN) into account :)


I tried to follow the notifier into the DMAR driver, ended up in the
IRQ remapping code, and lost the will to live. I have a question though:

In the bus notifier callback, you end-up in dmar_pci_bus_add_dev(),
which calls intel_irq_remap_add_device(), which tries to set the
MSI domain. Why isn't that enough? Are we still missing any information
at that stage?

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH 2/2] irqchip/gic-v3-its: Disable vSGI upon (CPUIF < v4.1) detection

2020-11-12 Thread Marc Zyngier

On 2020-11-12 14:40, Lorenzo Pieralisi wrote:

On Thu, Nov 12, 2020 at 09:36:10AM +, Marc Zyngier wrote:

Hi Lorenzo,

On 2020-11-11 16:28, Lorenzo Pieralisi wrote:
> GIC CPU interfaces versions predating GIC v4.1 were not built to
> accommodate vINTID within the vSGI range; as reported in the GIC
> specifications (8.2 "Changes to the CPU interface"), it is
> CONSTRAINED UNPREDICTABLE to deliver a vSGI to a PE with
> ID_AA64PFR0_EL1.GIC == b0001.

Hmmm. This goes against the very reason v4.1 was designed the way
it is, which was that all existing implementation supporting GICv4.0
would seamlessly let virtual SGIs in, and it would "just work".

If we start enforcing this, I question the very design of the 
architecture,
because we could have done so much better by changing the CPU 
interface.


What has changed in two years? Have you spotted a fundamental problem?


Hi Marc,

long story short: there are systems being designed with this
configuration, vSGIs may or may not work on them, to prevent
*potential* misbehaviour I am disabling vSGIs, I am not fixing
anything, it is belt and braces.


My concern is that if we prevent it, we're going to end-up with quirks
allowing it anyway, because people will realise that it actually 
works.


We may wait and fix it *if* this breaks, I would argue though that at
that point it is not a quirk since architecturally we know that vSGIs
may not work in this configuration.


I don't mind either way, as I doubt I'll see this kind of system any 
time

soon. I'm just mildly annoyed at the missed opportunity to do something
better...




In the meantime, to the meat of the change:

>
> Check the GIC CPUIF version through the arm64 capabilities
> infrastructure and disable vSGIs if a CPUIF version < 4.1 is
> detected to prevent using vSGIs on systems where they may
> misbehave.
>
> Signed-off-by: Lorenzo Pieralisi 
> Cc: Marc Zyngier 
> ---
>  drivers/irqchip/irq-gic-v3-its.c | 20 +++-
>  1 file changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/irqchip/irq-gic-v3-its.c
> b/drivers/irqchip/irq-gic-v3-its.c
> index 0fec31931e11..6ed4ba60ba7e 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -39,6 +39,20 @@
>
>  #include "irq-gic-common.h"
>
> +#ifdef CONFIG_ARM64
> +#include 
> +
> +static inline bool gic_cpuif_has_vsgi(void)
> +{
> +  return cpus_have_const_cap(ARM64_HAS_GIC_CPUIF_VSGI);
> +}
> +#else
> +static inline bool gic_cpuif_has_vsgi(void)
> +{
> +  return false;
> +}
> +#endif
> +
>  #define ITS_FLAGS_CMDQ_NEEDS_FLUSHING (1ULL << 0)
>  #define ITS_FLAGS_WORKAROUND_CAVIUM_22375 (1ULL << 1)
>  #define ITS_FLAGS_WORKAROUND_CAVIUM_23144 (1ULL << 2)
> @@ -5415,7 +5429,11 @@ int __init its_init(struct fwnode_handle
> *handle, struct rdists *rdists,
>if (has_v4 & rdists->has_vlpis) {
>const struct irq_domain_ops *sgi_ops;
>
> -  if (has_v4_1)
> +  /*
> +   * Enable vSGIs only if the ITS and the
> +   * GIC CPUIF support them.
> +   */
> +  if (has_v4_1 && gic_cpuif_has_vsgi())
>sgi_ops = &its_sgi_domain_ops;
>else
>sgi_ops = NULL;

Is that enough?


No, I obviously missed the VGIC bits built on top of
GICD_TYPER2.nASSGIcap.


KVM is still going to expose GICD_TYPER2.nASSGIcap, making things even
more confusing for the guest: it will be able to select active-less 
SGIs
via GICD_CTLR.nASSGIreq, and if I'm not mistaken, we'd still try to 
switch

to HW-backed SGIs, leading to some *very* unpleasant things in
gic_v4_enable_vsgis().


Yes (AFAICS GICD_TYPER2.nASSGIcap is not in the public specs though,
that's why I missed it while vetting architectural state that is
affecting vSGIs).


You can find it in the errata to the spec (I just checked the October 
2020
version). I doubt it is public though, and people have been asking for 
this

update to be published for a while now.


I should change the logic in vgic_mmio_{uaccess}_write_v3_misc() to
handle it properly - to redefine the logic around

kvm_vgic_global_state.has_gicv4_1

somehow.


You probably need a separate predicate, indicating HW-baked vSGI 
support.


Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH 1/2] arm64: cpufeature: Add GIC CPUIF v4.1 detection

2020-11-12 Thread Marc Zyngier

On 2020-11-11 16:28, Lorenzo Pieralisi wrote:

GIC v4.1 introduced changes to the GIC CPU interface; systems that
integrate CPUs that do not support GIC v4.1 features (as reported in 
the

ID_AA64PFR0_EL1.GIC bitfield) and a GIC v4.1 controller must disable in
software virtual SGIs support since the CPUIF and GIC controller 
version
mismatch results in CONSTRAINED UNPREDICTABLE behaviour at 
architectural

level.

Add a cpufeature and related capability to detect GIC v4.1 CPUIF
features so that the GIC driver can probe it to detect GIC CPUIF
hardware configuration and take action accordingly.

Signed-off-by: Lorenzo Pieralisi 
Cc: Will Deacon 
Cc: Catalin Marinas 
Cc: Marc Zyngier 
---
 arch/arm64/include/asm/cpucaps.h |  3 ++-
 arch/arm64/kernel/cpufeature.c   | 10 ++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpucaps.h 
b/arch/arm64/include/asm/cpucaps.h

index 42868dbd29fd..35ef0319f422 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -65,7 +65,8 @@
 #define ARM64_HAS_ARMv8_4_TTL  55
 #define ARM64_HAS_TLB_RANGE56
 #define ARM64_MTE  57
+#define ARM64_HAS_GIC_CPUIF_VSGI   58

-#define ARM64_NCAPS58
+#define ARM64_NCAPS59

 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/kernel/cpufeature.c 
b/arch/arm64/kernel/cpufeature.c

index dcc165b3fc04..9eabbaddfe5e 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2136,6 +2136,16 @@ static const struct arm64_cpu_capabilities
arm64_features[] = {
.cpu_enable = cpu_enable_mte,
},
 #endif /* CONFIG_ARM64_MTE */
+   {
+   .desc = "GIC CPUIF virtual SGI",


nit: that's not really what this feature is. It only means that the
sysreg interface complies to v4.1. Which on its own is totally rubbish,
because the sysreg don't change behaviour between 3.0/4.0 and 4.1.


+   .capability = ARM64_HAS_GIC_CPUIF_VSGI,
+   .type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
+   .matches = has_cpuid_feature,
+   .sys_reg = SYS_ID_AA64PFR0_EL1,
+   .field_pos = ID_AA64PFR0_GIC_SHIFT,
+   .sign = FTR_UNSIGNED,
+   .min_field_value = 3,
+   },


Do we really need a new cap for that? Or can we rely on simply looking
at the sanitised feature set? I'm not overly keen on advertising a 
feature
at CPU boot time if we discover later on that we cannot use it because 
all

we have in a non-4.1 GIC.

Another thing is that we currently assume that *all* CPUs will be the 
same
at the point where we setup the GIC (we only have a single CPU booted at 
that

point).

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH 2/2] irqchip/gic-v3-its: Disable vSGI upon (CPUIF < v4.1) detection

2020-11-12 Thread Marc Zyngier

Hi Lorenzo,

On 2020-11-11 16:28, Lorenzo Pieralisi wrote:

GIC CPU interfaces versions predating GIC v4.1 were not built to
accommodate vINTID within the vSGI range; as reported in the GIC
specifications (8.2 "Changes to the CPU interface"), it is
CONSTRAINED UNPREDICTABLE to deliver a vSGI to a PE with
ID_AA64PFR0_EL1.GIC == b0001.


Hmmm. This goes against the very reason v4.1 was designed the way
it is, which was that all existing implementation supporting GICv4.0
would seamlessly let virtual SGIs in, and it would "just work".

If we start enforcing this, I question the very design of the 
architecture,

because we could have done so much better by changing the CPU interface.

What has changed in two years? Have you spotted a fundamental problem?
My concern is that if we prevent it, we're going to end-up with quirks
allowing it anyway, because people will realise that it actually works.

In the meantime, to the meat of the change:



Check the GIC CPUIF version through the arm64 capabilities
infrastructure and disable vSGIs if a CPUIF version < 4.1 is
detected to prevent using vSGIs on systems where they may
misbehave.

Signed-off-by: Lorenzo Pieralisi 
Cc: Marc Zyngier 
---
 drivers/irqchip/irq-gic-v3-its.c | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c 
b/drivers/irqchip/irq-gic-v3-its.c

index 0fec31931e11..6ed4ba60ba7e 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -39,6 +39,20 @@

 #include "irq-gic-common.h"

+#ifdef CONFIG_ARM64
+#include 
+
+static inline bool gic_cpuif_has_vsgi(void)
+{
+   return cpus_have_const_cap(ARM64_HAS_GIC_CPUIF_VSGI);
+}
+#else
+static inline bool gic_cpuif_has_vsgi(void)
+{
+   return false;
+}
+#endif
+
 #define ITS_FLAGS_CMDQ_NEEDS_FLUSHING  (1ULL << 0)
 #define ITS_FLAGS_WORKAROUND_CAVIUM_22375  (1ULL << 1)
 #define ITS_FLAGS_WORKAROUND_CAVIUM_23144  (1ULL << 2)
@@ -5415,7 +5429,11 @@ int __init its_init(struct fwnode_handle
*handle, struct rdists *rdists,
if (has_v4 & rdists->has_vlpis) {
const struct irq_domain_ops *sgi_ops;

-   if (has_v4_1)
+   /*
+* Enable vSGIs only if the ITS and the
+* GIC CPUIF support them.
+*/
+   if (has_v4_1 && gic_cpuif_has_vsgi())
sgi_ops = &its_sgi_domain_ops;
else
sgi_ops = NULL;


Is that enough?

KVM is still going to expose GICD_TYPER2.nASSGIcap, making things even
more confusing for the guest: it will be able to select active-less SGIs
via GICD_CTLR.nASSGIreq, and if I'm not mistaken, we'd still try to 
switch

to HW-backed SGIs, leading to some *very* unpleasant things in
gic_v4_enable_vsgis().

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 00/24] Opt-in always-on nVHE hypervisor

2020-11-11 Thread Marc Zyngier

Hi David,

On 2020-11-09 11:32, David Brazdil wrote:

As we progress towards being able to keep guest state private to the
host running nVHE hypervisor, this series allows the hypervisor to
install itself on newly booted CPUs before the host is allowed to run
on them.

All functionality described below is opt-in, guarded by an early param
'kvm-arm.protected'. Future patches specific to the new "protected" 
mode

should be hidden behind the same param.

The hypervisor starts trapping host SMCs and intercepting host's PSCI
CPU_ON/OFF/SUSPEND calls. It replaces the host's entry point with its
own, initializes the EL2 state of the new CPU and installs the nVHE hyp
vector before ERETing to the host's entry point.

The kernel checks new cores' features against the finalized system
capabilities. To avoid the need to move this code/data to EL2, the
implementation only allows to boot cores that were online at the time 
of

KVM initialization and therefore had been checked already.

Other PSCI SMCs are forwarded to EL3, though only the known set of SMCs
implemented in the kernel is allowed. Non-PSCI SMCs are also forwarded
to EL3. Future changes will need to ensure the safety of all SMCs wrt.
private guests.

The host is still allowed to reset EL2 back to the stub vector, eg. for
hibernation or kexec, but will not disable nVHE when there are no VMs.

Tested on Rock Pi 4b, based on 5.10-rc3.


I think I've gone through most of the patches. When you respin this
series, you may want to do so on top of my host EL2 entry rework [1],
which change a few things you currently rely on.

If anything in there doesn't work for you, please let me know.

Thanks,

M.

[1] https://lore.kernel.org/kvm/20201109175923.445945-1-...@kernel.org/
--
Jazz is not dead. It just smells funny...


Re: [PATCH v2 3/5] gpio: msc313: MStar MSC313 GPIO driver

2020-11-11 Thread Marc Zyngier

On 2020-11-11 14:09, Linus Walleij wrote:

On Tue, Nov 10, 2020 at 3:19 PM Marc Zyngier  wrote:

On 2020-11-10 14:02, Linus Walleij wrote:



>> Probably nothing more than setting the callback to
>> irq_chip_set_affinity_parent,
>
> Hm, is this something all GPIO irqchips used on SMP systems
> should be doing? Or just hierarchical ones?

Probably only the hierarchical ones. I'd expect the non-hierarchical
GPIOs to be muxed behind a single interrupt, which makes it impossible
to move a single GPIO around, and moving the mux interrupt would break
userspace's expectations that interrupts move independently of each
others.


I found two suspects and sent patches. I think I might have some
more candidates down in pinctrl. I do have some hierarchical IRQ
that is on UP systems, I suppose these are not affected.


Yup, they look good. Feel free to add my Ack to them.
And yes, UP systems are naturally oblivious of interrupt affinity.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 07/24] kvm: arm64: Create nVHE copy of cpu_logical_map

2020-11-11 Thread Marc Zyngier

On 2020-11-11 13:45, David Brazdil wrote:

On Wed, Nov 11, 2020 at 01:29:29PM +, Marc Zyngier wrote:

On 2020-11-11 13:03, David Brazdil wrote:
> > > +/*
> > > + * nVHE copy of data structures tracking available CPU cores.
> > > + * Only entries for CPUs that were online at KVM init are populated.
> > > + * Other CPUs should not be allowed to boot because their features were
> > > + * not checked against the finalized system capabilities.
> > > + */
> > > +u64 __ro_after_init __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1]
> > > = INVALID_HWID };
> >
> > I'm not sure what __ro_after_init means once we get S2 isolation.
>
> It is stretching the definition of 'init' a bit, I know, but I don't see
> what
> your worry is about S2? The intention is to mark this read-only for
> .hyp.text
> at runtime. With S2, the host won't be able to write to it after KVM
> init.
> Obviously that's currently not the case.

More importantly, EL2 can write to it at any time, which is the bit 
I'm

worried
about, as it makes the annotation misleading.


EL2 can't, at least not accidentally. The hyp memory mapping is 
PAGE_HYP_RO

(see patch 05).


Ah, I obviously overlooked that. Thanks for setting me straight.

Shouldn't clash with include files. Where fixing the kernel might clash 
is

all the users of for_each_*_cpu that use an int for the iterator var.


I don't think that's a problem (nobody expects that many CPUs). But if 
you

are confident that we don't have a problem, no need to change the kernel
itself.

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 16/24] kvm: arm64: Add offset for hyp VA <-> PA conversion

2020-11-11 Thread Marc Zyngier

On 2020-11-09 11:32, David Brazdil wrote:

Add a host-initialized constant to KVM nVHE hyp code for converting
between EL2 linear map virtual addresses and physical addresses.
Also add `__hyp_pa` macro that performs the conversion.

Signed-off-by: David Brazdil 
---
 arch/arm64/kvm/arm.c   | 15 +++
 arch/arm64/kvm/hyp/nvhe/psci.c |  3 +++
 2 files changed, 18 insertions(+)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 28e3bc056225..dc7d43d7785a 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1484,6 +1484,20 @@ static inline void hyp_cpu_pm_exit(void)
 }
 #endif

+static void init_hyp_physvirt_offset(void)
+{
+   extern s64 kvm_nvhe_sym(hyp_physvirt_offset);
+   unsigned long kern_vaddr, hyp_vaddr, paddr;
+
+   /* Check that kvm_arm_hyp_percpu_base has been set. */
+   BUG_ON(kvm_arm_hyp_percpu_base[0] == 0);


Why is this dependent on the percpu base? Or is that just a convenient
symbol?


+
+   kern_vaddr = kvm_arm_hyp_percpu_base[0];
+   hyp_vaddr = kern_hyp_va(kern_vaddr);
+   paddr = __pa(kern_vaddr);
+   CHOOSE_NVHE_SYM(hyp_physvirt_offset) = (s64)paddr - (s64)hyp_vaddr;
+}


It feels like this offset could be set at the point where we compute
the hyp_va offset in va_layout.c, couldn't it? It would have the
advantage of keeping all the ugly VA hacks together.


+
 static void init_cpu_logical_map(void)
 {
extern u64 kvm_nvhe_sym(__cpu_logical_map)[NR_CPUS];
@@ -1688,6 +1702,7 @@ static int init_hyp_mode(void)
}
}

+   init_hyp_physvirt_offset();
init_cpu_logical_map();
init_psci();

diff --git a/arch/arm64/kvm/hyp/nvhe/psci.c 
b/arch/arm64/kvm/hyp/nvhe/psci.c

index 82d3b2c89658..b0b5df590ba5 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci.c
@@ -16,6 +16,9 @@
 /* Config options set by the host. */
 u32 kvm_host_psci_version = PSCI_VERSION(0, 0);
 u32 kvm_host_psci_function_id[PSCI_FN_MAX];
+s64 hyp_physvirt_offset;
+
+#define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset)

 static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt)
 {


Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 07/24] kvm: arm64: Create nVHE copy of cpu_logical_map

2020-11-11 Thread Marc Zyngier

On 2020-11-11 13:03, David Brazdil wrote:

> +/*
> + * nVHE copy of data structures tracking available CPU cores.
> + * Only entries for CPUs that were online at KVM init are populated.
> + * Other CPUs should not be allowed to boot because their features were
> + * not checked against the finalized system capabilities.
> + */
> +u64 __ro_after_init __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1]
> = INVALID_HWID };

I'm not sure what __ro_after_init means once we get S2 isolation.


It is stretching the definition of 'init' a bit, I know, but I don't 
see what
your worry is about S2? The intention is to mark this read-only for 
.hyp.text
at runtime. With S2, the host won't be able to write to it after KVM 
init.

Obviously that's currently not the case.


More importantly, EL2 can write to it at any time, which is the bit I'm 
worried

about, as it makes the annotation misleading.

One thing we might change in the future is marking it RW for some 
initial

setup in a HVC handler, then marking it RO for the rest of uptime.


That'd be a desirable outcome, and it would be consistent with the rest
of the kernel.





> +
> +u64 cpu_logical_map(int cpu)

nit: is there any reason why "cpu" cannot be unsigned? The thought
of a negative CPU number makes me shiver...


Same here. That's how it's defined in kernel proper, so I went with 
that.


I'm happy to deviate from the kernel (give the function a different name
if this clashes with existing include files).

We can also fix the rest of the kernel (I've just written the trivial 
patch).




> +{
> +  if (cpu < 0 || cpu >= ARRAY_SIZE(__cpu_logical_map))
> +  hyp_panic();
> +
> +  return __cpu_logical_map[cpu];
> +}
> +
>  unsigned long __hyp_per_cpu_offset(unsigned int cpu)
>  {
>unsigned long *cpu_base_array;

Overall, this patch would make more sense closer it its use case
(in patch 19). I also don't understand why this lives in percpu.c...


I didn't think it called for adding another C file for this. How about 
we

rename this file to smp.c? Would that make sense for both?


Make that hyp-smp.c, please!

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 15/24] kvm: arm64: Bootstrap PSCI SMC handler in nVHE EL2

2020-11-11 Thread Marc Zyngier

On 2020-11-09 11:32, David Brazdil wrote:

Add a handler of PSCI SMCs in nVHE hyp code. The handler is initialized
with the version used by the host's PSCI driver and the function IDs it
was configured with. If the SMC function ID matches one of the
configured PSCI calls (for v0.1) or falls into the PSCI function ID
range (for v0.2+), the SMC is handled by the PSCI handler. For now, all
SMCs return PSCI_RET_NOT_SUPPORTED.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_hyp.h   |   4 ++
 arch/arm64/kvm/arm.c   |  13 
 arch/arm64/kvm/hyp/nvhe/Makefile   |   2 +-
 arch/arm64/kvm/hyp/nvhe/hyp-main.c |   4 ++
 arch/arm64/kvm/hyp/nvhe/psci.c | 102 +
 include/uapi/linux/psci.h  |   1 +
 6 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kvm/hyp/nvhe/psci.c

diff --git a/arch/arm64/include/asm/kvm_hyp.h 
b/arch/arm64/include/asm/kvm_hyp.h

index a3289071f3d8..95a2bbbcc7e1 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -96,6 +96,10 @@ void deactivate_traps_vhe_put(void);

 u64 __guest_enter(struct kvm_vcpu *vcpu);

+#ifdef __KVM_NVHE_HYPERVISOR__
+bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt);
+#endif
+
 void __noreturn hyp_panic(void);
 #ifdef __KVM_NVHE_HYPERVISOR__
 void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, 
u64 par);

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1a57b6025937..28e3bc056225 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 

 #define CREATE_TRACE_POINTS
@@ -1498,6 +1499,17 @@ static void init_cpu_logical_map(void)
CHOOSE_NVHE_SYM(__cpu_logical_map)[cpu] = cpu_logical_map(cpu);
 }

+static void init_psci(void)


nit: init_psci_relay?


+{
+   extern u32 kvm_nvhe_sym(kvm_host_psci_version);
+   extern u32 kvm_nvhe_sym(kvm_host_psci_function_id)[PSCI_FN_MAX];
+   int i;
+
+   CHOOSE_NVHE_SYM(kvm_host_psci_version) = psci_driver_version();
+   for (i = 0; i < PSCI_FN_MAX; ++i)
+		CHOOSE_NVHE_SYM(kvm_host_psci_function_id)[i] = 
psci_get_function_id(i);

+}
+
 static int init_common_resources(void)
 {
return kvm_set_ipa_limit();
@@ -1677,6 +1689,7 @@ static int init_hyp_mode(void)
}

init_cpu_logical_map();
+   init_psci();

return 0;

diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile 
b/arch/arm64/kvm/hyp/nvhe/Makefile

index c45f440cce51..647b63337a51 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -7,7 +7,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__
 ccflags-y := -D__KVM_NVHE_HYPERVISOR__

 obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o 
host.o \

-hyp-main.o percpu.o
+hyp-main.o percpu.o psci.o
 obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o 
../entry.o \

 ../fpsimd.o ../hyp-entry.o

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 8661bc7deaa9..69f34d4f2773 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -134,6 +134,10 @@ static void handle_host_smc(struct
kvm_cpu_context *host_ctxt)
 */
skip_host_instruction();

+   /* Try to handle host's PSCI SMCs. */
+   if (kvm_host_psci_handler(host_ctxt))
+   return;
+
/* Forward SMC not handled in EL2 to EL3. */
forward_host_smc(host_ctxt);
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/psci.c 
b/arch/arm64/kvm/hyp/nvhe/psci.c

new file mode 100644
index ..82d3b2c89658
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/psci.c


nit: can we please name this psci-relay.c, or psci-proxy.c?
We already have a psci.c in the tree, and having the same file name 
messes

with my editor... ;-)


@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: David Brazdil 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* Config options set by the host. */
+u32 kvm_host_psci_version = PSCI_VERSION(0, 0);
+u32 kvm_host_psci_function_id[PSCI_FN_MAX];
+
+static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt)
+{
+   return host_ctxt->regs.regs[0];
+}
+
+static bool is_psci_0_1_call(u64 func_id)
+{
+   unsigned int i;
+
+   for (i = 0; i < ARRAY_SIZE(kvm_host_psci_function_id); ++i) {
+   if (func_id == kvm_host_psci_function_id[i])
+   return true;
+   }
+   return false;
+}
+
+static bool is_psci_0_2_fn_call(u64 func_id)
+{
+   u64 base = func_id & ~PSCI_0_2_FN_ID_MASK;
+
+   return base == PSCI_0_2_FN_BASE || base == PSCI_0_2_FN64_BASE;


I couldn't spot in the spec where PSCI reserves 16bit worth of IDs in
each range.


+}
+
+static bool is_psci_call(u64 func_id)
+{
+   if (kvm_host_psci_version == PSCI_VERSION(0, 0)

Re: [PATCH v1 06/24] kvm: arm64: Support per_cpu_ptr in nVHE hyp code

2020-11-11 Thread Marc Zyngier

On 2020-11-11 12:32, David Brazdil wrote:

> +
> +  cpu_base_array = kern_hyp_va(&kvm_arm_hyp_percpu_base[0]);

There is no guarantee that this will not generate a PC relative
addressing, resulting in kern_hyp_va() being applied twice.

Consider using hyp_symbol_addr() instead, which always does the right
by forcing a PC relative addressing and not subsequently mangling
the address.

> +  this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);
> +  return this_cpu_base - (unsigned long)&__per_cpu_start;

And this is the opposite case: if the compiler generates an absolute
address, you're toast. Yes, this is just as unlikely, but hey...
Same remedy should apply.


Good point, and I'll probably keep forgetting about this in the future. 
Now
that all .hyp.text is only executed under hyp page tables, should we 
start

thinking about fixing up the relocations?


Why not, if you can deal with the hypervisor text being mapped at a 
random

location, and make sure that the kernel doesn't process the relocations
for you. This would certainly save us a lot of runtime offsetting (which
I'm adding to in a separate series).

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 11/24] kvm: arm64: Add SMC handler in nVHE EL2

2020-11-11 Thread Marc Zyngier

On 2020-11-09 11:32, David Brazdil wrote:

Add handler of host SMCs in KVM nVHE trap handler. Forward all SMCs to
EL3 and propagate the result back to EL1. This is done in preparation
for validating host SMCs in KVM nVHE protected mode.

Signed-off-by: David Brazdil 
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 35 ++
 1 file changed, 35 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 19332c20fcde..8661bc7deaa9 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -106,6 +106,38 @@ static void handle_host_hcall(struct
kvm_cpu_context *host_ctxt)
host_ctxt->regs.regs[1] = ret;
 }

+static void skip_host_instruction(void)
+{
+   write_sysreg_el2(read_sysreg_el2(SYS_ELR) + 4, SYS_ELR);
+}
+
+static void forward_host_smc(struct kvm_cpu_context *host_ctxt)
+{
+   struct arm_smccc_res res;
+
+   arm_smccc_1_1_smc(host_ctxt->regs.regs[0], host_ctxt->regs.regs[1],
+ host_ctxt->regs.regs[2], host_ctxt->regs.regs[3],
+ host_ctxt->regs.regs[4], host_ctxt->regs.regs[5],
+ host_ctxt->regs.regs[6], host_ctxt->regs.regs[7],
+ &res);


How do you make sure that EL3 actually supports SMCCC 1.1? If that's not
the case, don't we risk additional registers being corrupted?

What of SMCCC 1.2 calls that extend arguments to up to x17?


+   host_ctxt->regs.regs[0] = res.a0;
+   host_ctxt->regs.regs[1] = res.a1;
+   host_ctxt->regs.regs[2] = res.a2;
+   host_ctxt->regs.regs[3] = res.a3;
+}
+
+static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
+{
+   /*
+* Unlike HVC, the return address of an SMC is the instruction's PC.
+* Move the return address past the instruction.
+*/
+   skip_host_instruction();
+
+   /* Forward SMC not handled in EL2 to EL3. */
+   forward_host_smc(host_ctxt);


nit: In general, we update the PC *after* emulating the instruction that
trapped. Not a big deal, but it makes it easier to reason across the
code base.


+}
+
 void handle_trap(struct kvm_cpu_context *host_ctxt)
 {
u64 esr = read_sysreg_el2(SYS_ESR);
@@ -114,6 +146,9 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
case ESR_ELx_EC_HVC64:
handle_host_hcall(host_ctxt);
break;
+   case ESR_ELx_EC_SMC64:
+   handle_host_smc(host_ctxt);
+   break;
default:
hyp_panic();
}


Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 13/24] kvm: arm64: Add CPU entry point in nVHE hyp

2020-11-11 Thread Marc Zyngier

On 2020-11-09 11:32, David Brazdil wrote:

When nVHE hyp starts interception host's PSCI CPU_ON SMCs, it will need
to install KVM on the newly booted CPU before returning to the host. 
Add
an entry point which expects the same kvm_nvhe_init_params struct as 
the

__kvm_hyp_init HVC in the CPU_ON context argument (x0).

The entry point initializes EL2 state with the same init_el2_state 
macro
used by the kernel's entry point. It then initializes KVM using the 
same

helper function used in the __kvm_hyp_init HVC.

When done, the entry point branches to a function provided in the init
params.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_asm.h   |  1 +
 arch/arm64/kernel/asm-offsets.c|  1 +
 arch/arm64/kvm/hyp/nvhe/hyp-init.S | 30 ++
 3 files changed, 32 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_asm.h 
b/arch/arm64/include/asm/kvm_asm.h

index 893327d1e449..efb4872bb29f 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -155,6 +155,7 @@ struct kvm_nvhe_init_params {
unsigned long tpidr_el2;
unsigned long hyp_stack_ptr;
unsigned long vector_ptr;
+   unsigned long psci_cpu_entry_fn;
 };

 /* Translate a kernel address @ptr into its equivalent linear mapping 
*/
diff --git a/arch/arm64/kernel/asm-offsets.c 
b/arch/arm64/kernel/asm-offsets.c

index 0cbb86135c7c..ffc84e68ad97 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -114,6 +114,7 @@ int main(void)
   DEFINE(NVHE_INIT_TPIDR_EL2,  offsetof(struct kvm_nvhe_init_params,
tpidr_el2));
   DEFINE(NVHE_INIT_STACK_PTR,  offsetof(struct kvm_nvhe_init_params,
hyp_stack_ptr));
   DEFINE(NVHE_INIT_VECTOR_PTR, offsetof(struct kvm_nvhe_init_params,
vector_ptr));
+  DEFINE(NVHE_INIT_PSCI_CPU_ENTRY_FN,  offsetof(struct
kvm_nvhe_init_params, psci_cpu_entry_fn));
 #endif
 #ifdef CONFIG_CPU_PM
   DEFINE(CPU_CTX_SP,   offsetof(struct cpu_suspend_ctx, sp));
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 1697d25756e9..f999a35b2c8c 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -6,6 +6,7 @@

 #include 
 #include 
+#include 


This should probably be included from the file that provides 
init_el2_state.




 #include 
 #include 
@@ -159,6 +160,35 @@ alternative_else_nop_endif
ret
 SYM_CODE_END(___kvm_hyp_init)

+SYM_CODE_START(__kvm_hyp_cpu_entry)
+   msr SPsel, #1   // We want to use SP_EL{1,2}
+
+   /*
+* Check that the core was booted in EL2. Loop indefinitely if not
+	 * because it cannot be safely given to the host without installing 
KVM.

+*/
+   mrs x1, CurrentEL
+   cmp x1, #CurrentEL_EL2
+   b.ne.


This is a bit brutal. Consider using a WFE/WFI loop as we have in other
places already (see __secondary_too_slow for example).


+
+   /* Initialize EL2 CPU state to sane values. */
+   mov x29, x0
+   init_el2_state nvhe
+   mov x0, x29
+
+   /*
+	 * Load hyp VA of C entry function. Must do so before switching on 
the
+	 * MMU because the struct pointer is PA and not identity-mapped in 
hyp.

+*/
+   ldr x29, [x0, #NVHE_INIT_PSCI_CPU_ENTRY_FN]
+
+   /* Enable MMU, set vectors and stack. */
+   bl  ___kvm_hyp_init
+
+   /* Leave idmap. */
+   br  x29


To a point I made against an earlier patch: psci_cpu_entry_fn seems to 
be a HYP
VA, and really needs to be documented as such, because this is pretty 
hard to

follow otherwise.


+SYM_CODE_END(__kvm_hyp_cpu_entry)
+
 SYM_CODE_START(__kvm_handle_stub_hvc)
cmp x0, #HVC_SOFT_RESTART
b.ne1f


Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH] arm64: tegra186: Add missing CPU PMUs

2020-11-10 Thread Marc Zyngier

On 2020-11-10 18:22, Thierry Reding wrote:

On Tue, Nov 10, 2020 at 06:08:31PM +, Marc Zyngier wrote:

On 2020-11-10 17:36, Thierry Reding wrote:
> On Tue, Oct 13, 2020 at 10:58:51AM +0100, Marc Zyngier wrote:
> > Add the description of CPU PMUs for both the Denver and A57 clusters,
> > which enables the perf subsystem.
> >
> > Signed-off-by: Marc Zyngier 

[...]

> >
> > + pmu_denver {
> > + compatible = "nvidia,denver-pmu", "arm,armv8-pmuv3";
>
> checkpatch complains that this isn't documented. Did I miss the DT
> bindings patch or do we not have one for this?

We don't. But I don't think adding a compatible string for each
and every micro-architecture makes much sense unless we have something
useful to add to that compatible string. Such as a full description
of the implementation specific events.


I'm wondering if this isn't going to upset Rob's json-schema bot and
make him mad.


Rob going mad? Never! ;-)

If you *really* want it, I'll respin this patch with the Denver 
compatible

added to Documentation/devicetree/bindings/arm/pmu.yaml.

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH] arm64: tegra186: Add missing CPU PMUs

2020-11-10 Thread Marc Zyngier

On 2020-11-10 17:36, Thierry Reding wrote:

On Tue, Oct 13, 2020 at 10:58:51AM +0100, Marc Zyngier wrote:

Add the description of CPU PMUs for both the Denver and A57 clusters,
which enables the perf subsystem.

Signed-off-by: Marc Zyngier 


[...]



+   pmu_denver {
+   compatible = "nvidia,denver-pmu", "arm,armv8-pmuv3";


checkpatch complains that this isn't documented. Did I miss the DT
bindings patch or do we not have one for this?


We don't. But I don't think adding a compatible string for each
and every micro-architecture makes much sense unless we have something
useful to add to that compatible string. Such as a full description
of the implementation specific events.

Thanks,

M.


Re: [PATCH v1 10/24] kvm: arm64: Extract parts of el2_setup into a macro

2020-11-10 Thread Marc Zyngier

On 2020-11-09 11:32, David Brazdil wrote:

When the a CPU is booted in EL2, the kernel checks for VHE support and
initializes the CPU core accordingly. For nVHE it also installs the 
stub

vectors and drops down to EL1.

Once KVM gains the ability to boot cores without going through the
kernel entry point, it will need to initialize the CPU the same way.
Extract the relevant bits of el2_setup into init_el2_state macro
with an argument specifying whether to initialize for VHE or nVHE.

No functional change. Size of el2_setup increased by 148 bytes due
to duplication.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_asm.h | 128 
 arch/arm64/kernel/head.S | 140 +++
 2 files changed, 141 insertions(+), 127 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h 
b/arch/arm64/include/asm/kvm_asm.h

index a49a87a186c3..893327d1e449 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -331,6 +331,134 @@ extern char
__smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
msr sp_el0, \tmp
 .endm

+.macro init_el2_state mode
+
+.ifnes "\mode", "vhe"
+.ifnes "\mode", "nvhe"
+.error "Invalid 'mode' argument"
+.endif
+.endif
+
+   mov_q   x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2)
+   msr sctlr_el2, x0
+   isb
+
+   /*
+* Allow Non-secure EL1 and EL0 to access physical timer and counter.
+* This is not necessary for VHE, since the host kernel runs in EL2,
+	 * and EL0 accesses are configured in the later stage of boot 
process.
+	 * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit 
layout
+	 * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are 
redefined
+	 * to access CNTHCTL_EL2. This allows the kernel designed to run at 
EL1

+* to transparently mess with the EL0 bits via CNTKCTL_EL1 access in
+* EL2.
+*/
+.ifeqs "\mode", "nvhe"
+   mrs x0, cnthctl_el2
+   orr x0, x0, #3  // Enable EL1 physical timers
+   msr cnthctl_el2, x0
+.endif
+   msr cntvoff_el2, xzr// Clear virtual offset
+
+#ifdef CONFIG_ARM_GIC_V3
+   /* GICv3 system register access */
+   mrs x0, id_aa64pfr0_el1
+   ubfxx0, x0, #ID_AA64PFR0_GIC_SHIFT, #4
+   cbz x0, 3f
+
+   mrs_s   x0, SYS_ICC_SRE_EL2
+   orr x0, x0, #ICC_SRE_EL2_SRE// Set ICC_SRE_EL2.SRE==1
+   orr x0, x0, #ICC_SRE_EL2_ENABLE // Set ICC_SRE_EL2.Enable==1
+   msr_s   SYS_ICC_SRE_EL2, x0
+   isb // Make sure SRE is now set
+   mrs_s   x0, SYS_ICC_SRE_EL2 // Read SRE back,
+   tbz x0, #0, 3f  // and check that it sticks
+   msr_s   SYS_ICH_HCR_EL2, xzr// Reset ICC_HCR_EL2 to defaults
+3:
+#endif
+
+   /* Populate ID registers. */
+   mrs x0, midr_el1
+   mrs x1, mpidr_el1
+   msr vpidr_el2, x0
+   msr vmpidr_el2, x1


I don't think this has any effect on VHE, and could be lumped
together with the nVHE code.


+
+#ifdef CONFIG_COMPAT
+   msr hstr_el2, xzr   // Disable CP15 traps to EL2
+#endif
+
+   /* EL2 debug */
+   mrs x1, id_aa64dfr0_el1
+   sbfxx0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4
+   cmp x0, #1
+   b.lt4f  // Skip if no PMU present
+   mrs x0, pmcr_el0// Disable debug access traps
+   ubfxx0, x0, #11, #5 // to EL2 and allow access to
+4:
+   cselx3, xzr, x0, lt // all PMU counters from EL1
+
+   /* Statistical profiling */
+   ubfxx0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4
+   cbz x0, 7f  // Skip if SPE not present
+.ifeqs "\mode", "nvhe"
+   mrs_s   x4, SYS_PMBIDR_EL1  // If SPE available at EL2,
+   and x4, x4, #(1 << SYS_PMBIDR_EL1_P_SHIFT)
+   cbnzx4, 5f  // then permit sampling of 
physical
+   mov x4, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \
+ 1 << SYS_PMSCR_EL2_PA_SHIFT)
+   msr_s   SYS_PMSCR_EL2, x4   // addresses and physical 
counter
+5:
+   mov x1, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
+   orr x3, x3, x1  // If we don't have VHE, then
+   b   7f  // use EL1&0 translation.
+.endif
+   orr x3, x3, #MDCR_EL2_TPMS  // and disable access from EL1


This orr would probably be better placed in an "else" close to the
previous macro. And since you are making things "modular", why not
go the extra mile, and define macros for each functionality that
defer between modes? It would certainly make this change more digest,
and the result more readable.


+7:
+   msr mdcr_el2, x3// Configure debug traps
+
+  

Re: [PATCH v1 08/24] kvm: arm64: Move hyp-init params to a per-CPU struct

2020-11-10 Thread Marc Zyngier

On 2020-11-09 11:32, David Brazdil wrote:

Once we start initializing KVM on newly booted cores before the rest of
the kernel, parameters to __do_hyp_init will need to be provided by EL2
rather than EL1. At that point it will not be possible to pass its four
arguments directly because PSCI_CPU_ON only supports one context
argument.

Refactor __do_hyp_init to accept its parameters in a struct. This
prepares the code for KVM booting cores as well as removes any limits 
on

the number of __do_hyp_init arguments.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_asm.h   |  7 +++
 arch/arm64/include/asm/kvm_hyp.h   |  4 
 arch/arm64/kernel/asm-offsets.c|  4 
 arch/arm64/kvm/arm.c   | 26 ++
 arch/arm64/kvm/hyp/nvhe/hyp-init.S | 21 ++---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c |  2 ++
 6 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h 
b/arch/arm64/include/asm/kvm_asm.h

index 54387ccd1ab2..a49a87a186c3 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -150,6 +150,13 @@ extern void *__vhe_undefined_symbol;

 #endif

+struct kvm_nvhe_init_params {
+   phys_addr_t pgd_ptr;
+   unsigned long tpidr_el2;
+   unsigned long hyp_stack_ptr;
+   unsigned long vector_ptr;
+};


Please add some documentation here, specially indicating what address 
space

the values are relative to.


+
 /* Translate a kernel address @ptr into its equivalent linear mapping 
*/

 #define kvm_ksym_ref(ptr)  \
({  \
diff --git a/arch/arm64/include/asm/kvm_hyp.h 
b/arch/arm64/include/asm/kvm_hyp.h

index 6b664de5ec1f..a3289071f3d8 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -15,6 +15,10 @@
 DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
 DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);

+#ifdef __KVM_NVHE_HYPERVISOR__
+DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
+#endif
+
 #define read_sysreg_elx(r,nvh,vh)  \
({  \
u64 reg;\
diff --git a/arch/arm64/kernel/asm-offsets.c 
b/arch/arm64/kernel/asm-offsets.c

index 7d32fc959b1a..0cbb86135c7c 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -110,6 +110,10 @@ int main(void)
   DEFINE(CPU_APGAKEYLO_EL1,offsetof(struct kvm_cpu_context,
sys_regs[APGAKEYLO_EL1]));
   DEFINE(HOST_CONTEXT_VCPU,offsetof(struct kvm_cpu_context,
__hyp_running_vcpu));
   DEFINE(HOST_DATA_CONTEXT,	offsetof(struct kvm_host_data, 
host_ctxt));
+  DEFINE(NVHE_INIT_PGD_PTR,	offsetof(struct kvm_nvhe_init_params, 
pgd_ptr));

+  DEFINE(NVHE_INIT_TPIDR_EL2,  offsetof(struct kvm_nvhe_init_params,
tpidr_el2));
+  DEFINE(NVHE_INIT_STACK_PTR,  offsetof(struct kvm_nvhe_init_params,
hyp_stack_ptr));
+  DEFINE(NVHE_INIT_VECTOR_PTR, offsetof(struct kvm_nvhe_init_params,
vector_ptr));
 #endif
 #ifdef CONFIG_CPU_PM
   DEFINE(CPU_CTX_SP,   offsetof(struct cpu_suspend_ctx, sp));
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b85b4294b72d..1a57b6025937 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -50,6 +50,7 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, 
kvm_hyp_vector);


 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
 unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
+DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, 
kvm_init_params);


 /* The VMID used in the VTTBR */
 static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
@@ -1331,10 +1332,7 @@ static int kvm_map_vectors(void)

 static void cpu_init_hyp_mode(void)
 {
-   phys_addr_t pgd_ptr;
-   unsigned long hyp_stack_ptr;
-   unsigned long vector_ptr;
-   unsigned long tpidr_el2;
+	struct kvm_nvhe_init_params *params = 
this_cpu_ptr_nvhe_sym(kvm_init_params);

struct arm_smccc_res res;

/* Switch from the HYP stub to our own HYP init vector */
@@ -1345,13 +1343,18 @@ static void cpu_init_hyp_mode(void)
 * kernel's mapping to the linear mapping, and store it in tpidr_el2
 * so that we can use adr_l to access per-cpu variables in EL2.
 */
-   tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
-   (unsigned 
long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
+	params->tpidr_el2 = (unsigned 
long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -

+   (unsigned 
long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));

-   pgd_ptr = kvm_mmu_get_httbr();
-   hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
-   hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr);
-	vector_ptr = (unsigned 
long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));

+   para

Re: [PATCH v1 07/24] kvm: arm64: Create nVHE copy of cpu_logical_map

2020-11-10 Thread Marc Zyngier

On 2020-11-09 11:32, David Brazdil wrote:

When KVM starts validating host's PSCI requests, it will need to map
MPIDR back to the CPU ID. To this end, copy cpu_logical_map into nVHE
hyp memory when KVM is initialized.

Only copy the information for CPUs that are online at the point of KVM
initialization so that KVM rejects CPUs whose features were not checked
against the finalized capabilities.

Signed-off-by: David Brazdil 
---
 arch/arm64/kvm/arm.c | 17 +
 arch/arm64/kvm/hyp/nvhe/percpu.c | 16 
 2 files changed, 33 insertions(+)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 9ba9db2aa7f8..b85b4294b72d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1481,6 +1481,21 @@ static inline void hyp_cpu_pm_exit(void)
 }
 #endif

+static void init_cpu_logical_map(void)
+{
+   extern u64 kvm_nvhe_sym(__cpu_logical_map)[NR_CPUS];
+   int cpu;
+
+   /*
+* Copy the MPIDR <-> logical CPU ID mapping to hyp.
+* Only copy the set of online CPUs whose features have been chacked
+* against the finalized system capabilities. The hypervisor will not
+* allow any other CPUs from the `possible` set to boot.
+*/
+   for_each_online_cpu(cpu)
+   CHOOSE_NVHE_SYM(__cpu_logical_map)[cpu] = cpu_logical_map(cpu);
+}
+
 static int init_common_resources(void)
 {
return kvm_set_ipa_limit();
@@ -1659,6 +1674,8 @@ static int init_hyp_mode(void)
}
}

+   init_cpu_logical_map();
+
return 0;

 out_err:
diff --git a/arch/arm64/kvm/hyp/nvhe/percpu.c 
b/arch/arm64/kvm/hyp/nvhe/percpu.c

index 5fd0c5696907..d0b9dbc2df45 100644
--- a/arch/arm64/kvm/hyp/nvhe/percpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/percpu.c
@@ -8,6 +8,22 @@
 #include 
 #include 

+/*
+ * nVHE copy of data structures tracking available CPU cores.
+ * Only entries for CPUs that were online at KVM init are populated.
+ * Other CPUs should not be allowed to boot because their features 
were

+ * not checked against the finalized system capabilities.
+ */
+u64 __ro_after_init __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1]
= INVALID_HWID };


I'm not sure what __ro_after_init means once we get S2 isolation.


+
+u64 cpu_logical_map(int cpu)


nit: is there any reason why "cpu" cannot be unsigned? The thought
of a negative CPU number makes me shiver...


+{
+   if (cpu < 0 || cpu >= ARRAY_SIZE(__cpu_logical_map))
+   hyp_panic();
+
+   return __cpu_logical_map[cpu];
+}
+
 unsigned long __hyp_per_cpu_offset(unsigned int cpu)
 {
unsigned long *cpu_base_array;


Overall, this patch would make more sense closer it its use case
(in patch 19). I also don't understand why this lives in percpu.c...

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 06/24] kvm: arm64: Support per_cpu_ptr in nVHE hyp code

2020-11-10 Thread Marc Zyngier

On 2020-11-09 11:32, David Brazdil wrote:
When compiling with __KVM_NVHE_HYPERVISOR__ redefine per_cpu_offset() 
to

__hyp_per_cpu_offset() which looks up the base of the nVHE per-CPU
region of the given cpu and computes its offset from the
.hyp.data..percpu section.

This enables use of per_cpu_ptr() helpers in nVHE hyp code. Until now
only this_cpu_ptr() was supported by setting TPIDR_EL2.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/percpu.h  |  6 ++
 arch/arm64/kernel/image-vars.h   |  3 +++
 arch/arm64/kvm/hyp/nvhe/Makefile |  3 ++-
 arch/arm64/kvm/hyp/nvhe/percpu.c | 22 ++
 4 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kvm/hyp/nvhe/percpu.c

diff --git a/arch/arm64/include/asm/percpu.h 
b/arch/arm64/include/asm/percpu.h

index 1599e17379d8..8f1661603b78 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -239,6 +239,12 @@ PERCPU_RET_OP(add, add, ldadd)
 #define this_cpu_cmpxchg_8(pcp, o, n)  \
_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)

+#ifdef __KVM_NVHE_HYPERVISOR__
+extern unsigned long __hyp_per_cpu_offset(unsigned int cpu);
+#define __per_cpu_offset
+#define per_cpu_offset(cpu)__hyp_per_cpu_offset((cpu))
+#endif
+
 #include 

 /* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its
dependencies. */
diff --git a/arch/arm64/kernel/image-vars.h 
b/arch/arm64/kernel/image-vars.h

index c615b285ff5b..78a42a7cdb72 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -103,6 +103,9 @@ KVM_NVHE_ALIAS(gic_nonsecure_priorities);
 KVM_NVHE_ALIAS(__start___kvm_ex_table);
 KVM_NVHE_ALIAS(__stop___kvm_ex_table);

+/* Array containing bases of nVHE per-CPU memory regions. */
+KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
+
 #endif /* CONFIG_KVM */

 #endif /* __ARM64_KERNEL_IMAGE_VARS_H */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile 
b/arch/arm64/kvm/hyp/nvhe/Makefile

index ddde15fe85f2..c45f440cce51 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -6,7 +6,8 @@
 asflags-y := -D__KVM_NVHE_HYPERVISOR__
 ccflags-y := -D__KVM_NVHE_HYPERVISOR__

-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o
host.o hyp-main.o
+obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o 
host.o \

+hyp-main.o percpu.o
 obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o 
../entry.o \

 ../fpsimd.o ../hyp-entry.o

diff --git a/arch/arm64/kvm/hyp/nvhe/percpu.c 
b/arch/arm64/kvm/hyp/nvhe/percpu.c

new file mode 100644
index ..5fd0c5696907
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/percpu.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: David Brazdil 
+ */
+
+#include 
+#include 
+#include 
+
+unsigned long __hyp_per_cpu_offset(unsigned int cpu)
+{
+   unsigned long *cpu_base_array;
+   unsigned long this_cpu_base;
+
+   if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base))
+   hyp_panic();
+
+   cpu_base_array = kern_hyp_va(&kvm_arm_hyp_percpu_base[0]);


There is no guarantee that this will not generate a PC relative
addressing, resulting in kern_hyp_va() being applied twice.

Consider using hyp_symbol_addr() instead, which always does the right
by forcing a PC relative addressing and not subsequently mangling
the address.


+   this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);
+   return this_cpu_base - (unsigned long)&__per_cpu_start;


And this is the opposite case: if the compiler generates an absolute
address, you're toast. Yes, this is just as unlikely, but hey...
Same remedy should apply.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 03/24] arm64: Move MAIR_EL1_SET to asm/memory.h

2020-11-10 Thread Marc Zyngier

On 2020-11-09 11:32, David Brazdil wrote:

KVM currently initializes MAIR_EL2 to the value of MAIR_EL1. In
preparation for initializing MAIR_EL2 before MAIR_EL1, move the 
constant

into a shared header file.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/memory.h | 13 +
 arch/arm64/mm/proc.S| 13 -
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h 
b/arch/arm64/include/asm/memory.h

index cd61239bae8c..aca00737e771 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -152,6 +152,19 @@
 #define MT_S2_FWB_NORMAL   6
 #define MT_S2_FWB_DEVICE_nGnRE 1

+/*
+ * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal 
memory and
+ * changed during __cpu_setup to Normal Tagged if the system supports 
MTE.

+ */
+#define MAIR_EL1_SET   \


If we are going to use this at EL2 directly, consider renaming it to
MAIR_ELx_SET, as we do for other constants that are shared across 
exception

levels.


+   (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) |  \


This creates an implicit dependency between sysreg.h and memory.h.
Consider including asm/sysreg.h, assuming this doesn't create any 
circular

dependency, or even move it to sysreg.h altogether.


+MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |\
+MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |\
+MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) |  \
+MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |\
+MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) |  \
+MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED))
+
 #ifdef CONFIG_ARM64_4K_PAGES
 #define IOREMAP_MAX_ORDER  (PUD_SHIFT)
 #else
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 23c326a06b2d..25ff21b3a1c6 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -45,19 +45,6 @@
 #define TCR_KASAN_FLAGS 0
 #endif

-/*
- * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal 
memory and
- * changed during __cpu_setup to Normal Tagged if the system supports 
MTE.

- */
-#define MAIR_EL1_SET   \
-   (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) |  \
-MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |\
-MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |\
-MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) |  \
-MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |\
-MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) |  \
-MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED))
-
 #ifdef CONFIG_CPU_PM
 /**
  * cpu_do_suspend - save CPU registers context


Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 01/24] psci: Accessor for configured PSCI version

2020-11-10 Thread Marc Zyngier

On 2020-11-09 11:32, David Brazdil wrote:

The version of PSCI that the kernel should use to communicate with
firmware is typically obtained from probing PSCI_VERSION. However, that
doesn't work for PSCI v0.1 where the host gets the information from
DT/ACPI, or if PSCI is not supported / was disabled.

KVM's host PSCI proxy needs to be configured with the same version
used by the host driver. Expose the PSCI version used by the host
with a read-only accessor.

Signed-off-by: David Brazdil 
---
 drivers/firmware/psci/psci.c | 11 +++
 include/linux/psci.h |  8 
 2 files changed, 19 insertions(+)

diff --git a/drivers/firmware/psci/psci.c 
b/drivers/firmware/psci/psci.c

index 00af99b6f97c..bc1b2d60fdbf 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -49,6 +49,13 @@ static int resident_cpu = -1;
 struct psci_operations psci_ops;
 static enum arm_smccc_conduit psci_conduit = SMCCC_CONDUIT_NONE;

+static int driver_version = PSCI_VERSION(0, 0);
+
+int psci_driver_version(void)
+{
+   return driver_version;
+}
+
 bool psci_tos_resident_on(int cpu)
 {
return cpu == resident_cpu;
@@ -461,6 +468,8 @@ static int __init psci_probe(void)
return -EINVAL;
}

+   driver_version = ver;
+
psci_0_2_set_functions();

psci_init_migrate();
@@ -514,6 +523,8 @@ static int __init psci_0_1_init(struct device_node 
*np)


pr_info("Using PSCI v0.1 Function IDs from DT\n");

+   driver_version = PSCI_VERSION(0, 1);
+
if (!of_property_read_u32(np, "cpu_suspend", &id)) {
psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
psci_ops.cpu_suspend = psci_cpu_suspend;
diff --git a/include/linux/psci.h b/include/linux/psci.h
index 2a1bfb890e58..5b5dcf176aa6 100644
--- a/include/linux/psci.h
+++ b/include/linux/psci.h
@@ -21,6 +21,14 @@ bool psci_power_state_is_valid(u32 state);
 int psci_set_osi_mode(bool enable);
 bool psci_has_osi_support(void);

+/**
+ * The version of the PSCI specification followed by the driver.
+ * This is equivalent to calling PSCI_VERSION except:
+ *   (a) it also works for PSCI v0.1, which does not support 
PSCI_VERSION, and

+ *   (b) it is set to v0.0 if the PSCI driver was not initialized.
+ */
+int psci_driver_version(void);
+
 struct psci_operations {
u32 (*get_version)(void);
int (*cpu_suspend)(u32 state, unsigned long entry_point);


I still maintain that populating .get_version in all cases instead of
duplicating an existing functionality is a better outcome. PSCI not
supported would be implied by .get_version being NULL.

What breaks?

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v2 3/5] gpio: msc313: MStar MSC313 GPIO driver

2020-11-10 Thread Marc Zyngier

On 2020-11-10 14:02, Linus Walleij wrote:

On Thu, Nov 5, 2020 at 4:43 PM Marc Zyngier  wrote:

On 2020-11-05 15:23, Daniel Palmer wrote:
> On Thu, 5 Nov 2020 at 21:08, Marc Zyngier  wrote:



> >  I see that msc313_gpio_irqchip doesn't have a
>> .irq_set_affinity callback. Is this system UP only?
>
> What is in mainline right now is UP only but there are chips with a
> second cortex A7 that I have working in my tree.
> So I will add that in for v3 if I can work out what I should actually
> do there. :)

Probably nothing more than setting the callback to
irq_chip_set_affinity_parent,


Hm, is this something all GPIO irqchips used on SMP systems
should be doing? Or just hierarchical ones?


Probably only the hierarchical ones. I'd expect the non-hierarchical
GPIOs to be muxed behind a single interrupt, which makes it impossible
to move a single GPIO around, and moving the mux interrupt would break
userspace's expectations that interrupts move independently of each 
others.


Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 00/24] Opt-in always-on nVHE hypervisor

2020-11-10 Thread Marc Zyngier

On 2020-11-10 10:15, Christoph Hellwig wrote:

On Mon, Nov 09, 2020 at 11:32:09AM +, David Brazdil wrote:

As we progress towards being able to keep guest state private to the
host running nVHE hypervisor, this series allows the hypervisor to
install itself on newly booted CPUs before the host is allowed to run
on them.


Why?  I thought we were trying to kill nVHE off now that newer CPUs
provide the saner virtualization extensions?


We can't kill nVHE at all, because that is the only game in town.
You can't even buy a decent machine with VHE, no matter how much money
you put on the table.

nVHE is here for the foreseeable future, and we even use its misfeatures
to our advantage in order to offer confidential VMs. See Will's 
presentation

at KVM forum a couple of weeks ago for the gory details.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 23/24] kvm: arm64: Trap host SMCs in protected mode.

2020-11-10 Thread Marc Zyngier

On 2020-11-09 11:32, David Brazdil wrote:

While protected nVHE KVM is installed, start trapping all host SMCs.
By default, these are simply forwarded to EL3, but PSCI SMCs are
validated first.

Create new constant HCR_HOST_NVHE_PROTECTED_FLAGS with the new set of 
HCR

flags to use while the nVHE vector is installed when the kernel was
booted with the protected flag enabled. Switch back to the default HCR
flags when switching back to the stub vector.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_arm.h   |  1 +
 arch/arm64/kernel/image-vars.h |  4 
 arch/arm64/kvm/arm.c   | 35 ++
 arch/arm64/kvm/hyp/nvhe/hyp-init.S |  8 +++
 arch/arm64/kvm/hyp/nvhe/switch.c   |  5 -
 5 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h 
b/arch/arm64/include/asm/kvm_arm.h

index 64ce29378467..4e90c2debf70 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -80,6 +80,7 @@
 HCR_FMO | HCR_IMO | HCR_PTW )
 #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
 #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA)
+#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC)
 #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)

 /* TCR_EL2 Registers bits */
diff --git a/arch/arm64/kernel/image-vars.h 
b/arch/arm64/kernel/image-vars.h

index 78a42a7cdb72..75cda51674f4 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -62,9 +62,13 @@ __efistub__ctype = _ctype;
  */

 /* Alternative callbacks for init-time patching of nVHE hyp code. */
+KVM_NVHE_ALIAS(kvm_patch_hcr_flags);
 KVM_NVHE_ALIAS(kvm_patch_vector_branch);
 KVM_NVHE_ALIAS(kvm_update_va_mask);

+/* Static key enabled when the user opted into nVHE protected mode. */
+KVM_NVHE_ALIAS(kvm_protected_mode);
+
 /* Global kernel state accessed by nVHE hyp code. */
 KVM_NVHE_ALIAS(kvm_vgic_global_state);

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 574aa2d026e6..c09b95cfa00a 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1861,6 +1861,41 @@ void kvm_arch_exit(void)
kvm_perf_teardown();
 }

+static inline u32 __init __gen_mov_hcr_insn(u64 hcr, u32 rd, int i)
+{
+   int shift = 48 - (i * 16);
+   u16 imm = (hcr >> shift) & GENMASK(16, 0);


I really doubt you want to encode 17 bits.


+
+   return aarch64_insn_gen_movewide(rd, imm, shift,
+AARCH64_INSN_VARIANT_64BIT,
+(i == 0) ? AARCH64_INSN_MOVEWIDE_ZERO
+ : AARCH64_INSN_MOVEWIDE_KEEP);
+}


I've added a generate_mov_q() helper as part of my host EL2 entry 
rework.

We can probably share some stuff here.


+
+void __init kvm_patch_hcr_flags(struct alt_instr *alt,
+   __le32 *origptr, __le32 *updptr, int nr_inst)
+{
+   int i;
+   u32 rd;
+
+   BUG_ON(nr_inst != 4);
+
+   /* Skip for VHE and unprotected nVHE modes. */
+   if (!is_kvm_protected_mode())
+   return;
+
+   rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD,
+ le32_to_cpu(origptr[0]));
+
+   for (i = 0; i < nr_inst; i++) {
+   u32 oinsn = __gen_mov_hcr_insn(HCR_HOST_NVHE_FLAGS, rd, i);
+   u32 insn = __gen_mov_hcr_insn(HCR_HOST_NVHE_PROTECTED_FLAGS, 
rd, i);
+
+   BUG_ON(oinsn != le32_to_cpu(origptr[i]));
+   updptr[i] = cpu_to_le32(insn);
+   }
+}
+
 static int __init early_kvm_protected_cfg(char *buf)
 {
bool val;
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index f999a35b2c8c..bbe6c5f558e0 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -88,6 +88,12 @@ SYM_CODE_END(__kvm_hyp_init)
  * x0: struct kvm_nvhe_init_params PA
  */
 SYM_CODE_START(___kvm_hyp_init)
+alternative_cb kvm_patch_hcr_flags
+   mov_q   x1, HCR_HOST_NVHE_FLAGS


You really want to be careful here: the mov_q macro expands to 2, 3 or 4
instructions, depending on the input data...

It is also odd that you have both a static key and a patching 
alternative.

Why isn't "protected KVM" a capability that can be evaluated as a a non
patching alternative? In general, I'd like to reserve patching 
alternatives
to values that cannot be evaluated at compile time (VM offsets, for 
example).



+alternative_cb_end
+   msr hcr_el2, x1
+   isb
+
ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
msr tpidr_el2, x1

@@ -220,6 +226,8 @@ reset:
bic x5, x5, x6  // Clear SCTL_M and etc
pre_disable_mmu_workaround
msr sctlr_el2, x5
+   mov_q   x5, HCR_HOST_NVHE_FLAGS
+   msr hcr_el2, x5
isb

/* Install stub vectors */
diff --git a/arch/arm64/kvm/hyp/nvhe

Re: [PATCH] regmap: Properly free allocated name for regmap_config of syscon

2020-11-10 Thread Marc Zyngier

On 2020-11-10 01:35, Kefeng Wang wrote:

On 2020/11/10 1:23, Mark Brown wrote:

On Mon, Nov 09, 2020 at 07:58:16PM +0800, Kefeng Wang wrote:

syscon_config.name in of_syscon_register is allocated using 
kasprintf,

which should be freed when it is not used after regmap_set_name, fix
the following memory leak.
unreferenced object 0xffe07fe8c150 (size 16):
   comm "swapper/0", pid 1, jiffies 4294892540 (age 68.168s)
   hex dump (first 16 bytes):
 74 65 73 74 40 31 30 30 30 30 30 00 e0 ff ff ff  
test@10.

   backtrace:
 [<23d86736>] create_object+0xe8/0x348
 [] kmemleak_alloc+0x20/0x2a

Please think hard before including complete backtraces in upstream
reports, they are very large and contain almost no useful information
relative to their size so often obscure the relevant content in your
message. If part of the backtrace is usefully illustrative (it often 
is
for search engines if nothing else) then it's usually better to pull 
out

the relevant sections.


2899872b627e   "regmap: debugfs: Fix memory leak in
regmap_debugfs_init" add a similar

backtrack, but the address of the trace is useless, will be careful 
next time.


@@ -601,6 +601,7 @@ static int regmap_set_name(struct regmap *map, 
const struct regmap_config *confi

if (!name)
return -ENOMEM;
  + kfree_const(config->name);
kfree_const(map->name);
map->name = name;
}
Why would we free the passed in name here?  The name wes passed in 
from

outside regmap in a const configuration struct, we've no idea within
regmap if it was dynamically allocted or not and it seems very
surprising that we'd go off and free it.  The whole reason we're
duplicating it in regmap_set_name() is that we don't know how long 
it's

going to be around so we don't want to reference it after having
returned to the caller.  If the caller has dynamically allocated it 
then

the caller should deal with freeing it.


Yes, after check it again, this patch is wrong.

Hi Marc,  the regmap debugfs will duplicate a name in 
regmap_set_name(), and


syscon_config.name won't be used in syscon,  so your following patch
doesn't seem

to be necessary,  right ? Please correct me if I'm wrong, thanks.


It was certainly necessary at the time when I wrote the patch, as it
was fixing some obvious memory corruption (use after free).

It is very possible that the flow has been reorganised since, as the
following commit hints at:

commit e15d7f2b81d2e7d93115d46fa931b366c1cdebc2
Author: Suman Anna 
Date:   Mon Jul 27 16:10:08 2020 -0500

mfd: syscon: Use a unique name with regmap_config

The DT node full name is currently being used in regmap_config
which in turn is used to create the regmap debugfs directories.
This name however is not guaranteed to be unique and the regmap
debugfs registration can fail in the cases where the syscon nodes
have the same unit-address but are present in different DT node
hierarchies. Replace this logic using the syscon reg resource
address instead (inspired from logic used while creating platform
devices) to ensure a unique name is given for each syscon.

Signed-off-by: Suman Anna 
Reviewed-by: Arnd Bergmann 
Signed-off-by: Lee Jones 

I suggest you come up with a more complete analysis of the problem
and how it came to be.

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v1 17/24] kvm: arm64: Add __hyp_pa_symbol helper macro

2020-11-09 Thread Marc Zyngier

On 2020-11-09 16:59, Quentin Perret wrote:

Hey David,

On Monday 09 Nov 2020 at 11:32:26 (+), David Brazdil wrote:

Add helper macro for computing the PA of a kernel symbol in nVHE hyp
code. This will be useful for computing the PA of a PSCI CPU_ON entry
point.

Signed-off-by: David Brazdil 
---
 arch/arm64/kvm/hyp/nvhe/psci.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/psci.c 
b/arch/arm64/kvm/hyp/nvhe/psci.c

index b0b5df590ba5..7510b9e174e9 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci.c
@@ -20,6 +20,16 @@ s64 hyp_physvirt_offset;

 #define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset)

+#define __hyp_pa_symbol(sym)   \
+   ({  \
+   extern char sym[];  \
+   unsigned long kern_va;  \
+   \
+   asm volatile("ldr %0, =%1" : "=r" (kern_va) \
+  : "S" (sym));  \
+   kern_va - kimage_voffset;   \
+   })
+


Could this be simplified to __hyp_pa(hyp_symbol_addr(sym))? That would
avoid the dependency on kimage_voffset.


I'm going to move away from evaluating kimage_voffset at runtime anyway,
see [1].

Thanks,

M.

[1] https://lore.kernel.org/r/20201109175923.445945-1-...@kernel.org
--
Jazz is not dead. It just smells funny...


Re: [PATCH -next] irq-chip/gic-v3-its: Fixed an issue where the ITS executes the residual commands in the queue again when the ITS wakes up from sleep mode.

2020-11-09 Thread Marc Zyngier

On 2020-11-09 03:05, xuqiang (M) wrote:

在 2020/11/8 0:54, Marc Zyngier 写道:

[dropping Jason, whose email address has been bouncing for weeks now]

On 2020-11-07 10:42, Xu Qiang wrote:
On my platform, ITS_FLAGS_SAVE_SUSPEND_STATE is not set,thus do 
nothing


Which platform?

Hisi Ascend platform



in its suspend and resuse function.On the other hand,firmware stores
GITS_CTRL,GITS_CBASER,GITS_CWRITER and GITS_BASER in the suspend,
and restores these registers in the resume. As a result, the ITS 
executes

the residual commands in the queue.


Which firmware are you using? I just had a look at the trusted 
firmware source
code, and while it definitely does something that *looks* like what 
you are

describing, it doesn't re-enable the ITS on resume.

So what are you running?


I am using ATF. Since ITS_FLAGS_SAVE_SUSPEND_STATE is not set,ITS
driver of OS will

not re-enable ITS in th resume. To make ITS work properly, we changed
the ATF code

to re-enable ITS on resume.


I don't think the words "work properly" apply here.

The kernel didn't do what you wanted, so instead of fixing the kernel, 
you

introduced a bug that results in memory corruption from the firmware.

What are you plans to fix your firmware? Because from an upstream ATF
compatibility PoV, all there is to do is to fixup the command queue and
enable the ITS.

M.
--
Jazz is not dead. It just smells funny...


<    3   4   5   6   7   8   9   10   11   12   >