Re: [RFC PATCH 3/6] kvm: arm64: Fix up RELR relocation in hyp code/data
On 2020-11-19 16:25, David Brazdil wrote: The arm64 kernel also supports packing of relocation data using the RELR format. Implement a parser of RELR data and fixup the relocations using the same infra as RELA relocs. Signed-off-by: David Brazdil --- arch/arm64/kvm/va_layout.c | 41 ++ 1 file changed, 41 insertions(+) diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index b80fab974896..7f45a98eacfd 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -145,6 +145,43 @@ static void __fixup_hyp_rela(void) __fixup_hyp_rel(rel[i].r_offset); } +#ifdef CONFIG_RELR +static void __fixup_hyp_relr(void) +{ + u64 *rel, *end; + + rel = (u64*)(kimage_vaddr + __load_elf_u64(__relr_offset)); + end = rel + (__load_elf_u64(__relr_size) / sizeof(*rel)); + + while (rel < end) { + unsigned n; + u64 addr = *(rel++); + + /* Address must not have the LSB set. */ + BUG_ON(addr & BIT(0)); + + /* Fix up the first address of the chain. */ + __fixup_hyp_rel(addr); + + /* +* Loop over bitmaps, i.e. as long as words' LSB is 1. +* Each bit (ordered from LSB to MSB) represents one word from +* the last full address (exclusive). If the corresponding bit +* is 1, there is a relative relocation on that word. +*/ What is the endianness of this bitmap? Is it guaranteed to be in CPU-endian format? + for (n = 0; rel < end && (*rel & BIT(0)); n++) { + unsigned i; + u64 bitmap = *(rel++); nit: if you change this u64 for an unsigned long... + + for (i = 1; i < 64; ++i) { + if ((bitmap & BIT(i))) + __fixup_hyp_rel(addr + 8 * (63 * n + i)); + } ... this can be written as: i = 1; for_each_set_bit_from(i, &bitmap, 64) __fixup_hyp_rel(addr + 8 * (63 * n + i)); + } + } +} +#endif + /* * The kernel relocated pointers to kernel VA. Iterate over relocations in * the hypervisor ELF sections and convert them to hyp VA. This avoids the @@ -156,6 +193,10 @@ __init void kvm_fixup_hyp_relocations(void) return; __fixup_hyp_rela(); + +#ifdef CONFIG_RELR + __fixup_hyp_relr(); +#endif } static u32 compute_instruction(int n, u32 rd, u32 rn) Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [RFC PATCH 2/6] kvm: arm64: Fix up RELA relocations in hyp code/data
On 2020-11-19 16:25, David Brazdil wrote: KVM nVHE code runs under a different VA mapping than the kernel, hence so far it relied only on PC-relative addressing to avoid accidentally using a relocated kernel VA from a constant pool (see hyp_symbol_addr). So as to reduce the possibility of a programmer error, fixup the relocated addresses instead. Let the kernel relocate them to kernel VA first, but then iterate over them again, filter those that point to hyp code/data and convert the kernel VA to hyp VA. This is done after kvm_compute_layout and before apply_alternatives. Signed-off-by: David Brazdil --- arch/arm64/include/asm/kvm_mmu.h | 1 + arch/arm64/kernel/smp.c | 4 +- arch/arm64/kvm/va_layout.c | 76 3 files changed, 80 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 5168a0c516ae..e5226f7e4732 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -105,6 +105,7 @@ alternative_cb_end void kvm_update_va_mask(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst); void kvm_compute_layout(void); +void kvm_fixup_hyp_relocations(void); static __always_inline unsigned long __kern_hyp_va(unsigned long v) { diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 18e9727d3f64..30241afc2c93 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -434,8 +434,10 @@ static void __init hyp_mode_check(void) "CPU: CPUs started in inconsistent modes"); else pr_info("CPU: All CPU(s) started at EL1\n"); - if (IS_ENABLED(CONFIG_KVM)) + if (IS_ENABLED(CONFIG_KVM)) { kvm_compute_layout(); + kvm_fixup_hyp_relocations(); + } } void __init smp_cpus_done(unsigned int max_cpus) diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index d8cc51bd60bf..b80fab974896 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -82,6 +83,81 @@ __init void kvm_compute_layout(void) init_hyp_physvirt_offset(); } +#define __load_elf_u64(s) \ + ({ \ + extern u64 s; \ + u64 val;\ + \ + asm ("ldr %0, =%1" : "=r"(val) : "S"(&s));\ + val;\ + }) I'm not sure I get the rational about the naming here. None of this has much to do with ELF, but seems to just load a value from a constant pool. + +static bool __is_within_bounds(u64 addr, char *start, char *end) +{ + return start <= (char*)addr && (char*)addr < end; +} + +static bool __is_in_hyp_section(u64 addr) +{ + return __is_within_bounds(addr, __hyp_text_start, __hyp_text_end) || + __is_within_bounds(addr, __hyp_rodata_start, __hyp_rodata_end) || + __is_within_bounds(addr, + CHOOSE_NVHE_SYM(__per_cpu_start), + CHOOSE_NVHE_SYM(__per_cpu_end)); +} + +static void __fixup_hyp_rel(u64 addr) +{ + u64 *ptr, kern_va, hyp_va; + + /* Adjust the relocation address taken from ELF for KASLR. */ + addr += kaslr_offset(); + + /* Skip addresses not in any of the hyp sections. */ + if (!__is_in_hyp_section(addr)) + return; + + /* Get the LM alias of the relocation address. */ + ptr = (u64*)kvm_ksym_ref((void*)addr); Why the casting? We should be perfectly fine without. nit: we really need to change the name of this helper, it doesn't have anything to do with symbols anymore. And actually, lm_alias() *is* the right thing to use here (we don't relocate anything on VHE). + + /* +* Read the value at the relocation address. It has already been +* relocated to the actual kernel kimg VA. +*/ + kern_va = (u64)kvm_ksym_ref((void*)*ptr); Same comment. + + /* Convert to hyp VA. */ + hyp_va = __early_kern_hyp_va(kern_va); + + /* Store hyp VA at the relocation address. */ + *ptr = __early_kern_hyp_va(kern_va); +} + +static void __fixup_hyp_rela(void) +{ + Elf64_Rela *rel; + size_t i, n; + + rel = (Elf64_Rela*)(kimage_vaddr + __load_elf_u64(__rela_offset)); + n = __load_elf_u64(__rela_size) / sizeof(*rel); + + for (i = 0; i < n; ++i) + __fixup_hyp_rel(rel[i].r_offset); +} + +/* + * The kernel relocated pointers to kernel VA. Iterate over relocations in + * the hypervisor ELF sections and convert them to hyp VA. This avoids the + * need to only use PC-relative addressing
Re: [PATCH 3/3] pinctrl: qcom: Clear possible pending irq when remuxing GPIOs
On 2020-11-24 12:43, Maulik Shah wrote: Hi Marc, On 11/24/2020 4:45 PM, Marc Zyngier wrote: On 2020-11-24 10:37, Maulik Shah wrote: [...] static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned function, unsigned group) { struct msm_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct gpio_chip *gc = &pctrl->chip; + unsigned int irq = irq_find_mapping(gc->irq.domain, group); const struct msm_pingroup *g; unsigned long flags; u32 val, mask; + u32 oldval; + u32 old_i; int i; g = &pctrl->soc->groups[group]; @@ -187,15 +215,26 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev, if (WARN_ON(i == g->nfuncs)) return -EINVAL; - raw_spin_lock_irqsave(&pctrl->lock, flags); + disable_irq(irq); - val = msm_readl_ctl(pctrl, g); + raw_spin_lock_irqsave(&pctrl->lock, flags); + oldval = val = msm_readl_ctl(pctrl, g); val &= ~mask; val |= i << g->mux_bit; msm_writel_ctl(val, pctrl, g); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + /* + * Clear IRQs if switching to/from GPIO mode since muxing to/from + * the GPIO path can cause phantom edges. + */ + old_i = (oldval & mask) >> g->mux_bit; + if (old_i != i && + (i == pctrl->soc->gpio_func || old_i == pctrl->soc->gpio_func)) + msm_pinctrl_clear_pending_irq(pctrl, group, irq); disable_irq() and enable_irq() should be moved inside this if loop. as only use for this is to mask the IRQ when switching back to gpio IRQ mode? i also don't think we should leave IRQ enabled at the end of this function by default, probably need to check if IRQ was already unmasked before disabling it, then only call enable_irq(). Why? It looks to me that this reproduces the behaviour of IRQCHIP_SET_TYPE_MASKED, which is highly desirable. What problem are you trying to address with this? Correct, here trying to reproduce the behaviour of IRQCHIP_SET_TYPE_MASKED which i guess is ok once its moved inside if loop as this is the place its switching to IRQ mode. but there is a problem to leave it enabled at the end of set_direction callbacks, see below. + + enable_irq(irq); + return 0; } @@ -456,32 +495,45 @@ static const struct pinconf_ops msm_pinconf_ops = { static int msm_gpio_direction_input(struct gpio_chip *chip, unsigned offset) { const struct msm_pingroup *g; + unsigned int irq = irq_find_mapping(chip->irq.domain, offset); struct msm_pinctrl *pctrl = gpiochip_get_data(chip); unsigned long flags; + u32 oldval; u32 val; g = &pctrl->soc->groups[offset]; + disable_irq(irq); + raw_spin_lock_irqsave(&pctrl->lock, flags); - val = msm_readl_ctl(pctrl, g); + oldval = val = msm_readl_ctl(pctrl, g); val &= ~BIT(g->oe_bit); msm_writel_ctl(val, pctrl, g); raw_spin_unlock_irqrestore(&pctrl->lock, flags); + if (oldval != val) + msm_pinctrl_clear_pending_irq(pctrl, offset, irq); + + enable_irq(irq); i do not think we need disable_irq() and enable_irq() here, changing direction to input does not mean its being used for interrupt only, it may be set to use something like Rx mode in UART. the client driver should enable IRQ when needed. And the kernel doesn't expect random interrupts to fire. Again, what are you trying to fix by removing these? I see leaving IRQ enabled here can cause problems. For example in qcom_geni_serial.c driver before requesting IRQ, it sets the IRQ_NOAUTOEN flag to not keep it enabled. see the below snippet irq_set_status_flags(uport->irq, IRQ_NOAUTOEN); ret = devm_request_irq(uport->dev, uport->irq, qcom_geni_serial_isr, IRQF_TRIGGER_HIGH, port->name, uport); later when this devm_request_irq() invokes .irq_request_resources callback it will reach msm_gpio_irq_reqres() from where msm_gpio_direction_input() is called which leaves the irq enabled at the end with enable_irq() which was not expected by driver. No it doesn't. disable_irq()/enable_irq() are designed to nest. If the interrupt line was disabled before the disable/enable sequence, it will still be disabled after. It will cause is IRQ storm since the UART geni driver uses GPIO in Rx mode when out of suspend. The IRQ mode in GPIO is enabled with suspend entry only. During resume the IRQ will again be disabled and GPIO will be switched to Rx mode. I don't see how this contradicts what is above. If the interrupt was disabled before hitting this sequence, it will still be disabled after. Am I missing something? Have you actually seen the problem on HW? M. -- Jazz is not dead. It just smells funny...
Re: [PATCH 3/3] pinctrl: qcom: Clear possible pending irq when remuxing GPIOs
On 2020-11-24 10:37, Maulik Shah wrote: [...] static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned function, unsigned group) { struct msm_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct gpio_chip *gc = &pctrl->chip; + unsigned int irq = irq_find_mapping(gc->irq.domain, group); const struct msm_pingroup *g; unsigned long flags; u32 val, mask; + u32 oldval; + u32 old_i; int i; g = &pctrl->soc->groups[group]; @@ -187,15 +215,26 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev, if (WARN_ON(i == g->nfuncs)) return -EINVAL; - raw_spin_lock_irqsave(&pctrl->lock, flags); + disable_irq(irq); - val = msm_readl_ctl(pctrl, g); + raw_spin_lock_irqsave(&pctrl->lock, flags); + oldval = val = msm_readl_ctl(pctrl, g); val &= ~mask; val |= i << g->mux_bit; msm_writel_ctl(val, pctrl, g); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + /* +* Clear IRQs if switching to/from GPIO mode since muxing to/from +* the GPIO path can cause phantom edges. +*/ + old_i = (oldval & mask) >> g->mux_bit; + if (old_i != i && + (i == pctrl->soc->gpio_func || old_i == pctrl->soc->gpio_func)) + msm_pinctrl_clear_pending_irq(pctrl, group, irq); disable_irq() and enable_irq() should be moved inside this if loop. as only use for this is to mask the IRQ when switching back to gpio IRQ mode? i also don't think we should leave IRQ enabled at the end of this function by default, probably need to check if IRQ was already unmasked before disabling it, then only call enable_irq(). Why? It looks to me that this reproduces the behaviour of IRQCHIP_SET_TYPE_MASKED, which is highly desirable. What problem are you trying to address with this? + + enable_irq(irq); + return 0; } @@ -456,32 +495,45 @@ static const struct pinconf_ops msm_pinconf_ops = { static int msm_gpio_direction_input(struct gpio_chip *chip, unsigned offset) { const struct msm_pingroup *g; + unsigned int irq = irq_find_mapping(chip->irq.domain, offset); struct msm_pinctrl *pctrl = gpiochip_get_data(chip); unsigned long flags; + u32 oldval; u32 val; g = &pctrl->soc->groups[offset]; + disable_irq(irq); + raw_spin_lock_irqsave(&pctrl->lock, flags); - val = msm_readl_ctl(pctrl, g); + oldval = val = msm_readl_ctl(pctrl, g); val &= ~BIT(g->oe_bit); msm_writel_ctl(val, pctrl, g); raw_spin_unlock_irqrestore(&pctrl->lock, flags); + if (oldval != val) + msm_pinctrl_clear_pending_irq(pctrl, offset, irq); + + enable_irq(irq); i do not think we need disable_irq() and enable_irq() here, changing direction to input does not mean its being used for interrupt only, it may be set to use something like Rx mode in UART. the client driver should enable IRQ when needed. And the kernel doesn't expect random interrupts to fire. Again, what are you trying to fix by removing these? M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v15 6/9] arm64/kvm: Add hypercall service for kvm ptp.
On 2020-11-24 05:20, Jianyong Wu wrote: Hi Marc, [...] > +/* ptp_kvm counter type ID */ > +#define ARM_PTP_VIRT_COUNTER 0 > +#define ARM_PTP_PHY_COUNTER 1 > +#define ARM_PTP_NONE_COUNTER 2 The architecture definitely doesn't have this last counter. Yeah, this is just represent no counter data needed from guest. Some annotation should be added here. I'd rather you remove it entirely, or explain why you really cannot do without a fake counter. M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v15 7/9] ptp: arm/arm64: Enable ptp_kvm for arm/arm64
Jianyong, On 2020-11-24 05:37, Jianyong Wu wrote: Hi Marc, [...] > + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FU NC_ID, > + ARM_PTP_NONE_COUNTER, &hvc_res); I really don't see the need to use a non-architectural counter ID. Using the virtual counter ID should just be fine, and shouldn't lead to any issue. Am I missing something? In this function, no counter data is needed. If virtual counter ID is used here, user may be confused with why we get virtual counter data and do nothing with it. So I explicitly add a new "NONE" counter ID to make it clear. WDYT? ITIABI (I Think It's A Bad Idea). There are two counters, and the API allows to retrieve the data for any of these two. If the "user" doesn't want to do anything with the data, that's their problem. Here, you can just sue the virtual counter, and that will give you the exact same semantic, without inventing non-architectural state. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH 1/3] irqchip: qcom-pdc: Fix phantom irq when changing between rising/falling
On 2020-11-24 00:01, Douglas Anderson wrote: We have a problem if we use gpio-keys and configure wakeups such that we only want one edge to wake us up. AKA: wakeup-event-action = ; wakeup-source; Specifically we end up with a phantom interrupt that blocks suspend if the line was already high and we want wakeups on rising edges (AKA we want the GPIO to go low and then high again before we wake up). The opposite is also problematic. Specifically, here's what's happening today: 1. Normally, gpio-keys configures to look for both edges. Due to the current workaround introduced in commit c3c0c2e18d94 ("pinctrl: qcom: Handle broken/missing PDC dual edge IRQs on sc7180"), if the line was high we'd configure for falling edges. 2. At suspend time, we change to look for rising edges. 3. After qcom_pdc_gic_set_type() runs, we get a phantom interrupt. We can solve this by just clearing the phantom interrupt. NOTE: it is possible that this could cause problems for a client with very specific needs, but there's not much we can do with this hardware. As an example, let's say the interrupt signal is currently high and the client is looking for falling edges. The client now changes to look for rising edges. The client could possibly expect that if the line has a short pulse low (and back high) that it would always be detected. Specifically no matter when the pulse happened, it should either have tripped the (old) falling edge trigger or the (new) rising edge trigger. We will simply not trip it. We could narrow down the race a bit by polling our parent before changing types, but no matter what we do there will still be a period of time where we can't tell the difference between a real transition (or more than one transition) and the phantom. Signed-off-by: Douglas Anderson --- drivers/irqchip/qcom-pdc.c | 19 ++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/qcom-pdc.c b/drivers/irqchip/qcom-pdc.c index bd39e9de6ecf..7d097164aadc 100644 --- a/drivers/irqchip/qcom-pdc.c +++ b/drivers/irqchip/qcom-pdc.c @@ -159,6 +159,8 @@ static int qcom_pdc_gic_set_type(struct irq_data *d, unsigned int type) { int pin_out = d->hwirq; enum pdc_irq_config_bits pdc_type; + enum pdc_irq_config_bits old_pdc_type; + int ret; if (pin_out == GPIO_NO_WAKE_IRQ) return 0; @@ -187,9 +189,24 @@ static int qcom_pdc_gic_set_type(struct irq_data *d, unsigned int type) return -EINVAL; } + old_pdc_type = pdc_reg_read(IRQ_i_CFG, pin_out); pdc_reg_write(IRQ_i_CFG, pin_out, pdc_type); - return irq_chip_set_type_parent(d, type); + ret = irq_chip_set_type_parent(d, type); + + /* +* When we change types the PDC can give a phantom interrupt. +* Clear it. Specifically the phantom shows up if a line is already +* high and we change to rising or if a line is already low and we +* change to falling but let's be consistent and clear it always. +* +* Doing this works because we have IRQCHIP_SET_TYPE_MASKED so the +* interrupt will be cleared before the rest of the system sees it. +*/ + if (old_pdc_type != pdc_type) + irq_chip_set_parent_state(d, IRQCHIP_STATE_PENDING, 0); nit: s/0/false/. You could also make it conditional on the parent side having been successful. And while we're looking at this: do you need to rollback the PDC state if the GIC side has failed? It's all very hypothetical, but just in case... + + return ret; } static struct irq_chip qcom_pdc_gic_chip = { It otherwise looks sensible. Is that a fix for 5.10? M. -- Jazz is not dead. It just smells funny...
Re: [RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side
On 2020-11-24 08:10, Shenming Lu wrote: On 2020/11/23 17:27, Marc Zyngier wrote: On 2020-11-23 06:54, Shenming Lu wrote: From: Zenghui Yu When setting the forwarding path of a VLPI, it is more consistent to I'm not sure it is more consistent. It is a *new* behaviour, because it only matters for migration, which has been so far unsupported. Alright, consistent may not be accurate... But I have doubt that whether there is really no need to transfer the pending states from kvm'vgic to VPT in set_forwarding regardless of migration, and the similar for unset_forwarding. If you have to transfer that state outside of the a save/restore, it means that you have missed the programming of the PCI endpoint. This is an established restriction that the MSI programming must occur *after* the translation has been established using MAPI/MAPTI (see the large comment at the beginning of vgic-v4.c). If you want to revisit this, fair enough. But you will need a lot more than just opportunistically transfer the pending state. also transfer the pending state from irq->pending_latch to VPT (especially in migration, the pending states of VLPIs are restored into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger a VLPI to pending. Signed-off-by: Zenghui Yu Signed-off-by: Shenming Lu --- arch/arm64/kvm/vgic/vgic-v4.c | 12 1 file changed, 12 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index b5fa73c9fd35..cc3ab9cea182 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -418,6 +418,18 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, irq->host_irq = virq; atomic_inc(&map.vpe->vlpi_count); + /* Transfer pending state */ + ret = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + irq->pending_latch); + WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq); + + /* + * Let it be pruned from ap_list later and don't bother + * the List Register. + */ + irq->pending_latch = false; It occurs to me that calling into irq_set_irqchip_state() for a large number of interrupts can take a significant amount of time. It is also odd that you dump the VPT with the VPE unmapped, but rely on the VPE being mapped for the opposite operation. Shouldn't these be symmetric, all performed while the VPE is unmapped? It would also save a lot of ITS traffic. My thought was to use the existing interface directly without unmapping... If you want to unmap the vPE and poke the VPT here, as I said in the cover letter, set/unset_forwarding might also be called when all devices are running at normal run time, in which case the unmapping of the vPE is not allowed... No, I'm suggesting that you don't do anything here, but instead as a by-product of restoring the ITS tables. What goes wrong if you use the KVM_DEV_ARM_ITS_RESTORE_TABLE backend instead? Another possible solution is to add a new dedicated interface to QEMU to transfer these pending states to HW in GIC VM state change handler corresponding to save_pending_tables? Userspace has no way to know we use GICv4, and I intend to keep it completely out of the loop. The API is already pretty tortuous, and I really don't want to add any extra complexity to it. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [RFC PATCH v1 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables
On 2020-11-24 07:40, Shenming Lu wrote: On 2020/11/23 17:18, Marc Zyngier wrote: On 2020-11-23 06:54, Shenming Lu wrote: After pausing all vCPUs and devices capable of interrupting, in order ^ See my comment below about this. to save the information of all interrupts, besides flushing the pending states in kvm’s vgic, we also try to flush the states of VLPIs in the virtual pending tables into guest RAM, but we need to have GICv4.1 and safely unmap the vPEs first. Signed-off-by: Shenming Lu --- arch/arm64/kvm/vgic/vgic-v3.c | 62 +++ 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 9cdf39a94a63..e1b3aa4b2b12 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include +#include #include #include #include @@ -356,6 +358,39 @@ int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) return 0; } +/* + * With GICv4.1, we can get the VLPI's pending state after unmapping + * the vPE. The deactivation of the doorbell interrupt will trigger + * the unmapping of the associated vPE. + */ +static void get_vlpi_state_pre(struct vgic_dist *dist) +{ + struct irq_desc *desc; + int i; + + if (!kvm_vgic_global_state.has_gicv4_1) + return; + + for (i = 0; i < dist->its_vm.nr_vpes; i++) { + desc = irq_to_desc(dist->its_vm.vpes[i]->irq); + irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); + } +} + +static void get_vlpi_state_post(struct vgic_dist *dist) nit: the naming feels a bit... odd. Pre/post what? My understanding is that the unmapping is a preparation for get_vlpi_state... Maybe just call it unmap/map_all_vpes? Yes, much better. [...] + if (irq->hw) { + WARN_RATELIMIT(irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, &is_pending), + "IRQ %d", irq->host_irq); Isn't this going to warn like mad on a GICv4.0 system where this, by definition, will generate an error? As we have returned an error in save_its_tables if hw && !has_gicv4_1, we don't have to warn this here? Are you referring to the check in vgic_its_save_itt() that occurs in patch 4? Fair enough, though I think the use of irq_get_irqchip_state() isn't quite what we want, as per my comments on patch #1. + } + + if (stored == is_pending) continue; - if (irq->pending_latch) + if (is_pending) val |= 1 << bit_nr; else val &= ~(1 << bit_nr); ret = kvm_write_guest_lock(kvm, ptr, &val, 1); if (ret) - return ret; + goto out; } - return 0; + +out: + get_vlpi_state_post(dist); This bit worries me: you have unmapped the VPEs, so any interrupt that has been generated during that phase is now forever lost (the GIC doesn't have ownership of the pending tables). In my opinion, during this phase, the devices capable of interrupting should have already been paused (prevent from sending interrupts), such as VFIO migration protocol has already realized it. Is that a hard guarantee? Pausing devices *may* be possible for a limited set of endpoints, but I'm not sure that is universally possible to restart them and expect a consistent state (you have just dropped a bunch of network packets on the floor...). Do you really expect the VM to be restartable from that point? I don't see how this is possible. If the migration has encountered an error, the src VM might be restarted, so we have to map the vPEs back. As I said above, I doubt it is universally possible to do so, but after all, this probably isn't worse that restarting on the target... M. -- Jazz is not dead. It just smells funny...
Re: [RFC PATCH v1 1/4] irqchip/gic-v4.1: Plumb get_irqchip_state VLPI callback
On 2020-11-24 07:38, Shenming Lu wrote: On 2020/11/23 17:01, Marc Zyngier wrote: On 2020-11-23 06:54, Shenming Lu wrote: From: Zenghui Yu Up to now, the irq_get_irqchip_state() callback of its_irq_chip leaves unimplemented since there is no architectural way to get the VLPI's pending state before GICv4.1. Yeah, there has one in v4.1 for VLPIs. With GICv4.1, after unmapping the vPE, which cleans and invalidates any caching of the VPT, we can get the VLPI's pending state by This is a crucial note: without this unmapping and invalidation, the pending bits are not generally accessible (they could be cached in a GIC private structure, cache or otherwise). peeking at the VPT. So we implement the irq_get_irqchip_state() callback of its_irq_chip to do it. Signed-off-by: Zenghui Yu Signed-off-by: Shenming Lu --- drivers/irqchip/irq-gic-v3-its.c | 38 1 file changed, 38 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 0fec31931e11..287003cacac7 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1695,6 +1695,43 @@ static void its_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg); } +static bool its_peek_vpt(struct its_vpe *vpe, irq_hw_number_t hwirq) +{ + int mask = hwirq % BITS_PER_BYTE; nit: this isn't a mask, but a shift instead. BIT(hwirq % BPB) would give you a mask. Ok, I will correct it. + void *va; + u8 *pt; + + va = page_address(vpe->vpt_page); + pt = va + hwirq / BITS_PER_BYTE; + + return !!(*pt & (1U << mask)); +} + +static int its_irq_get_irqchip_state(struct irq_data *d, + enum irqchip_irq_state which, bool *val) +{ + struct its_device *its_dev = irq_data_get_irq_chip_data(d); + struct its_vlpi_map *map = get_vlpi_map(d); + + if (which != IRQCHIP_STATE_PENDING) + return -EINVAL; + + /* not intended for physical LPI's pending state */ + if (!map) + return -EINVAL; + + /* + * In GICv4.1, a VMAPP with {V,Alloc}=={0,1} cleans and invalidates + * any caching of the VPT associated with the vPEID held in the GIC. + */ + if (!is_v4_1(its_dev->its) || atomic_read(&map->vpe->vmapp_count)) It isn't clear to me what prevents this from racing against a mapping of the VPE. Actually, since we only hold the LPI irqdesc lock, I'm pretty sure nothing prevents it. Yes, should have the vmovp_lock held? That's not helping because of the VPE activation. And is it necessary to also hold this lock in its_vpe_irq_domain_activate/deactivate? Well, you'd need that, but that's unnecessary complex AFAICT. + return -EACCES; I can sort of buy EACCESS for a VPE that is currently mapped, but a non-4.1 ITS should definitely return EINVAL. Alright, EINVAL looks better. + + *val = its_peek_vpt(map->vpe, map->vintid); + + return 0; +} + static int its_irq_set_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool state) @@ -1975,6 +2012,7 @@ static struct irq_chip its_irq_chip = { .irq_eoi = irq_chip_eoi_parent, .irq_set_affinity = its_set_affinity, .irq_compose_msi_msg = its_irq_compose_msi_msg, + .irq_get_irqchip_state = its_irq_get_irqchip_state, My biggest issue with this is that it isn't a reliable interface. It happens to work in the context of KVM, because you make sure it is called at the right time, but that doesn't make it safe in general (anyone with the interrupt number is allowed to call this at any time). We check the vmapp_count in it to ensure the unmapping of the vPE, and let the caller do the unmapping (they should know whether it is the right time). If the unmapping is not done, just return a failure. And without guaranteeing mutual exclusion against a concurrent VMAPP, checking the vmapp_count means nothing (you need the lock described above, and start sprinkling it around in other places as well). Is there a problem with poking at the VPT page from the KVM side? The code should be exactly the same (maybe simpler even), and at least you'd be guaranteed to be in the correct context. Yeah, that also seems a good choice. If you prefer it, we can try to realize it in v2. I'd certainly prefer that. Let me know if you spot any implementation issue with that. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v2 00/24] Opt-in always-on nVHE hypervisor
Hi David, On Mon, 16 Nov 2020 20:42:54 +, David Brazdil wrote: > > As we progress towards being able to keep guest state private to the > host running nVHE hypervisor, this series allows the hypervisor to > install itself on newly booted CPUs before the host is allowed to run > on them. > > All functionality described below is opt-in, guarded by an early param > 'kvm-arm.protected'. Future patches specific to the new "protected" mode > should be hidden behind the same param. > > The hypervisor starts trapping host SMCs and intercepting host's PSCI > CPU_ON/SUSPEND calls. It replaces the host's entry point with its own, > initializes the EL2 state of the new CPU and installs the nVHE hyp vector > before ERETing to the host's entry point. > > The kernel checks new cores' features against the finalized system > capabilities. To avoid the need to move this code/data to EL2, the > implementation only allows to boot cores that were online at the time of > KVM initialization and therefore had been checked already. > > Other PSCI SMCs are forwarded to EL3, though only the known set of SMCs > implemented in the kernel is allowed. Non-PSCI SMCs are also forwarded > to EL3. Future changes will need to ensure the safety of all SMCs wrt. > private guests. > > The host is still allowed to reset EL2 back to the stub vector, eg. for > hibernation or kexec, but will not disable nVHE when there are no VMs. I have now been through the whole series, and I don't think there is anything really major (although I haven't tried it yet). I think it would benefit from being rebased on top of kvmarm/queue, as it'd give you the opportunity to replace a number of per-CPU state fields with global function pointers. Another thing is that we seem to have diverging interpretations of the PSCI spec when it comes to CPU_SUSPEND. Finally, please include the PSCI maintainers in your next posting, as I'll need their Ack to pick the first few patches. Thanks, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH v2 08/24] kvm: arm64: Add SMC handler in nVHE EL2
On Mon, 16 Nov 2020 20:43:02 +, David Brazdil wrote: > > Add handler of host SMCs in KVM nVHE trap handler. Forward all SMCs to > EL3 and propagate the result back to EL1. This is done in preparation > for validating host SMCs in KVM nVHE protected mode. > > The implementation assumes that firmware uses SMCCC v1.2 or older. That > means x0-x17 can be used both for arguments and results, other GPRs are > preserved. > > Signed-off-by: David Brazdil > --- > arch/arm64/kvm/hyp/nvhe/host.S | 38 ++ > arch/arm64/kvm/hyp/nvhe/hyp-main.c | 26 > 2 files changed, 64 insertions(+) > > diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S > index ed27f06a31ba..52dae5cd5a28 100644 > --- a/arch/arm64/kvm/hyp/nvhe/host.S > +++ b/arch/arm64/kvm/hyp/nvhe/host.S > @@ -183,3 +183,41 @@ SYM_CODE_START(__kvm_hyp_host_vector) > invalid_host_el1_vect // FIQ 32-bit EL1 > invalid_host_el1_vect // Error 32-bit EL1 > SYM_CODE_END(__kvm_hyp_host_vector) > + > +/* > + * Forward SMC with arguments in struct kvm_cpu_context, and > + * store the result into the same struct. Assumes SMCCC 1.2 or older. > + * > + * x0: struct kvm_cpu_context* > + */ > +SYM_CODE_START(__kvm_hyp_host_forward_smc) > + /* > + * Use x18 to keep a pointer to the host context because x18 > + * is callee-saved SMCCC but not in AAPCS64. > + */ > + mov x18, x0 > + > + ldp x0, x1, [x18, #CPU_XREG_OFFSET(0)] > + ldp x2, x3, [x18, #CPU_XREG_OFFSET(2)] > + ldp x4, x5, [x18, #CPU_XREG_OFFSET(4)] > + ldp x6, x7, [x18, #CPU_XREG_OFFSET(6)] > + ldp x8, x9, [x18, #CPU_XREG_OFFSET(8)] > + ldp x10, x11, [x18, #CPU_XREG_OFFSET(10)] > + ldp x12, x13, [x18, #CPU_XREG_OFFSET(12)] > + ldp x14, x15, [x18, #CPU_XREG_OFFSET(14)] > + ldp x16, x17, [x18, #CPU_XREG_OFFSET(16)] > + > + smc #0 > + > + stp x0, x1, [x18, #CPU_XREG_OFFSET(0)] > + stp x2, x3, [x18, #CPU_XREG_OFFSET(2)] > + stp x4, x5, [x18, #CPU_XREG_OFFSET(4)] > + stp x6, x7, [x18, #CPU_XREG_OFFSET(6)] > + stp x8, x9, [x18, #CPU_XREG_OFFSET(8)] > + stp x10, x11, [x18, #CPU_XREG_OFFSET(10)] > + stp x12, x13, [x18, #CPU_XREG_OFFSET(12)] > + stp x14, x15, [x18, #CPU_XREG_OFFSET(14)] > + stp x16, x17, [x18, #CPU_XREG_OFFSET(16)] This is going to be really good for CPUs that need to use ARCH_WA1 for their Spectre-v2 mitigation... :-( If that's too expensive, we may have to reduce the number of save/restored registers, but I'm worried the battle is already lost by the time we reach this (the host trap path is already a huge hammer). Eventually, we'll have to insert the mitigation in the vectors anyway, just like we have on the guest exit path. Boo. Thanks, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH v2 12/24] kvm: arm64: Bootstrap PSCI SMC handler in nVHE EL2
On Mon, 16 Nov 2020 20:43:06 +, David Brazdil wrote: > > Add a handler of PSCI SMCs in nVHE hyp code. The handler is initialized > with the version used by the host's PSCI driver and the function IDs it > was configured with. If the SMC function ID matches one of the > configured PSCI calls (for v0.1) or falls into the PSCI function ID > range (for v0.2+), the SMC is handled by the PSCI handler. For now, all > SMCs return PSCI_RET_NOT_SUPPORTED. > > Signed-off-by: David Brazdil > --- > arch/arm64/include/asm/kvm_hyp.h | 4 ++ > arch/arm64/kvm/arm.c | 14 > arch/arm64/kvm/hyp/nvhe/Makefile | 2 +- > arch/arm64/kvm/hyp/nvhe/hyp-main.c | 6 +- > arch/arm64/kvm/hyp/nvhe/psci-relay.c | 104 +++ > 5 files changed, 128 insertions(+), 2 deletions(-) > create mode 100644 arch/arm64/kvm/hyp/nvhe/psci-relay.c > > diff --git a/arch/arm64/include/asm/kvm_hyp.h > b/arch/arm64/include/asm/kvm_hyp.h > index a3289071f3d8..95a2bbbcc7e1 100644 > --- a/arch/arm64/include/asm/kvm_hyp.h > +++ b/arch/arm64/include/asm/kvm_hyp.h > @@ -96,6 +96,10 @@ void deactivate_traps_vhe_put(void); > > u64 __guest_enter(struct kvm_vcpu *vcpu); > > +#ifdef __KVM_NVHE_HYPERVISOR__ > +bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt); > +#endif > + > void __noreturn hyp_panic(void); > #ifdef __KVM_NVHE_HYPERVISOR__ > void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 > par); > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > index cdd7981ea560..7d2270eeecfb 100644 > --- a/arch/arm64/kvm/arm.c > +++ b/arch/arm64/kvm/arm.c > @@ -19,6 +19,7 @@ > #include > #include > #include > +#include > #include > > #define CREATE_TRACE_POINTS > @@ -1514,6 +1515,18 @@ static void init_cpu_logical_map(void) > CHOOSE_NVHE_SYM(__cpu_logical_map)[cpu] = cpu_logical_map(cpu); > } > > +static void init_psci_relay(void) > +{ > + extern u32 kvm_nvhe_sym(kvm_host_psci_version); > + extern u32 kvm_nvhe_sym(kvm_host_psci_function_id)[PSCI_FN_MAX]; nit: I'd rather have these outside of the function body. > + int i; > + > + CHOOSE_NVHE_SYM(kvm_host_psci_version) = psci_ops.get_version > + ? psci_ops.get_version() : PSCI_VERSION(0, 0); nit: please write this with an if/else construct, it will read a lot better. > + for (i = 0; i < PSCI_FN_MAX; ++i) > + CHOOSE_NVHE_SYM(kvm_host_psci_function_id)[i] = > psci_get_function_id(i); Either pick kvm_nvhe_sym(), or CHOOSE_NVHE_SYM(). Having both used together is just an annoyance (and in this case there is nothing to choose, really). > +} > + > static int init_common_resources(void) > { > return kvm_set_ipa_limit(); > @@ -1693,6 +1706,7 @@ static int init_hyp_mode(void) > } > > init_cpu_logical_map(); > + init_psci_relay(); > > return 0; > > diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile > b/arch/arm64/kvm/hyp/nvhe/Makefile > index 2d842e009a40..bf62c8e42ab2 100644 > --- a/arch/arm64/kvm/hyp/nvhe/Makefile > +++ b/arch/arm64/kvm/hyp/nvhe/Makefile > @@ -7,7 +7,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ > ccflags-y := -D__KVM_NVHE_HYPERVISOR__ > > obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ > - hyp-main.o hyp-smp.o > + hyp-main.o hyp-smp.o psci-relay.o > obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ >../fpsimd.o ../hyp-entry.o > > diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c > b/arch/arm64/kvm/hyp/nvhe/hyp-main.c > index 71a17af05953..df4acb40dd39 100644 > --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c > +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c > @@ -120,7 +120,11 @@ static void skip_host_instruction(void) > > static void handle_host_smc(struct kvm_cpu_context *host_ctxt) > { > - default_host_smc_handler(host_ctxt); > + bool handled; > + > + handled = kvm_host_psci_handler(host_ctxt); > + if (!handled) > + default_host_smc_handler(host_ctxt); > > /* >* Unlike HVC, the return address of an SMC is the instruction's PC. > diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c > b/arch/arm64/kvm/hyp/nvhe/psci-relay.c > new file mode 100644 > index ..d75d3f896bfd > --- /dev/null > +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c > @@ -0,0 +1,104 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2020 - Google LLC > + * Author: David Brazdil > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +/* Config options set by the host. */ > +u32 __ro_after_init kvm_host_psci_version = PSCI_VERSION(0, 0); > +u32 __ro_after_init kvm_host_psci_function_id[PSCI_FN_MAX]; > + > +static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt) > +{ > + return host_ctxt->regs.regs[0]; > +} > + > +static bool is_psci_0_1_call(u64 func_id) > +{ > + unsigned int i; > + > + for (i = 0; i <
Re: [PATCH v2 23/24] kvm: arm64: Trap host SMCs in protected mode.
On Mon, 16 Nov 2020 20:43:17 +, David Brazdil wrote: > > While protected nVHE KVM is installed, start trapping all host SMCs. > By default, these are simply forwarded to EL3, but PSCI SMCs are > validated first. > > Create new constant HCR_HOST_NVHE_PROTECTED_FLAGS with the new set of HCR > flags to use while the nVHE vector is installed when the kernel was > booted with the protected flag enabled. Switch back to the default HCR > flags when switching back to the stub vector. > > Signed-off-by: David Brazdil > --- > arch/arm64/include/asm/kvm_arm.h | 1 + > arch/arm64/kvm/hyp/nvhe/hyp-init.S | 12 > arch/arm64/kvm/hyp/nvhe/switch.c | 5 - > 3 files changed, 17 insertions(+), 1 deletion(-) > > diff --git a/arch/arm64/include/asm/kvm_arm.h > b/arch/arm64/include/asm/kvm_arm.h > index 64ce29378467..4e90c2debf70 100644 > --- a/arch/arm64/include/asm/kvm_arm.h > +++ b/arch/arm64/include/asm/kvm_arm.h > @@ -80,6 +80,7 @@ >HCR_FMO | HCR_IMO | HCR_PTW ) > #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) > #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) > +#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) > #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) > > /* TCR_EL2 Registers bits */ > diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S > b/arch/arm64/kvm/hyp/nvhe/hyp-init.S > index 6d8202d2bdfb..8f3602f320ac 100644 > --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S > +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S > @@ -88,6 +88,12 @@ SYM_CODE_END(__kvm_hyp_init) > * x0: struct kvm_nvhe_init_params PA > */ > SYM_CODE_START(___kvm_hyp_init) > +alternative_if ARM64_PROTECTED_KVM > + mov_q x1, HCR_HOST_NVHE_PROTECTED_FLAGS > + msr hcr_el2, x1 > + isb Why the ISB? For HCR_TSC to have any effect, you'll have to go via an ERET to EL1 first, which will have the required synchronisation effect. > +alternative_else_nop_endif > + > ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] > msr tpidr_el2, x1 > > @@ -224,6 +230,12 @@ reset: > msr sctlr_el2, x5 > isb > > +alternative_if ARM64_PROTECTED_KVM > + mov_q x5, HCR_HOST_NVHE_FLAGS > + msr hcr_el2, x5 > + isb Same thing here, I believe. > +alternative_else_nop_endif > + > /* Install stub vectors */ > adr_l x5, __hyp_stub_vectors > msr vbar_el2, x5 > diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c > b/arch/arm64/kvm/hyp/nvhe/switch.c > index 8ae8160bc93a..e1f8e0797144 100644 > --- a/arch/arm64/kvm/hyp/nvhe/switch.c > +++ b/arch/arm64/kvm/hyp/nvhe/switch.c > @@ -96,7 +96,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) > mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; > > write_sysreg(mdcr_el2, mdcr_el2); > - write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2); > + if (is_protected_kvm_enabled()) > + write_sysreg(HCR_HOST_NVHE_PROTECTED_FLAGS, hcr_el2); > + else > + write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2); > write_sysreg(CPTR_EL2_DEFAULT, cptr_el2); > write_sysreg(__kvm_hyp_host_vector, vbar_el2); > } > -- > 2.29.2.299.gdc1121823c-goog > > Thanks, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH v2 21/24] kvm: arm64: Add kvm-arm.protected early kernel parameter
On Mon, 16 Nov 2020 20:43:15 +, David Brazdil wrote: > > Add an early parameter that allows users to opt into protected KVM mode > when using the nVHE hypervisor. In this mode, guest state will be kept > private from the host. This will primarily involve enabling stage-2 > address translation for the host, restricting DMA to host memory, and > filtering host SMCs. > > Capability ARM64_PROTECTED_KVM is set if the param is passed, CONFIG_KVM > is enabled and the kernel was not booted with VHE. > > Signed-off-by: David Brazdil > --- > arch/arm64/include/asm/cpucaps.h | 3 ++- > arch/arm64/include/asm/virt.h| 8 > arch/arm64/kernel/cpufeature.c | 29 + > arch/arm64/kvm/arm.c | 10 +- > 4 files changed, 48 insertions(+), 2 deletions(-) > > diff --git a/arch/arm64/include/asm/cpucaps.h > b/arch/arm64/include/asm/cpucaps.h > index e7d98997c09c..ac075f70b2e4 100644 > --- a/arch/arm64/include/asm/cpucaps.h > +++ b/arch/arm64/include/asm/cpucaps.h > @@ -66,7 +66,8 @@ > #define ARM64_HAS_TLB_RANGE 56 > #define ARM64_MTE57 > #define ARM64_WORKAROUND_1508412 58 > +#define ARM64_PROTECTED_KVM 59 > > -#define ARM64_NCAPS 59 > +#define ARM64_NCAPS 60 > > #endif /* __ASM_CPUCAPS_H */ > diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h > index 6069be50baf9..2fde1186b962 100644 > --- a/arch/arm64/include/asm/virt.h > +++ b/arch/arm64/include/asm/virt.h > @@ -97,6 +97,14 @@ static __always_inline bool has_vhe(void) > return cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN); > } > > +static __always_inline bool is_protected_kvm_enabled(void) > +{ > + if (is_vhe_hyp_code()) > + return false; > + else > + return cpus_have_final_cap(ARM64_PROTECTED_KVM); > +} > + > #endif /* __ASSEMBLY__ */ > > #endif /* ! __ASM__VIRT_H */ > diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c > index 6f36c4f62f69..dd5bc0f0cf0d 100644 > --- a/arch/arm64/kernel/cpufeature.c > +++ b/arch/arm64/kernel/cpufeature.c > @@ -1709,6 +1709,29 @@ static void cpu_enable_mte(struct > arm64_cpu_capabilities const *cap) > } > #endif /* CONFIG_ARM64_MTE */ > > +#ifdef CONFIG_KVM > +static bool enable_protected_kvm; > + > +static bool has_protected_kvm(const struct arm64_cpu_capabilities *entry, > int __unused) > +{ > + if (!enable_protected_kvm) > + return false; > + > + if (is_kernel_in_hyp_mode()) { > + pr_warn("Protected KVM not available with VHE\n"); > + return false; > + } > + > + return true; > +} > + > +static int __init early_protected_kvm_cfg(char *buf) > +{ > + return strtobool(buf, &enable_protected_kvm); > +} > +early_param("kvm-arm.protected", early_protected_kvm_cfg); Please add some documentation to Documentation/admin-guide/kernel-parameters.txt. > +#endif /* CONFIG_KVM */ > + > /* Internal helper functions to match cpu capability type */ > static bool > cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) > @@ -1822,6 +1845,12 @@ static const struct arm64_cpu_capabilities > arm64_features[] = { > .field_pos = ID_AA64PFR0_EL1_SHIFT, > .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT, > }, > + { > + .desc = "Protected KVM", > + .capability = ARM64_PROTECTED_KVM, > + .type = ARM64_CPUCAP_SYSTEM_FEATURE, > + .matches = has_protected_kvm, > + }, > #endif > { > .desc = "Kernel page table isolation (KPTI)", > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > index c76a8e5bd19c..49d2474f2a80 100644 > --- a/arch/arm64/kvm/arm.c > +++ b/arch/arm64/kvm/arm.c > @@ -1796,6 +1796,12 @@ int kvm_arch_init(void *opaque) > return -ENODEV; > } > > + /* The PROTECTED_KVM cap should not have been enabled for VHE. */ > + if (in_hyp_mode && is_protected_kvm_enabled()) { > + kvm_pr_unimpl("VHE protected mode unsupported, not > initializing\n"); > + return -ENODEV; How can this happen? Don't we already take care of this? > + } > + > if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || > cpus_have_final_cap(ARM64_WORKAROUND_1508412)) > kvm_info("Guests without required CPU erratum workarounds can > deadlock system!\n" \ > @@ -1827,7 +1833,9 @@ int kvm_arch_init(void *opaque) > if (err) > goto out_hyp; > > - if (in_hyp_mode) > + if (is_protected_kvm_enabled()) > + kvm_info("Protected nVHE mode initialized successfully\n"); > + else if (in_hyp_mode) > kvm_info("VHE mode initialized successfully\n"); > else > kvm_info("Hyp mode initialized successfully\n"); > -- > 2.29.2.299.gdc1121823c-goog >
Re: [PATCH v2 20/24] kvm: arm64: Intercept host's CPU_SUSPEND PSCI SMCs
Adding Lorenzo and Sudeep to this one in particular, as there is a bit of a corner case below. On Mon, 16 Nov 2020 20:43:14 +, David Brazdil wrote: > > Add a handler of CPU_SUSPEND host PSCI SMCs. The SMC can either enter > a sleep state indistinguishable from a WFI or a deeper sleep state that > behaves like a CPU_OFF+CPU_ON. > > The handler saves r0,pc of the host and makes the same call to EL3 with > the hyp CPU entry point. It either returns back to the handler and then > back to the host, or wakes up into the entry point and initializes EL2 > state before dropping back to EL1. > > There is a simple atomic lock around the reset state struct to protect > from races with CPU_ON. A well-behaved host should never run CPU_ON > against an already online core, and the kernel indeed does not allow > that, so if the core sees its reset state struct locked, it will return > a non-spec error code PENDING_ON. This protects the hypervisor state and "non-spec" as in "outside of the PSCI specification"? Err... > avoids the need for more complicated locking and/or tracking power state > of individual cores. > > Signed-off-by: David Brazdil > --- > arch/arm64/kvm/hyp/nvhe/psci-relay.c | 39 +++- > 1 file changed, 38 insertions(+), 1 deletion(-) > > diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c > b/arch/arm64/kvm/hyp/nvhe/psci-relay.c > index 2daf52b59846..313ef42f0eab 100644 > --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c > +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c > @@ -121,6 +121,39 @@ static void release_reset_state(struct > kvm_host_psci_state *cpu_state) > atomic_set_release(&cpu_state->pending_on, 0); > } > > +static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) > +{ > + u64 power_state = host_ctxt->regs.regs[1]; > + unsigned long pc = host_ctxt->regs.regs[2]; > + unsigned long r0 = host_ctxt->regs.regs[3]; > + struct kvm_host_psci_state *cpu_state; > + struct kvm_nvhe_init_params *cpu_params; > + int ret; > + > + cpu_state = this_cpu_ptr(&kvm_host_psci_state); > + cpu_params = this_cpu_ptr(&kvm_init_params); > + > + /* > + * Lock the reset state struct. This fails if the host has concurrently > + * called CPU_ON with this CPU as target. The kernel keeps track of > + * online CPUs, so that should never happen. If it does anyway, return > + * a non-spec error. This avoids the need for spinlocks. > + */ > + if (!try_acquire_reset_state(cpu_state, pc, r0)) > + return PSCI_RET_ALREADY_ON; So that's the core of the problem. I'm definitely not keen on EL2 returning unspecified error codes. But there is something I don't get: If the CPU is currently booting (reset state is locked), it means that CPU hasn't reached the EL1 kernel yet. So how can this same CPU issue a CPU_SUSPEND from EL1? CPU_SUSPEND can't be called for a third party, only by a CPU for itself. It looks like this case cannot happen by construction. And if it happens, it looks like the only course of action should be to panic, as we have lost track of the running CPUs. Am I missing something obvious? > + > + /* > + * Will either return if shallow sleep state, or wake up into the entry > + * point if it is a deep sleep state. > + */ > + ret = psci_call(func_id, power_state, > + __hyp_pa(hyp_symbol_addr(__kvm_hyp_cpu_entry)), > + __hyp_pa(cpu_params)); > + > + release_reset_state(cpu_state); > + return ret; > +} > + > static int psci_cpu_on(u64 func_id, struct kvm_cpu_context *host_ctxt) > { > u64 mpidr = host_ctxt->regs.regs[1]; > @@ -178,7 +211,9 @@ asmlinkage void __noreturn __kvm_hyp_psci_cpu_entry(void) > > static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context > *host_ctxt) > { > - if (func_id == kvm_host_psci_function_id[PSCI_FN_CPU_OFF]) > + if (func_id == kvm_host_psci_function_id[PSCI_FN_CPU_SUSPEND]) > + return psci_cpu_suspend(func_id, host_ctxt); > + else if (func_id == kvm_host_psci_function_id[PSCI_FN_CPU_OFF]) > return psci_forward(host_ctxt); > else if (func_id == kvm_host_psci_function_id[PSCI_FN_CPU_ON]) > return psci_cpu_on(func_id, host_ctxt); > @@ -202,6 +237,8 @@ static unsigned long psci_0_2_handler(u64 func_id, struct > kvm_cpu_context *host_ > case PSCI_0_2_FN_SYSTEM_RESET: > psci_forward_noreturn(host_ctxt); > unreachable(); > + case PSCI_0_2_FN64_CPU_SUSPEND: > + return psci_cpu_suspend(func_id, host_ctxt); > case PSCI_0_2_FN64_CPU_ON: > return psci_cpu_on(func_id, host_ctxt); > default: > -- > 2.29.2.299.gdc1121823c-goog > > Thanks, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH v2 19/24] kvm: arm64: Intercept host's PSCI_CPU_ON SMCs
On Mon, 16 Nov 2020 20:43:13 +, David Brazdil wrote: > > Add a handler of the CPU_ON PSCI call from host. When invoked, it looks > up the logical CPU ID corresponding to the provided MPIDR and populates > the state struct of the target CPU with the provided x0, pc. It then > calls CPU_ON itself, with an entry point in hyp that initializes EL2 > state before returning ERET to the provided PC in EL1. > > There is a simple atomic lock around the reset state struct. If it is > already locked, CPU_ON will return PENDING_ON error code. > > Signed-off-by: David Brazdil > --- > arch/arm64/include/asm/kvm_asm.h | 8 ++- > arch/arm64/kvm/arm.c | 1 + > arch/arm64/kvm/hyp/nvhe/psci-relay.c | 104 +++ > 3 files changed, 110 insertions(+), 3 deletions(-) > > diff --git a/arch/arm64/include/asm/kvm_asm.h > b/arch/arm64/include/asm/kvm_asm.h > index 109867fb76f6..2e36ba4be748 100644 > --- a/arch/arm64/include/asm/kvm_asm.h > +++ b/arch/arm64/include/asm/kvm_asm.h > @@ -175,9 +175,11 @@ struct kvm_s2_mmu; > DECLARE_KVM_NVHE_SYM(__kvm_hyp_init); > DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector); > DECLARE_KVM_HYP_SYM(__kvm_hyp_vector); > -#define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) > -#define __kvm_hyp_host_vectorCHOOSE_NVHE_SYM(__kvm_hyp_host_vector) > -#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) > +DECLARE_KVM_NVHE_SYM(__kvm_hyp_psci_cpu_entry); > +#define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) > +#define __kvm_hyp_host_vector > CHOOSE_NVHE_SYM(__kvm_hyp_host_vector) > +#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) > +#define __kvm_hyp_psci_cpu_entry > CHOOSE_NVHE_SYM(__kvm_hyp_psci_cpu_entry) > > extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; > DECLARE_KVM_NVHE_SYM(__per_cpu_start); > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > index 7d2270eeecfb..c76a8e5bd19c 100644 > --- a/arch/arm64/kvm/arm.c > +++ b/arch/arm64/kvm/arm.c > @@ -1365,6 +1365,7 @@ static void cpu_init_hyp_mode(void) > > params->vector_hyp_va = (unsigned > long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector)); > params->stack_hyp_va = > kern_hyp_va(__this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE); > + params->entry_hyp_va = (unsigned > long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_psci_cpu_entry)); It feels really odd to use a per-CPU variable to keep track of something that is essentially a constant. Why can't we just have an assembly version of __kimg_hyp_va() and use that to compute the branch target directly in __kvm_hyp_cpu_entry()? __kvm_hyp_host_vector is another one. > params->pgd_pa = kvm_mmu_get_httbr(); > > /* > diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c > b/arch/arm64/kvm/hyp/nvhe/psci-relay.c > index 7542de8bd679..2daf52b59846 100644 > --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c > +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c > @@ -9,10 +9,15 @@ > #include > #include > #include > +#include > #include > #include > #include > > +#define INVALID_CPU_ID UINT_MAX > + > +extern char __kvm_hyp_cpu_entry[]; > + > /* Config options set by the host. */ > u32 __ro_after_init kvm_host_psci_version = PSCI_VERSION(0, 0); > u32 __ro_after_init kvm_host_psci_function_id[PSCI_FN_MAX]; > @@ -20,6 +25,14 @@ s64 __ro_after_init hyp_physvirt_offset; > > #define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset) > > +struct kvm_host_psci_state { > + atomic_t pending_on; > + unsigned long pc; > + unsigned long r0; > +}; > + > +static DEFINE_PER_CPU(struct kvm_host_psci_state, kvm_host_psci_state); > + > static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt) > { > return host_ctxt->regs.regs[0]; > @@ -76,10 +89,99 @@ static __noreturn unsigned long > psci_forward_noreturn(struct kvm_cpu_context *ho > hyp_panic(); /* unreachable */ > } > > +static unsigned int find_cpu_id(u64 mpidr) > +{ > + int i; nit: unsigned int? > + > + if (mpidr != INVALID_HWID) { This is a little ugly on the side [(c) FZ], and deserves a comment ("Reject MPIDRs matching the init value of the __cpu_logical_map[] array"?). Also, I personally prefer a construct that reduces the nesting: if (mpidr == INVALID_HWID) return INVALID_CPU_ID; > + for (i = 0; i < NR_CPUS; i++) { > + if (cpu_logical_map(i) == mpidr) > + return i; > + } > + } > + > + return INVALID_CPU_ID; > +} > + > +static bool try_acquire_reset_state(struct kvm_host_psci_state *cpu_state, > + unsigned long pc, unsigned long r0) > +{ > + if (atomic_cmpxchg_acquire(&cpu_state->pending_on, 0, 1) != 0) What guarantees that this cmpxchg is inlined here? Also, having some names for 0 and 1 would be nice. > + return false; > + > + cpu_state->pc = pc; > + cpu_s
Re: [PATCH v2 15/24] kvm: arm64: Extract parts of el2_setup into a macro
On Mon, 16 Nov 2020 20:43:09 +, David Brazdil wrote: > > When the a CPU is booted in EL2, the kernel checks for VHE support and > initializes the CPU core accordingly. For nVHE it also installs the stub > vectors and drops down to EL1. > > Once KVM gains the ability to boot cores without going through the > kernel entry point, it will need to initialize the CPU the same way. > Extract the relevant bits of el2_setup into an init_el2_state macro > with an argument specifying whether to initialize for VHE or nVHE. > > No functional change. Size of el2_setup increased by 148 bytes due > to duplication. > > Signed-off-by: David Brazdil > --- > arch/arm64/include/asm/el2_setup.h | 185 + > arch/arm64/kernel/head.S | 144 +++--- > 2 files changed, 201 insertions(+), 128 deletions(-) > create mode 100644 arch/arm64/include/asm/el2_setup.h > > diff --git a/arch/arm64/include/asm/el2_setup.h > b/arch/arm64/include/asm/el2_setup.h > new file mode 100644 > index ..e5026e0aa878 > --- /dev/null > +++ b/arch/arm64/include/asm/el2_setup.h > @@ -0,0 +1,185 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (C) 2012,2013 - ARM Ltd > + * Author: Marc Zyngier > + */ > + > +#ifndef __ARM_KVM_INIT_H__ > +#define __ARM_KVM_INIT_H__ > + > +#ifndef __ASSEMBLY__ > +#error Assembly-only header > +#endif > + > +#ifdef CONFIG_ARM_GIC_V3 > +#include > +#endif > + > +#include > +#include > +#include > + > +.macro __init_el2_sctlr > + mov_q x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) > + msr sctlr_el2, x0 > + isb > +.endm > + > +/* > + * Allow Non-secure EL1 and EL0 to access physical timer and counter. > + * This is not necessary for VHE, since the host kernel runs in EL2, > + * and EL0 accesses are configured in the later stage of boot process. > + * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout > + * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined > + * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1 > + * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in > + * EL2. > + */ > +.macro __init_el2_timers mode > +.ifeqs "\mode", "nvhe" > + mrs x0, cnthctl_el2 > + orr x0, x0, #3 // Enable EL1 physical timers > + msr cnthctl_el2, x0 > +.endif > + msr cntvoff_el2, xzr// Clear virtual offset > +.endm > + > +.macro __init_el2_debug mode > + mrs x1, id_aa64dfr0_el1 > + sbfxx0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4 > + cmp x0, #1 > + b.lt1f // Skip if no PMU present > + mrs x0, pmcr_el0// Disable debug access traps > + ubfxx0, x0, #11, #5 // to EL2 and allow access to > +1: > + cselx2, xzr, x0, lt // all PMU counters from EL1 > + > + /* Statistical profiling */ > + ubfxx0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 > + cbz x0, 3f // Skip if SPE not present > + > +.ifeqs "\mode", "nvhe" > + mrs_s x0, SYS_PMBIDR_EL1 // If SPE available at EL2, > + and x0, x0, #(1 << SYS_PMBIDR_EL1_P_SHIFT) > + cbnzx0, 2f // then permit sampling of > physical > + mov x0, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \ > + 1 << SYS_PMSCR_EL2_PA_SHIFT) > + msr_s SYS_PMSCR_EL2, x0 // addresses and physical > counter > +2: > + mov x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) > + orr x2, x2, x0 // If we don't have VHE, then > + // use EL1&0 translation. > +.else > + orr x2, x2, #MDCR_EL2_TPMS // For VHE, use EL2 translation > + // and disable access from EL1 > +.endif > + > +3: > + msr mdcr_el2, x2// Configure debug traps > +.endm > + > +/* LORegions */ > +.macro __init_el2_lor > + mrs x1, id_aa64mmfr1_el1 > + ubfxx0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4 > + cbz x0, 1f > + msr_s SYS_LORC_EL1, xzr > +1: > +.endm > + > +/* Stage-2 translation */ > +.macro __init_el2_stage2 > + msr vttbr_el2, xzr > +.endm > + > +/* GICv3 system register access */ > +#ifdef CONFIG_ARM_GIC_V3 nit: this #ifdef isn't relevant anymore and can be dropped throughout the file. > +.macro __init_el2_gicv3 > + mrs x0
Re: [PATCH v2 07/24] kvm: arm64: Refactor handle_trap to use a switch
On Mon, 16 Nov 2020 20:43:01 +, David Brazdil wrote: > > Small refactor so that nVHE's handle_trap uses a switch on the Exception > Class value of ESR_EL2 in preparation for adding a handler of SMC32/64. nit: SMC32 seems to be a leftover from the previous version. > > Signed-off-by: David Brazdil > --- > arch/arm64/kvm/hyp/nvhe/hyp-main.c | 15 --- > 1 file changed, 8 insertions(+), 7 deletions(-) > > diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c > b/arch/arm64/kvm/hyp/nvhe/hyp-main.c > index 411b0f652417..19332c20fcde 100644 > --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c > +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c > @@ -16,9 +16,9 @@ > > DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); > > -static void handle_host_hcall(unsigned long func_id, > - struct kvm_cpu_context *host_ctxt) > +static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) > { > + unsigned long func_id = host_ctxt->regs.regs[0]; > unsigned long ret = 0; > > switch (func_id) { > @@ -109,11 +109,12 @@ static void handle_host_hcall(unsigned long func_id, > void handle_trap(struct kvm_cpu_context *host_ctxt) > { > u64 esr = read_sysreg_el2(SYS_ESR); > - unsigned long func_id; > > - if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64) > + switch (ESR_ELx_EC(esr)) { > + case ESR_ELx_EC_HVC64: > + handle_host_hcall(host_ctxt); > + break; > + default: > hyp_panic(); > - > - func_id = host_ctxt->regs.regs[0]; > - handle_host_hcall(func_id, host_ctxt); > + } > } > -- > 2.29.2.299.gdc1121823c-goog > > Thanks, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH v2 06/24] kvm: arm64: Move hyp-init params to a per-CPU struct
On Mon, 16 Nov 2020 20:43:00 +, David Brazdil wrote: > > Once we start initializing KVM on newly booted cores before the rest of > the kernel, parameters to __do_hyp_init will need to be provided by EL2 > rather than EL1. At that point it will not be possible to pass its four > arguments directly because PSCI_CPU_ON only supports one context > argument. > > Refactor __do_hyp_init to accept its parameters in a struct. This > prepares the code for KVM booting cores as well as removes any limits on > the number of __do_hyp_init arguments. > > Signed-off-by: David Brazdil > --- > arch/arm64/include/asm/kvm_asm.h | 7 +++ > arch/arm64/include/asm/kvm_hyp.h | 4 > arch/arm64/kernel/asm-offsets.c| 4 > arch/arm64/kvm/arm.c | 26 ++ > arch/arm64/kvm/hyp/nvhe/hyp-init.S | 21 ++--- > arch/arm64/kvm/hyp/nvhe/hyp-main.c | 2 ++ > 6 files changed, 41 insertions(+), 23 deletions(-) > > diff --git a/arch/arm64/include/asm/kvm_asm.h > b/arch/arm64/include/asm/kvm_asm.h > index 54387ccd1ab2..01904e88cead 100644 > --- a/arch/arm64/include/asm/kvm_asm.h > +++ b/arch/arm64/include/asm/kvm_asm.h > @@ -150,6 +150,13 @@ extern void *__vhe_undefined_symbol; > > #endif > > +struct kvm_nvhe_init_params { > + unsigned long tpidr_el2; > + unsigned long vector_hyp_va; > + unsigned long stack_hyp_va; > + phys_addr_t pgd_pa; > +}; > + > /* Translate a kernel address @ptr into its equivalent linear mapping */ > #define kvm_ksym_ref(ptr)\ > ({ \ > diff --git a/arch/arm64/include/asm/kvm_hyp.h > b/arch/arm64/include/asm/kvm_hyp.h > index 6b664de5ec1f..a3289071f3d8 100644 > --- a/arch/arm64/include/asm/kvm_hyp.h > +++ b/arch/arm64/include/asm/kvm_hyp.h > @@ -15,6 +15,10 @@ > DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); > DECLARE_PER_CPU(unsigned long, kvm_hyp_vector); > > +#ifdef __KVM_NVHE_HYPERVISOR__ > +DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); > +#endif I'm not sure we should bother with this #ifdefery. Having the declaration present at all times doesn't really hurt, since it is only defined in the HYP code. Cutting down on the conditionals would certainly help readability. > + > #define read_sysreg_elx(r,nvh,vh)\ > ({ \ > u64 reg;\ > diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c > index 7d32fc959b1a..4435ad8be938 100644 > --- a/arch/arm64/kernel/asm-offsets.c > +++ b/arch/arm64/kernel/asm-offsets.c > @@ -110,6 +110,10 @@ int main(void) >DEFINE(CPU_APGAKEYLO_EL1, offsetof(struct kvm_cpu_context, > sys_regs[APGAKEYLO_EL1])); >DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, > __hyp_running_vcpu)); >DEFINE(HOST_DATA_CONTEXT, offsetof(struct kvm_host_data, host_ctxt)); > + DEFINE(NVHE_INIT_TPIDR_EL2,offsetof(struct kvm_nvhe_init_params, > tpidr_el2)); > + DEFINE(NVHE_INIT_VECTOR_HYP_VA,offsetof(struct kvm_nvhe_init_params, > vector_hyp_va)); > + DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, > stack_hyp_va)); > + DEFINE(NVHE_INIT_PGD_PA, offsetof(struct kvm_nvhe_init_params, pgd_pa)); > #endif > #ifdef CONFIG_CPU_PM >DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > index c0ffb019ca8b..4838556920fb 100644 > --- a/arch/arm64/kvm/arm.c > +++ b/arch/arm64/kvm/arm.c > @@ -50,6 +50,7 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); > > static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); > unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; > +DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); > > /* The VMID used in the VTTBR */ > static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); > @@ -1347,10 +1348,7 @@ static int kvm_map_vectors(void) > > static void cpu_init_hyp_mode(void) > { > - phys_addr_t pgd_ptr; > - unsigned long hyp_stack_ptr; > - unsigned long vector_ptr; > - unsigned long tpidr_el2; > + struct kvm_nvhe_init_params *params = > this_cpu_ptr_nvhe_sym(kvm_init_params); > struct arm_smccc_res res; > > /* Switch from the HYP stub to our own HYP init vector */ > @@ -1361,13 +1359,18 @@ static void cpu_init_hyp_mode(void) >* kernel's mapping to the linear mapping, and store it in tpidr_el2 >* so that we can use adr_l to access per-cpu variables in EL2. >*/ > - tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) - > - (unsigned > long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); > + params->tpidr_el2 = (unsigned > long)this_cpu_ptr_nvhe_sym(__per_cpu_start) - > +
Re: [PATCH 1/2] drm/meson: dw-hdmi: Disable clocks on driver teardown
On 2020-11-23 14:03, Jerome Brunet wrote: On Fri 20 Nov 2020 at 10:42, Marc Zyngier wrote: The HDMI driver request clocks early, but never disable them, leaving the clocks on even when the driver is removed. Fix it by slightly refactoring the clock code, and register a devm action that will eventually disable/unprepare the enabled clocks. Signed-off-by: Marc Zyngier --- drivers/gpu/drm/meson/meson_dw_hdmi.c | 43 ++- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c b/drivers/gpu/drm/meson/meson_dw_hdmi.c index 7f8eea494147..29623b309cb1 100644 --- a/drivers/gpu/drm/meson/meson_dw_hdmi.c +++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c @@ -145,8 +145,6 @@ struct meson_dw_hdmi { struct reset_control *hdmitx_apb; struct reset_control *hdmitx_ctrl; struct reset_control *hdmitx_phy; - struct clk *hdmi_pclk; - struct clk *venci_clk; struct regulator *hdmi_supply; u32 irq_stat; struct dw_hdmi *hdmi; @@ -946,6 +944,29 @@ static void meson_disable_regulator(void *data) regulator_disable(data); } +static void meson_disable_clk(void *data) +{ + clk_disable_unprepare(data); +} + +static int meson_enable_clk(struct device *dev, char *name) +{ + struct clk *clk; + int ret; + + clk = devm_clk_get(dev, name); + if (IS_ERR(clk)) { + dev_err(dev, "Unable to get %s pclk\n", name); + return PTR_ERR(clk); + } + + ret = clk_prepare_enable(clk); + if (!ret) + ret = devm_add_action_or_reset(dev, meson_disable_clk, clk); Thanks for fixing this Marc. FYI, while it is fine to declare a function to disable the clocks, a quick cast may avoid it devm_add_action_or_reset(dev, (void(*)(void *))clk_disable_unprepare, clk); While this works for now, a change to the clk_disable_unprepare() prototype (such as adding a second argument) would now go completely unnoticed (after all, you've cast the function, it *must* be correct, right?), and someone would spend a few hours trying to track down memory corruption or some other interesting results. Yes, casting C functions can be hilarious. I can see a few uses of this hack in the tree, and I have my pop-corn ready. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v2 04/24] arm64: Move MAIR_EL1_SET to asm/memory.h
On Mon, 16 Nov 2020 20:42:58 +, David Brazdil wrote: > > KVM currently initializes MAIR_EL2 to the value of MAIR_EL1. In > preparation for initializing MAIR_EL2 before MAIR_EL1, move the constant > into a shared header file. Since it is used for EL1 and EL2, rename to > MAIR_ELx_SET. > > Signed-off-by: David Brazdil > --- > arch/arm64/include/asm/memory.h | 29 ++--- > arch/arm64/include/asm/sysreg.h | 30 ++ > arch/arm64/mm/proc.S| 15 +-- > 3 files changed, 45 insertions(+), 29 deletions(-) > > diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h > index cd61239bae8c..8ae8fd883a0c 100644 > --- a/arch/arm64/include/asm/memory.h > +++ b/arch/arm64/include/asm/memory.h > @@ -13,6 +13,7 @@ > #include > #include > #include > +#include > > /* > * Size of the PCI I/O space. This must remain a power of two so that > @@ -124,21 +125,6 @@ > */ > #define SEGMENT_ALIGNSZ_64K > > -/* > - * Memory types available. > - * > - * IMPORTANT: MT_NORMAL must be index 0 since vm_get_page_prot() may 'or' in > - * the MT_NORMAL_TAGGED memory type for PROT_MTE mappings. Note > - * that protection_map[] only contains MT_NORMAL attributes. > - */ > -#define MT_NORMAL0 > -#define MT_NORMAL_TAGGED 1 > -#define MT_NORMAL_NC 2 > -#define MT_NORMAL_WT 3 > -#define MT_DEVICE_nGnRnE 4 > -#define MT_DEVICE_nGnRE 5 > -#define MT_DEVICE_GRE6 > - > /* > * Memory types for Stage-2 translation > */ > @@ -152,6 +138,19 @@ > #define MT_S2_FWB_NORMAL 6 > #define MT_S2_FWB_DEVICE_nGnRE 1 > > +/* > + * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal memory > and > + * changed during __cpu_setup to Normal Tagged if the system supports MTE. > + */ > +#define MAIR_ELx_SET \ > + (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) | \ > + MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |\ > + MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |\ > + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \ > + MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |\ > + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) | \ > + MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED)) > + > #ifdef CONFIG_ARM64_4K_PAGES > #define IOREMAP_MAX_ORDER(PUD_SHIFT) > #else > diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h > index e2ef4c2edf06..24e773414cb4 100644 > --- a/arch/arm64/include/asm/sysreg.h > +++ b/arch/arm64/include/asm/sysreg.h > @@ -635,6 +635,34 @@ > /* Position the attr at the correct index */ > #define MAIR_ATTRIDX(attr, idx) ((attr) << ((idx) * 8)) > > +/* > + * Memory types available. > + * > + * IMPORTANT: MT_NORMAL must be index 0 since vm_get_page_prot() may 'or' in > + * the MT_NORMAL_TAGGED memory type for PROT_MTE mappings. Note > + * that protection_map[] only contains MT_NORMAL attributes. > + */ > +#define MT_NORMAL0 > +#define MT_NORMAL_TAGGED 1 > +#define MT_NORMAL_NC 2 > +#define MT_NORMAL_WT 3 > +#define MT_DEVICE_nGnRnE 4 > +#define MT_DEVICE_nGnRE 5 > +#define MT_DEVICE_GRE6 > + > +/* > + * Default MAIR_ELx. MT_NORMAL_TAGGED is initially mapped as Normal memory > and > + * changed during __cpu_setup to Normal Tagged if the system supports MTE. > + */ > +#define MAIR_ELx_SET \ > + (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) | \ > + MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |\ > + MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |\ > + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \ > + MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |\ > + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) | \ > + MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED)) > + > /* id_aa64isar0 */ > #define ID_AA64ISAR0_RNDR_SHIFT 60 > #define ID_AA64ISAR0_TLB_SHIFT 56 > @@ -992,6 +1020,7 @@ > /* Safe value for MPIDR_EL1: Bit31:RES1, Bit30:U:0, Bit24:MT:0 */ > #define SYS_MPIDR_SAFE_VAL (BIT(31)) > > +#ifndef LINKER_SCRIPT This is terribly ugly. Why is this included by the linker script? Does it actually define __ASSEMBLY__? > #ifdef __ASSEMBLY__ > > .irp > num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 > @@ -1109,5 +1138,6 @@ > }) > > #endif > +#endif /* LINKER_SCRIPT */ > > #endif /* __ASM_SYSREG_H */ > diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S > index 23c326a06b2d..e3b9aa372b96 100644 > --- a/arch/arm64/mm/proc.S > +++ b/arch/arm64/mm
Re: [PATCH v2 02/24] psci: Accessor for configured PSCI function IDs
On Mon, 16 Nov 2020 20:42:56 +, David Brazdil wrote: > > Function IDs used by PSCI are configurable for v0.1 via DT/APCI. If the > host is using PSCI v0.1, KVM's host PSCI proxy needs to use the same IDs. > Expose the array holding the information with a read-only accessor. > > Signed-off-by: David Brazdil > --- > drivers/firmware/psci/psci.c | 14 ++ > include/linux/psci.h | 10 ++ > 2 files changed, 16 insertions(+), 8 deletions(-) > > diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c > index 213c68418a65..d835f3d8b121 100644 > --- a/drivers/firmware/psci/psci.c > +++ b/drivers/firmware/psci/psci.c > @@ -58,16 +58,14 @@ typedef unsigned long (psci_fn)(unsigned long, unsigned > long, > unsigned long, unsigned long); > static psci_fn *invoke_psci_fn; > > -enum psci_function { > - PSCI_FN_CPU_SUSPEND, > - PSCI_FN_CPU_ON, > - PSCI_FN_CPU_OFF, > - PSCI_FN_MIGRATE, > - PSCI_FN_MAX, > -}; > - > static u32 psci_function_id[PSCI_FN_MAX]; > > +u32 psci_get_function_id(enum psci_function fn) > +{ > + WARN_ON(fn >= PSCI_FN_MAX); If we are going to warn on something that is out of bounds, maybe we shouldn't perform the access at all? And maybe check for lower bound as well? > + return psci_function_id[fn]; > +} > + > #define PSCI_0_2_POWER_STATE_MASK\ > (PSCI_0_2_POWER_STATE_ID_MASK | \ > PSCI_0_2_POWER_STATE_TYPE_MASK | \ > diff --git a/include/linux/psci.h b/include/linux/psci.h > index 2a1bfb890e58..5b49a5c82d6f 100644 > --- a/include/linux/psci.h > +++ b/include/linux/psci.h > @@ -21,6 +21,16 @@ bool psci_power_state_is_valid(u32 state); > int psci_set_osi_mode(bool enable); > bool psci_has_osi_support(void); > > +enum psci_function { > + PSCI_FN_CPU_SUSPEND, > + PSCI_FN_CPU_ON, > + PSCI_FN_CPU_OFF, > + PSCI_FN_MIGRATE, > + PSCI_FN_MAX, > +}; > + > +u32 psci_get_function_id(enum psci_function fn); > + > struct psci_operations { > u32 (*get_version)(void); > int (*cpu_suspend)(u32 state, unsigned long entry_point); > -- > 2.29.2.299.gdc1121823c-goog > > Thanks, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH v2 00/24] Opt-in always-on nVHE hypervisor
On Mon, 16 Nov 2020 20:42:54 +, David Brazdil wrote: > > As we progress towards being able to keep guest state private to the > host running nVHE hypervisor, this series allows the hypervisor to > install itself on newly booted CPUs before the host is allowed to run > on them. > > All functionality described below is opt-in, guarded by an early param > 'kvm-arm.protected'. Future patches specific to the new "protected" mode > should be hidden behind the same param. > > The hypervisor starts trapping host SMCs and intercepting host's PSCI > CPU_ON/SUSPEND calls. It replaces the host's entry point with its own, > initializes the EL2 state of the new CPU and installs the nVHE hyp vector > before ERETing to the host's entry point. > > The kernel checks new cores' features against the finalized system > capabilities. To avoid the need to move this code/data to EL2, the > implementation only allows to boot cores that were online at the time of > KVM initialization and therefore had been checked already. > > Other PSCI SMCs are forwarded to EL3, though only the known set of SMCs > implemented in the kernel is allowed. Non-PSCI SMCs are also forwarded > to EL3. Future changes will need to ensure the safety of all SMCs wrt. > private guests. > > The host is still allowed to reset EL2 back to the stub vector, eg. for > hibernation or kexec, but will not disable nVHE when there are no VMs. > > Tested on Rock Pi 4b, based on 5.10-rc4. Adding Lorenzo and Sudeep for the PSCI side of things. Thanks, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH v2 1/3] genirq/affinity: Add irq_update_affinity_desc()
Hi John, On 2020-11-23 12:54, John Garry wrote: Hi Marc, Right, but if the driver is removed then the interrupts should be deallocated, right? When removing the driver we just call free_irq(), which removes the handler and disables the interrupt. But about the irq_desc, this is created when the mapping is created in irq_create_fwspec_mapping(), and I don't see this being torn down in the driver removal, so persistent in that regard. If the irq_descs are created via the platform_get_irq() calls in platform_get_irqs_affinity(), I'd expect some equivalent helper to tear things down as a result, calling irq_dispose_mapping() behind the scenes. So this looks lacking in the kernel AFAICS... So is there a reason for which irq dispose mapping is not a requirement for drivers when finished with the irq? because of shared interrupts? For a bunch of reasons: IRQ number used to be created 1:1 with their physical counterpart, so there wasn't a need to "get rid" of the associated data structures. Also, people expected their drivers to be there for the lifetime of the system (believe it or not, hotplug devices are pretty new!). Shared interrupts are just another part of the problem (although we should be able to work out whether there is any action attached to the descriptor before blowing it away. So for pci msi I can see that we free the irq_desc in pci_disable_msi() -> free_msi_irqs() -> msi_domain_free_irqs() ... So what I am missing here? I'm not sure the paths are strictly equivalent. On the PCI side, we can have something that completely driver agnostic, as it is all architectural. In your case, only the endpoint driver knows about what happens, and needs to free things accordingly. Finally, there is the issue in your driver that everything is requested using devm_request_irq, which cannot play nicely with an explicit irq_desc teardown. You'll probably need to provide the equivalent devm helpers for your driver to safely be taken down. Yeah, so since we use the devm irq request method, we also need a devm dispose release method as we can't dispose the irq mapping in the remove() method, prior to the irq_free() in the later devm release method. But it looks like there is more to it than that, which I'm worried is far from non-trivial. For example, just calling irq_dispose_mapping() for removal and then plaform_get_irq()->acpi_get_irq() second time fails as it looks like more tidy-up is needed for removal... Most probably. I could imagine things failing if there is any trace of an existing translation in the ITS or in the platform-MSI layer, for example, or if the interrupt is still active... Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v15 6/9] arm64/kvm: Add hypercall service for kvm ptp.
On 2020-11-23 10:44, Marc Zyngier wrote: On 2020-11-11 06:22, Jianyong Wu wrote: ptp_kvm will get this service through SMCC call. The service offers wall time and cycle count of host to guest. The caller must specify whether they want the host cycle count or the difference between host cycle count and cntvoff. Signed-off-by: Jianyong Wu --- arch/arm64/kvm/hypercalls.c | 61 + include/linux/arm-smccc.h | 17 +++ 2 files changed, 78 insertions(+) diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index b9d8607083eb..f7d189563f3d 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -9,6 +9,51 @@ #include #include +static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) +{ + struct system_time_snapshot systime_snapshot; + u64 cycles = ~0UL; + u32 feature; + + /* +* system time and counter value must captured in the same +* time to keep consistency and precision. +*/ + ktime_get_snapshot(&systime_snapshot); + + // binding ptp_kvm clocksource to arm_arch_counter + if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER) + return; + + val[0] = upper_32_bits(systime_snapshot.real); + val[1] = lower_32_bits(systime_snapshot.real); What is the endianness of these values? I can't see it defined anywhere, and this is likely not to work if guest and hypervisor don't align. Scratch that. This is all passed via registers, so the endianness of the data is irrelevant. Please discard any comment about endianness I made in this review. The documentation aspect still requires to be beefed up. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v15 8/9] doc: add ptp_kvm introduction for arm64 support
On 2020-11-11 06:22, Jianyong Wu wrote: PTP_KVM implementation depends on hypercall using SMCCC. So we introduce a new SMCCC service ID. This doc explains how does the ID define and how does PTP_KVM works on arm/arm64. Signed-off-by: Jianyong Wu --- Documentation/virt/kvm/api.rst | 9 +++ Documentation/virt/kvm/arm/index.rst | 1 + Documentation/virt/kvm/arm/ptp_kvm.rst | 29 + Documentation/virt/kvm/timekeeping.rst | 35 ++ 4 files changed, 74 insertions(+) create mode 100644 Documentation/virt/kvm/arm/ptp_kvm.rst diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 36d5f1f3c6dd..9843dbcbf770 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6391,3 +6391,12 @@ When enabled, KVM will disable paravirtual features provided to the guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf (0x4001). Otherwise, a guest may use the paravirtual features regardless of what has actually been exposed through the CPUID leaf. + +8.27 KVM_CAP_PTP_KVM + + +:Architectures: arm64 + +This capability indicates that KVM virtual PTP service is supported in host. +It must company with the implementation of KVM virtual PTP service in host +so VMM can probe if there is the service in host by checking this capability. diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst index 3e2b2aba90fc..78a9b670aafe 100644 --- a/Documentation/virt/kvm/arm/index.rst +++ b/Documentation/virt/kvm/arm/index.rst @@ -10,3 +10,4 @@ ARM hyp-abi psci pvtime + ptp_kvm diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst new file mode 100644 index ..bb1e6cfefe44 --- /dev/null +++ b/Documentation/virt/kvm/arm/ptp_kvm.rst @@ -0,0 +1,29 @@ +.. SPDX-License-Identifier: GPL-2.0 + +PTP_KVM support for arm/arm64 += + +PTP_KVM is used for time sync between guest and host in a high precision. +It needs to get the wall time and counter value from the host and transfer these +to guest via hypercall service. So one more hypercall service has been added. + +This new SMCCC hypercall is defined as: + +* ARM_SMCCC_HYP_KVM_PTP_FUNC_ID: 0x8601 + +As both 32 and 64-bits ptp_kvm client should be supported, we choose SMC32/HVC32 +calling convention. + +ARM_SMCCC_HYP_KVM_PTP_FUNC_ID: + +===== +Function ID: (uint32) 0x8601 +Arguments: (uint32) ARM_PTP_PHY_COUNTER(1) or ARM_PTP_VIRT_COUNTER(0) + which indicate acquiring physical counter or + virtual counter respectively. +return value:(uint32) NOT_SUPPORTED(-1) or val0 and val1 represent + wall clock time and val2 and val3 represent + counter cycle. This needs a lot more description: - Which word contains what part of the data (upper/lower part of the 64bit data) - The endianness of the data returned M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v15 7/9] ptp: arm/arm64: Enable ptp_kvm for arm/arm64
On 2020-11-11 06:22, Jianyong Wu wrote: Currently, there is no mechanism to keep time sync between guest and host in arm/arm64 virtualization environment. Time in guest will drift compared with host after boot up as they may both use third party time sources to correct their time respectively. The time deviation will be in order of milliseconds. But in some scenarios,like in cloud envirenment, we ask environment for higher time precision. kvm ptp clock, which chooses the host clock source as a reference clock to sync time between guest and host, has been adopted by x86 which takes the time sync order from milliseconds to nanoseconds. This patch enables kvm ptp clock for arm/arm64 and improves clock sync precison precision significantly. Test result comparisons between with kvm ptp clock and without it in arm/arm64 are as follows. This test derived from the result of command 'chronyc sources'. we should take more care of the last sample column which shows the offset between the local clock and the source at the last measurement. no kvm ptp in guest: MS Name/IP address Stratum Poll Reach LastRx Last sample ^* dns1.synet.edu.cn 2 6 37713 +1040us[+1581us] +/- 21ms ^* dns1.synet.edu.cn 2 6 37721 +1040us[+1581us] +/- 21ms ^* dns1.synet.edu.cn 2 6 37729 +1040us[+1581us] +/- 21ms ^* dns1.synet.edu.cn 2 6 37737 +1040us[+1581us] +/- 21ms ^* dns1.synet.edu.cn 2 6 37745 +1040us[+1581us] +/- 21ms ^* dns1.synet.edu.cn 2 6 37753 +1040us[+1581us] +/- 21ms ^* dns1.synet.edu.cn 2 6 37761 +1040us[+1581us] +/- 21ms ^* dns1.synet.edu.cn 2 6 377 4 -130us[ +796us] +/- 21ms ^* dns1.synet.edu.cn 2 6 37712 -130us[ +796us] +/- 21ms ^* dns1.synet.edu.cn 2 6 37720 -130us[ +796us] +/- 21ms in host: MS Name/IP address Stratum Poll Reach LastRx Last sample ^* 120.25.115.20 2 7 37772 -470us[ -603us] +/- 18ms ^* 120.25.115.20 2 7 37792 -470us[ -603us] +/- 18ms ^* 120.25.115.20 2 7 377 112 -470us[ -603us] +/- 18ms ^* 120.25.115.20 2 7 377 2 +872ns[-6808ns] +/- 17ms ^* 120.25.115.20 2 7 37722 +872ns[-6808ns] +/- 17ms ^* 120.25.115.20 2 7 37743 +872ns[-6808ns] +/- 17ms ^* 120.25.115.20 2 7 37763 +872ns[-6808ns] +/- 17ms ^* 120.25.115.20 2 7 37783 +872ns[-6808ns] +/- 17ms ^* 120.25.115.20 2 7 377 103 +872ns[-6808ns] +/- 17ms ^* 120.25.115.20 2 7 377 123 +872ns[-6808ns] +/- 17ms The dns1.synet.edu.cn is the network reference clock for guest and 120.25.115.20 is the network reference clock for host. we can't get the clock error between guest and host directly, but a roughly estimated value will be in order of hundreds of us to ms. with kvm ptp in guest: chrony has been disabled in host to remove the disturb by network clock. MS Name/IP address Stratum Poll Reach LastRx Last sample * PHC00 3 377 8 -7ns[ +1ns] +/- 3ns * PHC00 3 377 8 +1ns[ +16ns] +/- 3ns * PHC00 3 377 6 -4ns[ -0ns] +/- 6ns * PHC00 3 377 6 -8ns[ -12ns] +/- 5ns * PHC00 3 377 5 +2ns[ +4ns] +/- 4ns * PHC00 3 37713 +2ns[ +4ns] +/- 4ns * PHC00 3 37712 -4ns[ -6ns] +/- 4ns * PHC00 3 37711 -8ns[ -11ns] +/- 6ns * PHC00 3 37710-14ns[ -20ns] +/- 4ns * PHC00 3 377 8 +4ns[ +5ns] +/- 4ns The PHC0 is the ptp clock which choose the host clock as its source clock. So we can see that the clock difference between host and guest is in order of ns. Signed-off-by: Jianyong Wu --- drivers/clocksource/arm_arch_timer.c | 28 ++ drivers/ptp/Kconfig | 2 +- drivers/ptp/Makefile | 1 + drivers/ptp/ptp_kvm_arm.c| 44 4 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 drivers/ptp/ptp_kvm_arm.c diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index d55acffb0b90..b33c5a663d30 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -1650,3 +1651,30 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table) } TIMER
Re: [PATCH v15 6/9] arm64/kvm: Add hypercall service for kvm ptp.
On 2020-11-11 06:22, Jianyong Wu wrote: ptp_kvm will get this service through SMCC call. The service offers wall time and cycle count of host to guest. The caller must specify whether they want the host cycle count or the difference between host cycle count and cntvoff. Signed-off-by: Jianyong Wu --- arch/arm64/kvm/hypercalls.c | 61 + include/linux/arm-smccc.h | 17 +++ 2 files changed, 78 insertions(+) diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index b9d8607083eb..f7d189563f3d 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -9,6 +9,51 @@ #include #include +static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) +{ + struct system_time_snapshot systime_snapshot; + u64 cycles = ~0UL; + u32 feature; + + /* +* system time and counter value must captured in the same +* time to keep consistency and precision. +*/ + ktime_get_snapshot(&systime_snapshot); + + // binding ptp_kvm clocksource to arm_arch_counter + if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER) + return; + + val[0] = upper_32_bits(systime_snapshot.real); + val[1] = lower_32_bits(systime_snapshot.real); What is the endianness of these values? I can't see it defined anywhere, and this is likely not to work if guest and hypervisor don't align. + + /* +* which of virtual counter or physical counter being +* asked for is decided by the r1 value of SMCCC +* call. If no invalid r1 value offered, default cycle +* value(-1) will be returned. +* Note: keep in mind that feature is u32 and smccc_get_arg1 +* will return u64, so need auto cast here. +*/ + feature = smccc_get_arg1(vcpu); + switch (feature) { + case ARM_PTP_VIRT_COUNTER: + cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2); + break; + case ARM_PTP_PHY_COUNTER: + cycles = systime_snapshot.cycles; + break; + case ARM_PTP_NONE_COUNTER: What is this "NONE" counter? + break; + default: + val[0] = SMCCC_RET_NOT_SUPPORTED; + break; + } + val[2] = upper_32_bits(cycles); + val[3] = lower_32_bits(cycles); Same problem as above. +} + int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) { u32 func_id = smccc_get_function(vcpu); @@ -79,6 +124,22 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) break; case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_KVM_PTP); + break; + /* +* This serves virtual kvm_ptp. +* Four values will be passed back. +* reg0 stores high 32-bits of host ktime; +* reg1 stores low 32-bits of host ktime; +* For ARM_PTP_VIRT_COUNTER: +* reg2 stores high 32-bits of difference of host cycles and cntvoff; +* reg3 stores low 32-bits of difference of host cycles and cntvoff. +* For ARM_PTP_PHY_COUNTER: +* reg2 stores the high 32-bits of host cycles; +* reg3 stores the low 32-bits of host cycles. +*/ + case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: + kvm_ptp_get_time(vcpu, val); break; default: return kvm_psci_call(vcpu); diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index d75408141137..a03c5dd409d3 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -103,6 +103,7 @@ /* KVM "vendor specific" services */ #define ARM_SMCCC_KVM_FUNC_FEATURES0 +#define ARM_SMCCC_KVM_FUNC_KVM_PTP 1 I think having KVM once in the name is enough. #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS128 @@ -114,6 +115,22 @@ #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED 1 +/* + * ptp_kvm is a feature used for time sync between vm and host. + * ptp_kvm module in guest kernel will get service from host using + * this hypercall ID. + */ +#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32,\ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_KVM_PTP) + +/* ptp_kvm counter type ID */ +#define ARM_PTP_VIRT_COUNTER 0 +#define ARM_PTP_PHY_COUNTER1 +#define ARM_PTP_NONE_COUNTER 2 The architecture definitely doesn't have this last counter. + /* Paravirtualised time calls (defined by ARM DEN0057A) */ #define ARM_SMCCC_HV_PV_TIME_FEATURES \
Re: [RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side
On 2020-11-23 06:54, Shenming Lu wrote: From: Zenghui Yu When setting the forwarding path of a VLPI, it is more consistent to I'm not sure it is more consistent. It is a *new* behaviour, because it only matters for migration, which has been so far unsupported. also transfer the pending state from irq->pending_latch to VPT (especially in migration, the pending states of VLPIs are restored into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger a VLPI to pending. Signed-off-by: Zenghui Yu Signed-off-by: Shenming Lu --- arch/arm64/kvm/vgic/vgic-v4.c | 12 1 file changed, 12 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index b5fa73c9fd35..cc3ab9cea182 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -418,6 +418,18 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, irq->host_irq= virq; atomic_inc(&map.vpe->vlpi_count); + /* Transfer pending state */ + ret = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + irq->pending_latch); + WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq); + + /* +* Let it be pruned from ap_list later and don't bother +* the List Register. +*/ + irq->pending_latch = false; It occurs to me that calling into irq_set_irqchip_state() for a large number of interrupts can take a significant amount of time. It is also odd that you dump the VPT with the VPE unmapped, but rely on the VPE being mapped for the opposite operation. Shouldn't these be symmetric, all performed while the VPE is unmapped? It would also save a lot of ITS traffic. + out: mutex_unlock(&its->its_lock); return ret; Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [RFC PATCH v1 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables
On 2020-11-23 06:54, Shenming Lu wrote: After pausing all vCPUs and devices capable of interrupting, in order ^ See my comment below about this. to save the information of all interrupts, besides flushing the pending states in kvm’s vgic, we also try to flush the states of VLPIs in the virtual pending tables into guest RAM, but we need to have GICv4.1 and safely unmap the vPEs first. Signed-off-by: Shenming Lu --- arch/arm64/kvm/vgic/vgic-v3.c | 62 +++ 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 9cdf39a94a63..e1b3aa4b2b12 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include +#include #include #include #include @@ -356,6 +358,39 @@ int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) return 0; } +/* + * With GICv4.1, we can get the VLPI's pending state after unmapping + * the vPE. The deactivation of the doorbell interrupt will trigger + * the unmapping of the associated vPE. + */ +static void get_vlpi_state_pre(struct vgic_dist *dist) +{ + struct irq_desc *desc; + int i; + + if (!kvm_vgic_global_state.has_gicv4_1) + return; + + for (i = 0; i < dist->its_vm.nr_vpes; i++) { + desc = irq_to_desc(dist->its_vm.vpes[i]->irq); + irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); + } +} + +static void get_vlpi_state_post(struct vgic_dist *dist) nit: the naming feels a bit... odd. Pre/post what? +{ + struct irq_desc *desc; + int i; + + if (!kvm_vgic_global_state.has_gicv4_1) + return; + + for (i = 0; i < dist->its_vm.nr_vpes; i++) { + desc = irq_to_desc(dist->its_vm.vpes[i]->irq); + irq_domain_activate_irq(irq_desc_get_irq_data(desc), false); + } +} + /** * vgic_v3_save_pending_tables - Save the pending tables into guest RAM * kvm lock and all vcpu lock must be held @@ -365,14 +400,17 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; gpa_t last_ptr = ~(gpa_t)0; - int ret; + int ret = 0; u8 val; + get_vlpi_state_pre(dist); + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { int byte_offset, bit_nr; struct kvm_vcpu *vcpu; gpa_t pendbase, ptr; bool stored; + bool is_pending = irq->pending_latch; vcpu = irq->target_vcpu; if (!vcpu) @@ -387,24 +425,36 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) if (ptr != last_ptr) { ret = kvm_read_guest_lock(kvm, ptr, &val, 1); if (ret) - return ret; + goto out; last_ptr = ptr; } stored = val & (1U << bit_nr); - if (stored == irq->pending_latch) + + /* also flush hw pending state */ This comment looks out of place, as we aren't flushing anything. + if (irq->hw) { + WARN_RATELIMIT(irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, &is_pending), + "IRQ %d", irq->host_irq); Isn't this going to warn like mad on a GICv4.0 system where this, by definition, will generate an error? + } + + if (stored == is_pending) continue; - if (irq->pending_latch) + if (is_pending) val |= 1 << bit_nr; else val &= ~(1 << bit_nr); ret = kvm_write_guest_lock(kvm, ptr, &val, 1); if (ret) - return ret; + goto out; } - return 0; + +out: + get_vlpi_state_post(dist); This bit worries me: you have unmapped the VPEs, so any interrupt that has been generated during that phase is now forever lost (the GIC doesn't have ownership of the pending tables). Do you really expect the VM to be restartable from that point? I don't see how this is possible. + + return ret; } /** Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [RFC PATCH v1 1/4] irqchip/gic-v4.1: Plumb get_irqchip_state VLPI callback
On 2020-11-23 06:54, Shenming Lu wrote: From: Zenghui Yu Up to now, the irq_get_irqchip_state() callback of its_irq_chip leaves unimplemented since there is no architectural way to get the VLPI's pending state before GICv4.1. Yeah, there has one in v4.1 for VLPIs. With GICv4.1, after unmapping the vPE, which cleans and invalidates any caching of the VPT, we can get the VLPI's pending state by This is a crucial note: without this unmapping and invalidation, the pending bits are not generally accessible (they could be cached in a GIC private structure, cache or otherwise). peeking at the VPT. So we implement the irq_get_irqchip_state() callback of its_irq_chip to do it. Signed-off-by: Zenghui Yu Signed-off-by: Shenming Lu --- drivers/irqchip/irq-gic-v3-its.c | 38 1 file changed, 38 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 0fec31931e11..287003cacac7 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1695,6 +1695,43 @@ static void its_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg); } +static bool its_peek_vpt(struct its_vpe *vpe, irq_hw_number_t hwirq) +{ + int mask = hwirq % BITS_PER_BYTE; nit: this isn't a mask, but a shift instead. BIT(hwirq % BPB) would give you a mask. + void *va; + u8 *pt; + + va = page_address(vpe->vpt_page); + pt = va + hwirq / BITS_PER_BYTE; + + return !!(*pt & (1U << mask)); +} + +static int its_irq_get_irqchip_state(struct irq_data *d, +enum irqchip_irq_state which, bool *val) +{ + struct its_device *its_dev = irq_data_get_irq_chip_data(d); + struct its_vlpi_map *map = get_vlpi_map(d); + + if (which != IRQCHIP_STATE_PENDING) + return -EINVAL; + + /* not intended for physical LPI's pending state */ + if (!map) + return -EINVAL; + + /* +* In GICv4.1, a VMAPP with {V,Alloc}=={0,1} cleans and invalidates +* any caching of the VPT associated with the vPEID held in the GIC. +*/ + if (!is_v4_1(its_dev->its) || atomic_read(&map->vpe->vmapp_count)) It isn't clear to me what prevents this from racing against a mapping of the VPE. Actually, since we only hold the LPI irqdesc lock, I'm pretty sure nothing prevents it. + return -EACCES; I can sort of buy EACCESS for a VPE that is currently mapped, but a non-4.1 ITS should definitely return EINVAL. + + *val = its_peek_vpt(map->vpe, map->vintid); + + return 0; +} + static int its_irq_set_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool state) @@ -1975,6 +2012,7 @@ static struct irq_chip its_irq_chip = { .irq_eoi= irq_chip_eoi_parent, .irq_set_affinity = its_set_affinity, .irq_compose_msi_msg= its_irq_compose_msi_msg, + .irq_get_irqchip_state = its_irq_get_irqchip_state, My biggest issue with this is that it isn't a reliable interface. It happens to work in the context of KVM, because you make sure it is called at the right time, but that doesn't make it safe in general (anyone with the interrupt number is allowed to call this at any time). Is there a problem with poking at the VPT page from the KVM side? The code should be exactly the same (maybe simpler even), and at least you'd be guaranteed to be in the correct context. .irq_set_irqchip_state = its_irq_set_irqchip_state, .irq_retrigger = its_irq_retrigger, .irq_set_vcpu_affinity = its_irq_set_vcpu_affinity, Thanks, M. -- Jazz is not dead. It just smells funny...
[GIT PULL] irqchip fixes for 5.10, take #2
Hi Thomas, Here a couple of patches from the irqchip department: a fix for a typo leading to incorrect trigger information being used on the Exiu driver, and another for save/restore with the GICv3 ITS. Please pull, M. The following changes since commit d95bdca75b3fb41bf185efe164e05aed820081a5: irqchip/ti-sci-inta: Add support for unmapped event handling (2020-11-01 12:00:50 +) are available in the Git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git tags/irqchip-fixes-5.10-2 for you to fetch changes up to 74cde1a53368aed4f2b4b54bf7030437f64a534b: irqchip/gic-v3-its: Unconditionally save/restore the ITS state on suspend (2020-11-22 12:58:35 +) irqchip fixes for Linux 5.10, take #2 - Fix Exiu driver trigger type when using ACPI - Fix GICv3 ITS suspend/resume to use the in-kernel path at all times, sidestepping braindead firmware support Chen Baozi (1): irqchip/exiu: Fix the index of fwspec for IRQ type Xu Qiang (1): irqchip/gic-v3-its: Unconditionally save/restore the ITS state on suspend drivers/irqchip/irq-gic-v3-its.c | 16 +++- drivers/irqchip/irq-sni-exiu.c | 2 +- 2 files changed, 4 insertions(+), 14 deletions(-)
Re: [PATCH 0/2] arm64: Allow the rescheduling IPI to bypass irq_enter/exit
On Fri, 20 Nov 2020 14:17:12 +, Thomas Gleixner wrote: Thomas, > So with the pre 5.8 scheduler IPI we had scheduler_ipi() doing wonderful > things [... tons of interesting and helpful stuff ...] > Hope that clarifies it. It does in a way, as it maps some of the low-level x86 things onto generic concepts. It certainly outlines that we have a lot of work ahead of us, none of which can be expected to be a quick fix. It requires restructuring a lot of the low-level arm64 code (Mark has been toying with it for a couple of years now), and rejig it in funny ways to match the expectations of the core code. I haven't tried to think about 32bit ARM just yet, and we may have to sever the IRQ ties that exist between the two architectures if we want to make forward progress in a reasonable time frame. In general, it looks that we'll need a new interface from the generic low-level IRQ entry code into this new common framework. > > If arm64 has forever been broken, I'd really like to know and fix it. > > Define broken. It kinda works with all the warts and bolts in tracing, > context tracking and other places. Is it entirely correct? Probably > not. > > The scheduler IPI is trivial compared to the more interesting ones like > NMI, MCE, breakpoint, debug exceptions etc. We found quite some > interesting issues with all of that muck during our 'noinstr' chase. Yup, point taken. However, given the complexity of the above and the fact that we currently have a measurable performance regression in 5.10, I'll respin the original series with the cleanups you mentioned, and the added note that we *really* need to sort this. Thanks, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH v2 1/3] genirq/affinity: Add irq_update_affinity_desc()
On Fri, 20 Nov 2020 11:52:09 +, John Garry wrote: > > Hi Thomas, > > >> Just mentioning a couple of things here, which could be a clue to what > >> is going on: > >> - the device is behind mbigen secondary irq controller > >> - the flow in the LLDD is to allocate all 128 interrupts during probe, > >> but we only register handlers for a subset with device managed API > > Right, but if the driver is removed then the interrupts should be > > deallocated, right? > > > > When removing the driver we just call free_irq(), which removes the > handler and disables the interrupt. > > But about the irq_desc, this is created when the mapping is created in > irq_create_fwspec_mapping(), and I don't see this being torn down in > the driver removal, so persistent in that regard. If the irq_descs are created via the platform_get_irq() calls in platform_get_irqs_affinity(), I'd expect some equivalent helper to tear things down as a result, calling irq_dispose_mapping() behind the scenes. > So for pci msi I can see that we free the irq_desc in > pci_disable_msi() -> free_msi_irqs() -> msi_domain_free_irqs() ... > > So what I am missing here? I'm not sure the paths are strictly equivalent. On the PCI side, we can have something that completely driver agnostic, as it is all architectural. In your case, only the endpoint driver knows about what happens, and needs to free things accordingly. Finally, there is the issue in your driver that everything is requested using devm_request_irq, which cannot play nicely with an explicit irq_desc teardown. You'll probably need to provide the equivalent devm helpers for your driver to safely be taken down. Thanks, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH] irqchip/exiu: Fix the index of fwspec for IRQ type
On Tue, 17 Nov 2020 11:20:15 +0800, Chen Baozi wrote: > Since fwspec->param_count of ACPI node is two, the index of IRQ type > in fwspec->param[] should be 1 rather than 2. Applied to irq/irqchip-next, thanks! [1/1] irqchip/exiu: Fix the index of fwspec for IRQ type commit: d001e41e1b15716e9b759df5ef00510699f85282 I added Fixes: and Cc: stable tags for a good measure. Cheers, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH -next] irq-chip/gic-v3-its: Fixed an issue where the ITS executes the residual commands in the queue again when the ITS wakes up from sleep mode.
On Sat, 7 Nov 2020 10:42:26 +, Xu Qiang wrote: > On my platform, ITS_FLAGS_SAVE_SUSPEND_STATE is not set,thus do nothing > in its suspend and resuse function.On the other hand,firmware stores > GITS_CTRL,GITS_CBASER,GITS_CWRITER and GITS_BASER in the suspend, > and restores these registers in the resume. As a result, the ITS executes > the residual commands in the queue. > > Memory corruption may occur in the following scenarios: > > [...] Applied to irq/irqchip-next, thanks! [1/1] irqchip/gic-v3-its: Unconditionally save/restore the ITS state on suspend commit: a51f7296f38f498c6f186c82ae3aa25ae10bb266 Cheers, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH v7 00/17] Add support for Clang LTO
On 2020-11-20 23:53, Nick Desaulniers wrote: On Fri, Nov 20, 2020 at 3:30 PM Ard Biesheuvel wrote: On Fri, 20 Nov 2020 at 21:19, Nick Desaulniers wrote: > > On Fri, Nov 20, 2020 at 2:30 AM Ard Biesheuvel wrote: > > > > On Thu, 19 Nov 2020 at 00:42, Nick Desaulniers wrote: > > > > > > Thanks for continuing to drive this series Sami. For the series, > > > > > > Tested-by: Nick Desaulniers > > > > > > I did virtualized boot tests with the series applied to aarch64 > > > defconfig without CONFIG_LTO, with CONFIG_LTO_CLANG, and a third time > > > with CONFIG_THINLTO. If you make changes to the series in follow ups, > > > please drop my tested by tag from the modified patches and I'll help > > > re-test. Some minor feedback on the Kconfig change, but I'll post it > > > off of that patch. > > > > > > > When you say 'virtualized" do you mean QEMU on x86? Or actual > > virtualization on an AArch64 KVM host? > > aarch64 guest on x86_64 host. If you have additional configurations > that are important to you, additional testing help would be > appreciated. > Could you run this on an actual phone? Or does Android already ship with this stuff? By `this`, if you mean "the LTO series", it has been shipping on Android phones for years now, I think it's even required in the latest release. If you mean "the LTO series + mainline" on a phone, well there's the android-mainline of https://android.googlesource.com/kernel/common/, in which this series was recently removed in order to facilitate rebasing Android's patches on ToT-mainline until getting the series landed upstream. Bit of a chicken and the egg problem there. If you mean "the LTO series + mainline + KVM" on a phone; I don't know the precise state of aarch64 KVM and Android (Will or Marc would know). If you are lucky enough to have an Android system booting at EL2, KVM should just works [1], though I haven't tried with this series. We did experiment recently with RockPI's for aach64 KVM, IIRC; I think Android is tricky as it still requires A64+A32/T32 chipsets, Which is about 100% of the Android systems at the moment (I don't think any of the asymmetric SoCs are in the wild yet). It doesn't really affect KVM anyway. M. [1] with the broken firmware gotchas that I believed to be erradicated 8 years ago, but are still prevalent in the Android world: laughable PSCI implementation, invalid CNTFRQ_EL0... -- Who you jivin' with that Cosmik Debris?
Re: [PATCH] random: avoid arch_get_random_seed_long() when collecting IRQ randomness
On 2020-11-05 15:29, Ard Biesheuvel wrote: When reseeding the CRNG periodically, arch_get_random_seed_long() is called to obtain entropy from an architecture specific source if one is implemented. In most cases, these are special instructions, but in some cases, such as on ARM, we may want to back this using firmware calls, which are considerably more expensive. Another call to arch_get_random_seed_long() exists in the CRNG driver, in add_interrupt_randomness(), which collects entropy by capturing inter-interrupt timing and relying on interrupt jitter to provide random bits. This is done by keeping a per-CPU state, and mixing in the IRQ number, the cycle counter and the return address every time an interrupt is taken, and mixing this per-CPU state into the entropy pool every 64 invocations, or at least once per second. The entropy that is gathered this way is credited as 1 bit of entropy. Every time this happens, arch_get_random_seed_long() is invoked, and the result is mixed in as well, and also credited with 1 bit of entropy. This means that arch_get_random_seed_long() is called at least once per second on every CPU, which seems excessive, and doesn't really scale, especially in a virtualization scenario where CPUs may be oversubscribed: in cases where arch_get_random_seed_long() is backed by an instruction that actually goes back to a shared hardware entropy source (such as RNDRRS on ARM), we will end up hitting it hundreds of times per second. So let's drop the call to arch_get_random_seed_long() from add_interrupt_randomness(), and instead, rely on crng_reseed() to call the arch hook to get random seed material from the platform. Signed-off-by: Ard Biesheuvel Looks sensible. Having this on the interrupt path looks quite heavy handed, and my understanding of the above is that it has an adverse effect on the entropy pool. Acked-by: Marc Zyngier M. -- Jazz is not dead. It just smells funny...
Re: [PATCH 2/2] drm/meson: dw-hdmi: Enable the iahb clock early enough
On 2020-11-20 10:54, Guillaume Tucker wrote: On 20/11/2020 09:42, Marc Zyngier wrote: Instead of moving meson_dw_hdmi_init() around which breaks existing platform, let's enable the clock meson_dw_hdmi_init() depends on. This means we don't have to worry about this clock being enabled or not, depending on the boot-loader features. Fixes: b33340e33acd ("drm/meson: dw-hdmi: Ensure that clocks are enabled before touching the TOP registers") Reported-by: Guillaume Tucker Although I am triaging kernelci bisections, it was initially found thanks to our friendly bot. So if you're OK with this, it would most definitely appreciate a mention: Reported-by: "kernelci.org bot" Sure. Neil can add this when (and if) he applies these patches. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: next/master bisection: baseline.dmesg.emerg on meson-gxbb-p200
On 2020-11-20 09:26, Neil Armstrong wrote: On 19/11/2020 19:35, Marc Zyngier wrote: diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c b/drivers/gpu/drm/meson/meson_dw_hdmi.c index 7f8eea494147..52af8ba94311 100644 --- a/drivers/gpu/drm/meson/meson_dw_hdmi.c +++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c @@ -146,6 +146,7 @@ struct meson_dw_hdmi { struct reset_control *hdmitx_ctrl; struct reset_control *hdmitx_phy; struct clk *hdmi_pclk; + struct clk *iahb_clk; struct clk *venci_clk; struct regulator *hdmi_supply; u32 irq_stat; @@ -1033,6 +1034,13 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, } clk_prepare_enable(meson_dw_hdmi->hdmi_pclk); + meson_dw_hdmi->iahb_clk = devm_clk_get(dev, "iahb"); + if (IS_ERR(meson_dw_hdmi->iahb_clk)) { + dev_err(dev, "Unable to get iahb clk\n"); + return PTR_ERR(meson_dw_hdmi->iahb_clk); + } + clk_prepare_enable(meson_dw_hdmi->iahb_clk); On previous SoCs, iahb was directly the bus clock (clk81), and on recent socs this clock is a gate. The question is why is it disabled. Maybe a previous failed probe disabled it in the dw-hdmi probe failure code and this clock is needed for meson_dw_hdmi_init(), so yeah this is the right fix. Thanks. Could you send a revert of b33340e33acdfe5ca6a5aa1244709575ae1e0432 and then proper fix with clk_disable_unprepare added ? Bah. I missed that email and sent a slightly different resolution. Hopefully that'll be good enough. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v5 0/2] MTE support for KVM guest
On 2020-11-20 09:50, Steven Price wrote: On 19/11/2020 19:11, Marc Zyngier wrote: Does this sound reasonable? I'll clean up the set_pte_at() change and post a v6 later today. Please hold on. I still haven't reviewed your v5, nor have I had time to read your reply to my comments on v4. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: AMU extension v1 support for cortex A76, A77, A78 CPUs
On 2020-11-20 09:09, Vladimir Murzin wrote: On 11/20/20 8:56 AM, Marc Zyngier wrote: On 2020-11-20 04:30, Neeraj Upadhyay wrote: Hi, For ARM cortex A76, A77, A78 cores (which as per TRM, support AMU) AA64PFR0[47:44] field is not set, and AMU does not get enabled for them. Can you please provide support for these CPUs in cpufeature.c? If that was the case, that'd be an erratum, and it would need to be documented as such. It could also be that this is an optional feature for these cores (though the TRM doesn't suggest that). Can someone at ARM confirm what is the expected behaviour of these CPUs? Not a confirmation, but IIRC, these are imp def features, while our cpufeatures catches architected one. Ah, good point. So these CPUs implement some sort of AMU, and not *the* AMU. Yet the register names are the same. Who thought that'd be a good idea? M. -- Jazz is not dead. It just smells funny...
[PATCH 2/2] drm/meson: dw-hdmi: Enable the iahb clock early enough
Instead of moving meson_dw_hdmi_init() around which breaks existing platform, let's enable the clock meson_dw_hdmi_init() depends on. This means we don't have to worry about this clock being enabled or not, depending on the boot-loader features. Fixes: b33340e33acd ("drm/meson: dw-hdmi: Ensure that clocks are enabled before touching the TOP registers") Reported-by: Guillaume Tucker Tested-by: Guillaume Tucker Signed-off-by: Marc Zyngier --- drivers/gpu/drm/meson/meson_dw_hdmi.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c b/drivers/gpu/drm/meson/meson_dw_hdmi.c index 29623b309cb1..aad75a22dc33 100644 --- a/drivers/gpu/drm/meson/meson_dw_hdmi.c +++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c @@ -1051,6 +1051,10 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, if (ret) return ret; + ret = meson_enable_clk(dev, "iahb"); + if (ret) + return ret; + ret = meson_enable_clk(dev, "venci"); if (ret) return ret; @@ -1086,6 +1090,8 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, encoder->possible_crtcs = BIT(0); + meson_dw_hdmi_init(meson_dw_hdmi); + DRM_DEBUG_DRIVER("encoder initialized\n"); /* Bridge / Connector */ @@ -1110,8 +1116,6 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, if (IS_ERR(meson_dw_hdmi->hdmi)) return PTR_ERR(meson_dw_hdmi->hdmi); - meson_dw_hdmi_init(meson_dw_hdmi); - next_bridge = of_drm_find_bridge(pdev->dev.of_node); if (next_bridge) drm_bridge_attach(encoder, next_bridge, -- 2.28.0
[PATCH 0/2] More meson HDMI fixes
Guillaume reported that my earlier fixes for the meson HDMI driver broke another set of machines which are now exploding (clock not enabled). I have thus reconsidered the approach and came up with an alternative fix (enable a missing clock), which Guillaume confirmed to be working. Jerome pointed out that this driver is leaking clock references like a sieve, so that needed addressing too. The first patch start by fixing the clock leakage using a devm facility. The second patch addresses the earlier crash by reusing the infrastructure put together in the first patch. Tested on VIM3l. Marc Zyngier (2): drm/meson: dw-hdmi: Disable clocks on driver teardown drm/meson: dw-hdmi: Enable the iahb clock early enough drivers/gpu/drm/meson/meson_dw_hdmi.c | 51 ++- 1 file changed, 35 insertions(+), 16 deletions(-) -- 2.28.0
[PATCH 1/2] drm/meson: dw-hdmi: Disable clocks on driver teardown
The HDMI driver request clocks early, but never disable them, leaving the clocks on even when the driver is removed. Fix it by slightly refactoring the clock code, and register a devm action that will eventually disable/unprepare the enabled clocks. Signed-off-by: Marc Zyngier --- drivers/gpu/drm/meson/meson_dw_hdmi.c | 43 ++- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c b/drivers/gpu/drm/meson/meson_dw_hdmi.c index 7f8eea494147..29623b309cb1 100644 --- a/drivers/gpu/drm/meson/meson_dw_hdmi.c +++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c @@ -145,8 +145,6 @@ struct meson_dw_hdmi { struct reset_control *hdmitx_apb; struct reset_control *hdmitx_ctrl; struct reset_control *hdmitx_phy; - struct clk *hdmi_pclk; - struct clk *venci_clk; struct regulator *hdmi_supply; u32 irq_stat; struct dw_hdmi *hdmi; @@ -946,6 +944,29 @@ static void meson_disable_regulator(void *data) regulator_disable(data); } +static void meson_disable_clk(void *data) +{ + clk_disable_unprepare(data); +} + +static int meson_enable_clk(struct device *dev, char *name) +{ + struct clk *clk; + int ret; + + clk = devm_clk_get(dev, name); + if (IS_ERR(clk)) { + dev_err(dev, "Unable to get %s pclk\n", name); + return PTR_ERR(clk); + } + + ret = clk_prepare_enable(clk); + if (!ret) + ret = devm_add_action_or_reset(dev, meson_disable_clk, clk); + + return ret; +} + static int meson_dw_hdmi_bind(struct device *dev, struct device *master, void *data) { @@ -1026,19 +1047,13 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, if (IS_ERR(meson_dw_hdmi->hdmitx)) return PTR_ERR(meson_dw_hdmi->hdmitx); - meson_dw_hdmi->hdmi_pclk = devm_clk_get(dev, "isfr"); - if (IS_ERR(meson_dw_hdmi->hdmi_pclk)) { - dev_err(dev, "Unable to get HDMI pclk\n"); - return PTR_ERR(meson_dw_hdmi->hdmi_pclk); - } - clk_prepare_enable(meson_dw_hdmi->hdmi_pclk); + ret = meson_enable_clk(dev, "isfr"); + if (ret) + return ret; - meson_dw_hdmi->venci_clk = devm_clk_get(dev, "venci"); - if (IS_ERR(meson_dw_hdmi->venci_clk)) { - dev_err(dev, "Unable to get venci clk\n"); - return PTR_ERR(meson_dw_hdmi->venci_clk); - } - clk_prepare_enable(meson_dw_hdmi->venci_clk); + ret = meson_enable_clk(dev, "venci"); + if (ret) + return ret; dw_plat_data->regm = devm_regmap_init(dev, NULL, meson_dw_hdmi, &meson_dw_hdmi_regmap_config); -- 2.28.0
Re: [PATCH 0/2] arm64: Allow the rescheduling IPI to bypass irq_enter/exit
[+ Mark who has been hacking in the same area lately] Hi Thomas, On 2020-11-03 20:32, Thomas Gleixner wrote: On Sun, Nov 01 2020 at 13:14, Marc Zyngier wrote: Vincent recently reported [1] that 5.10-rc1 showed a significant regression when running "perf bench sched pipe" on arm64, and pinpointed it to the recent move to handling IPIs as normal interrupts. The culprit is the use of irq_enter/irq_exit around the handling of the rescheduling IPI, meaning that we enter the scheduler right after the handling of the IPI instead of deferring it to the next preemption event. This accounts for most of the overhead introduced. irq_enter()/exit() does not end up in the scheduler. If it does then please show the call chain. Scheduling happens when the IPI returns just before returning into the low level code (or on ARM in the low level code) when NEED_RESCHED is set (which is usually the case when the IPI is sent) and: the IPI hit user space or the IPI hit in preemptible kernel context and CONFIG_PREEMPT[_RT] is enabled. Not doing so would be a bug. So I really don't understand your reasoning here. You are of course right. I somehow associated the overhead of the resched IPI with the scheduler itself. I stand corrected. On architectures that have architected IPIs at the CPU level (x86 being the obvious one), the absence of irq_enter/exit is natural. It's not really architected IPIs. We reserve the top 20ish vectors on each CPU for IPIs and other per processor interrupts, e.g. the per cpu timer. Now lets look at what x86 does: Interrupts and regular IPIs (function call ) do irqentry_enter() <- handles rcu_irq_enter() or context tracking ... irq_enter_rcu() ... irq_exit_rcu() irqentry_exit() <- handles need_resched() The scheduler IPI does: irqentry_enter() <- handles rcu_irq_enter() or context tracking ... __irq_enter_raw() ... __irq_exit_raw() irqentry_exit() <- handles need_resched() So we don't invoke irq_enter_rcu() on enter and on exit we skip irq_exit_rcu(). That's fine because - Calling the tick management is pointless because this is going to schedule anyway or something consumed the need_resched already. - The irqtime accounting is silly because it covers only the call and returns. The time spent in the accounting is more than the time we are accounting (empty call). So what your hack fails to invoke is rcu_irq_enter()/exit() in case that the IPI hits the idle task in an RCU off region. You also fail to tell lockdep. No cookies! Who needs cookies when you can have cheese? ;-) More seriously, it seems to me that we have a bit of a cross-architecture disconnect here. I have been trying to join the dots between what you describe above, and the behaviour of arm64 (and probably a large number of the non-x86 architectures), and I feel massively confused. Up to 5.9, our handling of the rescheduling IPI was "do as little as possible": decode the interrupt from the lowest possible level (the root irqchip), call into arch code, end-up in scheduler_ipi(), the end. No lockdep, no RCU, no nothing. What changed? Have we missed some radical change in the way the core kernel expects the arch code to do thing? I'm aware of the kernel/entry/common.c stuff, which implements most of the goodies you mention, but this effectively is x86-only at the moment. If arm64 has forever been broken, I'd really like to know and fix it. The bad news is that these patches are ugly as sin, and I really don't like them. Yes, they are ugly and the extra conditional in the irq handling path is not pretty either. I specially hate that they can give driver authors the idea that they can make random interrupts "faster". Just split the guts of irq_modify_status() into __irq_modify_status() and call that from irq_modify_status(). Reject IRQ_HIDDEN (which should have been IRQ_IPI really) and IRQ_NAKED (IRQ_RAW perhaps) in irq_modify_status(). Do not export __irq_modify_status() so it can be only used from built-in code which takes it away from driver writers. Yup, done. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: AMU extension v1 support for cortex A76, A77, A78 CPUs
On 2020-11-20 04:30, Neeraj Upadhyay wrote: Hi, For ARM cortex A76, A77, A78 cores (which as per TRM, support AMU) AA64PFR0[47:44] field is not set, and AMU does not get enabled for them. Can you please provide support for these CPUs in cpufeature.c? If that was the case, that'd be an erratum, and it would need to be documented as such. It could also be that this is an optional feature for these cores (though the TRM doesn't suggest that). Can someone at ARM confirm what is the expected behaviour of these CPUs? M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v5 0/2] MTE support for KVM guest
On 2020-11-19 18:42, Andrew Jones wrote: On Thu, Nov 19, 2020 at 03:45:40PM +, Peter Maydell wrote: On Thu, 19 Nov 2020 at 15:39, Steven Price wrote: > This series adds support for Arm's Memory Tagging Extension (MTE) to > KVM, allowing KVM guests to make use of it. This builds on the existing > user space support already in v5.10-rc1, see [1] for an overview. > The change to require the VMM to map all guest memory PROT_MTE is > significant as it means that the VMM has to deal with the MTE tags even > if it doesn't care about them (e.g. for virtual devices or if the VMM > doesn't support migration). Also unfortunately because the VMM can > change the memory layout at any time the check for PROT_MTE/VM_MTE has > to be done very late (at the point of faulting pages into stage 2). I'm a bit dubious about requring the VMM to map the guest memory PROT_MTE unless somebody's done at least a sketch of the design for how this would work on the QEMU side. Currently QEMU just assumes the guest memory is guest memory and it can access it without special precautions... There are two statements being made here: 1) Requiring the use of PROT_MTE when mapping guest memory may not fit QEMU well. 2) New KVM features should be accompanied with supporting QEMU code in order to prove that the APIs make sense. I strongly agree with (2). While kvmtool supports some quick testing, it doesn't support migration. We must test all new features with a migration supporting VMM. I'm not sure about (1). I don't feel like it should be a major problem, but (2). I'd be happy to help with the QEMU prototype, but preferably when there's hardware available. Has all the current MTE testing just been done on simulators? And, if so, are there regression tests regularly running on the simulators too? And can they test migration? If hardware doesn't show up quickly and simulators aren't used for regression tests, then all this code will start rotting from day one. While I agree with the sentiment, the reality is pretty bleak. I'm pretty sure nobody will ever run a migration on emulation. I also doubt there is much overlap between MTE users and migration users, unfortunately. No HW is available today, and when it becomes available, it will be in the form of a closed system on which QEMU doesn't run, either because we are locked out of EL2 (as usual), or because migration is not part of the use case (like KVM on Android, for example). So we can wait another two (five?) years until general purpose HW becomes available, or we start merging what we can test today. I'm inclined to do the latter. And I think it is absolutely fine for QEMU to say "no MTE support with KVM" (we can remove all userspace visibility, except for the capability). M. -- Jazz is not dead. It just smells funny...
Re: next/master bisection: baseline.dmesg.emerg on meson-gxbb-p200
On 2020-11-19 18:13, Jerome Brunet wrote: On Thu 19 Nov 2020 at 19:04, Guillaume Tucker wrote: Hi Marc, On 19/11/2020 11:58, Marc Zyngier wrote: On 2020-11-19 10:26, Neil Armstrong wrote: On 19/11/2020 11:20, Marc Zyngier wrote: On 2020-11-19 08:50, Guillaume Tucker wrote: Please see the automated bisection report below about some kernel errors on meson-gxbb-p200. Reports aren't automatically sent to the public while we're trialing new bisection features on kernelci.org, however this one looks valid. The bisection started with next-20201118 but the errors are still present in next-20201119. Details for this regression: https://kernelci.org/test/case/id/5fb6196bfd0127fd68d8d902/ The first error is: [ 14.757489] Internal error: synchronous external abort: 96000210 [#1] PREEMPT SMP Looks like yet another clock ordering setup. I guess different Amlogic platforms have slightly different ordering requirements. Neil, do you have any idea of which platform requires which ordering? The variability in DT and platforms is pretty difficult to follow (and I don't think I have such board around). The requirements should be the same, here the init was done before calling dw_hdmi_probe to be sure the clocks and internals resets were deasserted. But since you boot from u-boot already enabling these, it's already active. The solution would be to revert and do some check in meson_dw_hdmi_init() to check if already enabled and do nothing. A better fix seems to be this, which makes it explicit that there is a dependency between some of the registers accessed from meson_dw_hdmi_init() and the iahb clock. Guillaume, can you give this a go on your failing box? I confirm it solves the problem. Please add this to your fix patch if it's OK with you: Reported-by: "kernelci.org bot" Tested-by: Guillaume Tucker For the record, it passed all the tests when applied on top of the "bad" revision found by the bisection: http://lava.baylibre.com:10080/scheduler/alljobs?search=v5.10-rc3-1021-gb8668a2e5ea1 and the exact same test on the "bad" revision without the fix consistently showed the error: http://lava.baylibre.com:10080/scheduler/job/374176 Thanks, Guillaume diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c b/drivers/gpu/drm/meson/meson_dw_hdmi.c index 7f8eea494147..52af8ba94311 100644 --- a/drivers/gpu/drm/meson/meson_dw_hdmi.c +++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c @@ -146,6 +146,7 @@ struct meson_dw_hdmi { struct reset_control *hdmitx_ctrl; struct reset_control *hdmitx_phy; struct clk *hdmi_pclk; +struct clk *iahb_clk; struct clk *venci_clk; struct regulator *hdmi_supply; u32 irq_stat; @@ -1033,6 +1034,13 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, } clk_prepare_enable(meson_dw_hdmi->hdmi_pclk); +meson_dw_hdmi->iahb_clk = devm_clk_get(dev, "iahb"); +if (IS_ERR(meson_dw_hdmi->iahb_clk)) { +dev_err(dev, "Unable to get iahb clk\n"); +return PTR_ERR(meson_dw_hdmi->iahb_clk); +} +clk_prepare_enable(meson_dw_hdmi->iahb_clk); If you guys are going ahead with this fix, this call to clk_prepare_enable() needs to be balanced with clk_disable_unprepare() somehow Yup, good point. Although this driver *never* disables any clock it enables, and leaves it to the main DW driver, which I guess makes it leak references. So all 3 clocks need fixing. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: next/master bisection: baseline.dmesg.emerg on meson-gxbb-p200
On 2020-11-19 10:26, Neil Armstrong wrote: On 19/11/2020 11:20, Marc Zyngier wrote: On 2020-11-19 08:50, Guillaume Tucker wrote: Please see the automated bisection report below about some kernel errors on meson-gxbb-p200. Reports aren't automatically sent to the public while we're trialing new bisection features on kernelci.org, however this one looks valid. The bisection started with next-20201118 but the errors are still present in next-20201119. Details for this regression: https://kernelci.org/test/case/id/5fb6196bfd0127fd68d8d902/ The first error is: [ 14.757489] Internal error: synchronous external abort: 96000210 [#1] PREEMPT SMP Looks like yet another clock ordering setup. I guess different Amlogic platforms have slightly different ordering requirements. Neil, do you have any idea of which platform requires which ordering? The variability in DT and platforms is pretty difficult to follow (and I don't think I have such board around). The requirements should be the same, here the init was done before calling dw_hdmi_probe to be sure the clocks and internals resets were deasserted. But since you boot from u-boot already enabling these, it's already active. The solution would be to revert and do some check in meson_dw_hdmi_init() to check if already enabled and do nothing. A better fix seems to be this, which makes it explicit that there is a dependency between some of the registers accessed from meson_dw_hdmi_init() and the iahb clock. Guillaume, can you give this a go on your failing box? Thanks, M. diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c b/drivers/gpu/drm/meson/meson_dw_hdmi.c index 7f8eea494147..52af8ba94311 100644 --- a/drivers/gpu/drm/meson/meson_dw_hdmi.c +++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c @@ -146,6 +146,7 @@ struct meson_dw_hdmi { struct reset_control *hdmitx_ctrl; struct reset_control *hdmitx_phy; struct clk *hdmi_pclk; + struct clk *iahb_clk; struct clk *venci_clk; struct regulator *hdmi_supply; u32 irq_stat; @@ -1033,6 +1034,13 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, } clk_prepare_enable(meson_dw_hdmi->hdmi_pclk); + meson_dw_hdmi->iahb_clk = devm_clk_get(dev, "iahb"); + if (IS_ERR(meson_dw_hdmi->iahb_clk)) { + dev_err(dev, "Unable to get iahb clk\n"); + return PTR_ERR(meson_dw_hdmi->iahb_clk); + } + clk_prepare_enable(meson_dw_hdmi->iahb_clk); + meson_dw_hdmi->venci_clk = devm_clk_get(dev, "venci"); if (IS_ERR(meson_dw_hdmi->venci_clk)) { dev_err(dev, "Unable to get venci clk\n"); @@ -1071,6 +1079,8 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, encoder->possible_crtcs = BIT(0); + meson_dw_hdmi_init(meson_dw_hdmi); + DRM_DEBUG_DRIVER("encoder initialized\n"); /* Bridge / Connector */ @@ -1095,8 +1105,6 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, if (IS_ERR(meson_dw_hdmi->hdmi)) return PTR_ERR(meson_dw_hdmi->hdmi); - meson_dw_hdmi_init(meson_dw_hdmi); - next_bridge = of_drm_find_bridge(pdev->dev.of_node); if (next_bridge) drm_bridge_attach(encoder, next_bridge, -- Jazz is not dead. It just smells funny...
Re: next/master bisection: baseline.dmesg.emerg on meson-gxbb-p200
On 2020-11-19 10:26, Neil Armstrong wrote: On 19/11/2020 11:20, Marc Zyngier wrote: On 2020-11-19 08:50, Guillaume Tucker wrote: Please see the automated bisection report below about some kernel errors on meson-gxbb-p200. Reports aren't automatically sent to the public while we're trialing new bisection features on kernelci.org, however this one looks valid. The bisection started with next-20201118 but the errors are still present in next-20201119. Details for this regression: https://kernelci.org/test/case/id/5fb6196bfd0127fd68d8d902/ The first error is: [ 14.757489] Internal error: synchronous external abort: 96000210 [#1] PREEMPT SMP Looks like yet another clock ordering setup. I guess different Amlogic platforms have slightly different ordering requirements. Neil, do you have any idea of which platform requires which ordering? The variability in DT and platforms is pretty difficult to follow (and I don't think I have such board around). The requirements should be the same, here the init was done before calling dw_hdmi_probe to be sure the clocks and internals resets were deasserted. But since you boot from u-boot already enabling these, it's already active. The crashing platform also boots with u-boot. What is the difference? No HDMI support? The solution would be to revert and do some check in meson_dw_hdmi_init() to check if already enabled and do nothing. It looks more subtle than that, as it also depends on which DT is provided (an early meson_dw_hdmi_init() works with the kernel DT, and breaks with the u-boot DT). So whatever difference is between the two DTs causes havoc. u-boot obviously only uses its own DT, so we are looking at a kernel bug here. Not having this patch also breaks module reinsertion (HDMI clocks are disabled on unbind), so *something* has to be done late. So here are my questions: - What breaks in my config when I boot using u-boot's DT? - How to detect early that the registers are clocked or not? Thanks, M. -- Jazz is not dead. It just smells funny...
Re: next/master bisection: baseline.dmesg.emerg on meson-gxbb-p200
On 2020-11-19 08:50, Guillaume Tucker wrote: Please see the automated bisection report below about some kernel errors on meson-gxbb-p200. Reports aren't automatically sent to the public while we're trialing new bisection features on kernelci.org, however this one looks valid. The bisection started with next-20201118 but the errors are still present in next-20201119. Details for this regression: https://kernelci.org/test/case/id/5fb6196bfd0127fd68d8d902/ The first error is: [ 14.757489] Internal error: synchronous external abort: 96000210 [#1] PREEMPT SMP Looks like yet another clock ordering setup. I guess different Amlogic platforms have slightly different ordering requirements. Neil, do you have any idea of which platform requires which ordering? The variability in DT and platforms is pretty difficult to follow (and I don't think I have such board around). Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v3 3/5] irqchip: ocelot: Add support for Luton platforms
On 2020-11-16 16:24, Gregory CLEMENT wrote: This patch extends irqchip driver for oceleot to be used with an other vcoreiii base platform: Luton. Based on a larger patch from Lars Povlsen Signed-off-by: Gregory CLEMENT --- drivers/irqchip/irq-mscc-ocelot.c | 145 +- 1 file changed, 123 insertions(+), 22 deletions(-) diff --git a/drivers/irqchip/irq-mscc-ocelot.c b/drivers/irqchip/irq-mscc-ocelot.c index 88143c0b700c..9964800c53c2 100644 --- a/drivers/irqchip/irq-mscc-ocelot.c +++ b/drivers/irqchip/irq-mscc-ocelot.c @@ -12,39 +12,115 @@ #include #include -#define ICPU_CFG_INTR_INTR_STICKY 0x10 -#define ICPU_CFG_INTR_INTR_ENA 0x18 -#define ICPU_CFG_INTR_INTR_ENA_CLR 0x1c -#define ICPU_CFG_INTR_INTR_ENA_SET 0x20 -#define ICPU_CFG_INTR_DST_INTR_IDENT(x)(0x38 + 0x4 * (x)) -#define ICPU_CFG_INTR_INTR_TRIGGER(x) (0x5c + 0x4 * (x)) - -#define OCELOT_NR_IRQ 24 +#define ICPU_CFG_INTR_DST_INTR_IDENT(_p, x) (_p->reg_off_ident + 0x4 * (x)) +#define ICPU_CFG_INTR_INTR_TRIGGER(_p, x) (_p->reg_off_trigger + 0x4 * (x)) + +#define FLAGS_NEED_INIT_ENABLE BIT(0) +#define FLAGS_FORCE_LUTON_STYLEBIT(1) +#define FLAGS_HAS_TRIGGER BIT(2) + +struct chip_props { + u32 flags; + u32 reg_off_sticky; + u32 reg_off_ena; + u32 reg_off_ena_clr; + u32 reg_off_ena_set; + u32 reg_off_ident; + u32 reg_off_trigger; + u32 reg_off_force; + u32 reg_off_ena_irq0; Do all these fields need to be u32s? + u32 n_irq; +}; + +static const struct chip_props ocelot_props = { + .flags = FLAGS_HAS_TRIGGER, + .reg_off_sticky = 0x10, + .reg_off_ena= 0x18, + .reg_off_ena_clr= 0x1c, + .reg_off_ena_set= 0x20, + .reg_off_ident = 0x38, + .reg_off_trigger= 0x5c, + .reg_off_force = 0xc, + .n_irq = 24, +}; + +static const struct chip_props luton_props = { + .flags = FLAGS_NEED_INIT_ENABLE | + FLAGS_FORCE_LUTON_STYLE, LUTON_STYLE doesn't quite convey the idea of what this implies. Please use a name that actually means something (other than the city north of London). And force what exactly? + .reg_off_sticky = 0, + .reg_off_ena= 0x4, + .reg_off_ena_clr= 0x8, + .reg_off_ena_set= 0xc, + .reg_off_ident = 0x18, + .reg_off_trigger= 0, If this field doesn't exist (because the matching flag isn't set), don't list it. + .reg_off_force = 0x38, + .reg_off_ena_irq0 = 0x14, + .n_irq = 28, +}; Please split this patch in two: rework the Ocelot driver to the "new" style in one patch, then introduce new platform in another. static void ocelot_irq_unmask(struct irq_data *data) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); + struct irq_domain *d = data->domain; + struct chip_props *p = d->host_data; struct irq_chip_type *ct = irq_data_get_chip_type(data); unsigned int mask = data->mask; u32 val; irq_gc_lock(gc); - val = irq_reg_readl(gc, ICPU_CFG_INTR_INTR_TRIGGER(0)) | - irq_reg_readl(gc, ICPU_CFG_INTR_INTR_TRIGGER(1)); - if (!(val & mask)) - irq_reg_writel(gc, mask, ICPU_CFG_INTR_INTR_STICKY); + if (p->flags & FLAGS_HAS_TRIGGER) { + val = irq_reg_readl(gc, ICPU_CFG_INTR_INTR_TRIGGER(p, 0)) | + irq_reg_readl(gc, ICPU_CFG_INTR_INTR_TRIGGER(p, 1)); + if (!(val & mask)) + irq_reg_writel(gc, mask, p->reg_off_sticky); + } *ct->mask_cache &= ~mask; - irq_reg_writel(gc, mask, ICPU_CFG_INTR_INTR_ENA_SET); + irq_reg_writel(gc, mask, p->reg_off_ena_set); irq_gc_unlock(gc); } +static void luton_irq_force(struct irq_data *data, + struct irq_chip_generic *gc, + struct chip_props *p) +{ + int off = p->reg_off_force + (data->hwirq * sizeof(u32)); + u32 val = irq_reg_readl(gc, off); + + irq_reg_writel(gc, val | BIT(3), off); +} + +static int ocelot_irq_force(struct irq_data *data, + enum irqchip_irq_state which, bool state) Please use a name that matches the same of the API. +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); + struct irq_domain *d = data->domain; + struct chip_props *p = d->host_data; + int ret = -EINVAL; + + /* Only supports triggering */ + if ((which == IRQCHIP_STATE_PENDING || +which == IRQCHIP_STATE_ACTIVE) && PENDING and ACTIVE are two very different things, and yet you handle them the same way. Which one does your interrupt controller support? + state && p->reg_off_force) { +
Re: [RFC PATCH V2] acpi/irq: Add stacked IRQ domain support to PCI interrupt link
Hi Chen, On top of Bjorn's comments: On 2020-11-17 13:42, Chen Baozi wrote: Some PCIe designs require software to do extra acknowledgements for legacy INTx interrupts. If the driver is written only for device tree, things are simple. In that case, a new driver can be written under driver/pci/controller/ with a DT node of PCIe host written like: pcie { ... interrupt-map = <0 0 0 1 &pcie_intc 0>, <0 0 0 2 &pcie_intc 1>, <0 0 0 3 &pcie_intc 2>, <0 0 0 4 &pcie_intc 3>; pcie_intc: legacy-interrupt-controller { interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&gic>; interrupts = <0 226 4>; }; }; Similar designs can be found on Aardvark, MediaTek Gen2 and Socionext UniPhier PCIe controller at the moment. Essentially, those designs are supported by inserting an extra interrupt controller between PCIe host and GIC and parse the topology in a DT-based PCI controller driver. As we turn to ACPI, All the PCIe hosts are described the same ID of "PNP0A03" and share driver/acpi/pci_root.c. It comes to be a problem to make this kind of PCI INTx work under ACPI. Therefore, we introduce an stacked IRQ domain support to PCI interrupt link for ACPI. With this support, we can populate the ResourceSource to refer to a device object that describes an interrupt controller. That would allow us to refer to a dedicated driver which implements the logic needed to manage the interrupt state. With this patch, those PCI interrupt links can be supported by describing the INTx in ACPI table as the following example: Device (IXIU) { ... } Device(LINKA) { Name(_HID, EISAID("PNP0C0F")) Name(_PRS, ResourceTemplate(){ Interrupt(ResourceProducer, Level, ActiveHigh, Exclusive, 0, "\\SB.IXIU") { 60 } }) ... } Device(PCI0) { ... Name(_PRT, Package{ Package{ 0x, 0, LINKA, 0 } ... }) } Signed-off-by: Chen Baozi --- drivers/acpi/irq.c | 22 +- drivers/acpi/pci_irq.c | 6 -- drivers/acpi/pci_link.c | 17 +++-- include/acpi/acpi_drivers.h | 2 +- include/linux/acpi.h| 4 5 files changed, 45 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/irq.c b/drivers/acpi/irq.c index e209081d644b..e78a44815c44 100644 --- a/drivers/acpi/irq.c +++ b/drivers/acpi/irq.c @@ -81,6 +81,25 @@ void acpi_unregister_gsi(u32 gsi) } EXPORT_SYMBOL_GPL(acpi_unregister_gsi); +int acpi_register_irq(struct device *dev, u32 irq, int trigger, + int polarity, struct fwnode_handle *domain_id) +{ + struct irq_fwspec fwspec; + + if (WARN_ON(!domain_id)) { + pr_warn("GSI: No registered irqchip, giving up\n"); A fwnode_handle is not an irqchip. It's just an opaque identifier for a HW block. Furthermore, there is no need to have both a WARN_ON() and a pr_warn(). Please pick one. I'd also suggest you rename domain_id to fwnode, which is the commonly used idiom (yes, I know about the unfortunate precedent in acpi_register_gsi()). + return -EINVAL; + } + + fwspec.fwnode = domain_id; + fwspec.param[0] = irq; + fwspec.param[1] = acpi_dev_get_irq_type(trigger, polarity); + fwspec.param_count = 2; + + return irq_create_fwspec_mapping(&fwspec); +} +EXPORT_SYMBOL_GPL(acpi_register_irq); By the way, this is almost an exact duplicate of acpi_register_gsi(). You definitely want to make this code common. + /** * acpi_get_irq_source_fwhandle() - Retrieve fwhandle from IRQ resource source. * @source: acpi_resource_source to use for the lookup. @@ -92,7 +111,7 @@ EXPORT_SYMBOL_GPL(acpi_unregister_gsi); * Return: * The referenced device fwhandle or NULL on failure */ -static struct fwnode_handle * +struct fwnode_handle * acpi_get_irq_source_fwhandle(const struct acpi_resource_source *source) { struct fwnode_handle *result; @@ -115,6 +134,7 @@ acpi_get_irq_source_fwhandle(const struct acpi_resource_source *source) acpi_bus_put_acpi_device(device); return result; } +EXPORT_SYMBOL_GPL(acpi_get_irq_source_fwhandle); /* * Context for the resource walk used to lookup IRQ resources. diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index 14ee631cb7cf..19296d70c95c 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -410,6 +410,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev) char *link = NULL; char link_desc[16]; int rc; + struct fwnode_handle *irq_domain; fwnode_handle is most definitely not an IRQ domain. pin = dev->pin; if (!pin) { @@ -438,7 +439,8 @@ int acpi_pci_irq_enable(struct pci_dev *dev) gsi = acpi_pci_link_allocate_irq(entry->link, entry->index,
Re: WARNING: kernel/irq/chip.c:242 __irq_startup+0xa8/0xb0
Naresh, On 2020-11-18 06:11, Naresh Kamboju wrote: On Tue, 13 Oct 2020 at 11:09, Naresh Kamboju wrote: On stable rc 5.8.15 the following kernel warning was noticed once while boot and this is hard to reproduce. This is now reproduciable on arm64 NXP ls2088 device How reproducible? On demand? Once in a while? [ 19.980839] [ cut here ] [ 19.985468] WARNING: CPU: 1 PID: 441 at kernel/irq/chip.c:242 __irq_startup+0x9c/0xa8 [ 19.985472] Modules linked in: rfkill lm90 ina2xx crct10dif_ce qoriq_thermal fuse [ 20.000773] CPU: 1 PID: 441 Comm: (agetty) Not tainted 5.4.78-rc1 #2 Can you please try and reproduce this on mainline? [ 20.000775] Hardware name: Freescale Layerscape 2088A RDB Board (DT) [ 20.000779] pstate: 6085 (nZCv daIf -PAN -UAO) [ 20.018253] pc : __irq_startup+0x9c/0xa8 [ 20.018256] lr : irq_startup+0x64/0x130 [ 20.018259] sp : 80001122f8e0 [ 20.029303] x29: 80001122f8e0 x28: 0082c242d400 [ 20.029306] x27: dd0f47234768 x26: 00020902 [ 20.029309] x25: dd0f461a6f10 x24: dd0f461a6bc8 [ 20.029311] x23: x22: 0001 [ 20.029314] x21: 0001 x20: 0082c22f8780 [ 20.029316] x19: 0082c1060800 x18: 0001 [ 20.029318] x17: x16: 8000114a [ 20.029321] x15: x14: 0082c0e92f90 [ 20.071738] x13: 0082c0e93080 x12: 80001146 [ 20.071741] x11: dead0100 x10: 0040 [ 20.071743] x9 : dd0f47093ba8 x8 : dd0f47093ba0 [ 20.087653] x7 : 0082a2b0 x6 : dd0f47074958 [ 20.087655] x5 : dd0f47074000 x4 : 80001123 [ 20.087657] x3 : 0504 x2 : 0001 [ 20.103567] x1 : 03032004 x0 : 0082c1060858 [ 20.103570] Call trace: [ 20.103573] __irq_startup+0x9c/0xa8 [ 20.103577] irq_startup+0x64/0x130 [ 20.118359] __enable_irq+0x7c/0x88 [ 20.118362] enable_irq+0x54/0xa8 [ 20.118367] serial8250_do_startup+0x658/0x718 [ 20.118371] serial8250_startup+0x38/0x48 Looking at the DT: serial0: serial@21c0500 { interrupts = <0 32 0x4>; /* Level high type */ serial1: serial@21c0600 { interrupts = <0 32 0x4>; /* Level high type */ serial2: serial@21d0500 { interrupts = <0 33 0x4>; /* Level high type */ serial3: serial@21d0600 { interrupts = <0 33 0x4>; /* Level high type */ The UART interrupt lines are shared. Braindead, 1980 style. Which UART is agetty trying to use? Is there any other process using another UART concurrently? We could have a race between the line being shut down on one device, and activated from the other, but I can't spot it right away. If you can reproduce it easily enough, it shouldn't be too hard to trace what happens around the activation of the shared interrupt (assuming that this is where the problem is). M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v4 2/2] arm64: kvm: Introduce MTE VCPU feature
Hi Steven, On 2020-10-26 15:57, Steven Price wrote: Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging for a VM. This exposes the feature to the guest and automatically tags memory pages touched by the VM as PG_mte_tagged (and clears the tags storage) to ensure that the guest cannot see stale tags, and so that the tags are correctly saved/restored across swap. Signed-off-by: Steven Price Reviewed-by: Andrew Jones --- arch/arm64/include/asm/kvm_emulate.h | 3 +++ arch/arm64/include/asm/kvm_host.h| 3 +++ arch/arm64/kvm/arm.c | 9 + arch/arm64/kvm/mmu.c | 20 arch/arm64/kvm/sys_regs.c| 6 +- include/uapi/linux/kvm.h | 1 + 6 files changed, 41 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 5ef2669ccd6c..66c0d9e7c2b4 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -79,6 +79,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) || vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu->kvm->arch.mte_enabled) Please add a predicate (vcpu_has_mte() or kvm_has_mte()?) for this. + vcpu->arch.hcr_el2 |= HCR_ATA; } static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 95ab7345dcc8..cd993aec0440 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -118,6 +118,9 @@ struct kvm_arch { */ unsigned long *pmu_filter; unsigned int pmuver; + + /* Memory Tagging Extension enabled for the guest */ + bool mte_enabled; }; struct kvm_vcpu_fault_info { diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index f56122eedffc..7ee93bcac017 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -89,6 +89,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = 0; kvm->arch.return_nisv_io_abort_to_user = true; break; + case KVM_CAP_ARM_MTE: + if (!system_supports_mte() || kvm->created_vcpus) + return -EINVAL; You also want to avoid 32bit guests. Also, what is the rational for this being a VM capability as opposed to a CPU feature, similar to SVE and PMU? + r = 0; + kvm->arch.mte_enabled = true; + break; default: r = -EINVAL; break; @@ -210,6 +216,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = 1; break; + case KVM_CAP_ARM_MTE: + r = system_supports_mte(); Same comment about 32bit. + break; case KVM_CAP_STEAL_TIME: r = kvm_arm_pvtime_supported(); break; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 19aacc7d64de..38fe25310ca1 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -862,6 +862,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (vma_pagesize == PAGE_SIZE && !force_pte) vma_pagesize = transparent_hugepage_adjust(memslot, hva, &pfn, &fault_ipa); + + /* +* The otherwise redundant test for system_supports_mte() allows the +* code to be compiled out when CONFIG_ARM64_MTE is not present. +*/ + if (system_supports_mte() && kvm->arch.mte_enabled && pfn_valid(pfn)) { + /* +* VM will be able to see the page's tags, so we must ensure +* they have been initialised. +*/ + struct page *page = pfn_to_page(pfn); + long i, nr_pages = compound_nr(page); + + /* if PG_mte_tagged is set, tags have already been initialised */ + for (i = 0; i < nr_pages; i++, page++) { + if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + mte_clear_page_tags(page_address(page)); + } + } What are the visibility requirements for the tags, specially if the guest has its MMU off? Is there any cache management that needs to occur? Another thing is device-like memory that is managed by userspace, such as the QEMU emulated flash, for which there also might be tags. How is that dealt with? In general, what are the expectations for tags on memory shared between host and guest? Who owns them? + if (writable) { prot |= KVM_PGTABLE_PROT_W; kvm_set_pfn_dirty(pfn); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 430e36e1a13d..35a3dc448231 100644 --- a/arch/arm64/kvm
Re: [PATCH v4 1/2] arm64: kvm: Save/restore MTE registers
Hi Steven, These patches unfortunately don't apply to -rc4 anymore, as we repainted quite a bit while working on fixes. I'd be grateful if you could rebase them. A few other things though: On 2020-10-26 15:57, Steven Price wrote: Define the new system registers that MTE introduces and context switch them. The MTE feature is still hidden from the ID register as it isn't supported in a VM yet. Signed-off-by: Steven Price Reviewed-by: Andrew Jones --- arch/arm64/include/asm/kvm_host.h | 4 arch/arm64/include/asm/sysreg.h| 3 ++- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 14 ++ arch/arm64/kvm/sys_regs.c | 14 ++ 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 0aecbab6a7fb..95ab7345dcc8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -134,6 +134,8 @@ enum vcpu_sysreg { SCTLR_EL1, /* System Control Register */ ACTLR_EL1, /* Auxiliary Control Register */ CPACR_EL1, /* Coprocessor Access Control */ + RGSR_EL1, /* Random Allocation Tag Seed Register */ + GCR_EL1,/* Tag Control Register */ ZCR_EL1,/* SVE Control */ TTBR0_EL1, /* Translation Table Base Register 0 */ TTBR1_EL1, /* Translation Table Base Register 1 */ @@ -150,6 +152,8 @@ enum vcpu_sysreg { TPIDR_EL1, /* Thread ID, Privileged */ AMAIR_EL1, /* Aux Memory Attribute Indirection Register */ CNTKCTL_EL1,/* Timer Control Register (EL1) */ + TFSRE0_EL1, /* Tag Fault Status Register (EL0) */ + TFSR_EL1, /* Tag Fault Stauts Register (EL1) */ PAR_EL1,/* Physical Address Register */ MDSCR_EL1, /* Monitor Debug System Control Register */ MDCCINT_EL1,/* Monitor Debug Comms Channel Interrupt Enable Reg */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index d52c1b3ce589..7727df0bc09d 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -565,7 +565,8 @@ #define SCTLR_ELx_M(BIT(0)) #define SCTLR_ELx_FLAGS(SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ -SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB) +SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB | \ +SCTLR_ELx_ITFSB) /* SCTLR_EL2 specific flags. */ #define SCTLR_EL2_RES1 ((BIT(4)) | (BIT(5)) | (BIT(11)) | (BIT(16)) | \ diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 7a986030145f..a124ffa49ba3 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -18,6 +18,11 @@ static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1); + if (system_supports_mte()) { + ctxt_sys_reg(ctxt, RGSR_EL1)= read_sysreg_s(SYS_RGSR_EL1); + ctxt_sys_reg(ctxt, GCR_EL1) = read_sysreg_s(SYS_GCR_EL1); + ctxt_sys_reg(ctxt, TFSRE0_EL1) = read_sysreg_s(SYS_TFSRE0_EL1); As far as I can tell, HCR_EL2.ATA is still clear when running a guest. So why, do we save/restore this state yet? Also, I wonder whether we should keep these in the C code. If one day we enable MTE in the kernel, we will have to move them to the assembly part, much like we do for PAuth. And I fear that "one day" is pretty soon: https://lore.kernel.org/linux-arm-kernel/cover.1605046192.git.andreyk...@google.com/ + } } static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) @@ -45,6 +50,8 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, CNTKCTL_EL1) = read_sysreg_el1(SYS_CNTKCTL); ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg(par_el1); ctxt_sys_reg(ctxt, TPIDR_EL1) = read_sysreg(tpidr_el1); + if (system_supports_mte()) + ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR); ctxt_sys_reg(ctxt, SP_EL1) = read_sysreg(sp_el1); ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR); @@ -63,6 +70,11 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1); + if (system_supports_mte()) { + write_sysreg_s(ctxt_sys_reg(ctxt, RGSR_EL1), SYS_RGSR_EL1); + write_sysreg_s(ctxt_sys_reg(ctxt, GCR_EL1), SYS_GCR_EL1); + write_sysreg_s(ctxt_sys_reg(ctxt, TFSRE0_EL1), SYS_TFSRE0_EL1); + } } static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) @@ -106,6 +118,8 @@ static inline void _
Re: [PATCH] KVM: arm64: vgic-v3: Drop the reporting of GICR_TYPER.Last for userspace
On Tue, 17 Nov 2020 23:16:29 +0800, Zenghui Yu wrote: > It was recently reported that if GICR_TYPER is accessed before the RD base > address is set, we'll suffer from the unset @rdreg dereferencing. Oops... > > gpa_t last_rdist_typer = rdreg->base + GICR_TYPER + > (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE; > > It's "expected" that users will access registers in the redistributor if > the RD has been properly configured (e.g., the RD base address is set). But > it hasn't yet been covered by the existing documentation. > > [...] Applied to kvm-arm64/fixes-5.10, thanks! [1/1] KVM: arm64: vgic-v3: Drop the reporting of GICR_TYPER.Last for userspace commit: 23bde34771f1ea92fb5e6682c0d8c04304d34b3b I have added a Cc stable, as this definitely wants to be backported. Cheers, M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH 1/2] KVM: arm64: vgic: Forbid invalid userspace Redistributor accesses
On 2020-11-17 09:59, Auger Eric wrote: Hi Marc, On 11/17/20 9:49 AM, Marc Zyngier wrote: Hi Zenghui, On 2020-11-16 14:57, Zenghui Yu wrote: Hi Marc, On 2020/11/16 22:10, Marc Zyngier wrote: My take is that only if the "[Re]Distributor base address" is specified in the system memory map, will the user-provided kvm_device_attr.offset make sense. And we can then handle the access to the register which is defined by "base address + offset". I'd tend to agree, but it is just that this is a large change at -rc4. I'd rather have a quick fix for 5.10, and a more invasive change for 5.11, spanning all the possible vgic devices. So you prefer fixing it by "return a value that doesn't have the Last bit set" for v5.10? I'm ok with it and can send v2 for it. Cool. Thanks for that. Btw, looking again at the way we handle the user-reading of GICR_TYPER vgic_mmio_read_v3r_typer(vcpu, addr, len) it seems that @addr is actually the *offset* of GICR_TYPER (0x0008) and @addr is unlikely to be equal to last_rdist_typer, which is the *GPA* of the last RD. Looks like the user-reading of GICR_TYPER.Last is always broken? I think you are right. Somehow, we don't seem to track the index of the RD in the region, so we can never compute the address of the RD even if the base address is set. Let's drop the reporting of Last for userspace for now, as it never worked. If you post a patch addressing that quickly, I'll get it to Paolo by the end of the week (there's another fix that needs merging). Eric: do we have any test covering the userspace API? So as this issue seems related to the changes made when implementing the multiple RDIST regions, I volunteer to write those KVM selftests :-) You're on! :D More seriously, there is scope for fuzzing the device save/restore API, as we find bugs every time someone change the "known good" ordering that is implemented in QEMU. Maybe it means getting rid of some unnecessary flexibility, as proposed by Zenghui, if we are confident that no userspace makes use of it. And in the future, making sure that new APIs are rigid enough to avoid such bugs. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH 0/4] drm/meson: Module removal fixes
Hi Neil, On 2020-11-17 08:49, Neil Armstrong wrote: Hi Marc, On 16/11/2020 21:07, Marc Zyngier wrote: Hi all, Having recently moved over to a top-of-the-tree u-boot on one of my VIM3L systems in order to benefit from unrelated improvements (automatic PCIe detection, EFI...), I faced the issue that my kernel would hang like this: [ OK ] Finished Helper to synchronize boot up for ifupdown. [ OK ] Started Rule-based Manager for Device Events and Files. [7.114516] VDDCPU: supplied by regulator-dummy [ OK ] Found device /dev/ttyAML0. [7.146862] meson-drm ff90.vpu: Queued 2 outputs on vpu [7.169630] fb0: switching to meson-drm-fb from simple [7.169944] Console: switching to colour dummy device 80x25 [7.179250] meson-drm ff90.vpu: CVBS Output connector not available and that's it. After some poking around, I figured out that it is in the meson-dw-hdmi module that the CPU was hanging... I'll be interested in having your kernel config, I never had such report since I enabled HDMI support in U-Boot a few years ago. Yeah, I was pretty surprised too. I have a hunch that this is caused by u-boot DT exposing an extra MMIO region (dubbed "hhi") that gets picked up by the kernel driver. *Not* having the region in the DT (as in the kernel's version of the same DT) makes the driver work exactly once: Decompiled u-boot DT: hdmi-tx@0 { compatible = "amlogic,meson-g12a-dw-hdmi"; reg = <0x00 0x00 0x00 0x1 0x00 0x3c000 0x00 0x1000>; [...] reg-names = "hdmitx\0hhi"; Decompiled kernel DT: hdmi-tx@0 { compatible = "amlogic,meson-g12a-dw-hdmi"; reg = <0x00 0x00 0x00 0x1>; There seem to be some complex interactions between the HDMI driver and the DRM driver, both using this MMIO region at any given time. But I admit not having tried very hard to follow the DRM maze of intricate callbacks. All I needed was this box to reliably boot with the firmware-provided DT. You can find a reasonably recent version of my config at [1]. M. [1] http://www.loen.fr/tmp/Config.full-arm64 -- Jazz is not dead. It just smells funny...
Re: [PATCH 1/2] KVM: arm64: vgic: Forbid invalid userspace Redistributor accesses
Hi Zenghui, On 2020-11-16 14:57, Zenghui Yu wrote: Hi Marc, On 2020/11/16 22:10, Marc Zyngier wrote: My take is that only if the "[Re]Distributor base address" is specified in the system memory map, will the user-provided kvm_device_attr.offset make sense. And we can then handle the access to the register which is defined by "base address + offset". I'd tend to agree, but it is just that this is a large change at -rc4. I'd rather have a quick fix for 5.10, and a more invasive change for 5.11, spanning all the possible vgic devices. So you prefer fixing it by "return a value that doesn't have the Last bit set" for v5.10? I'm ok with it and can send v2 for it. Cool. Thanks for that. Btw, looking again at the way we handle the user-reading of GICR_TYPER vgic_mmio_read_v3r_typer(vcpu, addr, len) it seems that @addr is actually the *offset* of GICR_TYPER (0x0008) and @addr is unlikely to be equal to last_rdist_typer, which is the *GPA* of the last RD. Looks like the user-reading of GICR_TYPER.Last is always broken? I think you are right. Somehow, we don't seem to track the index of the RD in the region, so we can never compute the address of the RD even if the base address is set. Let's drop the reporting of Last for userspace for now, as it never worked. If you post a patch addressing that quickly, I'll get it to Paolo by the end of the week (there's another fix that needs merging). Eric: do we have any test covering the userspace API? Thanks, M. -- Jazz is not dead. It just smells funny...
[PATCH 1/4] drm/meson: Free RDMA resources after tearing down DRM
Removing the meson DRM module results in the following splat: [ 2179.451346] Hardware name: , BIOS 2021.01-rc2-00012-gde865f7ee1 11/16/2020 [ 2179.458316] Workqueue: events drm_mode_rmfb_work_fn [drm] [ 2179.463597] pstate: 80c9 (Nzcv daif +PAN +UAO -TCO BTYPE=--) [ 2179.469558] pc : meson_rdma_writel_sync+0x44/0xb0 [meson_drm] [ 2179.475243] lr : meson_g12a_afbcd_reset+0x34/0x60 [meson_drm] [ 2179.480930] sp : ffc01212bb70 [ 2179.484207] x29: ffc01212bb70 x28: ff8044f66f00 [ 2179.489469] x27: ff8045b13800 x26: 0001 [ 2179.494730] x25: x24: 0001 [ 2179.41] x23: x22: [ 2179.505252] x21: 0028 x20: 1a01 [ 2179.510513] x19: ff8046029480 x18: [ 2179.515775] x17: x16: [ 2179.521036] x15: x14: [ 2179.526297] x13: 00400326 x12: 0309030303260300 [ 2179.531558] x11: 0300054004a0 x10: 0418054004000400 [ 2179.536820] x9 : ffc008fe4914 x8 : ff8040a1adc0 [ 2179.542081] x7 : x6 : ff8042aa0080 [ 2179.547342] x5 : ff8044f66f00 x4 : ffc008fe5bc8 [ 2179.552603] x3 : 00010101 x2 : 0001 [ 2179.557865] x1 : x0 : [ 2179.563127] Call trace: [ 2179.565548] meson_rdma_writel_sync+0x44/0xb0 [meson_drm] [ 2179.570894] meson_g12a_afbcd_reset+0x34/0x60 [meson_drm] [ 2179.576241] meson_plane_atomic_disable+0x38/0xb0 [meson_drm] [ 2179.581966] drm_atomic_helper_commit_planes+0x1e0/0x21c [drm_kms_helper] [ 2179.588684] drm_atomic_helper_commit_tail_rpm+0x68/0xb0 [drm_kms_helper] [ 2179.595410] commit_tail+0xac/0x190 [drm_kms_helper] [ 2179.600326] drm_atomic_helper_commit+0x16c/0x390 [drm_kms_helper] [ 2179.606484] drm_atomic_commit+0x58/0x70 [drm] [ 2179.610880] drm_framebuffer_remove+0x398/0x434 [drm] [ 2179.615881] drm_mode_rmfb_work_fn+0x68/0x8c [drm] [ 2179.620575] process_one_work+0x1cc/0x49c [ 2179.624538] worker_thread+0x200/0x444 [ 2179.628246] kthread+0x14c/0x160 [ 2179.631439] ret_from_fork+0x10/0x38 caused by the fact that the RDMA buffer has already been freed, resulting in meson_rdma_writel_sync() getting a NULL pointer. Move the afbcd reset and meson_rdma_free calls after the DRM unregistration is complete so that the teardown can safely complete. Signed-off-by: Marc Zyngier --- drivers/gpu/drm/meson/meson_drv.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_drv.c b/drivers/gpu/drm/meson/meson_drv.c index 8b9c8dd788c4..324fa489f1c4 100644 --- a/drivers/gpu/drm/meson/meson_drv.c +++ b/drivers/gpu/drm/meson/meson_drv.c @@ -389,15 +389,15 @@ static void meson_drv_unbind(struct device *dev) meson_canvas_free(priv->canvas, priv->canvas_id_vd1_2); } - if (priv->afbcd.ops) { - priv->afbcd.ops->reset(priv); - meson_rdma_free(priv); - } - drm_dev_unregister(drm); drm_irq_uninstall(drm); drm_kms_helper_poll_fini(drm); drm_dev_put(drm); + + if (priv->afbcd.ops) { + priv->afbcd.ops->reset(priv); + meson_rdma_free(priv); + } } static const struct component_master_ops meson_drv_master_ops = { -- 2.28.0
[PATCH 2/4] drm/meson: Unbind all connectors on module removal
Removing the meson DRM module results in the following splats: [ 42.689228] WARNING: CPU: 0 PID: 572 at drivers/gpu/drm/drm_irq.c:192 drm_irq_uninstall+0x130/0x160 [drm] [...] [ 42.812820] Hardware name: , BIOS 2021.01-rc2-00012-gde865f7ee1 11/16/2020 [ 42.819723] pstate: 80400089 (Nzcv daIf +PAN -UAO -TCO BTYPE=--) [ 42.825737] pc : drm_irq_uninstall+0x130/0x160 [drm] [ 42.830647] lr : drm_irq_uninstall+0xc4/0x160 [drm] [...] [ 42.917614] Call trace: [ 42.920086] drm_irq_uninstall+0x130/0x160 [drm] [ 42.924612] meson_drv_unbind+0x68/0xa4 [meson_drm] [ 42.929436] component_del+0xc0/0x180 [ 42.933058] meson_dw_hdmi_remove+0x28/0x40 [meson_dw_hdmi] [ 42.938576] platform_drv_remove+0x38/0x60 [ 42.942628] __device_release_driver+0x190/0x23c [ 42.947198] driver_detach+0xcc/0x160 [ 42.950822] bus_remove_driver+0x68/0xe0 [ 42.954702] driver_unregister+0x3c/0x6c [ 42.958583] platform_driver_unregister+0x20/0x2c [ 42.963243] meson_dw_hdmi_platform_driver_exit+0x18/0x4a8 [meson_dw_hdmi] [ 42.970057] __arm64_sys_delete_module+0x1bc/0x294 [ 42.974801] el0_svc_common.constprop.0+0x80/0x240 [ 42.979542] do_el0_svc+0x30/0xa0 [ 42.982821] el0_svc+0x18/0x50 [ 42.985839] el0_sync_handler+0x198/0x404 [ 42.989806] el0_sync+0x158/0x180 immediatelly followed by [ 43.002296] WARNING: CPU: 0 PID: 572 at drivers/gpu/drm/drm_mode_config.c:504 drm_mode_config_cleanup+0x2a8/0x304 [drm] [...] [ 43.128150] Hardware name: , BIOS 2021.01-rc2-00012-gde865f7ee1 11/16/2020 [ 43.135052] pstate: 8049 (Nzcv daif +PAN -UAO -TCO BTYPE=--) [ 43.141062] pc : drm_mode_config_cleanup+0x2a8/0x304 [drm] [ 43.146492] lr : drm_mode_config_cleanup+0xac/0x304 [drm] [...] [ 43.233979] Call trace: [ 43.236451] drm_mode_config_cleanup+0x2a8/0x304 [drm] [ 43.241538] drm_mode_config_init_release+0x1c/0x2c [drm] [ 43.246886] drm_managed_release+0xa8/0x120 [drm] [ 43.251543] drm_dev_put+0x94/0xc0 [drm] [ 43.255380] meson_drv_unbind+0x78/0xa4 [meson_drm] [ 43.260204] component_del+0xc0/0x180 [ 43.263829] meson_dw_hdmi_remove+0x28/0x40 [meson_dw_hdmi] [ 43.269344] platform_drv_remove+0x38/0x60 [ 43.273398] __device_release_driver+0x190/0x23c [ 43.277967] driver_detach+0xcc/0x160 [ 43.281590] bus_remove_driver+0x68/0xe0 [ 43.285471] driver_unregister+0x3c/0x6c [ 43.289352] platform_driver_unregister+0x20/0x2c [ 43.294011] meson_dw_hdmi_platform_driver_exit+0x18/0x4a8 [meson_dw_hdmi] [ 43.300826] __arm64_sys_delete_module+0x1bc/0x294 [ 43.305570] el0_svc_common.constprop.0+0x80/0x240 [ 43.310312] do_el0_svc+0x30/0xa0 [ 43.313590] el0_svc+0x18/0x50 [ 43.316608] el0_sync_handler+0x198/0x404 [ 43.320574] el0_sync+0x158/0x180 [ 43.323852] ---[ end trace d796a3072dab01da ]--- [ 43.328561] [drm:drm_mode_config_cleanup [drm]] *ERROR* connector HDMI-A-1 leaked! both triggered by the fact that the HDMI subsystem is still active, and the DRM removal doesn't result in the connectors being torn down. Call drm_atomic_helper_shutdown() and component_unbind_all() to safely tear the module down. Signed-off-by: Marc Zyngier --- drivers/gpu/drm/meson/meson_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/meson/meson_drv.c b/drivers/gpu/drm/meson/meson_drv.c index 324fa489f1c4..3d1de9cbb1c8 100644 --- a/drivers/gpu/drm/meson/meson_drv.c +++ b/drivers/gpu/drm/meson/meson_drv.c @@ -390,8 +390,10 @@ static void meson_drv_unbind(struct device *dev) } drm_dev_unregister(drm); - drm_irq_uninstall(drm); drm_kms_helper_poll_fini(drm); + drm_atomic_helper_shutdown(drm); + component_unbind_all(dev, drm); + drm_irq_uninstall(drm); drm_dev_put(drm); if (priv->afbcd.ops) { -- 2.28.0
[PATCH 0/4] drm/meson: Module removal fixes
Hi all, Having recently moved over to a top-of-the-tree u-boot on one of my VIM3L systems in order to benefit from unrelated improvements (automatic PCIe detection, EFI...), I faced the issue that my kernel would hang like this: [ OK ] Finished Helper to synchronize boot up for ifupdown. [ OK ] Started Rule-based Manager for Device Events and Files. [7.114516] VDDCPU: supplied by regulator-dummy [ OK ] Found device /dev/ttyAML0. [7.146862] meson-drm ff90.vpu: Queued 2 outputs on vpu [7.169630] fb0: switching to meson-drm-fb from simple [7.169944] Console: switching to colour dummy device 80x25 [7.179250] meson-drm ff90.vpu: CVBS Output connector not available and that's it. After some poking around, I figured out that it is in the meson-dw-hdmi module that the CPU was hanging... Reverting to the kernel DT instead of u-boot's papered over it somehow, but it turned out that removing the module (modprobe -r) resulted in a firework. And for every issue I was fixing, another followed. Much fun for a rainy Monday in the basement! I ended up with the following 4 patches, which solve all my problems: I can now boot with the u-boot provided DT, and the hdmi and DRM drivers can be removed and re-inserted at will. The first patch is a straightforward use-after-free, causing a NULL pointer dereference. Moving things around fixes it. The second patch shows that I have no clue about the DRM subsystem whatsoever. I mimicked what my Rockchip systems are doing, and the two warnings disappeared. It can't completely be wrong (famous last words...). The third patch fixes a *very* common issue with regulators (I've fixed at least 3 drivers with a similar issue). I guess the devm subsystem needs to grow a new helper at some point. The last patch finally fixes the issue I was seeing: the HDMI driver hangs when accessing a register with clocks disabled, which they are on module removal. It also fixes my u-boot booting for similar reasons, I guess. I went as far as reaching out for a HDMI cable and verifying that I was getting a working display. Total dedication. Feedback much appreciated. M. Marc Zyngier (4): drm/meson: Free RDMA resources after tearing down DRM drm/meson: Unbind all connectors on module removal drm/meson: dw-hdmi: Register a callback to disable the regulator drm/meson: dw-hdmi: Ensure that clocks are enabled before touching the TOP registers drivers/gpu/drm/meson/meson_drv.c | 12 +++- drivers/gpu/drm/meson/meson_dw_hdmi.c | 13 +++-- 2 files changed, 18 insertions(+), 7 deletions(-) -- 2.28.0
[PATCH 4/4] drm/meson: dw-hdmi: Ensure that clocks are enabled before touching the TOP registers
Removing the meson-dw-hdmi module and re-inserting it results in a hang as the driver writes to HDMITX_TOP_SW_RESET. Similar effects can be seen when booting with mainline u-boot and using the u-boot provided DT (which is highly desirable). The reason for the hang seem to be that the clocks are not always enabled by the time we enter meson_dw_hdmi_init(). Moving this call *after* dw_hdmi_probe() ensures that the clocks are enabled. Signed-off-by: Marc Zyngier --- drivers/gpu/drm/meson/meson_dw_hdmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c b/drivers/gpu/drm/meson/meson_dw_hdmi.c index 68826cf9993f..7f8eea494147 100644 --- a/drivers/gpu/drm/meson/meson_dw_hdmi.c +++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c @@ -1073,8 +1073,6 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, DRM_DEBUG_DRIVER("encoder initialized\n"); - meson_dw_hdmi_init(meson_dw_hdmi); - /* Bridge / Connector */ dw_plat_data->priv_data = meson_dw_hdmi; @@ -1097,6 +1095,8 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, if (IS_ERR(meson_dw_hdmi->hdmi)) return PTR_ERR(meson_dw_hdmi->hdmi); + meson_dw_hdmi_init(meson_dw_hdmi); + next_bridge = of_drm_find_bridge(pdev->dev.of_node); if (next_bridge) drm_bridge_attach(encoder, next_bridge, -- 2.28.0
[PATCH 3/4] drm/meson: dw-hdmi: Register a callback to disable the regulator
Removing the meson-dw-hdmi module results in the following splat: i[ 43.340509] WARNING: CPU: 0 PID: 572 at drivers/regulator/core.c:2125 _regulator_put.part.0+0x16c/0x174 [...] [ 43.454870] CPU: 0 PID: 572 Comm: modprobe Tainted: GW E 5.10.0-rc4-00049-gd274813a4de3-dirty #2147 [ 43.465042] Hardware name: , BIOS 2021.01-rc2-00012-gde865f7ee1 11/16/2020 [ 43.471945] pstate: 8049 (Nzcv daif +PAN -UAO -TCO BTYPE=--) [ 43.477896] pc : _regulator_put.part.0+0x16c/0x174 [ 43.482638] lr : regulator_put+0x44/0x60 [...] [ 43.568715] Call trace: [ 43.571132] _regulator_put.part.0+0x16c/0x174 [ 43.575529] regulator_put+0x44/0x60 [ 43.579067] devm_regulator_release+0x20/0x2c [ 43.583380] release_nodes+0x1c8/0x2b4 [ 43.587087] devres_release_all+0x44/0x6c [ 43.591056] __device_release_driver+0x1a0/0x23c [ 43.595626] driver_detach+0xcc/0x160 [ 43.599249] bus_remove_driver+0x68/0xe0 [ 43.603130] driver_unregister+0x3c/0x6c [ 43.607011] platform_driver_unregister+0x20/0x2c [ 43.611678] meson_dw_hdmi_platform_driver_exit+0x18/0x4a8 [meson_dw_hdmi] [ 43.618485] __arm64_sys_delete_module+0x1bc/0x294 as the HDMI regulator is still enabled on release. In order to address this, register a callback that will deal with the disabling when the driver is unbound, solving the problem. Signed-off-by: Marc Zyngier --- drivers/gpu/drm/meson/meson_dw_hdmi.c | 9 + 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c b/drivers/gpu/drm/meson/meson_dw_hdmi.c index 29a8ff41595d..68826cf9993f 100644 --- a/drivers/gpu/drm/meson/meson_dw_hdmi.c +++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c @@ -941,6 +941,11 @@ static void meson_dw_hdmi_init(struct meson_dw_hdmi *meson_dw_hdmi) } +static void meson_disable_regulator(void *data) +{ + regulator_disable(data); +} + static int meson_dw_hdmi_bind(struct device *dev, struct device *master, void *data) { @@ -989,6 +994,10 @@ static int meson_dw_hdmi_bind(struct device *dev, struct device *master, ret = regulator_enable(meson_dw_hdmi->hdmi_supply); if (ret) return ret; + ret = devm_add_action_or_reset(dev, meson_disable_regulator, + meson_dw_hdmi->hdmi_supply); + if (ret) + return ret; } meson_dw_hdmi->hdmitx_apb = devm_reset_control_get_exclusive(dev, -- 2.28.0
Re: [PATCH 1/2] KVM: arm64: vgic: Forbid invalid userspace Redistributor accesses
On 2020-11-16 13:09, Zenghui Yu wrote: Hi Marc, On 2020/11/16 1:04, Marc Zyngier wrote: Hi Zenghui, On 2020-11-13 14:28, Zenghui Yu wrote: It's expected that users will access registers in the redistributor *if* the RD has been initialized properly. Unfortunately userspace can be bogus enough to access registers before setting the RD base address, and KVM implicitly allows it (we handle the access anyway, regardless of whether the base address is set). Bad thing happens when we're handling the user read of GICR_TYPER. We end up with an oops when deferencing the unset rdreg... gpa_t last_rdist_typer = rdreg->base + GICR_TYPER + (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE; Fix this issue by informing userspace what had gone wrong (-ENXIO). I'm worried about the "implicit" aspect of the access that this patch now forbids. The problem is that the existing documentation doesn't cover this case, > and -ENXIO's "Getting or setting this register is not yet supported" is way too vague. Indeed. How about changing to -ENXIO Getting or setting this register is not yet supported or VGIC not properly configured (e.g., [Re]Distributor base address is unknown) Looks OK to me. There is a precedent with the ITS, but that's undocumented as well. Also, how about v2? If that's the wasy we are going to fix this, we also nned to beef up the documentation. Sure, I plan to do so and hope it won't break the existing userspace. Well, at this stage we can only hope. Of course, the other horrible way to address the issue is to return a value that doesn't have the Last bit set, since we can't synthetise it. It doesn't change the userspace API, and I can even find some (admittedly twisted) logic to it (since there is no base address, there is no last RD...). I'm fine with it. But I'm afraid that there might be other issues due to the "unexpected" accesses since I haven't tested with all registers from userspace. I have had a look at the weekend, and couldn't see any other other GICR register that would suffer from rdreg being NULL. I haven't looked at GICD, but I don't anticipate anything bad on that front. My take is that only if the "[Re]Distributor base address" is specified in the system memory map, will the user-provided kvm_device_attr.offset make sense. And we can then handle the access to the register which is defined by "base address + offset". I'd tend to agree, but it is just that this is a large change at -rc4. I'd rather have a quick fix for 5.10, and a more invasive change for 5.11, spanning all the possible vgic devices. M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 13/24] kvm: arm64: Add CPU entry point in nVHE hyp
On 2020-11-16 11:49, David Brazdil wrote: > #ifdef CONFIG_CPU_PM >DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); > diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S > b/arch/arm64/kvm/hyp/nvhe/hyp-init.S > index 1697d25756e9..f999a35b2c8c 100644 > --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S > +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S > @@ -6,6 +6,7 @@ > > #include > #include > +#include This should probably be included from the file that provides init_el2_state. Agreed. This is a workaround for the fact that the arm-gic* headers don't play nice with each other (define the same constants). Ah, that... Including arm-gic-v3.h in kvm_asm.h will trigger macro redefine warnings in vgic*-v2.c because they include arm-gic.h. Boo. Another option is to create a header just for el2 init. Would that be preferable? Other ideas? Having an asm/el2_setup.h file feels like a better option. After all, it is in no way KVM specific, and having the macros hanging around in asm/kvm_asm.h feels odd. M. -- Jazz is not dead. It just smells funny...
Re: [patch 06/19] arm64: irqstat: Get rid of duplicated declaration
On 2020-11-13 14:02, Thomas Gleixner wrote: irq_cpustat_t is exactly the same as the asm-generic one. Define ack_bad_irq so the generic header does not emit the generic version of it. Signed-off-by: Thomas Gleixner Cc: Catalin Marinas Cc: Will Deacon Cc: Marc Zyngier Cc: linux-arm-ker...@lists.infradead.org --- arch/arm64/include/asm/hardirq.h |7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) --- a/arch/arm64/include/asm/hardirq.h +++ b/arch/arm64/include/asm/hardirq.h @@ -13,11 +13,8 @@ #include #include -typedef struct { - unsigned int __softirq_pending; -} cacheline_aligned irq_cpustat_t; - -#include /* Standard mappings for irq_cpustat_t above */ +#define ack_bad_irq ack_bad_irq +#include #define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 Acked-by: Marc Zyngier M. -- Jazz is not dead. It just smells funny...
Re: [PATCH 1/2] KVM: arm64: vgic: Forbid invalid userspace Redistributor accesses
Hi Zenghui, On 2020-11-13 14:28, Zenghui Yu wrote: It's expected that users will access registers in the redistributor *if* the RD has been initialized properly. Unfortunately userspace can be bogus enough to access registers before setting the RD base address, and KVM implicitly allows it (we handle the access anyway, regardless of whether the base address is set). Bad thing happens when we're handling the user read of GICR_TYPER. We end up with an oops when deferencing the unset rdreg... gpa_t last_rdist_typer = rdreg->base + GICR_TYPER + (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE; Fix this issue by informing userspace what had gone wrong (-ENXIO). I'm worried about the "implicit" aspect of the access that this patch now forbids. The problem is that the existing documentation doesn't cover this case, and -ENXIO's "Getting or setting this register is not yet supported" is way too vague. There is a precedent with the ITS, but that's undocumented as well. Also, how about v2? If that's the wasy we are going to fix this, we also nned to beef up the documentation. Of course, the other horrible way to address the issue is to return a value that doesn't have the Last bit set, since we can't synthetise it. It doesn't change the userspace API, and I can even find some (admittedly twisted) logic to it (since there is no base address, there is no last RD...). Thoughts? M. -- Jazz is not dead. It just smells funny...
Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs
On 2020-11-14 03:37, Alexey Kardashevskiy wrote: What is the easiest way to get irq-hierarchical hardware? I have a bunch of powerpc boxes (no good) but also a raspberry pi, a bunch of 32/64bit orange pi's, an "armada" arm box, thinkpads - is any of this good for the task? If your HW doesn't require an interrupt hierarchy, run VMs! Booting an arm64 guest with virtual PCI devices will result in hierarchies being created (PCI-MSI -> GIC MSI widget -> GIC). Absolutely :) But the beauty of ARM is that one can buy an actual ARM device for 20$, I have "opi one+ allwinner h6 64bit cortex a53 1GB RAM", is it worth using KVM on this device, or is it too small for that? I've run VMs on smaller machines. 256MB of guest RAM is enough to boot a full blown Debian system with PCI devices, and your AW box should be up to the task as long as you run a mainline kernel on it. Please don't add to the pile of junk! You can use KVM, or even bare QEMU on x86 if you are so inclined. Have a QEMU command line handy for x86/tcg? /me digs, as my x86 boxes are overspec'd X terminals these days: Here you go, courtesy of Will: http://cdn.kernel.org/pub/linux/kernel/people/will/docs/qemu/qemu-arm64-howto.html M. -- Jazz is not dead. It just smells funny...
Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs
Hi Alexey, On 2020-11-09 09:46, Alexey Kardashevskiy wrote: PCI devices share 4 legacy INTx interrupts from the same PCI host bridge. Device drivers map/unmap hardware interrupts via irq_create_mapping()/ irq_dispose_mapping(). The problem with that these interrupts are shared and when performing hot unplug, we need to unmap the interrupt only when the last device is released. This reuses already existing irq_desc::kobj for this purpose. The refcounter is naturally 1 when the descriptor is allocated already; this adds kobject_get() in places where already existing mapped virq is returned. This reorganizes irq_dispose_mapping() to release the kobj and let the release callback do the cleanup. As kobject_put() is called directly now (not via RCU), it can also handle the early boot case (irq_kobj_base==NULL) with the help of the kobject::state_in_sysfs flag and without additional irq_sysfs_del(). While at this, clean up the comment at where irq_sysfs_del() was called. Quick grep shows no sign of irq reference counting in drivers. Drivers typically request mapping when probing and dispose it when removing; platforms tend to dispose only if setup failed and the rest seems calling one dispose per one mapping. Except (at least) PPC/pseries which needs https://lkml.org/lkml/2020/10/27/259 Cc: Cédric Le Goater Cc: Marc Zyngier Cc: Michael Ellerman Cc: Qian Cai Cc: Rob Herring Cc: Frederic Barrat Cc: Michal Suchánek Cc: Thomas Gleixner Signed-off-by: Alexey Kardashevskiy --- This is what it is fixing for powerpc: There was a comment about whether hierarchical IRQ domains should contribute to this reference counter and I need some help here as I cannot see why. It is reverse now - IRQs contribute to domain->mapcount and irq_domain_associate/irq_domain_disassociate take necessary steps to keep this counter in order. What might be missing is that if we have cascade of IRQs (as in the IOAPIC example from Documentation/core-api/irq/irq-domain.rst ), then a parent IRQ should contribute to the children IRQs and it is up to irq_domain_ops::alloc/free hooks, and they all seem to be eventually calling irq_domain_alloc_irqs_xxx/irq_domain_free_irqs_xxx which seems right. Documentation/core-api/irq/irq-domain.rst also suggests there is a lot to see in debugfs about IRQs but on my thinkpad there nothing about hierarchy. So I'll ask again :) What is the easiest way to get irq-hierarchical hardware? I have a bunch of powerpc boxes (no good) but also a raspberry pi, a bunch of 32/64bit orange pi's, an "armada" arm box, thinkpads - is any of this good for the task? If your HW doesn't require an interrupt hierarchy, run VMs! Booting an arm64 guest with virtual PCI devices will result in hierarchies being created (PCI-MSI -> GIC MSI widget -> GIC). You can use KVM, or even bare QEMU on x86 if you are so inclined. I'll try to go through this patch over the week-end (or more probably early next week), and try to understand where our understandings differ. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: iommu/vt-d: Cure VF irqdomain hickup
On 2020-11-12 21:34, Thomas Gleixner wrote: On Thu, Nov 12 2020 at 20:15, Thomas Gleixner wrote: The recent changes to store the MSI irqdomain pointer in struct device missed that Intel DMAR does not register virtual function devices. Due to that a VF device gets the plain PCI-MSI domain assigned and then issues compat MSI messages which get caught by the interrupt remapping unit. Cure that by inheriting the irq domain from the physical function device. That's a temporary workaround. The correct fix is to inherit the irq domain from the bus, but that's a larger effort which needs quite some other changes to the way how x86 manages PCI and MSI domains. Bah, that's not really going to work with the way how irq remapping works on x86 because at least Intel/DMAR can have more than one DMAR unit on a bus. So the alternative solution would be to assign the domain per device, but the current ordering creates a hen and egg problem. Looking the domain up in pci_set_msi_domain() does not work because at that point the device is not registered in the IOMMU. That happens from device_add(). Marc, is there any problem to reorder the calls in pci_device_add(): device_add(); pci_set_msi_domain(); I *think* it works as long as we keep the "match_driver = false" hack. Otherwise, we risk binding to a driver early, and game over. That would allow to add a irq_find_matching_fwspec() based lookup to pci_msi_get_device_domain(). Just so that I understand the issue: is the core of the problem that there is no 1:1 mapping between a PCI bus and a DMAR unit, and no firmware topology information to indicate which one to pick? Though I'm not yet convinced that the outcome would be less horrible than the hack in the DMAR driver when I'm taking all the other horrors of x86 (including XEN) into account :) I tried to follow the notifier into the DMAR driver, ended up in the IRQ remapping code, and lost the will to live. I have a question though: In the bus notifier callback, you end-up in dmar_pci_bus_add_dev(), which calls intel_irq_remap_add_device(), which tries to set the MSI domain. Why isn't that enough? Are we still missing any information at that stage? Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH 2/2] irqchip/gic-v3-its: Disable vSGI upon (CPUIF < v4.1) detection
On 2020-11-12 14:40, Lorenzo Pieralisi wrote: On Thu, Nov 12, 2020 at 09:36:10AM +, Marc Zyngier wrote: Hi Lorenzo, On 2020-11-11 16:28, Lorenzo Pieralisi wrote: > GIC CPU interfaces versions predating GIC v4.1 were not built to > accommodate vINTID within the vSGI range; as reported in the GIC > specifications (8.2 "Changes to the CPU interface"), it is > CONSTRAINED UNPREDICTABLE to deliver a vSGI to a PE with > ID_AA64PFR0_EL1.GIC == b0001. Hmmm. This goes against the very reason v4.1 was designed the way it is, which was that all existing implementation supporting GICv4.0 would seamlessly let virtual SGIs in, and it would "just work". If we start enforcing this, I question the very design of the architecture, because we could have done so much better by changing the CPU interface. What has changed in two years? Have you spotted a fundamental problem? Hi Marc, long story short: there are systems being designed with this configuration, vSGIs may or may not work on them, to prevent *potential* misbehaviour I am disabling vSGIs, I am not fixing anything, it is belt and braces. My concern is that if we prevent it, we're going to end-up with quirks allowing it anyway, because people will realise that it actually works. We may wait and fix it *if* this breaks, I would argue though that at that point it is not a quirk since architecturally we know that vSGIs may not work in this configuration. I don't mind either way, as I doubt I'll see this kind of system any time soon. I'm just mildly annoyed at the missed opportunity to do something better... In the meantime, to the meat of the change: > > Check the GIC CPUIF version through the arm64 capabilities > infrastructure and disable vSGIs if a CPUIF version < 4.1 is > detected to prevent using vSGIs on systems where they may > misbehave. > > Signed-off-by: Lorenzo Pieralisi > Cc: Marc Zyngier > --- > drivers/irqchip/irq-gic-v3-its.c | 20 +++- > 1 file changed, 19 insertions(+), 1 deletion(-) > > diff --git a/drivers/irqchip/irq-gic-v3-its.c > b/drivers/irqchip/irq-gic-v3-its.c > index 0fec31931e11..6ed4ba60ba7e 100644 > --- a/drivers/irqchip/irq-gic-v3-its.c > +++ b/drivers/irqchip/irq-gic-v3-its.c > @@ -39,6 +39,20 @@ > > #include "irq-gic-common.h" > > +#ifdef CONFIG_ARM64 > +#include > + > +static inline bool gic_cpuif_has_vsgi(void) > +{ > + return cpus_have_const_cap(ARM64_HAS_GIC_CPUIF_VSGI); > +} > +#else > +static inline bool gic_cpuif_has_vsgi(void) > +{ > + return false; > +} > +#endif > + > #define ITS_FLAGS_CMDQ_NEEDS_FLUSHING (1ULL << 0) > #define ITS_FLAGS_WORKAROUND_CAVIUM_22375 (1ULL << 1) > #define ITS_FLAGS_WORKAROUND_CAVIUM_23144 (1ULL << 2) > @@ -5415,7 +5429,11 @@ int __init its_init(struct fwnode_handle > *handle, struct rdists *rdists, >if (has_v4 & rdists->has_vlpis) { >const struct irq_domain_ops *sgi_ops; > > - if (has_v4_1) > + /* > + * Enable vSGIs only if the ITS and the > + * GIC CPUIF support them. > + */ > + if (has_v4_1 && gic_cpuif_has_vsgi()) >sgi_ops = &its_sgi_domain_ops; >else >sgi_ops = NULL; Is that enough? No, I obviously missed the VGIC bits built on top of GICD_TYPER2.nASSGIcap. KVM is still going to expose GICD_TYPER2.nASSGIcap, making things even more confusing for the guest: it will be able to select active-less SGIs via GICD_CTLR.nASSGIreq, and if I'm not mistaken, we'd still try to switch to HW-backed SGIs, leading to some *very* unpleasant things in gic_v4_enable_vsgis(). Yes (AFAICS GICD_TYPER2.nASSGIcap is not in the public specs though, that's why I missed it while vetting architectural state that is affecting vSGIs). You can find it in the errata to the spec (I just checked the October 2020 version). I doubt it is public though, and people have been asking for this update to be published for a while now. I should change the logic in vgic_mmio_{uaccess}_write_v3_misc() to handle it properly - to redefine the logic around kvm_vgic_global_state.has_gicv4_1 somehow. You probably need a separate predicate, indicating HW-baked vSGI support. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH 1/2] arm64: cpufeature: Add GIC CPUIF v4.1 detection
On 2020-11-11 16:28, Lorenzo Pieralisi wrote: GIC v4.1 introduced changes to the GIC CPU interface; systems that integrate CPUs that do not support GIC v4.1 features (as reported in the ID_AA64PFR0_EL1.GIC bitfield) and a GIC v4.1 controller must disable in software virtual SGIs support since the CPUIF and GIC controller version mismatch results in CONSTRAINED UNPREDICTABLE behaviour at architectural level. Add a cpufeature and related capability to detect GIC v4.1 CPUIF features so that the GIC driver can probe it to detect GIC CPUIF hardware configuration and take action accordingly. Signed-off-by: Lorenzo Pieralisi Cc: Will Deacon Cc: Catalin Marinas Cc: Marc Zyngier --- arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/kernel/cpufeature.c | 10 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 42868dbd29fd..35ef0319f422 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -65,7 +65,8 @@ #define ARM64_HAS_ARMv8_4_TTL 55 #define ARM64_HAS_TLB_RANGE56 #define ARM64_MTE 57 +#define ARM64_HAS_GIC_CPUIF_VSGI 58 -#define ARM64_NCAPS58 +#define ARM64_NCAPS59 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index dcc165b3fc04..9eabbaddfe5e 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2136,6 +2136,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_mte, }, #endif /* CONFIG_ARM64_MTE */ + { + .desc = "GIC CPUIF virtual SGI", nit: that's not really what this feature is. It only means that the sysreg interface complies to v4.1. Which on its own is totally rubbish, because the sysreg don't change behaviour between 3.0/4.0 and 4.1. + .capability = ARM64_HAS_GIC_CPUIF_VSGI, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR0_EL1, + .field_pos = ID_AA64PFR0_GIC_SHIFT, + .sign = FTR_UNSIGNED, + .min_field_value = 3, + }, Do we really need a new cap for that? Or can we rely on simply looking at the sanitised feature set? I'm not overly keen on advertising a feature at CPU boot time if we discover later on that we cannot use it because all we have in a non-4.1 GIC. Another thing is that we currently assume that *all* CPUs will be the same at the point where we setup the GIC (we only have a single CPU booted at that point). M. -- Jazz is not dead. It just smells funny...
Re: [PATCH 2/2] irqchip/gic-v3-its: Disable vSGI upon (CPUIF < v4.1) detection
Hi Lorenzo, On 2020-11-11 16:28, Lorenzo Pieralisi wrote: GIC CPU interfaces versions predating GIC v4.1 were not built to accommodate vINTID within the vSGI range; as reported in the GIC specifications (8.2 "Changes to the CPU interface"), it is CONSTRAINED UNPREDICTABLE to deliver a vSGI to a PE with ID_AA64PFR0_EL1.GIC == b0001. Hmmm. This goes against the very reason v4.1 was designed the way it is, which was that all existing implementation supporting GICv4.0 would seamlessly let virtual SGIs in, and it would "just work". If we start enforcing this, I question the very design of the architecture, because we could have done so much better by changing the CPU interface. What has changed in two years? Have you spotted a fundamental problem? My concern is that if we prevent it, we're going to end-up with quirks allowing it anyway, because people will realise that it actually works. In the meantime, to the meat of the change: Check the GIC CPUIF version through the arm64 capabilities infrastructure and disable vSGIs if a CPUIF version < 4.1 is detected to prevent using vSGIs on systems where they may misbehave. Signed-off-by: Lorenzo Pieralisi Cc: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 20 +++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 0fec31931e11..6ed4ba60ba7e 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -39,6 +39,20 @@ #include "irq-gic-common.h" +#ifdef CONFIG_ARM64 +#include + +static inline bool gic_cpuif_has_vsgi(void) +{ + return cpus_have_const_cap(ARM64_HAS_GIC_CPUIF_VSGI); +} +#else +static inline bool gic_cpuif_has_vsgi(void) +{ + return false; +} +#endif + #define ITS_FLAGS_CMDQ_NEEDS_FLUSHING (1ULL << 0) #define ITS_FLAGS_WORKAROUND_CAVIUM_22375 (1ULL << 1) #define ITS_FLAGS_WORKAROUND_CAVIUM_23144 (1ULL << 2) @@ -5415,7 +5429,11 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, if (has_v4 & rdists->has_vlpis) { const struct irq_domain_ops *sgi_ops; - if (has_v4_1) + /* +* Enable vSGIs only if the ITS and the +* GIC CPUIF support them. +*/ + if (has_v4_1 && gic_cpuif_has_vsgi()) sgi_ops = &its_sgi_domain_ops; else sgi_ops = NULL; Is that enough? KVM is still going to expose GICD_TYPER2.nASSGIcap, making things even more confusing for the guest: it will be able to select active-less SGIs via GICD_CTLR.nASSGIreq, and if I'm not mistaken, we'd still try to switch to HW-backed SGIs, leading to some *very* unpleasant things in gic_v4_enable_vsgis(). Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 00/24] Opt-in always-on nVHE hypervisor
Hi David, On 2020-11-09 11:32, David Brazdil wrote: As we progress towards being able to keep guest state private to the host running nVHE hypervisor, this series allows the hypervisor to install itself on newly booted CPUs before the host is allowed to run on them. All functionality described below is opt-in, guarded by an early param 'kvm-arm.protected'. Future patches specific to the new "protected" mode should be hidden behind the same param. The hypervisor starts trapping host SMCs and intercepting host's PSCI CPU_ON/OFF/SUSPEND calls. It replaces the host's entry point with its own, initializes the EL2 state of the new CPU and installs the nVHE hyp vector before ERETing to the host's entry point. The kernel checks new cores' features against the finalized system capabilities. To avoid the need to move this code/data to EL2, the implementation only allows to boot cores that were online at the time of KVM initialization and therefore had been checked already. Other PSCI SMCs are forwarded to EL3, though only the known set of SMCs implemented in the kernel is allowed. Non-PSCI SMCs are also forwarded to EL3. Future changes will need to ensure the safety of all SMCs wrt. private guests. The host is still allowed to reset EL2 back to the stub vector, eg. for hibernation or kexec, but will not disable nVHE when there are no VMs. Tested on Rock Pi 4b, based on 5.10-rc3. I think I've gone through most of the patches. When you respin this series, you may want to do so on top of my host EL2 entry rework [1], which change a few things you currently rely on. If anything in there doesn't work for you, please let me know. Thanks, M. [1] https://lore.kernel.org/kvm/20201109175923.445945-1-...@kernel.org/ -- Jazz is not dead. It just smells funny...
Re: [PATCH v2 3/5] gpio: msc313: MStar MSC313 GPIO driver
On 2020-11-11 14:09, Linus Walleij wrote: On Tue, Nov 10, 2020 at 3:19 PM Marc Zyngier wrote: On 2020-11-10 14:02, Linus Walleij wrote: >> Probably nothing more than setting the callback to >> irq_chip_set_affinity_parent, > > Hm, is this something all GPIO irqchips used on SMP systems > should be doing? Or just hierarchical ones? Probably only the hierarchical ones. I'd expect the non-hierarchical GPIOs to be muxed behind a single interrupt, which makes it impossible to move a single GPIO around, and moving the mux interrupt would break userspace's expectations that interrupts move independently of each others. I found two suspects and sent patches. I think I might have some more candidates down in pinctrl. I do have some hierarchical IRQ that is on UP systems, I suppose these are not affected. Yup, they look good. Feel free to add my Ack to them. And yes, UP systems are naturally oblivious of interrupt affinity. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 07/24] kvm: arm64: Create nVHE copy of cpu_logical_map
On 2020-11-11 13:45, David Brazdil wrote: On Wed, Nov 11, 2020 at 01:29:29PM +, Marc Zyngier wrote: On 2020-11-11 13:03, David Brazdil wrote: > > > +/* > > > + * nVHE copy of data structures tracking available CPU cores. > > > + * Only entries for CPUs that were online at KVM init are populated. > > > + * Other CPUs should not be allowed to boot because their features were > > > + * not checked against the finalized system capabilities. > > > + */ > > > +u64 __ro_after_init __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] > > > = INVALID_HWID }; > > > > I'm not sure what __ro_after_init means once we get S2 isolation. > > It is stretching the definition of 'init' a bit, I know, but I don't see > what > your worry is about S2? The intention is to mark this read-only for > .hyp.text > at runtime. With S2, the host won't be able to write to it after KVM > init. > Obviously that's currently not the case. More importantly, EL2 can write to it at any time, which is the bit I'm worried about, as it makes the annotation misleading. EL2 can't, at least not accidentally. The hyp memory mapping is PAGE_HYP_RO (see patch 05). Ah, I obviously overlooked that. Thanks for setting me straight. Shouldn't clash with include files. Where fixing the kernel might clash is all the users of for_each_*_cpu that use an int for the iterator var. I don't think that's a problem (nobody expects that many CPUs). But if you are confident that we don't have a problem, no need to change the kernel itself. M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 16/24] kvm: arm64: Add offset for hyp VA <-> PA conversion
On 2020-11-09 11:32, David Brazdil wrote: Add a host-initialized constant to KVM nVHE hyp code for converting between EL2 linear map virtual addresses and physical addresses. Also add `__hyp_pa` macro that performs the conversion. Signed-off-by: David Brazdil --- arch/arm64/kvm/arm.c | 15 +++ arch/arm64/kvm/hyp/nvhe/psci.c | 3 +++ 2 files changed, 18 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 28e3bc056225..dc7d43d7785a 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1484,6 +1484,20 @@ static inline void hyp_cpu_pm_exit(void) } #endif +static void init_hyp_physvirt_offset(void) +{ + extern s64 kvm_nvhe_sym(hyp_physvirt_offset); + unsigned long kern_vaddr, hyp_vaddr, paddr; + + /* Check that kvm_arm_hyp_percpu_base has been set. */ + BUG_ON(kvm_arm_hyp_percpu_base[0] == 0); Why is this dependent on the percpu base? Or is that just a convenient symbol? + + kern_vaddr = kvm_arm_hyp_percpu_base[0]; + hyp_vaddr = kern_hyp_va(kern_vaddr); + paddr = __pa(kern_vaddr); + CHOOSE_NVHE_SYM(hyp_physvirt_offset) = (s64)paddr - (s64)hyp_vaddr; +} It feels like this offset could be set at the point where we compute the hyp_va offset in va_layout.c, couldn't it? It would have the advantage of keeping all the ugly VA hacks together. + static void init_cpu_logical_map(void) { extern u64 kvm_nvhe_sym(__cpu_logical_map)[NR_CPUS]; @@ -1688,6 +1702,7 @@ static int init_hyp_mode(void) } } + init_hyp_physvirt_offset(); init_cpu_logical_map(); init_psci(); diff --git a/arch/arm64/kvm/hyp/nvhe/psci.c b/arch/arm64/kvm/hyp/nvhe/psci.c index 82d3b2c89658..b0b5df590ba5 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci.c +++ b/arch/arm64/kvm/hyp/nvhe/psci.c @@ -16,6 +16,9 @@ /* Config options set by the host. */ u32 kvm_host_psci_version = PSCI_VERSION(0, 0); u32 kvm_host_psci_function_id[PSCI_FN_MAX]; +s64 hyp_physvirt_offset; + +#define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset) static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt) { Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 07/24] kvm: arm64: Create nVHE copy of cpu_logical_map
On 2020-11-11 13:03, David Brazdil wrote: > +/* > + * nVHE copy of data structures tracking available CPU cores. > + * Only entries for CPUs that were online at KVM init are populated. > + * Other CPUs should not be allowed to boot because their features were > + * not checked against the finalized system capabilities. > + */ > +u64 __ro_after_init __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] > = INVALID_HWID }; I'm not sure what __ro_after_init means once we get S2 isolation. It is stretching the definition of 'init' a bit, I know, but I don't see what your worry is about S2? The intention is to mark this read-only for .hyp.text at runtime. With S2, the host won't be able to write to it after KVM init. Obviously that's currently not the case. More importantly, EL2 can write to it at any time, which is the bit I'm worried about, as it makes the annotation misleading. One thing we might change in the future is marking it RW for some initial setup in a HVC handler, then marking it RO for the rest of uptime. That'd be a desirable outcome, and it would be consistent with the rest of the kernel. > + > +u64 cpu_logical_map(int cpu) nit: is there any reason why "cpu" cannot be unsigned? The thought of a negative CPU number makes me shiver... Same here. That's how it's defined in kernel proper, so I went with that. I'm happy to deviate from the kernel (give the function a different name if this clashes with existing include files). We can also fix the rest of the kernel (I've just written the trivial patch). > +{ > + if (cpu < 0 || cpu >= ARRAY_SIZE(__cpu_logical_map)) > + hyp_panic(); > + > + return __cpu_logical_map[cpu]; > +} > + > unsigned long __hyp_per_cpu_offset(unsigned int cpu) > { >unsigned long *cpu_base_array; Overall, this patch would make more sense closer it its use case (in patch 19). I also don't understand why this lives in percpu.c... I didn't think it called for adding another C file for this. How about we rename this file to smp.c? Would that make sense for both? Make that hyp-smp.c, please! M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 15/24] kvm: arm64: Bootstrap PSCI SMC handler in nVHE EL2
On 2020-11-09 11:32, David Brazdil wrote: Add a handler of PSCI SMCs in nVHE hyp code. The handler is initialized with the version used by the host's PSCI driver and the function IDs it was configured with. If the SMC function ID matches one of the configured PSCI calls (for v0.1) or falls into the PSCI function ID range (for v0.2+), the SMC is handled by the PSCI handler. For now, all SMCs return PSCI_RET_NOT_SUPPORTED. Signed-off-by: David Brazdil --- arch/arm64/include/asm/kvm_hyp.h | 4 ++ arch/arm64/kvm/arm.c | 13 arch/arm64/kvm/hyp/nvhe/Makefile | 2 +- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 4 ++ arch/arm64/kvm/hyp/nvhe/psci.c | 102 + include/uapi/linux/psci.h | 1 + 6 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kvm/hyp/nvhe/psci.c diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index a3289071f3d8..95a2bbbcc7e1 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -96,6 +96,10 @@ void deactivate_traps_vhe_put(void); u64 __guest_enter(struct kvm_vcpu *vcpu); +#ifdef __KVM_NVHE_HYPERVISOR__ +bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt); +#endif + void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 1a57b6025937..28e3bc056225 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -1498,6 +1499,17 @@ static void init_cpu_logical_map(void) CHOOSE_NVHE_SYM(__cpu_logical_map)[cpu] = cpu_logical_map(cpu); } +static void init_psci(void) nit: init_psci_relay? +{ + extern u32 kvm_nvhe_sym(kvm_host_psci_version); + extern u32 kvm_nvhe_sym(kvm_host_psci_function_id)[PSCI_FN_MAX]; + int i; + + CHOOSE_NVHE_SYM(kvm_host_psci_version) = psci_driver_version(); + for (i = 0; i < PSCI_FN_MAX; ++i) + CHOOSE_NVHE_SYM(kvm_host_psci_function_id)[i] = psci_get_function_id(i); +} + static int init_common_resources(void) { return kvm_set_ipa_limit(); @@ -1677,6 +1689,7 @@ static int init_hyp_mode(void) } init_cpu_logical_map(); + init_psci(); return 0; diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index c45f440cce51..647b63337a51 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -7,7 +7,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ -hyp-main.o percpu.o +hyp-main.o percpu.o psci.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 8661bc7deaa9..69f34d4f2773 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -134,6 +134,10 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt) */ skip_host_instruction(); + /* Try to handle host's PSCI SMCs. */ + if (kvm_host_psci_handler(host_ctxt)) + return; + /* Forward SMC not handled in EL2 to EL3. */ forward_host_smc(host_ctxt); } diff --git a/arch/arm64/kvm/hyp/nvhe/psci.c b/arch/arm64/kvm/hyp/nvhe/psci.c new file mode 100644 index ..82d3b2c89658 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/psci.c nit: can we please name this psci-relay.c, or psci-proxy.c? We already have a psci.c in the tree, and having the same file name messes with my editor... ;-) @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google LLC + * Author: David Brazdil + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Config options set by the host. */ +u32 kvm_host_psci_version = PSCI_VERSION(0, 0); +u32 kvm_host_psci_function_id[PSCI_FN_MAX]; + +static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt) +{ + return host_ctxt->regs.regs[0]; +} + +static bool is_psci_0_1_call(u64 func_id) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(kvm_host_psci_function_id); ++i) { + if (func_id == kvm_host_psci_function_id[i]) + return true; + } + return false; +} + +static bool is_psci_0_2_fn_call(u64 func_id) +{ + u64 base = func_id & ~PSCI_0_2_FN_ID_MASK; + + return base == PSCI_0_2_FN_BASE || base == PSCI_0_2_FN64_BASE; I couldn't spot in the spec where PSCI reserves 16bit worth of IDs in each range. +} + +static bool is_psci_call(u64 func_id) +{ + if (kvm_host_psci_version == PSCI_VERSION(0, 0)
Re: [PATCH v1 06/24] kvm: arm64: Support per_cpu_ptr in nVHE hyp code
On 2020-11-11 12:32, David Brazdil wrote: > + > + cpu_base_array = kern_hyp_va(&kvm_arm_hyp_percpu_base[0]); There is no guarantee that this will not generate a PC relative addressing, resulting in kern_hyp_va() being applied twice. Consider using hyp_symbol_addr() instead, which always does the right by forcing a PC relative addressing and not subsequently mangling the address. > + this_cpu_base = kern_hyp_va(cpu_base_array[cpu]); > + return this_cpu_base - (unsigned long)&__per_cpu_start; And this is the opposite case: if the compiler generates an absolute address, you're toast. Yes, this is just as unlikely, but hey... Same remedy should apply. Good point, and I'll probably keep forgetting about this in the future. Now that all .hyp.text is only executed under hyp page tables, should we start thinking about fixing up the relocations? Why not, if you can deal with the hypervisor text being mapped at a random location, and make sure that the kernel doesn't process the relocations for you. This would certainly save us a lot of runtime offsetting (which I'm adding to in a separate series). M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 11/24] kvm: arm64: Add SMC handler in nVHE EL2
On 2020-11-09 11:32, David Brazdil wrote: Add handler of host SMCs in KVM nVHE trap handler. Forward all SMCs to EL3 and propagate the result back to EL1. This is done in preparation for validating host SMCs in KVM nVHE protected mode. Signed-off-by: David Brazdil --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 35 ++ 1 file changed, 35 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 19332c20fcde..8661bc7deaa9 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -106,6 +106,38 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) host_ctxt->regs.regs[1] = ret; } +static void skip_host_instruction(void) +{ + write_sysreg_el2(read_sysreg_el2(SYS_ELR) + 4, SYS_ELR); +} + +static void forward_host_smc(struct kvm_cpu_context *host_ctxt) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_smc(host_ctxt->regs.regs[0], host_ctxt->regs.regs[1], + host_ctxt->regs.regs[2], host_ctxt->regs.regs[3], + host_ctxt->regs.regs[4], host_ctxt->regs.regs[5], + host_ctxt->regs.regs[6], host_ctxt->regs.regs[7], + &res); How do you make sure that EL3 actually supports SMCCC 1.1? If that's not the case, don't we risk additional registers being corrupted? What of SMCCC 1.2 calls that extend arguments to up to x17? + host_ctxt->regs.regs[0] = res.a0; + host_ctxt->regs.regs[1] = res.a1; + host_ctxt->regs.regs[2] = res.a2; + host_ctxt->regs.regs[3] = res.a3; +} + +static void handle_host_smc(struct kvm_cpu_context *host_ctxt) +{ + /* +* Unlike HVC, the return address of an SMC is the instruction's PC. +* Move the return address past the instruction. +*/ + skip_host_instruction(); + + /* Forward SMC not handled in EL2 to EL3. */ + forward_host_smc(host_ctxt); nit: In general, we update the PC *after* emulating the instruction that trapped. Not a big deal, but it makes it easier to reason across the code base. +} + void handle_trap(struct kvm_cpu_context *host_ctxt) { u64 esr = read_sysreg_el2(SYS_ESR); @@ -114,6 +146,9 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) case ESR_ELx_EC_HVC64: handle_host_hcall(host_ctxt); break; + case ESR_ELx_EC_SMC64: + handle_host_smc(host_ctxt); + break; default: hyp_panic(); } Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 13/24] kvm: arm64: Add CPU entry point in nVHE hyp
On 2020-11-09 11:32, David Brazdil wrote: When nVHE hyp starts interception host's PSCI CPU_ON SMCs, it will need to install KVM on the newly booted CPU before returning to the host. Add an entry point which expects the same kvm_nvhe_init_params struct as the __kvm_hyp_init HVC in the CPU_ON context argument (x0). The entry point initializes EL2 state with the same init_el2_state macro used by the kernel's entry point. It then initializes KVM using the same helper function used in the __kvm_hyp_init HVC. When done, the entry point branches to a function provided in the init params. Signed-off-by: David Brazdil --- arch/arm64/include/asm/kvm_asm.h | 1 + arch/arm64/kernel/asm-offsets.c| 1 + arch/arm64/kvm/hyp/nvhe/hyp-init.S | 30 ++ 3 files changed, 32 insertions(+) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 893327d1e449..efb4872bb29f 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -155,6 +155,7 @@ struct kvm_nvhe_init_params { unsigned long tpidr_el2; unsigned long hyp_stack_ptr; unsigned long vector_ptr; + unsigned long psci_cpu_entry_fn; }; /* Translate a kernel address @ptr into its equivalent linear mapping */ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 0cbb86135c7c..ffc84e68ad97 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -114,6 +114,7 @@ int main(void) DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2)); DEFINE(NVHE_INIT_STACK_PTR, offsetof(struct kvm_nvhe_init_params, hyp_stack_ptr)); DEFINE(NVHE_INIT_VECTOR_PTR, offsetof(struct kvm_nvhe_init_params, vector_ptr)); + DEFINE(NVHE_INIT_PSCI_CPU_ENTRY_FN, offsetof(struct kvm_nvhe_init_params, psci_cpu_entry_fn)); #endif #ifdef CONFIG_CPU_PM DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 1697d25756e9..f999a35b2c8c 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -6,6 +6,7 @@ #include #include +#include This should probably be included from the file that provides init_el2_state. #include #include @@ -159,6 +160,35 @@ alternative_else_nop_endif ret SYM_CODE_END(___kvm_hyp_init) +SYM_CODE_START(__kvm_hyp_cpu_entry) + msr SPsel, #1 // We want to use SP_EL{1,2} + + /* +* Check that the core was booted in EL2. Loop indefinitely if not + * because it cannot be safely given to the host without installing KVM. +*/ + mrs x1, CurrentEL + cmp x1, #CurrentEL_EL2 + b.ne. This is a bit brutal. Consider using a WFE/WFI loop as we have in other places already (see __secondary_too_slow for example). + + /* Initialize EL2 CPU state to sane values. */ + mov x29, x0 + init_el2_state nvhe + mov x0, x29 + + /* + * Load hyp VA of C entry function. Must do so before switching on the + * MMU because the struct pointer is PA and not identity-mapped in hyp. +*/ + ldr x29, [x0, #NVHE_INIT_PSCI_CPU_ENTRY_FN] + + /* Enable MMU, set vectors and stack. */ + bl ___kvm_hyp_init + + /* Leave idmap. */ + br x29 To a point I made against an earlier patch: psci_cpu_entry_fn seems to be a HYP VA, and really needs to be documented as such, because this is pretty hard to follow otherwise. +SYM_CODE_END(__kvm_hyp_cpu_entry) + SYM_CODE_START(__kvm_handle_stub_hvc) cmp x0, #HVC_SOFT_RESTART b.ne1f Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH] arm64: tegra186: Add missing CPU PMUs
On 2020-11-10 18:22, Thierry Reding wrote: On Tue, Nov 10, 2020 at 06:08:31PM +, Marc Zyngier wrote: On 2020-11-10 17:36, Thierry Reding wrote: > On Tue, Oct 13, 2020 at 10:58:51AM +0100, Marc Zyngier wrote: > > Add the description of CPU PMUs for both the Denver and A57 clusters, > > which enables the perf subsystem. > > > > Signed-off-by: Marc Zyngier [...] > > > > + pmu_denver { > > + compatible = "nvidia,denver-pmu", "arm,armv8-pmuv3"; > > checkpatch complains that this isn't documented. Did I miss the DT > bindings patch or do we not have one for this? We don't. But I don't think adding a compatible string for each and every micro-architecture makes much sense unless we have something useful to add to that compatible string. Such as a full description of the implementation specific events. I'm wondering if this isn't going to upset Rob's json-schema bot and make him mad. Rob going mad? Never! ;-) If you *really* want it, I'll respin this patch with the Denver compatible added to Documentation/devicetree/bindings/arm/pmu.yaml. M. -- Jazz is not dead. It just smells funny...
Re: [PATCH] arm64: tegra186: Add missing CPU PMUs
On 2020-11-10 17:36, Thierry Reding wrote: On Tue, Oct 13, 2020 at 10:58:51AM +0100, Marc Zyngier wrote: Add the description of CPU PMUs for both the Denver and A57 clusters, which enables the perf subsystem. Signed-off-by: Marc Zyngier [...] + pmu_denver { + compatible = "nvidia,denver-pmu", "arm,armv8-pmuv3"; checkpatch complains that this isn't documented. Did I miss the DT bindings patch or do we not have one for this? We don't. But I don't think adding a compatible string for each and every micro-architecture makes much sense unless we have something useful to add to that compatible string. Such as a full description of the implementation specific events. Thanks, M.
Re: [PATCH v1 10/24] kvm: arm64: Extract parts of el2_setup into a macro
On 2020-11-09 11:32, David Brazdil wrote: When the a CPU is booted in EL2, the kernel checks for VHE support and initializes the CPU core accordingly. For nVHE it also installs the stub vectors and drops down to EL1. Once KVM gains the ability to boot cores without going through the kernel entry point, it will need to initialize the CPU the same way. Extract the relevant bits of el2_setup into init_el2_state macro with an argument specifying whether to initialize for VHE or nVHE. No functional change. Size of el2_setup increased by 148 bytes due to duplication. Signed-off-by: David Brazdil --- arch/arm64/include/asm/kvm_asm.h | 128 arch/arm64/kernel/head.S | 140 +++ 2 files changed, 141 insertions(+), 127 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index a49a87a186c3..893327d1e449 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -331,6 +331,134 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; msr sp_el0, \tmp .endm +.macro init_el2_state mode + +.ifnes "\mode", "vhe" +.ifnes "\mode", "nvhe" +.error "Invalid 'mode' argument" +.endif +.endif + + mov_q x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) + msr sctlr_el2, x0 + isb + + /* +* Allow Non-secure EL1 and EL0 to access physical timer and counter. +* This is not necessary for VHE, since the host kernel runs in EL2, + * and EL0 accesses are configured in the later stage of boot process. + * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout + * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined + * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1 +* to transparently mess with the EL0 bits via CNTKCTL_EL1 access in +* EL2. +*/ +.ifeqs "\mode", "nvhe" + mrs x0, cnthctl_el2 + orr x0, x0, #3 // Enable EL1 physical timers + msr cnthctl_el2, x0 +.endif + msr cntvoff_el2, xzr// Clear virtual offset + +#ifdef CONFIG_ARM_GIC_V3 + /* GICv3 system register access */ + mrs x0, id_aa64pfr0_el1 + ubfxx0, x0, #ID_AA64PFR0_GIC_SHIFT, #4 + cbz x0, 3f + + mrs_s x0, SYS_ICC_SRE_EL2 + orr x0, x0, #ICC_SRE_EL2_SRE// Set ICC_SRE_EL2.SRE==1 + orr x0, x0, #ICC_SRE_EL2_ENABLE // Set ICC_SRE_EL2.Enable==1 + msr_s SYS_ICC_SRE_EL2, x0 + isb // Make sure SRE is now set + mrs_s x0, SYS_ICC_SRE_EL2 // Read SRE back, + tbz x0, #0, 3f // and check that it sticks + msr_s SYS_ICH_HCR_EL2, xzr// Reset ICC_HCR_EL2 to defaults +3: +#endif + + /* Populate ID registers. */ + mrs x0, midr_el1 + mrs x1, mpidr_el1 + msr vpidr_el2, x0 + msr vmpidr_el2, x1 I don't think this has any effect on VHE, and could be lumped together with the nVHE code. + +#ifdef CONFIG_COMPAT + msr hstr_el2, xzr // Disable CP15 traps to EL2 +#endif + + /* EL2 debug */ + mrs x1, id_aa64dfr0_el1 + sbfxx0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4 + cmp x0, #1 + b.lt4f // Skip if no PMU present + mrs x0, pmcr_el0// Disable debug access traps + ubfxx0, x0, #11, #5 // to EL2 and allow access to +4: + cselx3, xzr, x0, lt // all PMU counters from EL1 + + /* Statistical profiling */ + ubfxx0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 + cbz x0, 7f // Skip if SPE not present +.ifeqs "\mode", "nvhe" + mrs_s x4, SYS_PMBIDR_EL1 // If SPE available at EL2, + and x4, x4, #(1 << SYS_PMBIDR_EL1_P_SHIFT) + cbnzx4, 5f // then permit sampling of physical + mov x4, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \ + 1 << SYS_PMSCR_EL2_PA_SHIFT) + msr_s SYS_PMSCR_EL2, x4 // addresses and physical counter +5: + mov x1, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) + orr x3, x3, x1 // If we don't have VHE, then + b 7f // use EL1&0 translation. +.endif + orr x3, x3, #MDCR_EL2_TPMS // and disable access from EL1 This orr would probably be better placed in an "else" close to the previous macro. And since you are making things "modular", why not go the extra mile, and define macros for each functionality that defer between modes? It would certainly make this change more digest, and the result more readable. +7: + msr mdcr_el2, x3// Configure debug traps + +
Re: [PATCH v1 08/24] kvm: arm64: Move hyp-init params to a per-CPU struct
On 2020-11-09 11:32, David Brazdil wrote: Once we start initializing KVM on newly booted cores before the rest of the kernel, parameters to __do_hyp_init will need to be provided by EL2 rather than EL1. At that point it will not be possible to pass its four arguments directly because PSCI_CPU_ON only supports one context argument. Refactor __do_hyp_init to accept its parameters in a struct. This prepares the code for KVM booting cores as well as removes any limits on the number of __do_hyp_init arguments. Signed-off-by: David Brazdil --- arch/arm64/include/asm/kvm_asm.h | 7 +++ arch/arm64/include/asm/kvm_hyp.h | 4 arch/arm64/kernel/asm-offsets.c| 4 arch/arm64/kvm/arm.c | 26 ++ arch/arm64/kvm/hyp/nvhe/hyp-init.S | 21 ++--- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 2 ++ 6 files changed, 41 insertions(+), 23 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 54387ccd1ab2..a49a87a186c3 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -150,6 +150,13 @@ extern void *__vhe_undefined_symbol; #endif +struct kvm_nvhe_init_params { + phys_addr_t pgd_ptr; + unsigned long tpidr_el2; + unsigned long hyp_stack_ptr; + unsigned long vector_ptr; +}; Please add some documentation here, specially indicating what address space the values are relative to. + /* Translate a kernel address @ptr into its equivalent linear mapping */ #define kvm_ksym_ref(ptr) \ ({ \ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 6b664de5ec1f..a3289071f3d8 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -15,6 +15,10 @@ DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DECLARE_PER_CPU(unsigned long, kvm_hyp_vector); +#ifdef __KVM_NVHE_HYPERVISOR__ +DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); +#endif + #define read_sysreg_elx(r,nvh,vh) \ ({ \ u64 reg;\ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 7d32fc959b1a..0cbb86135c7c 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -110,6 +110,10 @@ int main(void) DEFINE(CPU_APGAKEYLO_EL1,offsetof(struct kvm_cpu_context, sys_regs[APGAKEYLO_EL1])); DEFINE(HOST_CONTEXT_VCPU,offsetof(struct kvm_cpu_context, __hyp_running_vcpu)); DEFINE(HOST_DATA_CONTEXT, offsetof(struct kvm_host_data, host_ctxt)); + DEFINE(NVHE_INIT_PGD_PTR, offsetof(struct kvm_nvhe_init_params, pgd_ptr)); + DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2)); + DEFINE(NVHE_INIT_STACK_PTR, offsetof(struct kvm_nvhe_init_params, hyp_stack_ptr)); + DEFINE(NVHE_INIT_VECTOR_PTR, offsetof(struct kvm_nvhe_init_params, vector_ptr)); #endif #ifdef CONFIG_CPU_PM DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b85b4294b72d..1a57b6025937 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -50,6 +50,7 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; +DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); @@ -1331,10 +1332,7 @@ static int kvm_map_vectors(void) static void cpu_init_hyp_mode(void) { - phys_addr_t pgd_ptr; - unsigned long hyp_stack_ptr; - unsigned long vector_ptr; - unsigned long tpidr_el2; + struct kvm_nvhe_init_params *params = this_cpu_ptr_nvhe_sym(kvm_init_params); struct arm_smccc_res res; /* Switch from the HYP stub to our own HYP init vector */ @@ -1345,13 +1343,18 @@ static void cpu_init_hyp_mode(void) * kernel's mapping to the linear mapping, and store it in tpidr_el2 * so that we can use adr_l to access per-cpu variables in EL2. */ - tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) - - (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); + params->tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) - + (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); - pgd_ptr = kvm_mmu_get_httbr(); - hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE; - hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr); - vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector)); + para
Re: [PATCH v1 07/24] kvm: arm64: Create nVHE copy of cpu_logical_map
On 2020-11-09 11:32, David Brazdil wrote: When KVM starts validating host's PSCI requests, it will need to map MPIDR back to the CPU ID. To this end, copy cpu_logical_map into nVHE hyp memory when KVM is initialized. Only copy the information for CPUs that are online at the point of KVM initialization so that KVM rejects CPUs whose features were not checked against the finalized capabilities. Signed-off-by: David Brazdil --- arch/arm64/kvm/arm.c | 17 + arch/arm64/kvm/hyp/nvhe/percpu.c | 16 2 files changed, 33 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9ba9db2aa7f8..b85b4294b72d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1481,6 +1481,21 @@ static inline void hyp_cpu_pm_exit(void) } #endif +static void init_cpu_logical_map(void) +{ + extern u64 kvm_nvhe_sym(__cpu_logical_map)[NR_CPUS]; + int cpu; + + /* +* Copy the MPIDR <-> logical CPU ID mapping to hyp. +* Only copy the set of online CPUs whose features have been chacked +* against the finalized system capabilities. The hypervisor will not +* allow any other CPUs from the `possible` set to boot. +*/ + for_each_online_cpu(cpu) + CHOOSE_NVHE_SYM(__cpu_logical_map)[cpu] = cpu_logical_map(cpu); +} + static int init_common_resources(void) { return kvm_set_ipa_limit(); @@ -1659,6 +1674,8 @@ static int init_hyp_mode(void) } } + init_cpu_logical_map(); + return 0; out_err: diff --git a/arch/arm64/kvm/hyp/nvhe/percpu.c b/arch/arm64/kvm/hyp/nvhe/percpu.c index 5fd0c5696907..d0b9dbc2df45 100644 --- a/arch/arm64/kvm/hyp/nvhe/percpu.c +++ b/arch/arm64/kvm/hyp/nvhe/percpu.c @@ -8,6 +8,22 @@ #include #include +/* + * nVHE copy of data structures tracking available CPU cores. + * Only entries for CPUs that were online at KVM init are populated. + * Other CPUs should not be allowed to boot because their features were + * not checked against the finalized system capabilities. + */ +u64 __ro_after_init __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; I'm not sure what __ro_after_init means once we get S2 isolation. + +u64 cpu_logical_map(int cpu) nit: is there any reason why "cpu" cannot be unsigned? The thought of a negative CPU number makes me shiver... +{ + if (cpu < 0 || cpu >= ARRAY_SIZE(__cpu_logical_map)) + hyp_panic(); + + return __cpu_logical_map[cpu]; +} + unsigned long __hyp_per_cpu_offset(unsigned int cpu) { unsigned long *cpu_base_array; Overall, this patch would make more sense closer it its use case (in patch 19). I also don't understand why this lives in percpu.c... Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 06/24] kvm: arm64: Support per_cpu_ptr in nVHE hyp code
On 2020-11-09 11:32, David Brazdil wrote: When compiling with __KVM_NVHE_HYPERVISOR__ redefine per_cpu_offset() to __hyp_per_cpu_offset() which looks up the base of the nVHE per-CPU region of the given cpu and computes its offset from the .hyp.data..percpu section. This enables use of per_cpu_ptr() helpers in nVHE hyp code. Until now only this_cpu_ptr() was supported by setting TPIDR_EL2. Signed-off-by: David Brazdil --- arch/arm64/include/asm/percpu.h | 6 ++ arch/arm64/kernel/image-vars.h | 3 +++ arch/arm64/kvm/hyp/nvhe/Makefile | 3 ++- arch/arm64/kvm/hyp/nvhe/percpu.c | 22 ++ 4 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kvm/hyp/nvhe/percpu.c diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 1599e17379d8..8f1661603b78 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -239,6 +239,12 @@ PERCPU_RET_OP(add, add, ldadd) #define this_cpu_cmpxchg_8(pcp, o, n) \ _pcp_protect_return(cmpxchg_relaxed, pcp, o, n) +#ifdef __KVM_NVHE_HYPERVISOR__ +extern unsigned long __hyp_per_cpu_offset(unsigned int cpu); +#define __per_cpu_offset +#define per_cpu_offset(cpu)__hyp_per_cpu_offset((cpu)) +#endif + #include /* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */ diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index c615b285ff5b..78a42a7cdb72 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -103,6 +103,9 @@ KVM_NVHE_ALIAS(gic_nonsecure_priorities); KVM_NVHE_ALIAS(__start___kvm_ex_table); KVM_NVHE_ALIAS(__stop___kvm_ex_table); +/* Array containing bases of nVHE per-CPU memory regions. */ +KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); + #endif /* CONFIG_KVM */ #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index ddde15fe85f2..c45f440cce51 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -6,7 +6,8 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o +obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ +hyp-main.o percpu.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o diff --git a/arch/arm64/kvm/hyp/nvhe/percpu.c b/arch/arm64/kvm/hyp/nvhe/percpu.c new file mode 100644 index ..5fd0c5696907 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/percpu.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google LLC + * Author: David Brazdil + */ + +#include +#include +#include + +unsigned long __hyp_per_cpu_offset(unsigned int cpu) +{ + unsigned long *cpu_base_array; + unsigned long this_cpu_base; + + if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base)) + hyp_panic(); + + cpu_base_array = kern_hyp_va(&kvm_arm_hyp_percpu_base[0]); There is no guarantee that this will not generate a PC relative addressing, resulting in kern_hyp_va() being applied twice. Consider using hyp_symbol_addr() instead, which always does the right by forcing a PC relative addressing and not subsequently mangling the address. + this_cpu_base = kern_hyp_va(cpu_base_array[cpu]); + return this_cpu_base - (unsigned long)&__per_cpu_start; And this is the opposite case: if the compiler generates an absolute address, you're toast. Yes, this is just as unlikely, but hey... Same remedy should apply. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 03/24] arm64: Move MAIR_EL1_SET to asm/memory.h
On 2020-11-09 11:32, David Brazdil wrote: KVM currently initializes MAIR_EL2 to the value of MAIR_EL1. In preparation for initializing MAIR_EL2 before MAIR_EL1, move the constant into a shared header file. Signed-off-by: David Brazdil --- arch/arm64/include/asm/memory.h | 13 + arch/arm64/mm/proc.S| 13 - 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index cd61239bae8c..aca00737e771 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -152,6 +152,19 @@ #define MT_S2_FWB_NORMAL 6 #define MT_S2_FWB_DEVICE_nGnRE 1 +/* + * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal memory and + * changed during __cpu_setup to Normal Tagged if the system supports MTE. + */ +#define MAIR_EL1_SET \ If we are going to use this at EL2 directly, consider renaming it to MAIR_ELx_SET, as we do for other constants that are shared across exception levels. + (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) | \ This creates an implicit dependency between sysreg.h and memory.h. Consider including asm/sysreg.h, assuming this doesn't create any circular dependency, or even move it to sysreg.h altogether. +MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |\ +MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |\ +MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \ +MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |\ +MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) | \ +MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED)) + #ifdef CONFIG_ARM64_4K_PAGES #define IOREMAP_MAX_ORDER (PUD_SHIFT) #else diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 23c326a06b2d..25ff21b3a1c6 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -45,19 +45,6 @@ #define TCR_KASAN_FLAGS 0 #endif -/* - * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal memory and - * changed during __cpu_setup to Normal Tagged if the system supports MTE. - */ -#define MAIR_EL1_SET \ - (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) | \ -MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |\ -MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |\ -MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \ -MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |\ -MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) | \ -MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED)) - #ifdef CONFIG_CPU_PM /** * cpu_do_suspend - save CPU registers context Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 01/24] psci: Accessor for configured PSCI version
On 2020-11-09 11:32, David Brazdil wrote: The version of PSCI that the kernel should use to communicate with firmware is typically obtained from probing PSCI_VERSION. However, that doesn't work for PSCI v0.1 where the host gets the information from DT/ACPI, or if PSCI is not supported / was disabled. KVM's host PSCI proxy needs to be configured with the same version used by the host driver. Expose the PSCI version used by the host with a read-only accessor. Signed-off-by: David Brazdil --- drivers/firmware/psci/psci.c | 11 +++ include/linux/psci.h | 8 2 files changed, 19 insertions(+) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 00af99b6f97c..bc1b2d60fdbf 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -49,6 +49,13 @@ static int resident_cpu = -1; struct psci_operations psci_ops; static enum arm_smccc_conduit psci_conduit = SMCCC_CONDUIT_NONE; +static int driver_version = PSCI_VERSION(0, 0); + +int psci_driver_version(void) +{ + return driver_version; +} + bool psci_tos_resident_on(int cpu) { return cpu == resident_cpu; @@ -461,6 +468,8 @@ static int __init psci_probe(void) return -EINVAL; } + driver_version = ver; + psci_0_2_set_functions(); psci_init_migrate(); @@ -514,6 +523,8 @@ static int __init psci_0_1_init(struct device_node *np) pr_info("Using PSCI v0.1 Function IDs from DT\n"); + driver_version = PSCI_VERSION(0, 1); + if (!of_property_read_u32(np, "cpu_suspend", &id)) { psci_function_id[PSCI_FN_CPU_SUSPEND] = id; psci_ops.cpu_suspend = psci_cpu_suspend; diff --git a/include/linux/psci.h b/include/linux/psci.h index 2a1bfb890e58..5b5dcf176aa6 100644 --- a/include/linux/psci.h +++ b/include/linux/psci.h @@ -21,6 +21,14 @@ bool psci_power_state_is_valid(u32 state); int psci_set_osi_mode(bool enable); bool psci_has_osi_support(void); +/** + * The version of the PSCI specification followed by the driver. + * This is equivalent to calling PSCI_VERSION except: + * (a) it also works for PSCI v0.1, which does not support PSCI_VERSION, and + * (b) it is set to v0.0 if the PSCI driver was not initialized. + */ +int psci_driver_version(void); + struct psci_operations { u32 (*get_version)(void); int (*cpu_suspend)(u32 state, unsigned long entry_point); I still maintain that populating .get_version in all cases instead of duplicating an existing functionality is a better outcome. PSCI not supported would be implied by .get_version being NULL. What breaks? M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v2 3/5] gpio: msc313: MStar MSC313 GPIO driver
On 2020-11-10 14:02, Linus Walleij wrote: On Thu, Nov 5, 2020 at 4:43 PM Marc Zyngier wrote: On 2020-11-05 15:23, Daniel Palmer wrote: > On Thu, 5 Nov 2020 at 21:08, Marc Zyngier wrote: > > I see that msc313_gpio_irqchip doesn't have a >> .irq_set_affinity callback. Is this system UP only? > > What is in mainline right now is UP only but there are chips with a > second cortex A7 that I have working in my tree. > So I will add that in for v3 if I can work out what I should actually > do there. :) Probably nothing more than setting the callback to irq_chip_set_affinity_parent, Hm, is this something all GPIO irqchips used on SMP systems should be doing? Or just hierarchical ones? Probably only the hierarchical ones. I'd expect the non-hierarchical GPIOs to be muxed behind a single interrupt, which makes it impossible to move a single GPIO around, and moving the mux interrupt would break userspace's expectations that interrupts move independently of each others. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 00/24] Opt-in always-on nVHE hypervisor
On 2020-11-10 10:15, Christoph Hellwig wrote: On Mon, Nov 09, 2020 at 11:32:09AM +, David Brazdil wrote: As we progress towards being able to keep guest state private to the host running nVHE hypervisor, this series allows the hypervisor to install itself on newly booted CPUs before the host is allowed to run on them. Why? I thought we were trying to kill nVHE off now that newer CPUs provide the saner virtualization extensions? We can't kill nVHE at all, because that is the only game in town. You can't even buy a decent machine with VHE, no matter how much money you put on the table. nVHE is here for the foreseeable future, and we even use its misfeatures to our advantage in order to offer confidential VMs. See Will's presentation at KVM forum a couple of weeks ago for the gory details. Thanks, M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 23/24] kvm: arm64: Trap host SMCs in protected mode.
On 2020-11-09 11:32, David Brazdil wrote: While protected nVHE KVM is installed, start trapping all host SMCs. By default, these are simply forwarded to EL3, but PSCI SMCs are validated first. Create new constant HCR_HOST_NVHE_PROTECTED_FLAGS with the new set of HCR flags to use while the nVHE vector is installed when the kernel was booted with the protected flag enabled. Switch back to the default HCR flags when switching back to the stub vector. Signed-off-by: David Brazdil --- arch/arm64/include/asm/kvm_arm.h | 1 + arch/arm64/kernel/image-vars.h | 4 arch/arm64/kvm/arm.c | 35 ++ arch/arm64/kvm/hyp/nvhe/hyp-init.S | 8 +++ arch/arm64/kvm/hyp/nvhe/switch.c | 5 - 5 files changed, 52 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 64ce29378467..4e90c2debf70 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -80,6 +80,7 @@ HCR_FMO | HCR_IMO | HCR_PTW ) #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) +#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) /* TCR_EL2 Registers bits */ diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 78a42a7cdb72..75cda51674f4 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -62,9 +62,13 @@ __efistub__ctype = _ctype; */ /* Alternative callbacks for init-time patching of nVHE hyp code. */ +KVM_NVHE_ALIAS(kvm_patch_hcr_flags); KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); +/* Static key enabled when the user opted into nVHE protected mode. */ +KVM_NVHE_ALIAS(kvm_protected_mode); + /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 574aa2d026e6..c09b95cfa00a 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1861,6 +1861,41 @@ void kvm_arch_exit(void) kvm_perf_teardown(); } +static inline u32 __init __gen_mov_hcr_insn(u64 hcr, u32 rd, int i) +{ + int shift = 48 - (i * 16); + u16 imm = (hcr >> shift) & GENMASK(16, 0); I really doubt you want to encode 17 bits. + + return aarch64_insn_gen_movewide(rd, imm, shift, +AARCH64_INSN_VARIANT_64BIT, +(i == 0) ? AARCH64_INSN_MOVEWIDE_ZERO + : AARCH64_INSN_MOVEWIDE_KEEP); +} I've added a generate_mov_q() helper as part of my host EL2 entry rework. We can probably share some stuff here. + +void __init kvm_patch_hcr_flags(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + int i; + u32 rd; + + BUG_ON(nr_inst != 4); + + /* Skip for VHE and unprotected nVHE modes. */ + if (!is_kvm_protected_mode()) + return; + + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, + le32_to_cpu(origptr[0])); + + for (i = 0; i < nr_inst; i++) { + u32 oinsn = __gen_mov_hcr_insn(HCR_HOST_NVHE_FLAGS, rd, i); + u32 insn = __gen_mov_hcr_insn(HCR_HOST_NVHE_PROTECTED_FLAGS, rd, i); + + BUG_ON(oinsn != le32_to_cpu(origptr[i])); + updptr[i] = cpu_to_le32(insn); + } +} + static int __init early_kvm_protected_cfg(char *buf) { bool val; diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index f999a35b2c8c..bbe6c5f558e0 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -88,6 +88,12 @@ SYM_CODE_END(__kvm_hyp_init) * x0: struct kvm_nvhe_init_params PA */ SYM_CODE_START(___kvm_hyp_init) +alternative_cb kvm_patch_hcr_flags + mov_q x1, HCR_HOST_NVHE_FLAGS You really want to be careful here: the mov_q macro expands to 2, 3 or 4 instructions, depending on the input data... It is also odd that you have both a static key and a patching alternative. Why isn't "protected KVM" a capability that can be evaluated as a a non patching alternative? In general, I'd like to reserve patching alternatives to values that cannot be evaluated at compile time (VM offsets, for example). +alternative_cb_end + msr hcr_el2, x1 + isb + ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] msr tpidr_el2, x1 @@ -220,6 +226,8 @@ reset: bic x5, x5, x6 // Clear SCTL_M and etc pre_disable_mmu_workaround msr sctlr_el2, x5 + mov_q x5, HCR_HOST_NVHE_FLAGS + msr hcr_el2, x5 isb /* Install stub vectors */ diff --git a/arch/arm64/kvm/hyp/nvhe
Re: [PATCH] regmap: Properly free allocated name for regmap_config of syscon
On 2020-11-10 01:35, Kefeng Wang wrote: On 2020/11/10 1:23, Mark Brown wrote: On Mon, Nov 09, 2020 at 07:58:16PM +0800, Kefeng Wang wrote: syscon_config.name in of_syscon_register is allocated using kasprintf, which should be freed when it is not used after regmap_set_name, fix the following memory leak. unreferenced object 0xffe07fe8c150 (size 16): comm "swapper/0", pid 1, jiffies 4294892540 (age 68.168s) hex dump (first 16 bytes): 74 65 73 74 40 31 30 30 30 30 30 00 e0 ff ff ff test@10. backtrace: [<23d86736>] create_object+0xe8/0x348 [] kmemleak_alloc+0x20/0x2a Please think hard before including complete backtraces in upstream reports, they are very large and contain almost no useful information relative to their size so often obscure the relevant content in your message. If part of the backtrace is usefully illustrative (it often is for search engines if nothing else) then it's usually better to pull out the relevant sections. 2899872b627e "regmap: debugfs: Fix memory leak in regmap_debugfs_init" add a similar backtrack, but the address of the trace is useless, will be careful next time. @@ -601,6 +601,7 @@ static int regmap_set_name(struct regmap *map, const struct regmap_config *confi if (!name) return -ENOMEM; + kfree_const(config->name); kfree_const(map->name); map->name = name; } Why would we free the passed in name here? The name wes passed in from outside regmap in a const configuration struct, we've no idea within regmap if it was dynamically allocted or not and it seems very surprising that we'd go off and free it. The whole reason we're duplicating it in regmap_set_name() is that we don't know how long it's going to be around so we don't want to reference it after having returned to the caller. If the caller has dynamically allocated it then the caller should deal with freeing it. Yes, after check it again, this patch is wrong. Hi Marc, the regmap debugfs will duplicate a name in regmap_set_name(), and syscon_config.name won't be used in syscon, so your following patch doesn't seem to be necessary, right ? Please correct me if I'm wrong, thanks. It was certainly necessary at the time when I wrote the patch, as it was fixing some obvious memory corruption (use after free). It is very possible that the flow has been reorganised since, as the following commit hints at: commit e15d7f2b81d2e7d93115d46fa931b366c1cdebc2 Author: Suman Anna Date: Mon Jul 27 16:10:08 2020 -0500 mfd: syscon: Use a unique name with regmap_config The DT node full name is currently being used in regmap_config which in turn is used to create the regmap debugfs directories. This name however is not guaranteed to be unique and the regmap debugfs registration can fail in the cases where the syscon nodes have the same unit-address but are present in different DT node hierarchies. Replace this logic using the syscon reg resource address instead (inspired from logic used while creating platform devices) to ensure a unique name is given for each syscon. Signed-off-by: Suman Anna Reviewed-by: Arnd Bergmann Signed-off-by: Lee Jones I suggest you come up with a more complete analysis of the problem and how it came to be. M. -- Jazz is not dead. It just smells funny...
Re: [PATCH v1 17/24] kvm: arm64: Add __hyp_pa_symbol helper macro
On 2020-11-09 16:59, Quentin Perret wrote: Hey David, On Monday 09 Nov 2020 at 11:32:26 (+), David Brazdil wrote: Add helper macro for computing the PA of a kernel symbol in nVHE hyp code. This will be useful for computing the PA of a PSCI CPU_ON entry point. Signed-off-by: David Brazdil --- arch/arm64/kvm/hyp/nvhe/psci.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/psci.c b/arch/arm64/kvm/hyp/nvhe/psci.c index b0b5df590ba5..7510b9e174e9 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci.c +++ b/arch/arm64/kvm/hyp/nvhe/psci.c @@ -20,6 +20,16 @@ s64 hyp_physvirt_offset; #define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset) +#define __hyp_pa_symbol(sym) \ + ({ \ + extern char sym[]; \ + unsigned long kern_va; \ + \ + asm volatile("ldr %0, =%1" : "=r" (kern_va) \ + : "S" (sym)); \ + kern_va - kimage_voffset; \ + }) + Could this be simplified to __hyp_pa(hyp_symbol_addr(sym))? That would avoid the dependency on kimage_voffset. I'm going to move away from evaluating kimage_voffset at runtime anyway, see [1]. Thanks, M. [1] https://lore.kernel.org/r/20201109175923.445945-1-...@kernel.org -- Jazz is not dead. It just smells funny...
Re: [PATCH -next] irq-chip/gic-v3-its: Fixed an issue where the ITS executes the residual commands in the queue again when the ITS wakes up from sleep mode.
On 2020-11-09 03:05, xuqiang (M) wrote: 在 2020/11/8 0:54, Marc Zyngier 写道: [dropping Jason, whose email address has been bouncing for weeks now] On 2020-11-07 10:42, Xu Qiang wrote: On my platform, ITS_FLAGS_SAVE_SUSPEND_STATE is not set,thus do nothing Which platform? Hisi Ascend platform in its suspend and resuse function.On the other hand,firmware stores GITS_CTRL,GITS_CBASER,GITS_CWRITER and GITS_BASER in the suspend, and restores these registers in the resume. As a result, the ITS executes the residual commands in the queue. Which firmware are you using? I just had a look at the trusted firmware source code, and while it definitely does something that *looks* like what you are describing, it doesn't re-enable the ITS on resume. So what are you running? I am using ATF. Since ITS_FLAGS_SAVE_SUSPEND_STATE is not set,ITS driver of OS will not re-enable ITS in th resume. To make ITS work properly, we changed the ATF code to re-enable ITS on resume. I don't think the words "work properly" apply here. The kernel didn't do what you wanted, so instead of fixing the kernel, you introduced a bug that results in memory corruption from the firmware. What are you plans to fix your firmware? Because from an upstream ATF compatibility PoV, all there is to do is to fixup the command queue and enable the ITS. M. -- Jazz is not dead. It just smells funny...