[PATCH kvmtool v2 1/3] vfio-pci: Release INTx's unmask eventfd properly
The PCI device INTx uses event fd 'unmask_fd' to signal the deassertion of the line from guest to host; but this eventfd isn't released properly when disable INTx. This patch firstly adds field 'unmask_fd' in struct vfio_pci_device for storing unmask eventfd and close it when disable INTx. Signed-off-by: Leo Yan --- include/kvm/vfio.h | 1 + vfio/pci.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h index 60e6c54..28223cf 100644 --- a/include/kvm/vfio.h +++ b/include/kvm/vfio.h @@ -74,6 +74,7 @@ struct vfio_pci_device { unsigned long irq_modes; int intx_fd; + int unmask_fd; unsigned intintx_gsi; struct vfio_pci_msi_common msi; struct vfio_pci_msi_common msix; diff --git a/vfio/pci.c b/vfio/pci.c index 03de3c1..5224fee 100644 --- a/vfio/pci.c +++ b/vfio/pci.c @@ -1008,6 +1008,7 @@ static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev) irq__del_irqfd(kvm, gsi, pdev->intx_fd); close(pdev->intx_fd); + close(pdev->unmask_fd); } static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) @@ -1095,6 +1096,7 @@ static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) } pdev->intx_fd = trigger_fd; + pdev->unmask_fd = unmask_fd; /* Guest is going to ovewrite our irq_line... */ pdev->intx_gsi = gsi; -- 2.19.1 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH kvmtool v2 3/3] vfio-pci: Re-enable INTx mode when disable MSI/MSIX
Since PCI forbids enabling INTx, MSI or MSIX at the same time, it's by default to disable INTx mode when enable MSI/MSIX mode; but this logic is easily broken if the guest PCI driver detects the MSI/MSIX cannot work as expected and tries to rollback to use INTx mode. The INTx mode has been disabled and it has no chance to be enabled again, thus both INTx mode and MSI/MSIX mode will not be enabled in vfio for this case. Below shows the detailed flow for introducing this issue: vfio_pci_configure_dev_irqs() `-> vfio_pci_enable_intx() vfio_pci_enable_msis() `-> vfio_pci_disable_intx() vfio_pci_disable_msis() => Guest PCI driver disables MSI To fix this issue, when disable MSI/MSIX we need to check if INTx mode is available for this device or not; if the device can support INTx then we need to re-enable it so the device can fallback to use it. In this patch, should note two minor changes: - vfio_pci_disable_intx() may be called multiple times (each time the guest enables one MSI vector). This patch changes to use 'intx_fd == -1' to denote the INTx disabled, vfio_pci_disable_intx() and vfio_pci_enable_intx will directly bail out when detect INTx has been disabled and enabled respectively. - Since pci_device_header will be corrupted after PCI configuration and all irq related info will be lost. Before re-enabling INTx mode, this patch restores 'irq_pin' and 'irq_line' fields in struct pci_device_header. Signed-off-by: Leo Yan --- vfio/pci.c | 59 -- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/vfio/pci.c b/vfio/pci.c index d025581..ba971eb 100644 --- a/vfio/pci.c +++ b/vfio/pci.c @@ -28,6 +28,7 @@ struct vfio_irq_eventfd { msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY) static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev); +static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev); static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, bool msix) @@ -50,17 +51,14 @@ static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, if (!msi_is_enabled(msis->virt_state)) return 0; - if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { - /* -* PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same -* time. Since INTx has to be enabled from the start (we don't -* have a reliable way to know when the user starts using it), -* disable it now. -*/ + /* +* PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same +* time. Since INTx has to be enabled from the start (after enabling +* 'pdev->intx_fd' will be assigned to an eventfd and doesn't equal +* to the init value -1), disable it now. +*/ + if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) vfio_pci_disable_intx(kvm, vdev); - /* Permanently disable INTx */ - pdev->irq_modes &= ~VFIO_PCI_IRQ_MODE_INTX; - } eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); @@ -162,7 +160,34 @@ static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev, msi_set_enabled(msis->phys_state, false); msi_set_empty(msis->phys_state, true); - return 0; + /* +* When MSI or MSIX is disabled, this might be called when +* PCI driver detects the MSI interrupt failure and wants to +* rollback to INTx mode. Thus enable INTx if the device +* supports INTx mode in this case. +*/ + if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { + /* +* Struct pci_device_header is not only used for header, +* it also is used for PCI configuration; and in the function +* vfio_pci_cfg_write() it firstly writes configuration space +* and then read back the configuration space data into the +* header structure; thus 'irq_pin' and 'irq_line' in the +* header will be overwritten. +* +* If want to enable INTx mode properly, firstly needs to +* restore 'irq_pin' and 'irq_line' values; we can simply set 1 +* to 'irq_pin', and 'pdev->intx_gsi' keeps gsi value when +* enable INTx mode previously so we can simply use it to +* recover irq line number by adding offset KVM_IRQ_OFFSET. +*/ + pdev->hdr.irq_pin = 1; + pdev->hdr.irq_line = pdev->intx_gsi + KVM_IRQ_OFFSET; + + ret = vfio_pci_enable_intx(kvm, vdev); + } + + return ret >= 0 ? 0 : ret; } static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev, @@ -1002,6 +1027,10 @@ static void vfio_pci_disable_intx(st
[PATCH kvmtool v2 0/3] vfio-pci: Support INTx mode re-enabling
When enable vfio-pci mode for NIC driver on Juno board, the IRQ is failed to forward properly from host to guest, finally root caused this issue is related with kvmtool cannot re-enable INTx mode properly. So the basic working flow to reproduce this issue is as below: Host Guest - INTx mode MSI enable failed in NIC driver MSI disable in NIC driver Switch back to INTx mode --> kvmtool doesn't support So this patch is to support INTx mode re-enabling; 0001/0002 patches are only minor fixing up for eventfd releasing and remove useless FDs reservation for INTx. 0003 patch is the core patch for supporting INTx mode re-enabling, when kvmtool detects MSI is disabled it rollbacks to INTx mode. This patch set has been tested on Juno-r2 board. Leo Yan (3): vfio-pci: Release INTx's unmask eventfd properly vfio-pci: Remove useless FDs reservation in vfio_pci_enable_intx() vfio-pci: Re-enable INTx mode when disable MSI/MSIX include/kvm/vfio.h | 1 + vfio/pci.c | 61 +- 2 files changed, 50 insertions(+), 12 deletions(-) -- 2.19.1 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH kvmtool v2 2/3] vfio-pci: Remove useless FDs reservation in vfio_pci_enable_intx()
Since INTx only uses 2 FDs, it's not particularly useful to reserve FDs in function vfio_pci_enable_intx(); so this patch is to remove FDs reservation in this function. Signed-off-by: Leo Yan --- vfio/pci.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/vfio/pci.c b/vfio/pci.c index 5224fee..d025581 100644 --- a/vfio/pci.c +++ b/vfio/pci.c @@ -1025,8 +1025,6 @@ static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) .index = VFIO_PCI_INTX_IRQ_INDEX, }; - vfio_pci_reserve_irq_fds(2); - ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); if (ret || irq_info.count == 0) { vfio_dev_err(vdev, "no INTx reported by VFIO"); -- 2.19.1 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Re: [PATCH kvmtool v1 2/2] vfio-pci: Fallback to INTx mode when disable MSI/MSIX
Hi Jean-Philippe, On Fri, Mar 15, 2019 at 02:20:07PM +, Jean-Philippe Brucker wrote: > On 15/03/2019 08:33, Leo Yan wrote: > > Since PCI forbids enabling INTx, MSI or MSIX at the same time, it's by > > default to disable INTx mode when enable MSI/MSIX mode; but this logic is > > easily broken if the guest PCI driver detects the MSI/MSIX cannot work as > > expected and tries to rollback to use INTx mode. The INTx mode has been > > disabled and it has no chance to be enabled again, thus both INTx mode > > and MSI/MSIX mode will not be enabled in vfio for this case. > > > > Below shows the detailed flow for introducing this issue: > > > > vfio_pci_configure_dev_irqs() > > `-> vfio_pci_enable_intx() > > > > vfio_pci_enable_msis() > > `-> vfio_pci_disable_intx() > > > > vfio_pci_disable_msis() => Guest PCI driver disables MSI > > > > To fix this issue, when disable MSI/MSIX we need to check if INTx mode > > is available for this device or not; if the device can support INTx then > > we need to re-enable it so the device can fallback to use it. > > > > Signed-off-by: Leo Yan > > --- > > vfio/pci.c | 17 - > > 1 file changed, 12 insertions(+), 5 deletions(-) > > > > diff --git a/vfio/pci.c b/vfio/pci.c > > index c0683f6..44727bb 100644 > > --- a/vfio/pci.c > > +++ b/vfio/pci.c > > @@ -28,6 +28,7 @@ struct vfio_irq_eventfd { > > msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY) > > > > static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device > > *vdev); > > +static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev); > > > > static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, > > bool msix) > > @@ -50,7 +51,7 @@ static int vfio_pci_enable_msis(struct kvm *kvm, struct > > vfio_device *vdev, > > if (!msi_is_enabled(msis->virt_state)) > > return 0; > > > > - if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { > > + if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) > > /* > > * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same > > * time. Since INTx has to be enabled from the start (we don't > > @@ -58,9 +59,6 @@ static int vfio_pci_enable_msis(struct kvm *kvm, struct > > vfio_device *vdev, > > * disable it now. > > */ > > vfio_pci_disable_intx(kvm, vdev); > > - /* Permanently disable INTx */ > > - pdev->irq_modes &= ~VFIO_PCI_IRQ_MODE_INTX; > > As a result vfio_pci_disable_intx() may be called multiple times (each > time the guest enables one MSI vector). Could you make > vfio_pci_disable_intx() safe against that (maybe use intx_fd == -1 to > denote the INTx state)? Will do this. > > - } > > > > eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); > > > > @@ -162,7 +160,16 @@ static int vfio_pci_disable_msis(struct kvm *kvm, > > struct vfio_device *vdev, > > msi_set_enabled(msis->phys_state, false); > > msi_set_empty(msis->phys_state, true); > > > > - return 0; > > + if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) > > + /* > > +* When MSI or MSIX is disabled, this might be called when > > +* PCI driver detects the MSI interrupt failure and wants to > > +* rollback to INTx mode. Thus enable INTx if the device > > +* supports INTx mode in this case. > > +*/ > > + ret = vfio_pci_enable_intx(kvm, vdev); > > Let's remove vfio_pci_reserve_irq_fds(2) from vfio_pci_enable_intx(), it > should only called once per run, and isn't particularly useful here > since INTx only uses 2 fds. It's used to bump the fd rlimit when a > device needs ~2048 file descriptors for MSI-X. Understand; will do it. Thanks a lot for very detailed suggestions. > > + > > + return ret >= 0 ? 0 : ret; > > } > > > > static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device > > *vdev, > > > ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[kvmarm:5.1-fixes 6/6] arch/arm/kvm/../../../virt/kvm/arm/mmu.c:1154:35: error: 'S2_PUD_MASK' undeclared; did you mean 'S2_PMD_MASK'?
tree: https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git 5.1-fixes head: 4c6b77dd0036d42268cc1e6c5ab5a9e9d98cae65 commit: 4c6b77dd0036d42268cc1e6c5ab5a9e9d98cae65 [6/6] KVM: arm/arm64: Fix handling of stage2 huge mappings config: arm-axm55xx_defconfig (attached as .config) compiler: arm-linux-gnueabi-gcc (Debian 7.2.0-11) 7.2.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout 4c6b77dd0036d42268cc1e6c5ab5a9e9d98cae65 # save the attached .config to linux build tree GCC_VERSION=7.2.0 make.cross ARCH=arm All errors (new ones prefixed by >>): arch/arm/kvm/../../../virt/kvm/arm/mmu.c: In function 'stage2_set_pud_huge': >> arch/arm/kvm/../../../virt/kvm/arm/mmu.c:1154:35: error: 'S2_PUD_MASK' >> undeclared (first use in this function); did you mean 'S2_PMD_MASK'? unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); ^~~ S2_PMD_MASK arch/arm/kvm/../../../virt/kvm/arm/mmu.c:1154:35: note: each undeclared identifier is reported only once for each function it appears in >> arch/arm/kvm/../../../virt/kvm/arm/mmu.c:1154:48: error: 'S2_PUD_SIZE' >> undeclared (first use in this function); did you mean 'S2_PMD_SIZE'? unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); ^~~ S2_PMD_SIZE In file included from arch/arm/include/asm/bug.h:60:0, from include/linux/bug.h:5, from include/linux/mmdebug.h:5, from include/linux/mm.h:9, from include/linux/mman.h:5, from arch/arm/kvm/../../../virt/kvm/arm/mmu.c:19: >> arch/arm/kvm/../../../virt/kvm/arm/mmu.c:1157:17: error: implicit >> declaration of function 'pud_pfn'; did you mean 'pmd_pfn'? >> [-Werror=implicit-function-declaration] WARN_ON_ONCE(pud_pfn(old_pud) != pud_pfn(*new_pudp)); ^ include/asm-generic/bug.h:148:27: note: in definition of macro 'WARN_ON_ONCE' int __ret_warn_once = !!(condition); \ ^ cc1: some warnings being treated as errors vim +1154 arch/arm/kvm/../../../virt/kvm/arm/mmu.c 1128 1129 static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1130 phys_addr_t addr, const pud_t *new_pudp) 1131 { 1132 pud_t *pudp, old_pud; 1133 1134 retry: 1135 pudp = stage2_get_pud(kvm, cache, addr); 1136 VM_BUG_ON(!pudp); 1137 1138 old_pud = *pudp; 1139 1140 /* 1141 * A large number of vcpus faulting on the same stage 2 entry, 1142 * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). 1143 * Skip updating the page tables if there is no change. 1144 */ 1145 if (pud_val(old_pud) == pud_val(*new_pudp)) 1146 return 0; 1147 1148 if (stage2_pud_present(kvm, old_pud)) { 1149 /* 1150 * If we already have table level mapping for this block, unmap 1151 * the range for this block and retry. 1152 */ 1153 if (!stage2_pud_huge(kvm, old_pud)) { > 1154 unmap_stage2_range(kvm, addr & S2_PUD_MASK, > S2_PUD_SIZE); 1155 goto retry; 1156 } else { > 1157 WARN_ON_ONCE(pud_pfn(old_pud) != > pud_pfn(*new_pudp)); 1158 stage2_pud_clear(kvm, pudp); 1159 kvm_tlb_flush_vmid_ipa(kvm, addr); 1160 } 1161 } else { 1162 get_page(virt_to_page(pudp)); 1163 } 1164 1165 kvm_set_pud(pudp, *new_pudp); 1166 return 0; 1167 } 1168 --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Re: [PATCH v5 00/26] KVM: arm64: SVE guest support
On Tue, Mar 19, 2019 at 05:51:51PM +, Dave Martin wrote: [...] Yes, I messed up the subject line in the cover letter :P The prefix should read "[PATCH v6 00/27]" Cheers ---Dave ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 27/27] KVM: arm64/sve: Document KVM API extensions for SVE
This patch adds sections to the KVM API documentation describing the extensions for supporting the Scalable Vector Extension (SVE) in guests. Signed-off-by: Dave Martin --- Changes since v5: * Document KVM_ARM_VCPU_FINALIZE and its interactions with SVE. --- Documentation/virtual/kvm/api.txt | 132 +- 1 file changed, 129 insertions(+), 3 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index cd920dd..68509de 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1873,6 +1873,7 @@ Parameters: struct kvm_one_reg (in) Returns: 0 on success, negative value on failure Errors: ENOENT: no such register + EPERM: register access forbidden for architecture-dependent reasons EINVAL: other errors, such as bad size encoding for a known register struct kvm_one_reg { @@ -2127,13 +2128,20 @@ Specifically: 0x6030 0010 004c SPSR_UND64 spsr[KVM_SPSR_UND] 0x6030 0010 004e SPSR_IRQ64 spsr[KVM_SPSR_IRQ] 0x6060 0010 0050 SPSR_FIQ64 spsr[KVM_SPSR_FIQ] - 0x6040 0010 0054 V0 128 fp_regs.vregs[0] - 0x6040 0010 0058 V1 128 fp_regs.vregs[1] + 0x6040 0010 0054 V0 128 fp_regs.vregs[0](*) + 0x6040 0010 0058 V1 128 fp_regs.vregs[1](*) ... - 0x6040 0010 00d0 V31128 fp_regs.vregs[31] + 0x6040 0010 00d0 V31128 fp_regs.vregs[31] (*) 0x6020 0010 00d4 FPSR32 fp_regs.fpsr 0x6020 0010 00d5 FPCR32 fp_regs.fpcr +(*) These encodings are not accepted for SVE-enabled vcpus. See +KVM_ARM_VCPU_INIT. + +The equivalent register content can be accessed via bits [127:0] of +the corresponding SVE Zn registers instead for vcpus that have SVE +enabled (see below). + arm64 CCSIDR registers are demultiplexed by CSSELR value: 0x6020 0011 00 @@ -2143,6 +2151,61 @@ arm64 system registers have the following id bit patterns: arm64 firmware pseudo-registers have the following bit pattern: 0x6030 0014 +arm64 SVE registers have the following bit patterns: + 0x6080 0015 00 Zn bits[2048*slice + 2047 : 2048*slice] + 0x6050 0015 04 Pn bits[256*slice + 255 : 256*slice] + 0x6050 0015 060 FFR bits[256*slice + 255 : 256*slice] + 0x6060 0015 KVM_REG_ARM64_SVE_VLS pseudo-register + +Access to slices beyond the maximum vector length configured for the +vcpu (i.e., where 16 * slice >= max_vq (**)) will fail with ENOENT. + +These registers are only accessible on vcpus for which SVE is enabled. +See KVM_ARM_VCPU_INIT for details. + +In addition, except for KVM_REG_ARM64_SVE_VLS, these registers are not +accessible until the vcpu's SVE configuration has been finalized +using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). See KVM_ARM_VCPU_INIT +and KVM_ARM_VCPU_FINALIZE for more information about this procedure. + +KVM_REG_ARM64_SVE_VLS is a pseudo-register that allows the set of vector +lengths supported by the vcpu to be discovered and configured by +userspace. When transferred to or from user memory via KVM_GET_ONE_REG +or KVM_SET_ONE_REG, the value of this register is of type __u64[8], and +encodes the set of vector lengths as follows: + +__u64 vector_lengths[8]; + +if (vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX && +((vector_lengths[(vq - 1) / 64] >> ((vq - 1) % 64)) & 1)) + /* Vector length vq * 16 bytes supported */ +else + /* Vector length vq * 16 bytes not supported */ + +(**) The maximum value vq for which the above condition is true is +max_vq. This is the maximum vector length available to the guest on +this vcpu, and determines which register slices are visible through +this ioctl interface. + +(See Documentation/arm64/sve.txt for an explanation of the "vq" +nomenclature.) + +KVM_REG_ARM64_SVE_VLS is only accessible after KVM_ARM_VCPU_INIT. +KVM_ARM_VCPU_INIT initialises it to the best set of vector lengths that +the host supports. + +Userspace may subsequently modify it if desired until the vcpu's SVE +configuration is finalized using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). + +Apart from simply removing all vector lengths from the host set that +exceed some value, support for arbitrarily chosen sets of vector lengths +is hardware-dependent and may not be available. Attempting to configure +an invalid set of vector lengths via KVM_SET_ONE_REG will fail with +EINVAL. + +After the vcpu's SVE configuration is finalized, further attempts to +write this register will fail with EPERM. + MIPS registers are mapped using the lower 32 bits. The upper 16 of that is the register group type: @@ -2197,6 +2260,7 @@ Parameters: struct kvm_one_reg (in and out) Returns: 0 on success, negative value on failure Errors: ENOENT: no such register + EPERM: register access forbidden for architecture-dependent reasons EINVAL: other errors, such as
[PATCH v6 23/27] KVM: arm64/sve: Add pseudo-register for the guest's vector lengths
This patch adds a new pseudo-register KVM_REG_ARM64_SVE_VLS to allow userspace to set and query the set of vector lengths visible to the guest. In the future, multiple register slices per SVE register may be visible through the ioctl interface. Once the set of slices has been determined we would not be able to allow the vector length set to be changed any more, in order to avoid userspace seeing inconsistent sets of registers. For this reason, this patch adds support for explicit finalization of the SVE configuration via the KVM_ARM_VCPU_FINALIZE ioctl. Finalization is the proper place to allocate the SVE register state storage in vcpu->arch.sve_state, so this patch adds that as appropriate. The data is freed via kvm_arch_vcpu_uninit(), which was previously a no-op on arm64. To simplify the logic for determining what vector lengths can be supported, some code is added to KVM init to work this out, in the kvm_arm_init_arch_resources() hook. The KVM_REG_ARM64_SVE_VLS pseudo-register is not exposed yet. Subsequent patches will allow SVE to be turned on for guest vcpus, making it visible. Signed-off-by: Dave Martin --- Changes since v5: * [Julien Thierry] Delete overzealous BUILD_BUG_ON() checks. It also turns out that these could cause kernel build failures in some configurations, even though the checked condition is compile- time constant. Because of the way the affected functions are called, the checks are superfluous, so the simplest option is simply to get rid of them. * [Julien Thierry] Free vcpu->arch.sve_state (if any) in kvm_arch_vcpu_uninit() (which is currently a no-op). This was accidentally lost during a previous rebase. * Add kvm_arm_init_arch_resources() hook, and use it to probe SVE configuration for KVM, to avoid duplicating the logic elsewhere. We only need to do this once. * Move sve_state buffer allocation to kvm_arm_vcpu_finalize(). As well as making the code more straightforward, this avoids the need to allocate memory in kvm_reset_vcpu(), the meat of which is non-preemptible since commit 358b28f09f0a ("arm/arm64: KVM: Allow a VCPU to fully reset itself"). The refactoring means that if this has not been done by the time we hit KVM_RUN, then this allocation will happen on the kvm_arm_first_run_init() path, where preemption remains enabled. * Add a couple of comments in {get,set}_sve_reg() to make the handling of the KVM_REG_ARM64_SVE_VLS special case a little clearer. * Move mis-split rework to avoid put_user() being the correct size by accident in KVM_GET_REG_LIST to KVM: arm64: Enumerate SVE register indices for KVM_GET_REG_LIST. * Fix wrong WARN_ON() check sense when checking whether the implementation may needs move SVE register slices than KVM can support. * Fix erroneous setting of vcpu->arch.sve_max_vl based on stale loop control veriable vq. * Move definition of KVM_ARM_VCPU_SVE from KVM: arm64/sve: Allow userspace to enable SVE for vcpus. * Migrate to explicit finalization of the SVE configuration, using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). --- arch/arm64/include/asm/kvm_host.h | 15 +++-- arch/arm64/include/uapi/asm/kvm.h | 5 ++ arch/arm64/kvm/guest.c| 114 +- arch/arm64/kvm/reset.c| 89 + 4 files changed, 215 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 98658f7..5475cc4 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -23,7 +23,6 @@ #define __ARM64_KVM_HOST_H__ #include -#include #include #include #include @@ -50,6 +49,7 @@ #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS +/* Will be incremented when KVM_ARM_VCPU_SVE is fully implemented: */ #define KVM_VCPU_MAX_FEATURES 4 #define KVM_REQ_SLEEP \ @@ -59,10 +59,12 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); -static inline int kvm_arm_init_arch_resources(void) { return 0; } +extern unsigned int kvm_sve_max_vl; +int kvm_arm_init_arch_resources(void); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); @@ -353,6 +355,7 @@ struct kvm_vcpu_arch { #define KVM_ARM64_HOST_SVE_IN_USE (1 << 3) /* backup for host TIF_SVE */ #define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ #define KVM_ARM64_GUEST_HAS_SVE(1 << 5) /* SVE exposed to guest */ +#define KVM_ARM64_VCPU_SVE_FINALIZED (1 << 6) /* SVE config completed */ #define vcpu_has_sve(vcpu) (system_supports_sve() && \ ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE)) @@ -525,7 +528,6 @@ static inline bool kvm_arch_requires_vhe(voi
[PATCH v6 24/27] KVM: arm64/sve: Allow userspace to enable SVE for vcpus
Now that all the pieces are in place, this patch offers a new flag KVM_ARM_VCPU_SVE that userspace can pass to KVM_ARM_VCPU_INIT to turn on SVE for the guest, on a per-vcpu basis. As part of this, support for initialisation and reset of the SVE vector length set and registers is added in the appropriate places, as well as finally setting the KVM_ARM64_GUEST_HAS_SVE vcpu flag, to turn on the SVE support code. Allocation of the SVE register storage in vcpu->arch.sve_state is deferred until the SVE configuration is finalized, by which time the size of the registers is known. Setting the vector lengths supported by the vcpu is considered configuration of the emulated hardware rather than runtime configuration, so no support is offered for changing the vector lengths available to an existing vcpu across reset. Signed-off-by: Dave Martin --- Changes since v5: * Refactored to make the code flow clearer and clarify responsiblity for the various initialisation phases/checks. In place of the previous, confusingly dual-purpose kvm_reset_sve(), enabling and resetting of SVE are split into separate functions and called as appropriate from kvm_reset_vcpu(). To avoid interactions with preempt_disable(), memory allocation is done in the kvm_vcpu_first_fun_init() path instead. To achieve this, the SVE memory allocation is moved to kvm_arm_vcpu_finalize(), which now takes on the role of actually doing deferred setup instead of just setting a flag to indicate that the setup was done. * Add has_vhe() sanity-check into kvm_vcpu_enable_sve(), since it makes more sense here than when resetting the vcpu. * When checking for SVE finalization in kvm_reset_vcpu(), call the new SVE-specific function kvm_arm_vcpu_sve_finalized(). The new generic check kvm_arm_vcpu_is_finalized() is unnecessarily broad here: using the appropriate specific check makes the code more self-describing. * Definition of KVM_ARM_VCPU_SVE moved to KVM: arm64/sve: Add pseudo- register for the guest's vector lengths (which needs it for the KVM_ARM_VCPU_FINALIZE ioctl). --- arch/arm64/include/asm/kvm_host.h | 3 +-- arch/arm64/kvm/reset.c| 45 ++- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5475cc4..9d57cf8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -49,8 +49,7 @@ #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS -/* Will be incremented when KVM_ARM_VCPU_SVE is fully implemented: */ -#define KVM_VCPU_MAX_FEATURES 4 +#define KVM_VCPU_MAX_FEATURES 5 #define KVM_REQ_SLEEP \ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e7f9c06..4f04dbf 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -20,10 +20,12 @@ */ #include +#include #include #include #include #include +#include #include #include @@ -37,6 +39,7 @@ #include #include #include +#include /* Maximum phys_shift supported for any VM on this host */ static u32 kvm_ipa_limit; @@ -130,6 +133,27 @@ int kvm_arm_init_arch_resources(void) return 0; } +static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) +{ + if (!system_supports_sve()) + return -EINVAL; + + /* Verify that KVM startup enforced this when SVE was detected: */ + if (WARN_ON(!has_vhe())) + return -EINVAL; + + vcpu->arch.sve_max_vl = kvm_sve_max_vl; + + /* +* Userspace can still customize the vector lengths by writing +* KVM_REG_ARM64_SVE_VLS. Allocation is deferred until +* kvm_arm_vcpu_finalize(), which freezes the configuration. +*/ + vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_SVE; + + return 0; +} + /* * Finalize vcpu's maximum SVE vector length, allocating * vcpu->arch.sve_state as necessary. @@ -188,13 +212,20 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kfree(vcpu->arch.sve_state); } +static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) +{ + if (vcpu_has_sve(vcpu)) + memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); +} + /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer * * This function finds the right table above and sets the registers on * the virtual CPU struct to their architecturally defined reset - * values. + * values, except for registers whose reset is deferred until + * kvm_arm_vcpu_finalize(). * * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT * ioctl or as part of handling a request issued by another VCPU in the PSCI @@ -217,6 +248,18 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) if (loaded) kvm_arch_vcpu_put(vcpu); + if (!kvm_arm_vcpu_sve_finalized(vcpu)) { +
[PATCH v6 22/27] KVM: arm/arm64: Add KVM_ARM_VCPU_FINALIZE ioctl
Some aspects of vcpu configuration may be too complex to be completed inside KVM_ARM_VCPU_INIT. Thus, there may be a requirement for userspace to do some additional configuration before various other ioctls will work in a consistent way. In particular this will be the case for SVE, where userspace will need to negotiate the set of vector lengths to be made available to the guest before the vcpu becomes fully usable. In order to provide an explicit way for userspace to confirm that it has finished setting up a particular vcpu feature, this patch adds a new ioctl KVM_ARM_VCPU_FINALIZE. When userspace has opted into a feature that requires finalization, typically by means of a feature flag passed to KVM_ARM_VCPU_INIT, a matching call to KVM_ARM_VCPU_FINALIZE is now required before KVM_RUN or KVM_GET_REG_LIST is allowed. Individual features may impose additional restrictions where appropriate. No existing vcpu features are affected by this, so current userspace implementations will continue to work exactly as before, with no need to issue KVM_ARM_VCPU_FINALIZE. As implemented in this patch, KVM_ARM_VCPU_FINALIZE is currently a placeholder: no finalizable features exist yet, so ioctl is not required and will always yield EINVAL. Subsequent patches will add the finalization logic to make use of this ioctl for SVE. No functional change for existing userspace. Signed-off-by: Dave Martin --- Changes since v5: * Commit message, including subject line, rewritten. This patch is a rework of "KVM: arm/arm64: Add hook to finalize the vcpu configuration". The old subject line and commit message no longer accurately described what the patch does. However, the code is an evolution of the previous patch rather than a wholesale rewrite. * Added an explicit KVM_ARM_VCPU_FINALIZE ioctl, rather than just providing internal hooks in the kernel to finalize the vcpu configuration implicitly. This allows userspace to confirm exactly when it has finished configuring the vcpu and is ready to use it. This results in simpler (and hopefully more maintainable) ioctl ordering rules. --- arch/arm/include/asm/kvm_host.h | 4 arch/arm64/include/asm/kvm_host.h | 4 include/uapi/linux/kvm.h | 3 +++ virt/kvm/arm/arm.c| 18 ++ 4 files changed, 29 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index a49ee01..e80cfc1 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -19,6 +19,7 @@ #ifndef __ARM_KVM_HOST_H__ #define __ARM_KVM_HOST_H__ +#include #include #include #include @@ -411,4 +412,7 @@ static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) return 0; } +#define kvm_arm_vcpu_finalize(vcpu, what) (-EINVAL) +#define kvm_arm_vcpu_is_finalized(vcpu) true + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3e89509..98658f7 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -23,6 +23,7 @@ #define __ARM64_KVM_HOST_H__ #include +#include #include #include #include @@ -625,4 +626,7 @@ void kvm_arch_free_vm(struct kvm *kvm); int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); +#define kvm_arm_vcpu_finalize(vcpu, what) (-EINVAL) +#define kvm_arm_vcpu_is_finalized(vcpu) true + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index dc77a5a..c3b8e7a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1441,6 +1441,9 @@ struct kvm_enc_region { /* Available with KVM_CAP_HYPERV_CPUID */ #define KVM_GET_SUPPORTED_HV_CPUID _IOWR(KVMIO, 0xc1, struct kvm_cpuid2) +/* Available with KVM_CAP_ARM_SVE */ +#define KVM_ARM_VCPU_FINALIZE_IOW(KVMIO, 0xc2, int) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index c69e137..9edbf0f 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -545,6 +545,9 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) if (likely(vcpu->arch.has_run_once)) return 0; + if (!kvm_arm_vcpu_is_finalized(vcpu)) + return -EPERM; + vcpu->arch.has_run_once = true; if (likely(irqchip_in_kernel(kvm))) { @@ -1116,6 +1119,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (unlikely(!kvm_vcpu_initialized(vcpu))) break; + r = -EPERM; + if (!kvm_arm_vcpu_is_finalized(vcpu)) + break; + r = -EFAULT; if (copy_from_user(®_list, user_list, sizeof(reg_list))) break; @@ -1169,6 +1176,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return kvm_arm_vcpu_set_events(vc
[PATCH v6 25/27] KVM: arm64: Add a capability to advertise SVE support
To provide a uniform way to check for KVM SVE support amongst other features, this patch adds a suitable capability KVM_CAP_ARM_SVE, and reports it as present when SVE is available. Signed-off-by: Dave Martin --- Changes since v5: * [Julien Thierry] Strip out has_vhe() sanity-check, which wasn't in the most logical place, and anyway doesn't really belong in this patch. Moved to KVM: arm64/sve: Allow userspace to enable SVE for vcpus instead. --- arch/arm64/kvm/reset.c | 3 +++ include/uapi/linux/kvm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 4f04dbf..180d7a5 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -98,6 +98,9 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_VM_IPA_SIZE: r = kvm_ipa_limit; break; + case KVM_CAP_ARM_SVE: + r = system_supports_sve(); + break; default: r = 0; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c3b8e7a..1d56444 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -988,6 +988,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_VM_IPA_SIZE 165 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 #define KVM_CAP_HYPERV_CPUID 167 +#define KVM_CAP_ARM_SVE 168 #ifdef KVM_CAP_IRQ_ROUTING -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 21/27] KVM: arm/arm64: Add hook for arch-specific KVM initialisation
This patch adds a kvm_arm_init_arch_resources() hook to perform subarch-specific initialisation when starting up KVM. This will be used in a subsequent patch for global SVE-related setup on arm64. No functional change. Signed-off-by: Dave Martin --- arch/arm/include/asm/kvm_host.h | 2 ++ arch/arm64/include/asm/kvm_host.h | 2 ++ virt/kvm/arm/arm.c| 4 3 files changed, 8 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 770d732..a49ee01 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -53,6 +53,8 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); +static inline int kvm_arm_init_arch_resources(void) { return 0; } + u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 205438a..3e89509 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -58,6 +58,8 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); +static inline int kvm_arm_init_arch_resources(void) { return 0; } + int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 99c3738..c69e137 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1664,6 +1664,10 @@ int kvm_arch_init(void *opaque) if (err) return err; + err = kvm_arm_init_arch_resources(); + if (err) + return err; + if (!in_hyp_mode) { err = init_hyp_mode(); if (err) -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 26/27] KVM: Document errors for KVM_GET_ONE_REG and KVM_SET_ONE_REG
KVM_GET_ONE_REG and KVM_SET_ONE_REG return some error codes that are not documented (but hopefully not surprising either). To give an indication of what these may mean, this patch adds brief documentation. Signed-off-by: Dave Martin --- Documentation/virtual/kvm/api.txt | 6 ++ 1 file changed, 6 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 2d4f7ce..cd920dd 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1871,6 +1871,9 @@ Architectures: all Type: vcpu ioctl Parameters: struct kvm_one_reg (in) Returns: 0 on success, negative value on failure +Errors: + ENOENT: no such register + EINVAL: other errors, such as bad size encoding for a known register struct kvm_one_reg { __u64 id; @@ -2192,6 +2195,9 @@ Architectures: all Type: vcpu ioctl Parameters: struct kvm_one_reg (in and out) Returns: 0 on success, negative value on failure +Errors: + ENOENT: no such register + EINVAL: other errors, such as bad size encoding for a known register This ioctl allows to receive the value of a single register implemented in a vcpu. The register to read is indicated by the "id" field of the -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 20/27] arm64/sve: In-kernel vector length availability query interface
KVM will need to interrogate the set of SVE vector lengths available on the system. This patch exposes the relevant bits to the kernel, along with a sve_vq_available() helper to check whether a particular vector length is supported. __vq_to_bit() and __bit_to_vq() are not intended for use outside these functions: now that these are exposed outside fpsimd.c, they are prefixed with __ in order to provide an extra hint that they are not intended for general-purpose use. Signed-off-by: Dave Martin Reviewed-by: Alex Bennée --- arch/arm64/include/asm/fpsimd.h | 29 + arch/arm64/kernel/fpsimd.c | 35 --- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index df7a143..ad6d2e4 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -24,10 +24,13 @@ #ifndef __ASSEMBLY__ +#include #include +#include #include #include #include +#include #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* Masks for extracting the FPSR and FPCR from the FPSCR */ @@ -89,6 +92,32 @@ extern u64 read_zcr_features(void); extern int __ro_after_init sve_max_vl; extern int __ro_after_init sve_max_virtualisable_vl; +/* Set of available vector lengths, as vq_to_bit(vq): */ +extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); + +/* + * Helpers to translate bit indices in sve_vq_map to VQ values (and + * vice versa). This allows find_next_bit() to be used to find the + * _maximum_ VQ not exceeding a certain value. + */ +static inline unsigned int __vq_to_bit(unsigned int vq) +{ + return SVE_VQ_MAX - vq; +} + +static inline unsigned int __bit_to_vq(unsigned int bit) +{ + if (WARN_ON(bit >= SVE_VQ_MAX)) + bit = SVE_VQ_MAX - 1; + + return SVE_VQ_MAX - bit; +} + +/* Ensure vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX before calling this function */ +static inline bool sve_vq_available(unsigned int vq) +{ + return test_bit(__vq_to_bit(vq), sve_vq_map); +} #ifdef CONFIG_ARM64_SVE diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 8a93afa..577296b 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -136,7 +136,7 @@ static int sve_default_vl = -1; int __ro_after_init sve_max_vl = SVE_VL_MIN; int __ro_after_init sve_max_virtualisable_vl = SVE_VL_MIN; /* Set of available vector lengths, as vq_to_bit(vq): */ -static __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); +__ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); /* Set of vector lengths present on at least one cpu: */ static __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX); static void __percpu *efi_sve_state; @@ -270,25 +270,6 @@ void fpsimd_save(void) } /* - * Helpers to translate bit indices in sve_vq_map to VQ values (and - * vice versa). This allows find_next_bit() to be used to find the - * _maximum_ VQ not exceeding a certain value. - */ - -static unsigned int vq_to_bit(unsigned int vq) -{ - return SVE_VQ_MAX - vq; -} - -static unsigned int bit_to_vq(unsigned int bit) -{ - if (WARN_ON(bit >= SVE_VQ_MAX)) - bit = SVE_VQ_MAX - 1; - - return SVE_VQ_MAX - bit; -} - -/* * All vector length selection from userspace comes through here. * We're on a slow path, so some sanity-checks are included. * If things go wrong there's a bug somewhere, but try to fall back to a @@ -309,8 +290,8 @@ static unsigned int find_supported_vector_length(unsigned int vl) vl = max_vl; bit = find_next_bit(sve_vq_map, SVE_VQ_MAX, - vq_to_bit(sve_vq_from_vl(vl))); - return sve_vl_from_vq(bit_to_vq(bit)); + __vq_to_bit(sve_vq_from_vl(vl))); + return sve_vl_from_vq(__bit_to_vq(bit)); } #ifdef CONFIG_SYSCTL @@ -648,7 +629,7 @@ static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1); /* self-syncing */ vl = sve_get_vl(); vq = sve_vq_from_vl(vl); /* skip intervening lengths */ - set_bit(vq_to_bit(vq), map); + set_bit(__vq_to_bit(vq), map); } } @@ -717,7 +698,7 @@ int sve_verify_vq_map(void) * Mismatches above sve_max_virtualisable_vl are fine, since * no guest is allowed to configure ZCR_EL2.LEN to exceed this: */ - if (sve_vl_from_vq(bit_to_vq(b)) <= sve_max_virtualisable_vl) { + if (sve_vl_from_vq(__bit_to_vq(b)) <= sve_max_virtualisable_vl) { pr_warn("SVE: cpu%d: Unsupported vector length(s) present\n", smp_processor_id()); return -EINVAL; @@ -801,8 +782,8 @@ void __init sve_setup(void) * so sve_vq_map must have at least SVE_VQ_MIN set. * If something went wrong, at least try to patch it up: */ - if (WARN_ON(!tes
[PATCH v6 17/27] KVM: arm64: Reject ioctl access to FPSIMD V-regs on SVE vcpus
In order to avoid the pointless complexity of maintaining two ioctl register access views of the same data, this patch blocks ioctl access to the FPSIMD V-registers on vcpus that support SVE. This will make it more straightforward to add SVE register access support. Since SVE is an opt-in feature for userspace, this will not affect existing users. Signed-off-by: Dave Martin --- (Julien Thierry's Reviewed-by dropped due to non-trivial refactoring) Changes since v5: * Refactored to cope with the removal of core_reg_size_from_offset() (which was added by another series which will now be handled independently). This leaves some duplication in that we still filter the V-regs out in two places, but this no worse than other existing code in guest.c. I plan to tidy this up independently later on. --- arch/arm64/kvm/guest.c | 48 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index a391a61..756d0d6 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -54,12 +54,19 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +static bool core_reg_offset_is_vreg(u64 off) +{ + return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) && + off < KVM_REG_ARM_CORE_REG(fp_regs.fpsr); +} + static u64 core_reg_offset_from_id(u64 id) { return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); } -static int validate_core_offset(const struct kvm_one_reg *reg) +static int validate_core_offset(const struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) { u64 off = core_reg_offset_from_id(reg->id); int size; @@ -91,11 +98,19 @@ static int validate_core_offset(const struct kvm_one_reg *reg) return -EINVAL; } - if (KVM_REG_SIZE(reg->id) == size && - IS_ALIGNED(off, size / sizeof(__u32))) - return 0; + if (KVM_REG_SIZE(reg->id) != size || + !IS_ALIGNED(off, size / sizeof(__u32))) + return -EINVAL; - return -EINVAL; + /* +* The KVM_REG_ARM64_SVE regs must be used instead of +* KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on +* SVE-enabled vcpus: +*/ + if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(off)) + return -EINVAL; + + return 0; } static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) @@ -117,7 +132,7 @@ static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; - if (validate_core_offset(reg)) + if (validate_core_offset(vcpu, reg)) return -EINVAL; if (copy_to_user(uaddr, ((u32 *)regs) + off, KVM_REG_SIZE(reg->id))) @@ -142,7 +157,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; - if (validate_core_offset(reg)) + if (validate_core_offset(vcpu, reg)) return -EINVAL; if (KVM_REG_SIZE(reg->id) > sizeof(tmp)) @@ -195,13 +210,22 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return -EINVAL; } -static int kvm_arm_copy_core_reg_indices(u64 __user *uindices) +static int copy_core_reg_indices(const struct kvm_vcpu *vcpu, +u64 __user *uindices) { unsigned int i; int n = 0; const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE; for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) { + /* +* The KVM_REG_ARM64_SVE regs must be used instead of +* KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on +* SVE-enabled vcpus: +*/ + if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(i)) + continue; + if (uindices) { if (put_user(core_reg | i, uindices)) return -EFAULT; @@ -214,9 +238,9 @@ static int kvm_arm_copy_core_reg_indices(u64 __user *uindices) return n; } -static unsigned long num_core_regs(void) +static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) { - return kvm_arm_copy_core_reg_indices(NULL); + return copy_core_reg_indices(vcpu, NULL); } /** @@ -281,7 +305,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) { unsigned long res = 0; - res += num_core_regs(); + res += num_core_regs(vcpu); res += kvm_arm_num_sys_reg_descs(vcpu); res += kvm_arm_get_fw_num_regs(vcpu); res += NUM_TIMER_REGS; @@ -298,7 +322,7 @@ int kvm_arm_copy_reg_indices(struc
[PATCH v6 13/27] KVM: arm64/sve: Context switch the SVE registers
In order to give each vcpu its own view of the SVE registers, this patch adds context storage via a new sve_state pointer in struct vcpu_arch. An additional member sve_max_vl is also added for each vcpu, to determine the maximum vector length visible to the guest and thus the value to be configured in ZCR_EL2.LEN while the vcpu is active. This also determines the layout and size of the storage in sve_state, which is read and written by the same backend functions that are used for context-switching the SVE state for host tasks. On SVE-enabled vcpus, SVE access traps are now handled by switching in the vcpu's SVE context and disabling the trap before returning to the guest. On other vcpus, the trap is not handled and an exit back to the host occurs, where the handle_sve() fallback path reflects an undefined instruction exception back to the guest, consistently with the behaviour of non-SVE-capable hardware (as was done unconditionally prior to this patch). No SVE handling is added on non-VHE-only paths, since VHE is an architectural and Kconfig prerequisite of SVE. Signed-off-by: Dave Martin --- Changes since v5: * [Julien Thierry, Julien Grall] Commit message typo fixes * [Mark Rutland] Rename trap_class to hsr_ec, for consistency with existing code. * [Mark Rutland] Simplify condition for refusing to handle an FPSIMD/SVE trap, using multiple if () statements for clarity. The previous condition was a bit tortuous, and how that the static_key checks have been hoisted out, it makes little difference to the compiler how we express the condition here. --- arch/arm64/include/asm/kvm_host.h | 6 arch/arm64/kvm/fpsimd.c | 5 +-- arch/arm64/kvm/hyp/switch.c | 75 +-- 3 files changed, 66 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 22cf484..4fabfd2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -228,6 +228,8 @@ struct vcpu_reset_state { struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; + void *sve_state; + unsigned int sve_max_vl; /* HYP configuration */ u64 hcr_el2; @@ -323,6 +325,10 @@ struct kvm_vcpu_arch { bool sysregs_loaded_on_cpu; }; +/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ +#define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \ + sve_ffr_offset((vcpu)->arch.sve_max_vl))) + /* vcpu_arch flags field values: */ #define KVM_ARM64_DEBUG_DIRTY (1 << 0) #define KVM_ARM64_FP_ENABLED (1 << 1) /* guest FP regs loaded */ diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 7053bf4..6e3c9c8 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -87,10 +87,11 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { fpsimd_bind_state_to_cpu(&vcpu->arch.ctxt.gp_regs.fp_regs, -NULL, SVE_VL_MIN); +vcpu->arch.sve_state, +vcpu->arch.sve_max_vl); clear_thread_flag(TIF_FOREIGN_FPSTATE); - clear_thread_flag(TIF_SVE); + update_thread_flag(TIF_SVE, vcpu_has_sve(vcpu)); } } diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 9d46066..5444b9c 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -100,7 +100,10 @@ static void activate_traps_vhe(struct kvm_vcpu *vcpu) val = read_sysreg(cpacr_el1); val |= CPACR_EL1_TTA; val &= ~CPACR_EL1_ZEN; - if (!update_fp_enabled(vcpu)) { + if (update_fp_enabled(vcpu)) { + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } else { val &= ~CPACR_EL1_FPEN; __activate_traps_fpsimd32(vcpu); } @@ -317,16 +320,48 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) return true; } -static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu) +/* Check for an FPSIMD/SVE trap and handle as appropriate */ +static bool __hyp_text __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) { - struct user_fpsimd_state *host_fpsimd = vcpu->arch.host_fpsimd_state; + bool vhe, sve_guest, sve_host; + u8 hsr_ec; - if (has_vhe()) - write_sysreg(read_sysreg(cpacr_el1) | CPACR_EL1_FPEN, -cpacr_el1); - else + if (!system_supports_fpsimd()) + return false; + + if (system_supports_sve()) { + sve_guest = vcpu_has_sve(vcpu); + sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE; + vhe = true; + } else { + sve_guest = false; + sve_
[PATCH v6 18/27] KVM: arm64/sve: Add SVE support to register access ioctl interface
This patch adds the following registers for access via the KVM_{GET,SET}_ONE_REG interface: * KVM_REG_ARM64_SVE_ZREG(n, i) (n = 0..31) (in 2048-bit slices) * KVM_REG_ARM64_SVE_PREG(n, i) (n = 0..15) (in 256-bit slices) * KVM_REG_ARM64_SVE_FFR(i) (in 256-bit slices) In order to adapt gracefully to future architectural extensions, the registers are logically divided up into slices as noted above: the i parameter denotes the slice index. This allows us to reserve space in the ABI for future expansion of these registers. However, as of today the architecture does not permit registers to be larger than a single slice, so no code is needed in the kernel to expose additional slices, for now. The code can be extended later as needed to expose them up to a maximum of 32 slices (as carved out in the architecture itself) if they really exist someday. The registers are only visible for vcpus that have SVE enabled. They are not enumerated by KVM_GET_REG_LIST on vcpus that do not have SVE. Accesses to the FPSIMD registers via KVM_REG_ARM_CORE is not allowed for SVE-enabled vcpus: SVE-aware userspace can use the KVM_REG_ARM64_SVE_ZREG() interface instead to access the same register state. This avoids some complex and pointless emulation in the kernel to convert between the two views of these aliased registers. Signed-off-by: Dave Martin --- Changes since v5: * [Julien Thierry] rename sve_reg_region() to sve_reg_to_region() to make its purpose a bit clearer. * [Julien Thierry] rename struct sve_state_region to sve_state_reg_region to make it clearer this this struct only describes the bounds of (part of) a single register within sve_state. * [Julien Thierry] Add a comment to clarify the purpose of struct sve_state_reg_region. --- arch/arm64/include/asm/kvm_host.h | 14 arch/arm64/include/uapi/asm/kvm.h | 17 + arch/arm64/kvm/guest.c| 139 ++ 3 files changed, 158 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 4fabfd2..205438a 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -329,6 +329,20 @@ struct kvm_vcpu_arch { #define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \ sve_ffr_offset((vcpu)->arch.sve_max_vl))) +#define vcpu_sve_state_size(vcpu) ({ \ + size_t __size_ret; \ + unsigned int __vcpu_vq; \ + \ + if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) { \ + __size_ret = 0; \ + } else {\ + __vcpu_vq = sve_vq_from_vl((vcpu)->arch.sve_max_vl);\ + __size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq); \ + } \ + \ + __size_ret; \ +}) + /* vcpu_arch flags field values: */ #define KVM_ARM64_DEBUG_DIRTY (1 << 0) #define KVM_ARM64_FP_ENABLED (1 << 1) /* guest FP regs loaded */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 97c3478..ced760c 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -226,6 +226,23 @@ struct kvm_vcpu_events { KVM_REG_ARM_FW | ((r) & 0x)) #define KVM_REG_ARM_PSCI_VERSION KVM_REG_ARM_FW_REG(0) +/* SVE registers */ +#define KVM_REG_ARM64_SVE (0x15 << KVM_REG_ARM_COPROC_SHIFT) + +/* Z- and P-regs occupy blocks at the following offsets within this range: */ +#define KVM_REG_ARM64_SVE_ZREG_BASE0 +#define KVM_REG_ARM64_SVE_PREG_BASE0x400 + +#define KVM_REG_ARM64_SVE_ZREG(n, i) (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | \ +KVM_REG_ARM64_SVE_ZREG_BASE | \ +KVM_REG_SIZE_U2048 | \ +((n) << 5) | (i)) +#define KVM_REG_ARM64_SVE_PREG(n, i) (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | \ +KVM_REG_ARM64_SVE_PREG_BASE | \ +KVM_REG_SIZE_U256 |\ +((n) << 5) | (i)) +#define KVM_REG_ARM64_SVE_FFR(i) KVM_REG_ARM64_SVE_PREG(16, i) + /* Device Control API: ARM VGIC */ #define KVM_DEV_ARM_VGIC_GRP_ADDR 0 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 756d0d6..736d8cb 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/
[PATCH v6 12/27] KVM: arm64/sve: System register context switch and access support
This patch adds the necessary support for context switching ZCR_EL1 for each vcpu. ZCR_EL1 is trapped alongside the FPSIMD/SVE registers, so it makes sense for it to be handled as part of the guest FPSIMD/SVE context for context switch purposes instead of handling it as a general system register. This means that it can be switched in lazily at the appropriate time. No effort is made to track host context for this register, since SVE requires VHE: thus the hosts's value for this register lives permanently in ZCR_EL2 and does not alias the guest's value at any time. The Hyp switch and fpsimd context handling code is extended appropriately. Accessors are added in sys_regs.c to expose the SVE system registers and ID register fields. Because these need to be conditionally visible based on the guest configuration, they are implemented separately for now rather than by use of the generic system register helpers. This may be abstracted better later on when/if there are more features requiring this model. ID_AA64ZFR0_EL1 is RO-RAZ for MRS/MSR when SVE is disabled for the guest, but for compatibility with non-SVE aware KVM implementations the register should not be enumerated at all for KVM_GET_REG_LIST in this case. For consistency we also reject ioctl access to the register. This ensures that a non-SVE-enabled guest looks the same to userspace, irrespective of whether the kernel KVM implementation supports SVE. Signed-off-by: Dave Martin --- Changes since v5: * Port to the renamed visibility() framework. * Swap visiblity() helpers so that they appear by the relevant accessor functions. * [Julien Grall] With the visibility() checks, {get,set}_zcr_el1() degenerate to doing exactly what the common code does, so drop them. The ID_AA64ZFR0_EL1 handlers are still needed to provide contitional RAZ behaviour. This could be moved to the common code too, but since this is a one-off case I don't do this for now. We can address this later if other regs need to follow the same pattern. * [Julien Thierry] Reset ZCR_EL1 to a fixed value using reset_val instead of using relying on reset_unknown() honouring set bits in val as RES0. Most of the bits in ZCR_EL1 are RES0 anyway, and many implementations of SVE will support larger vectors than 128 bits, so 0 seems as good a value as any to expose guests that forget to initialise this register properly. --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/sysreg.h | 3 ++ arch/arm64/kvm/fpsimd.c | 9 - arch/arm64/kvm/hyp/switch.c | 3 ++ arch/arm64/kvm/sys_regs.c | 83 --- 5 files changed, 93 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ad4f7f0..22cf484 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -121,6 +121,7 @@ enum vcpu_sysreg { SCTLR_EL1, /* System Control Register */ ACTLR_EL1, /* Auxiliary Control Register */ CPACR_EL1, /* Coprocessor Access Control */ + ZCR_EL1,/* SVE Control */ TTBR0_EL1, /* Translation Table Base Register 0 */ TTBR1_EL1, /* Translation Table Base Register 1 */ TCR_EL1,/* Translation Control Register */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 5b267de..4d6262d 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -454,6 +454,9 @@ #define SYS_ICH_LR14_EL2 __SYS__LR8_EL2(6) #define SYS_ICH_LR15_EL2 __SYS__LR8_EL2(7) +/* VHE encodings for architectural EL0/1 system registers */ +#define SYS_ZCR_EL12 sys_reg(3, 5, 1, 2, 0) + /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_DSSBS(_BITUL(44)) #define SCTLR_ELx_ENIA (_BITUL(31)) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 1cf4f02..7053bf4 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -103,14 +103,21 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) { unsigned long flags; + bool host_has_sve = system_supports_sve(); + bool guest_has_sve = vcpu_has_sve(vcpu); local_irq_save(flags); if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { + u64 *guest_zcr = &vcpu->arch.ctxt.sys_regs[ZCR_EL1]; + /* Clean guest FP state to memory and invalidate cpu view */ fpsimd_save(); fpsimd_flush_cpu_state(); - } else if (system_supports_sve()) { + + if (guest_has_sve) + *guest_zcr = read_sysreg_s(SYS_ZCR_EL12); + } else if (host_has_sve) { /* * The FPSIMD/SVE state in the CPU has not been touched, and we * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2)
[PATCH v6 16/27] KVM: arm64: Factor out core register ID enumeration
In preparation for adding logic to filter out some KVM_REG_ARM_CORE registers from the KVM_GET_REG_LIST output, this patch factors out the core register enumeration into a separate function and rebuilds num_core_regs() on top of it. This may be a little more expensive (depending on how good a job the compiler does of specialising the code), but KVM_GET_REG_LIST is not a hot path. This will make it easier to consolidate ID filtering code in one place. No functional change. Signed-off-by: Dave Martin --- Changes since v5: * New patch. This reimplements part of the separately-posted patch "KVM: arm64: Factor out KVM_GET_REG_LIST core register enumeration", minus aspects that potentially break the ABI. As a result, the opportunity to truly consolidate all the ID reg filtering in one place is deliberately left on the floor, for now. This will be addressed in a separate series later on. --- arch/arm64/kvm/guest.c | 33 + 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 3e38eb2..a391a61 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -194,9 +195,28 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return -EINVAL; } +static int kvm_arm_copy_core_reg_indices(u64 __user *uindices) +{ + unsigned int i; + int n = 0; + const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE; + + for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) { + if (uindices) { + if (put_user(core_reg | i, uindices)) + return -EFAULT; + uindices++; + } + + n++; + } + + return n; +} + static unsigned long num_core_regs(void) { - return sizeof(struct kvm_regs) / sizeof(__u32); + return kvm_arm_copy_core_reg_indices(NULL); } /** @@ -276,15 +296,12 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) */ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - unsigned int i; - const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE; int ret; - for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) { - if (put_user(core_reg | i, uindices)) - return -EFAULT; - uindices++; - } + ret = kvm_arm_copy_core_reg_indices(uindices); + if (ret) + return ret; + uindices += ret; ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices); if (ret) -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 19/27] KVM: arm64: Enumerate SVE register indices for KVM_GET_REG_LIST
This patch includes the SVE register IDs in the list returned by KVM_GET_REG_LIST, as appropriate. On a non-SVE-enabled vcpu, no new IDs are added. On an SVE-enabled vcpu, IDs for the FPSIMD V-registers are removed from the list, since userspace is required to access the Z- registers instead in order to access the V-register content. For the variably-sized SVE registers, the appropriate set of slice IDs are enumerated, depending on the maximum vector length for the vcpu. As it currently stands, the SVE architecture never requires more than one slice to exist per register, so this patch adds no explicit support for enumerating multiple slices. The code can be extended straightforwardly to support this in the future, if needed. Signed-off-by: Dave Martin --- Changes since v5: (Dropped Julien Thierry's Reviewed-by due to non-trivial rebasing) * Move mis-split reword to prevent put_user()s being accidentally the correct size from KVM: arm64/sve: Add pseudo-register for the guest's vector lengths. --- arch/arm64/kvm/guest.c | 56 ++ 1 file changed, 56 insertions(+) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 736d8cb..585c31e5 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -411,6 +411,56 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0; } +static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu) +{ + /* Only the first slice ever exists, for now */ + const unsigned int slices = 1; + + if (!vcpu_has_sve(vcpu)) + return 0; + + return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */); +} + +static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + /* Only the first slice ever exists, for now */ + const unsigned int slices = 1; + u64 reg; + unsigned int i, n; + int num_regs = 0; + + if (!vcpu_has_sve(vcpu)) + return 0; + + for (i = 0; i < slices; i++) { + for (n = 0; n < SVE_NUM_ZREGS; n++) { + reg = KVM_REG_ARM64_SVE_ZREG(n, i); + if (put_user(reg, uindices++)) + return -EFAULT; + + num_regs++; + } + + for (n = 0; n < SVE_NUM_PREGS; n++) { + reg = KVM_REG_ARM64_SVE_PREG(n, i); + if (put_user(reg, uindices++)) + return -EFAULT; + + num_regs++; + } + + reg = KVM_REG_ARM64_SVE_FFR(i); + if (put_user(reg, uindices++)) + return -EFAULT; + + num_regs++; + } + + return num_regs; +} + /** * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG * @@ -421,6 +471,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) unsigned long res = 0; res += num_core_regs(vcpu); + res += num_sve_regs(vcpu); res += kvm_arm_num_sys_reg_descs(vcpu); res += kvm_arm_get_fw_num_regs(vcpu); res += NUM_TIMER_REGS; @@ -442,6 +493,11 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return ret; uindices += ret; + ret = copy_sve_reg_indices(vcpu, uindices); + if (ret) + return ret; + uindices += ret; + ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices); if (ret) return ret; -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 09/27] KVM: arm64: Add a vcpu flag to control SVE visibility for the guest
Since SVE will be enabled or disabled on a per-vcpu basis, a flag is needed in order to track which vcpus have it enabled. This patch adds a suitable flag and a helper for checking it. Signed-off-by: Dave Martin Reviewed-by: Alex Bennée --- arch/arm64/include/asm/kvm_host.h | 4 1 file changed, 4 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 6d10100..ad4f7f0 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -328,6 +328,10 @@ struct kvm_vcpu_arch { #define KVM_ARM64_FP_HOST (1 << 2) /* host FP regs loaded */ #define KVM_ARM64_HOST_SVE_IN_USE (1 << 3) /* backup for host TIF_SVE */ #define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ +#define KVM_ARM64_GUEST_HAS_SVE(1 << 5) /* SVE exposed to guest */ + +#define vcpu_has_sve(vcpu) (system_supports_sve() && \ + ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE)) #define vcpu_gp_regs(v)(&(v)->arch.ctxt.gp_regs) -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 08/27] arm64/sve: Enable SVE state tracking for non-task contexts
The current FPSIMD/SVE context handling support for non-task (i.e., KVM vcpu) contexts does not take SVE into account. This means that only task contexts can safely use SVE at present. In preparation for enabling KVM guests to use SVE, it is necessary to keep track of SVE state for non-task contexts too. This patch adds the necessary support, removing assumptions from the context switch code about the location of the SVE context storage. When binding a vcpu context, its vector length is arbitrarily specified as SVE_VL_MIN for now. In any case, because TIF_SVE is presently cleared at vcpu context bind time, the specified vector length will not be used for anything yet. In later patches TIF_SVE will be set here as appropriate, and the appropriate maximum vector length for the vcpu will be passed when binding. Signed-off-by: Dave Martin Reviewed-by: Alex Bennée Reviewed-by: Julien Grall --- arch/arm64/include/asm/fpsimd.h | 3 ++- arch/arm64/kernel/fpsimd.c | 20 +++- arch/arm64/kvm/fpsimd.c | 5 - 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 964adc9..df7a143 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -56,7 +56,8 @@ extern void fpsimd_restore_current_state(void); extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); extern void fpsimd_bind_task_to_cpu(void); -extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state); +extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state, +void *sve_state, unsigned int sve_vl); extern void fpsimd_flush_task_state(struct task_struct *target); extern void fpsimd_flush_cpu_state(void); diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index b219796a..8a93afa 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -121,6 +121,8 @@ */ struct fpsimd_last_state_struct { struct user_fpsimd_state *st; + void *sve_state; + unsigned int sve_vl; }; static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state); @@ -241,14 +243,15 @@ static void task_fpsimd_load(void) */ void fpsimd_save(void) { - struct user_fpsimd_state *st = __this_cpu_read(fpsimd_last_state.st); + struct fpsimd_last_state_struct const *last = + this_cpu_ptr(&fpsimd_last_state); /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */ WARN_ON(!in_softirq() && !irqs_disabled()); if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { if (system_supports_sve() && test_thread_flag(TIF_SVE)) { - if (WARN_ON(sve_get_vl() != current->thread.sve_vl)) { + if (WARN_ON(sve_get_vl() != last->sve_vl)) { /* * Can't save the user regs, so current would * re-enter user with corrupt state. @@ -258,9 +261,11 @@ void fpsimd_save(void) return; } - sve_save_state(sve_pffr(¤t->thread), &st->fpsr); + sve_save_state((char *)last->sve_state + + sve_ffr_offset(last->sve_vl), + &last->st->fpsr); } else - fpsimd_save_state(st); + fpsimd_save_state(last->st); } } @@ -1034,6 +1039,8 @@ void fpsimd_bind_task_to_cpu(void) this_cpu_ptr(&fpsimd_last_state); last->st = ¤t->thread.uw.fpsimd_state; + last->sve_state = current->thread.sve_state; + last->sve_vl = current->thread.sve_vl; current->thread.fpsimd_cpu = smp_processor_id(); if (system_supports_sve()) { @@ -1047,7 +1054,8 @@ void fpsimd_bind_task_to_cpu(void) } } -void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st) +void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state, + unsigned int sve_vl) { struct fpsimd_last_state_struct *last = this_cpu_ptr(&fpsimd_last_state); @@ -1055,6 +1063,8 @@ void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st) WARN_ON(!in_softirq() && !irqs_disabled()); last->st = st; + last->sve_state = sve_state; + last->sve_vl = sve_vl; } /* diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index aac7808..1cf4f02 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -85,7 +86,9 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) WARN_ON_ONCE(!irqs_disabled()); if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { - fpsimd_bind_stat
[PATCH v6 14/27] KVM: Allow 2048-bit register access via ioctl interface
The Arm SVE architecture defines registers that are up to 2048 bits in size (with some possibility of further future expansion). In order to avoid the need for an excessively large number of ioctls when saving and restoring a vcpu's registers, this patch adds a #define to make support for individual 2048-bit registers through the KVM_{GET,SET}_ONE_REG ioctl interface official. This will allow each SVE register to be accessed in a single call. There are sufficient spare bits in the register id size field for this change, so there is no ABI impact, providing that KVM_GET_REG_LIST does not enumerate any 2048-bit register unless userspace explicitly opts in to the relevant architecture-specific features. Signed-off-by: Dave Martin Reviewed-by: Alex Bennée --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6d4ea4b..dc77a5a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1145,6 +1145,7 @@ struct kvm_dirty_tlb { #define KVM_REG_SIZE_U256 0x0050ULL #define KVM_REG_SIZE_U512 0x0060ULL #define KVM_REG_SIZE_U1024 0x0070ULL +#define KVM_REG_SIZE_U2048 0x0080ULL struct kvm_reg_list { __u64 n; /* number of regs */ -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 15/27] KVM: arm64: Add missing #include of in guest.c
arch/arm64/kvm/guest.c uses the string functions, but the corresponding header is not included. We seem to get away with this for now, but for completeness this patch adds the #include, in preparation for adding yet more memset() calls. Signed-off-by: Dave Martin --- arch/arm64/kvm/guest.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 62514cb..3e38eb2 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 04/27] KVM: arm64: Refactor kvm_arm_num_regs() for easier maintenance
kvm_arm_num_regs() adds together various partial register counts in a freeform sum expression, which makes it harder than necessary to read diffs that add, modify or remove a single term in the sum (which is expected to the common case under maintenance). This patch refactors the code to add the term one per line, for maximum readability. Signed-off-by: Dave Martin Reviewed-by: Alex Bennée --- arch/arm64/kvm/guest.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index dd436a5..62514cb 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -258,8 +258,14 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) */ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) { - return num_core_regs() + kvm_arm_num_sys_reg_descs(vcpu) - + kvm_arm_get_fw_num_regs(vcpu) + NUM_TIMER_REGS; + unsigned long res = 0; + + res += num_core_regs(); + res += kvm_arm_num_sys_reg_descs(vcpu); + res += kvm_arm_get_fw_num_regs(vcpu); + res += NUM_TIMER_REGS; + + return res; } /** -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 11/27] KVM: arm64: Support runtime sysreg visibility filtering
Some optional features of the Arm architecture add new system registers that are not present in the base architecture. Where these features are optional for the guest, the visibility of these registers may need to depend on some runtime configuration, such as a flag passed to KVM_ARM_VCPU_INIT. For example, ZCR_EL1 and ID_AA64ZFR0_EL1 need to be hidden if SVE is not enabled for the guest, even though these registers may be present in the hardware and visible to the host at EL2. Adding special-case checks all over the place for individual registers is going to get messy as the number of conditionally- visible registers grows. In order to help solve this problem, this patch adds a new sysreg method visibility() that can be used to hook in any needed runtime visibility checks. This method can currently return REG_HIDDEN_USER to inhibit enumeration and ioctl access to the register for userspace, and REG_HIDDEN_GUEST to inhibit runtime access by the guest using MSR/MRS. Wrappers are added to allow these flags to be conveniently queried. This approach allows a conditionally modified view of individual system registers such as the CPU ID registers, in addition to completely hiding register where appropriate. Signed-off-by: Dave Martin --- Changes since v5: * Rename the visibility override flags, add some comments, and rename/ introduce helpers to make the purpose of this code clearer. --- arch/arm64/kvm/sys_regs.c | 24 +--- arch/arm64/kvm/sys_regs.h | 25 + 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a5d14b5..c86a7b0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1927,6 +1927,12 @@ static void perform_access(struct kvm_vcpu *vcpu, { trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); + /* Check for regs disabled by runtime config */ + if (sysreg_hidden_from_guest(vcpu, r)) { + kvm_inject_undefined(vcpu); + return; + } + /* * Not having an accessor means that we have configured a trap * that we don't know how to handle. This certainly qualifies @@ -2438,6 +2444,10 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (!r) return get_invariant_sys_reg(reg->id, uaddr); + /* Check for regs disabled by runtime config */ + if (sysreg_hidden_from_user(vcpu, r)) + return -ENOENT; + if (r->get_user) return (r->get_user)(vcpu, r, reg, uaddr); @@ -2459,6 +2469,10 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (!r) return set_invariant_sys_reg(reg->id, uaddr); + /* Check for regs disabled by runtime config */ + if (sysreg_hidden_from_user(vcpu, r)) + return -ENOENT; + if (r->set_user) return (r->set_user)(vcpu, r, reg, uaddr); @@ -2515,7 +2529,8 @@ static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind) return true; } -static int walk_one_sys_reg(const struct sys_reg_desc *rd, +static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 __user **uind, unsigned int *total) { @@ -2526,6 +2541,9 @@ static int walk_one_sys_reg(const struct sys_reg_desc *rd, if (!(rd->reg || rd->get_user)) return 0; + if (sysreg_hidden_from_user(vcpu, rd)) + return 0; + if (!copy_reg_to_user(rd, uind)) return -EFAULT; @@ -2554,9 +2572,9 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) int cmp = cmp_sys_reg(i1, i2); /* target-specific overrides generic entry. */ if (cmp <= 0) - err = walk_one_sys_reg(i1, &uind, &total); + err = walk_one_sys_reg(vcpu, i1, &uind, &total); else - err = walk_one_sys_reg(i2, &uind, &total); + err = walk_one_sys_reg(vcpu, i2, &uind, &total); if (err) return err; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 3b1bc7f..2be9950 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -64,8 +64,15 @@ struct sys_reg_desc { const struct kvm_one_reg *reg, void __user *uaddr); int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr); + + /* Return mask of REG_* runtime visibility overrides */ + unsigned int (*visibility)(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd); }; +#define REG_HIDDEN_USER
[PATCH v6 10/27] KVM: arm64: Propagate vcpu into read_id_reg()
Architecture features that are conditionally visible to the guest will require run-time checks in the ID register accessor functions. In particular, read_id_reg() will need to perform checks in order to generate the correct emulated value for certain ID register fields such as ID_AA64PFR0_EL1.SVE for example. This patch propagates vcpu into read_id_reg() so that future patches can add run-time checks on the guest configuration here. For now, there is no functional change. Signed-off-by: Dave Martin Reviewed-by: Alex Bennée --- arch/arm64/kvm/sys_regs.c | 23 +-- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 539feec..a5d14b5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1044,7 +1044,8 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, } /* Read a sanitised cpufeature ID register by sys_reg_desc */ -static u64 read_id_reg(struct sys_reg_desc const *r, bool raz) +static u64 read_id_reg(const struct kvm_vcpu *vcpu, + struct sys_reg_desc const *r, bool raz) { u32 id = sys_reg((u32)r->Op0, (u32)r->Op1, (u32)r->CRn, (u32)r->CRm, (u32)r->Op2); @@ -1078,7 +1079,7 @@ static bool __access_id_reg(struct kvm_vcpu *vcpu, if (p->is_write) return write_to_read_only(vcpu, p, r); - p->regval = read_id_reg(r, raz); + p->regval = read_id_reg(vcpu, r, raz); return true; } @@ -1107,16 +1108,18 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); * are stored, and for set_id_reg() we don't allow the effective value * to be changed. */ -static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, +static int __get_id_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, void __user *uaddr, bool raz) { const u64 id = sys_reg_to_index(rd); - const u64 val = read_id_reg(rd, raz); + const u64 val = read_id_reg(vcpu, rd, raz); return reg_to_user(uaddr, &val, id); } -static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, +static int __set_id_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, void __user *uaddr, bool raz) { const u64 id = sys_reg_to_index(rd); @@ -1128,7 +1131,7 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, return err; /* This is what we mean by invariant: you can't change it. */ - if (val != read_id_reg(rd, raz)) + if (val != read_id_reg(vcpu, rd, raz)) return -EINVAL; return 0; @@ -1137,25 +1140,25 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(rd, uaddr, false); + return __get_id_reg(vcpu, rd, uaddr, false); } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(rd, uaddr, false); + return __set_id_reg(vcpu, rd, uaddr, false); } static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(rd, uaddr, true); + return __get_id_reg(vcpu, rd, uaddr, true); } static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(rd, uaddr, true); + return __set_id_reg(vcpu, rd, uaddr, true); } static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 07/27] arm64/sve: Check SVE virtualisability
Due to the way the effective SVE vector length is controlled and trapped at different exception levels, certain mismatches in the sets of vector lengths supported by different physical CPUs in the system may prevent straightforward virtualisation of SVE at parity with the host. This patch analyses the extent to which SVE can be virtualised safely without interfering with migration of vcpus between physical CPUs, and rejects late secondary CPUs that would erode the situation further. It is left up to KVM to decide what to do with this information. Signed-off-by: Dave Martin Reviewed-by: Julien Thierry --- QUESTION: The final structure of this code makes it quite natural to clamp the vector length for KVM guests to the maximum value supportable across all CPUs; such a value is guaranteed to exist, but may be surprisingly small on a given hardware platform. Should we be stricter and actually refuse to support KVM at all on such hardware? This may help to force people to fix Linux (or the architecture) if/when this issue comes up. For now, I stick with pr_warn() and make do with a limited SVE vector length for guests. Changes since v5: * pr_info() about the presence of unvirtualisable vector lengths in sve_setup() upgrade to pr_warn(), for consistency with sve_verify_vq_map(). --- arch/arm64/include/asm/fpsimd.h | 1 + arch/arm64/kernel/cpufeature.c | 2 +- arch/arm64/kernel/fpsimd.c | 86 ++--- 3 files changed, 73 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index dd1ad39..964adc9 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -87,6 +87,7 @@ extern void sve_kernel_enable(const struct arm64_cpu_capabilities *__unused); extern u64 read_zcr_features(void); extern int __ro_after_init sve_max_vl; +extern int __ro_after_init sve_max_virtualisable_vl; #ifdef CONFIG_ARM64_SVE diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e24e94d..4b1cda5 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1862,7 +1862,7 @@ static void verify_sve_features(void) unsigned int len = zcr & ZCR_ELx_LEN_MASK; if (len < safe_len || sve_verify_vq_map()) { - pr_crit("CPU%d: SVE: required vector length(s) missing\n", + pr_crit("CPU%d: SVE: vector length support mismatch\n", smp_processor_id()); cpu_die_early(); } diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index f59ea67..b219796a 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -48,6 +49,7 @@ #include #include #include +#include #define FPEXC_IOF (1 << 0) #define FPEXC_DZF (1 << 1) @@ -130,14 +132,18 @@ static int sve_default_vl = -1; /* Maximum supported vector length across all CPUs (initially poisoned) */ int __ro_after_init sve_max_vl = SVE_VL_MIN; +int __ro_after_init sve_max_virtualisable_vl = SVE_VL_MIN; /* Set of available vector lengths, as vq_to_bit(vq): */ static __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); +/* Set of vector lengths present on at least one cpu: */ +static __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX); static void __percpu *efi_sve_state; #else /* ! CONFIG_ARM64_SVE */ /* Dummy declaration for code that will be optimised out: */ extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); +extern __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX); extern void __percpu *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ @@ -623,12 +629,6 @@ int sve_get_current_vl(void) return sve_prctl_status(0); } -/* - * Bitmap for temporary storage of the per-CPU set of supported vector lengths - * during secondary boot. - */ -static DECLARE_BITMAP(sve_secondary_vq_map, SVE_VQ_MAX); - static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) { unsigned int vq, vl; @@ -654,6 +654,7 @@ static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) void __init sve_init_vq_map(void) { sve_probe_vqs(sve_vq_map); + bitmap_copy(sve_vq_partial_map, sve_vq_map, SVE_VQ_MAX); } /* @@ -663,8 +664,11 @@ void __init sve_init_vq_map(void) */ void sve_update_vq_map(void) { - sve_probe_vqs(sve_secondary_vq_map); - bitmap_and(sve_vq_map, sve_vq_map, sve_secondary_vq_map, SVE_VQ_MAX); + DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); + + sve_probe_vqs(tmp_map); + bitmap_and(sve_vq_map, sve_vq_map, tmp_map, SVE_VQ_MAX); + bitmap_or(sve_vq_partial_map, sve_vq_partial_map, tmp_map, SVE_VQ_MAX); } /* @@ -673,18 +677,48 @@ void sve_update_vq_map(void) */ int sve_verify_vq_map(void) { - int ret = 0; + DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); + unsigned long b; - sve_prob
[PATCH v6 06/27] arm64/sve: Clarify role of the VQ map maintenance functions
The roles of sve_init_vq_map(), sve_update_vq_map() and sve_verify_vq_map() are highly non-obvious to anyone who has not dug through cpufeatures.c in detail. Since the way these functions interact with each other is more important here than a full understanding of the cpufeatures code, this patch adds comments to make the functions' roles clearer. No functional change. Signed-off-by: Dave Martin Reviewed-by: Julien Thierry Reviewed-by: Julien Grall --- Changes since v5: * [Julien Thierry] This patch is useful for explaining the previous patch (as per the v5 ordering), and is anyway non-functional. Swapped it with the previous patch to provide a more logical reading order for the series. --- arch/arm64/kernel/fpsimd.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 62c37f0..f59ea67 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -647,6 +647,10 @@ static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) } } +/* + * Initialise the set of known supported VQs for the boot CPU. + * This is called during kernel boot, before secondary CPUs are brought up. + */ void __init sve_init_vq_map(void) { sve_probe_vqs(sve_vq_map); @@ -655,6 +659,7 @@ void __init sve_init_vq_map(void) /* * If we haven't committed to the set of supported VQs yet, filter out * those not supported by the current CPU. + * This function is called during the bring-up of early secondary CPUs only. */ void sve_update_vq_map(void) { @@ -662,7 +667,10 @@ void sve_update_vq_map(void) bitmap_and(sve_vq_map, sve_vq_map, sve_secondary_vq_map, SVE_VQ_MAX); } -/* Check whether the current CPU supports all VQs in the committed set */ +/* + * Check whether the current CPU supports all VQs in the committed set. + * This function is called during the bring-up of late secondary CPUs only. + */ int sve_verify_vq_map(void) { int ret = 0; -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 05/27] KVM: arm64: Add missing #includes to kvm_host.h
kvm_host.h uses some declarations from other headers that are currently included by accident, without an explicit #include. This patch adds a few #includes that are clearly missing. Although the header builds without them today, this should help to avoid future surprises. Signed-off-by: Dave Martin Acked-by: Mark Rutland --- Changes since v5: * [Mark Rutland] Add additional missing #includes , , , while we're about it. Commit message reworded to match. --- arch/arm64/include/asm/kvm_host.h | 4 1 file changed, 4 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a01fe087..6d10100 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -22,9 +22,13 @@ #ifndef __ARM64_KVM_HOST_H__ #define __ARM64_KVM_HOST_H__ +#include #include +#include #include +#include #include +#include #include #include #include -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 02/27] arm64: fpsimd: Always set TIF_FOREIGN_FPSTATE on task state flush
This patch updates fpsimd_flush_task_state() to mirror the new semantics of fpsimd_flush_cpu_state() introduced by commit d8ad71fa38a9 ("arm64: fpsimd: Fix TIF_FOREIGN_FPSTATE after invalidating cpu regs"). Both functions now implicitly set TIF_FOREIGN_FPSTATE to indicate that the task's FPSIMD state is not loaded into the cpu. As a side-effect, fpsimd_flush_task_state() now sets TIF_FOREIGN_FPSTATE even for non-running tasks. In the case of non-running tasks this is not useful but also harmless, because the flag is live only while the corresponding task is running. This function is not called from fast paths, so special-casing this for the task == current case is not really worth it. Compiler barriers previously present in restore_sve_fpsimd_context() are pulled into fpsimd_flush_task_state() so that it can be safely called with preemption enabled if necessary. Explicit calls to set TIF_FOREIGN_FPSTATE that accompany fpsimd_flush_task_state() calls and are now redundant are removed as appropriate. fpsimd_flush_task_state() is used to get exclusive access to the representation of the task's state via task_struct, for the purpose of replacing the state. Thus, the call to this function should happen before manipulating fpsimd_state or sve_state etc. in task_struct. Anomalous cases are reordered appropriately in order to make the code more consistent, although there should be no functional difference since these cases are protected by local_bh_disable() anyway. Signed-off-by: Dave Martin Reviewed-by: Alex Bennée Reviewed-by: Julien Grall --- arch/arm64/kernel/fpsimd.c | 25 +++-- arch/arm64/kernel/signal.c | 5 - 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 5ebe73b..62c37f0 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -550,7 +550,6 @@ int sve_set_vector_length(struct task_struct *task, local_bh_disable(); fpsimd_save(); - set_thread_flag(TIF_FOREIGN_FPSTATE); } fpsimd_flush_task_state(task); @@ -816,12 +815,11 @@ asmlinkage void do_sve_acc(unsigned int esr, struct pt_regs *regs) local_bh_disable(); fpsimd_save(); - fpsimd_to_sve(current); /* Force ret_to_user to reload the registers: */ fpsimd_flush_task_state(current); - set_thread_flag(TIF_FOREIGN_FPSTATE); + fpsimd_to_sve(current); if (test_and_set_thread_flag(TIF_SVE)) WARN_ON(1); /* SVE access shouldn't have trapped */ @@ -894,9 +892,9 @@ void fpsimd_flush_thread(void) local_bh_disable(); + fpsimd_flush_task_state(current); memset(¤t->thread.uw.fpsimd_state, 0, sizeof(current->thread.uw.fpsimd_state)); - fpsimd_flush_task_state(current); if (system_supports_sve()) { clear_thread_flag(TIF_SVE); @@ -933,8 +931,6 @@ void fpsimd_flush_thread(void) current->thread.sve_vl_onexec = 0; } - set_thread_flag(TIF_FOREIGN_FPSTATE); - local_bh_enable(); } @@ -1043,12 +1039,29 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state) /* * Invalidate live CPU copies of task t's FPSIMD state + * + * This function may be called with preemption enabled. The barrier() + * ensures that the assignment to fpsimd_cpu is visible to any + * preemption/softirq that could race with set_tsk_thread_flag(), so + * that TIF_FOREIGN_FPSTATE cannot be spuriously re-cleared. + * + * The final barrier ensures that TIF_FOREIGN_FPSTATE is seen set by any + * subsequent code. */ void fpsimd_flush_task_state(struct task_struct *t) { t->thread.fpsimd_cpu = NR_CPUS; + + barrier(); + set_tsk_thread_flag(t, TIF_FOREIGN_FPSTATE); + + barrier(); } +/* + * Invalidate any task's FPSIMD state that is present on this cpu. + * This function must be called with softirqs disabled. + */ void fpsimd_flush_cpu_state(void) { __this_cpu_write(fpsimd_last_state.st, NULL); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 867a7ce..a9b0485 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -296,11 +296,6 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) */ fpsimd_flush_task_state(current); - barrier(); - /* From now, fpsimd_thread_switch() won't clear TIF_FOREIGN_FPSTATE */ - - set_thread_flag(TIF_FOREIGN_FPSTATE); - barrier(); /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ sve_alloc(current); -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 03/27] KVM: arm64: Delete orphaned declaration for __fpsimd_enabled()
__fpsimd_enabled() no longer exists, but a dangling declaration has survived in kvm_hyp.h. This patch gets rid of it. Signed-off-by: Dave Martin Reviewed-by: Alex Bennée --- arch/arm64/include/asm/kvm_hyp.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 4da765f..ef8b839 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -149,7 +149,6 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu); void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); -bool __fpsimd_enabled(void); void activate_traps_vhe_load(struct kvm_vcpu *vcpu); void deactivate_traps_vhe_put(void); -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v6 01/27] KVM: Documentation: Document arm64 core registers in detail
Since the the sizes of individual members of the core arm64 registers vary, the list of register encodings that make sense is not a simple linear sequence. To clarify which encodings to use, this patch adds a brief list to the documentation. Signed-off-by: Dave Martin Reviewed-by: Julien Grall Reviewed-by: Peter Maydell --- Documentation/virtual/kvm/api.txt | 24 1 file changed, 24 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 7de9eee..2d4f7ce 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2107,6 +2107,30 @@ contains elements ranging from 32 to 128 bits. The index is a 32bit value in the kvm_regs structure seen as a 32bit array. 0x60x0 0010 +Specifically: +EncodingRegister Bits kvm_regs member + + 0x6030 0010 X0 64 regs.regs[0] + 0x6030 0010 0002 X1 64 regs.regs[1] +... + 0x6030 0010 003c X30 64 regs.regs[30] + 0x6030 0010 003e SP 64 regs.sp + 0x6030 0010 0040 PC 64 regs.pc + 0x6030 0010 0042 PSTATE 64 regs.pstate + 0x6030 0010 0044 SP_EL1 64 sp_el1 + 0x6030 0010 0046 ELR_EL1 64 elr_el1 + 0x6030 0010 0048 SPSR_EL164 spsr[KVM_SPSR_EL1] (alias SPSR_SVC) + 0x6030 0010 004a SPSR_ABT64 spsr[KVM_SPSR_ABT] + 0x6030 0010 004c SPSR_UND64 spsr[KVM_SPSR_UND] + 0x6030 0010 004e SPSR_IRQ64 spsr[KVM_SPSR_IRQ] + 0x6060 0010 0050 SPSR_FIQ64 spsr[KVM_SPSR_FIQ] + 0x6040 0010 0054 V0 128 fp_regs.vregs[0] + 0x6040 0010 0058 V1 128 fp_regs.vregs[1] +... + 0x6040 0010 00d0 V31128 fp_regs.vregs[31] + 0x6020 0010 00d4 FPSR32 fp_regs.fpsr + 0x6020 0010 00d5 FPCR32 fp_regs.fpcr + arm64 CCSIDR registers are demultiplexed by CSSELR value: 0x6020 0011 00 -- 2.1.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v5 00/26] KVM: arm64: SVE guest support
This series implements support for allowing KVM guests to use the Arm Scalable Vector Extension (SVE), superseding the previous v5 series [1]. The patches are also available on a branch for reviewer convenience. [2] The patches are based on v5.1-rc1. The series has been reworked to remove the dependency on [3], eliminating the ABI "fixes" but keeping the relevant cleanup/ refactoring. (See patch 16.) This series addresses review comments received on v5, and contains one significant change: * A new ioctl KVM_ARM_VCPU_FINALIZE is added to replace the implicit finalization behaviour in v5. If userspace enables SVE, it must now use this ioctl to finalize the vcpu configuration before KVM_RUN etc. or SVE register access are permitted. For a description of minor updates, see the individual patches. Known issues: * This update requires modifications to kvmtool that are not published yet. I will reply to this cover letter with a link when those are available (hopefully within 24 hours of this posting). The kvmtool branch referenced by the v5 cover letter **will not work** with v6... Please be patient :) Testing status: * Lightweight testing on the Arm Fast Model, primarily to exercise the new vcpu finalization API. I plan to re-run stress testing once v6 is posted. The low-level context switch internals in the series remain unchanged since v5, so it is "relatively unlikely" that new problems will crop up since the v5 testing. * ThunderX2: basic testing of arm64 defconfig, including booting guests (no SVE support on this hardware). (This was done on an interim release candidate of v6: I will redo it after this posting.) * aarch32 host testing has only been done in v5 so far. arch/arm changes between v5 and v6 are minimal. I plan to redo sanity-check testing after this posting. I will reply to this cover letter when I have v6 test outcomes to report. Resolved issues: * The register zeroing bug observed when testing v5 has been tracked down to architecture noncompliance in the Arm Fast Model, which led to lanes of SVE registers being zeroed in some inappropriate situations. [1] Previous series: [PATCH v5 00/26] KVM: arm64: SVE guest support https://lists.cs.columbia.edu/pipermail/kvmarm/2019-February/034695.html [2] This series in git: http://linux-arm.org/git?p=linux-dm.git;a=shortlog;h=refs/heads/sve-kvm/v6/head git://linux-arm.org/linux-dm.git sve-kvm/v6/head [3] [PATCH v2 0/2] Fix KVM_GET_REG_LIST invalid register ID regression https://lists.cs.columbia.edu/pipermail/kvmarm/2018-December/033810.html Dave Martin (27): KVM: Documentation: Document arm64 core registers in detail arm64: fpsimd: Always set TIF_FOREIGN_FPSTATE on task state flush KVM: arm64: Delete orphaned declaration for __fpsimd_enabled() KVM: arm64: Refactor kvm_arm_num_regs() for easier maintenance KVM: arm64: Add missing #includes to kvm_host.h arm64/sve: Clarify role of the VQ map maintenance functions arm64/sve: Check SVE virtualisability arm64/sve: Enable SVE state tracking for non-task contexts KVM: arm64: Add a vcpu flag to control SVE visibility for the guest KVM: arm64: Propagate vcpu into read_id_reg() KVM: arm64: Support runtime sysreg visibility filtering KVM: arm64/sve: System register context switch and access support KVM: arm64/sve: Context switch the SVE registers KVM: Allow 2048-bit register access via ioctl interface KVM: arm64: Add missing #include of in guest.c KVM: arm64: Factor out core register ID enumeration KVM: arm64: Reject ioctl access to FPSIMD V-regs on SVE vcpus KVM: arm64/sve: Add SVE support to register access ioctl interface KVM: arm64: Enumerate SVE register indices for KVM_GET_REG_LIST arm64/sve: In-kernel vector length availability query interface KVM: arm/arm64: Add hook for arch-specific KVM initialisation KVM: arm/arm64: Add KVM_ARM_VCPU_FINALIZE ioctl KVM: arm64/sve: Add pseudo-register for the guest's vector lengths KVM: arm64/sve: Allow userspace to enable SVE for vcpus KVM: arm64: Add a capability to advertise SVE support KVM: Document errors for KVM_GET_ONE_REG and KVM_SET_ONE_REG KVM: arm64/sve: Document KVM API extensions for SVE Documentation/virtual/kvm/api.txt | 156 +++ arch/arm/include/asm/kvm_host.h | 6 + arch/arm64/include/asm/fpsimd.h | 33 +++- arch/arm64/include/asm/kvm_host.h | 43 - arch/arm64/include/asm/kvm_hyp.h | 1 - arch/arm64/include/asm/sysreg.h | 3 + arch/arm64/include/uapi/asm/kvm.h | 22 +++ arch/arm64/kernel/cpufeature.c| 2 +- arch/arm64/kernel/fpsimd.c| 172 - arch/arm64/kernel/signal.c| 5 - arch/arm64/kvm/fpsimd.c | 17 +- arch/arm64/kvm/guest.c| 387 +++--- arch/arm64/kvm/hyp/switch.c | 74 ++-- arch/arm64/kvm/reset.c| 137 +- arch/arm64/kvm/sys_regs.c
Re: [RFC PATCH] KVM: arm/arm64: Enable direct irqfd MSI injection
On Tue, 19 Mar 2019 15:59:00 +, Zenghui Yu wrote: > > Hi Marc, > > On 2019/3/19 18:01, Marc Zyngier wrote: > > On Tue, 19 Mar 2019 09:09:43 +0800 > > Zenghui Yu wrote: > > > >> Hi all, > >> > >> On 2019/3/18 3:35, Marc Zyngier wrote: > >>> A first approach would be to keep a small cache of the last few > >>> successful translations for this ITS, cache that could be looked-up by > >>> holding a spinlock instead. A hit in this cache could directly be > >>> injected. Any command that invalidates or changes anything (DISCARD, > >>> INV, INVALL, MAPC with V=0, MAPD with V=0, MOVALL, MOVI) should nuke > >>> the cache altogether. > >>> > >>> Of course, all of that needs to be quantified. > >> > >> Thanks for all of your explanations, especially for Marc's suggestions! > >> It took me long time to figure out my mistakes, since I am not very > >> familiar with the locking stuff. Now I have to apologize for my noise. > > > > No need to apologize. The whole point of this list is to have > > discussions. Although your approach wasn't working, you did > > identify potential room for improvement. > > > >> As for the its-translation-cache code (a really good news to us), we > >> have a rough look at it and start testing now! > > > > Please let me know about your findings. My initial test doesn't show > > any improvement, but that could easily be attributed to the system I > > running this on (a tiny and slightly broken dual A53 system). The sizing > > of the cache is also important: too small, and you have the overhead of > > the lookup for no benefit; too big, and you waste memory. > > Not smoothly as expected. With below config (in the form of XML): The good news is that nothing was expected at all. > ---8<--- > > > > > > ---8<--- Sorry, I don't read XML, and I have zero idea what this represent. > > VM can't even get to boot successfully! > > > Kernel version is -stable 4.19.28. And *dmesg* on host shows: Please don't test on any other thing but mainline. The only thing I'm interested in at the moment is 5.1-rc1. > > ---8<--- > [ 507.908330] rcu: INFO: rcu_sched detected stalls on CPUs/tasks: > [ 507.908338] rcu: 35-...0: (0 ticks this GP) > idle=d06/1/0x4000 softirq=72150/72150 fqs=6269 > [ 507.908341] rcu: 41-...0: (0 ticks this GP) > idle=dee/1/0x4000 softirq=68144/68144 fqs=6269 > [ 507.908342] rcu: (detected by 23, t=15002 jiffies, g=68929, q=408641) > [ 507.908350] Task dump for CPU 35: > [ 507.908351] qemu-kvmR running task0 66789 1 > 0x0002 > [ 507.908354] Call trace: > [ 507.908360] __switch_to+0x94/0xe8 > [ 507.908363] _cond_resched+0x24/0x68 > [ 507.908366] __flush_work+0x58/0x280 > [ 507.908369] free_unref_page_commit+0xc4/0x198 > [ 507.908370] free_unref_page+0x84/0xa0 > [ 507.908371] __free_pages+0x58/0x68 > [ 507.908372] free_pages.part.21+0x34/0x40 > [ 507.908373] free_pages+0x2c/0x38 > [ 507.908375] poll_freewait+0xa8/0xd0 > [ 507.908377] do_sys_poll+0x3d0/0x560 > [ 507.908378] __arm64_sys_ppoll+0x180/0x1e8 > [ 507.908380] 0xa48990 > [ 507.908381] Task dump for CPU 41: > [ 507.908382] kworker/41:1R running task0 647 2 > 0x002a > [ 507.908387] Workqueue: events irqfd_inject > [ 507.908389] Call trace: > [ 507.908391] __switch_to+0x94/0xe8 > [ 507.908392] 0x20131 > [... ...] > [ 687.928330] rcu: INFO: rcu_sched detected stalls on CPUs/tasks: > [ 687.928339] rcu: 35-...0: (0 ticks this GP) > idle=d06/1/0x4000 softirq=72150/72150 fqs=25034 > [ 687.928341] rcu: 41-...0: (0 ticks this GP) > idle=dee/1/0x4000 softirq=68144/68144 fqs=25034 > [ 687.928343] rcu: (detected by 16, t=60007 jiffies, g=68929, > q=1601093) > [ 687.928351] Task dump for CPU 35: > [ 687.928352] qemu-kvmR running task0 66789 1 > 0x0002 > [ 687.928355] Call trace: > [ 687.928360] __switch_to+0x94/0xe8 > [ 687.928364] _cond_resched+0x24/0x68 > [ 687.928367] __flush_work+0x58/0x280 > [ 687.928369] free_unref_page_commit+0xc4/0x198 > [ 687.928370] free_unref_page+0x84/0xa0 > [ 687.928372] __free_pages+0x58/0x68 > [ 687.928373] free_pages.part.21+0x34/0x40 > [ 687.928374] free_pages+0x2c/0x38 > [ 687.928376] poll_freewait+0xa8/0xd0 > [ 687.928378] do_sys_poll+0x3d0/0x560 > [ 687.928379] __arm64_sys_ppoll+0x180/0x1e8 > [ 687.928381] 0xa48990 > [ 687.928382] Task dump for CPU 41: > [ 687.928383] kworker/41:1R running task0 647 2 > 0x002a > [ 687.928389] Workqueue: events irqfd_inject > [ 687.928391] Call trace: > [ 687.928392] __switch_to+0x94/0xe8 > [ 687.928394] 0x20131 > [...] > ---8<--- endlessly ... > > It seems that we've suffered from some locking related issues. Any > suggestions for debugging? None at the moment. And this doesn't seem quite related to the problem at hand, does it? > And could you please provide your test
Re: [PATCH] kvm: arm: Fix handling of stage2 huge mappings
Hi Suzuki, On 2019/3/19 22:11, Suzuki K Poulose wrote: We rely on the mmu_notifier call backs to handle the split/merge of huge pages and thus we are guaranteed that, while creating a block mapping, either the entire block is unmapped at stage2 or it is missing permission. However, we miss a case where the block mapping is split for dirty logging case and then could later be made block mapping, if we cancel the dirty logging. This not only creates inconsistent TLB entries for the pages in the the block, but also leakes the table pages for PMD level. Handle this corner case for the huge mappings at stage2 by unmapping the non-huge mapping for the block. This could potentially release the upper level table. So we need to restart the table walk once we unmap the range. Fixes : ad361f093c1e31d ("KVM: ARM: Support hugetlbfs backed huge pages") Reported-by: Zheng Xiang Cc: Zheng Xiang Cc: Zhengui Yu Sorry to bother you, but this should be "Zenghui Yu", thanks! zenghui Cc: Marc Zyngier Cc: Christoffer Dall Signed-off-by: Suzuki K Poulose --- virt/kvm/arm/mmu.c | 63 ++ 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index fce0983..6ad6f19d 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1060,25 +1060,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache { pmd_t *pmd, old_pmd; +retry: pmd = stage2_get_pmd(kvm, cache, addr); VM_BUG_ON(!pmd); old_pmd = *pmd; + /* +* Multiple vcpus faulting on the same PMD entry, can +* lead to them sequentially updating the PMD with the +* same value. Following the break-before-make +* (pmd_clear() followed by tlb_flush()) process can +* hinder forward progress due to refaults generated +* on missing translations. +* +* Skip updating the page table if the entry is +* unchanged. +*/ + if (pmd_val(old_pmd) == pmd_val(*new_pmd)) + return 0; + if (pmd_present(old_pmd)) { /* -* Multiple vcpus faulting on the same PMD entry, can -* lead to them sequentially updating the PMD with the -* same value. Following the break-before-make -* (pmd_clear() followed by tlb_flush()) process can -* hinder forward progress due to refaults generated -* on missing translations. +* If we already have PTE level mapping for this block, +* we must unmap it to avoid inconsistent TLB state and +* leaking the table page. We could end up in this situation +* if the memory slot was marked for dirty logging and was +* reverted, leaving PTE level mappings for the pages accessed +* during the period. So, unmap the PTE level mapping for this +* block and retry, as we could have released the upper level +* table in the process. * -* Skip updating the page table if the entry is -* unchanged. +* Normal THP split/merge follows mmu_notifier callbacks and do +* get handled accordingly. */ - if (pmd_val(old_pmd) == pmd_val(*new_pmd)) - return 0; - + if (!pmd_thp_or_huge(old_pmd)) { + unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); + goto retry; + } /* * Mapping in huge pages should only happen through a * fault. If a page is merged into a transparent huge @@ -1090,8 +1108,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache * should become splitting first, unmapped, merged, * and mapped back in on-demand. */ - VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); - + WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); } else { @@ -1107,6 +1124,7 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac { pud_t *pudp, old_pud; +retry: pudp = stage2_get_pud(kvm, cache, addr); VM_BUG_ON(!pudp); @@ -1114,16 +1132,25 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac /* * A large number of vcpus faulting on the same stage 2 entry, -* can lead to a refault due to the -* stage2_pud_clear()/tlb_flush(). Skip updating the page -* tables if there is no change. +* can lead to a refault due to the stage2_pud_clear()/tlb_flush(). +* Skip updating the page tables if there is no change.
Re: [RFC PATCH] KVM: arm/arm64: Enable direct irqfd MSI injection
Hi Marc, On 2019/3/19 18:01, Marc Zyngier wrote: On Tue, 19 Mar 2019 09:09:43 +0800 Zenghui Yu wrote: Hi all, On 2019/3/18 3:35, Marc Zyngier wrote: A first approach would be to keep a small cache of the last few successful translations for this ITS, cache that could be looked-up by holding a spinlock instead. A hit in this cache could directly be injected. Any command that invalidates or changes anything (DISCARD, INV, INVALL, MAPC with V=0, MAPD with V=0, MOVALL, MOVI) should nuke the cache altogether. Of course, all of that needs to be quantified. Thanks for all of your explanations, especially for Marc's suggestions! It took me long time to figure out my mistakes, since I am not very familiar with the locking stuff. Now I have to apologize for my noise. No need to apologize. The whole point of this list is to have discussions. Although your approach wasn't working, you did identify potential room for improvement. As for the its-translation-cache code (a really good news to us), we have a rough look at it and start testing now! Please let me know about your findings. My initial test doesn't show any improvement, but that could easily be attributed to the system I running this on (a tiny and slightly broken dual A53 system). The sizing of the cache is also important: too small, and you have the overhead of the lookup for no benefit; too big, and you waste memory. Not smoothly as expected. With below config (in the form of XML): ---8<--- ---8<--- VM can't even get to boot successfully! Kernel version is -stable 4.19.28. And *dmesg* on host shows: ---8<--- [ 507.908330] rcu: INFO: rcu_sched detected stalls on CPUs/tasks: [ 507.908338] rcu: 35-...0: (0 ticks this GP) idle=d06/1/0x4000 softirq=72150/72150 fqs=6269 [ 507.908341] rcu: 41-...0: (0 ticks this GP) idle=dee/1/0x4000 softirq=68144/68144 fqs=6269 [ 507.908342] rcu: (detected by 23, t=15002 jiffies, g=68929, q=408641) [ 507.908350] Task dump for CPU 35: [ 507.908351] qemu-kvmR running task0 66789 1 0x0002 [ 507.908354] Call trace: [ 507.908360] __switch_to+0x94/0xe8 [ 507.908363] _cond_resched+0x24/0x68 [ 507.908366] __flush_work+0x58/0x280 [ 507.908369] free_unref_page_commit+0xc4/0x198 [ 507.908370] free_unref_page+0x84/0xa0 [ 507.908371] __free_pages+0x58/0x68 [ 507.908372] free_pages.part.21+0x34/0x40 [ 507.908373] free_pages+0x2c/0x38 [ 507.908375] poll_freewait+0xa8/0xd0 [ 507.908377] do_sys_poll+0x3d0/0x560 [ 507.908378] __arm64_sys_ppoll+0x180/0x1e8 [ 507.908380] 0xa48990 [ 507.908381] Task dump for CPU 41: [ 507.908382] kworker/41:1R running task0 647 2 0x002a [ 507.908387] Workqueue: events irqfd_inject [ 507.908389] Call trace: [ 507.908391] __switch_to+0x94/0xe8 [ 507.908392] 0x20131 [... ...] [ 687.928330] rcu: INFO: rcu_sched detected stalls on CPUs/tasks: [ 687.928339] rcu: 35-...0: (0 ticks this GP) idle=d06/1/0x4000 softirq=72150/72150 fqs=25034 [ 687.928341] rcu: 41-...0: (0 ticks this GP) idle=dee/1/0x4000 softirq=68144/68144 fqs=25034 [ 687.928343] rcu: (detected by 16, t=60007 jiffies, g=68929, q=1601093) [ 687.928351] Task dump for CPU 35: [ 687.928352] qemu-kvmR running task0 66789 1 0x0002 [ 687.928355] Call trace: [ 687.928360] __switch_to+0x94/0xe8 [ 687.928364] _cond_resched+0x24/0x68 [ 687.928367] __flush_work+0x58/0x280 [ 687.928369] free_unref_page_commit+0xc4/0x198 [ 687.928370] free_unref_page+0x84/0xa0 [ 687.928372] __free_pages+0x58/0x68 [ 687.928373] free_pages.part.21+0x34/0x40 [ 687.928374] free_pages+0x2c/0x38 [ 687.928376] poll_freewait+0xa8/0xd0 [ 687.928378] do_sys_poll+0x3d0/0x560 [ 687.928379] __arm64_sys_ppoll+0x180/0x1e8 [ 687.928381] 0xa48990 [ 687.928382] Task dump for CPU 41: [ 687.928383] kworker/41:1R running task0 647 2 0x002a [ 687.928389] Workqueue: events irqfd_inject [ 687.928391] Call trace: [ 687.928392] __switch_to+0x94/0xe8 [ 687.928394] 0x20131 [...] ---8<--- endlessly ... It seems that we've suffered from some locking related issues. Any suggestions for debugging? And could you please provide your test steps ? So that I can run some tests on my HW to see improvement hopefully. Having thought about it a bit more, I think we can drop the invalidation on MOVI/MOVALL, as the LPI is still perfectly valid, and we don't cache the target vcpu. On the other hand, the cache must be nuked when the ITS is turned off. All of these are valuable. But it might be early for me to consider about them (I have to get the above problem solved first ...) thanks, zenghui Thanks, M. ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Re: [PATCH 1/2] KVM: arm/arm64: vgic-its: Take the srcu lock when writing to guest memory
Hi Eric, On Tue, 19 Mar 2019 15:16:38 +, Auger Eric wrote: > > Hi Marc, > > On 3/19/19 2:30 PM, Marc Zyngier wrote: > > When halting a guest, QEMU flushes the virtual ITS caches, which > > amounts to writing to the various tables that the guest has allocated. > > > > When doing this, we fail to take the srcu lock, and the kernel > > shouts loudly if running a lockdep kernel: > > > > [ 69.680416] = > > [ 69.680819] WARNING: suspicious RCU usage > > [ 69.681526] 5.1.0-rc1-8-g600025238f51-dirty #18 Not tainted > > [ 69.682096] - > > [ 69.682501] ./include/linux/kvm_host.h:605 suspicious > > rcu_dereference_check() usage! > > [ 69.683225] > > [ 69.683225] other info that might help us debug this: > > [ 69.683225] > > [ 69.683975] > > [ 69.683975] rcu_scheduler_active = 2, debug_locks = 1 > > [ 69.684598] 6 locks held by qemu-system-aar/4097: > > [ 69.685059] #0: 34196013 (&kvm->lock){+.+.}, at: > > vgic_its_set_attr+0x244/0x3a0 > > [ 69.686087] #1: f2ed935e (&its->its_lock){+.+.}, at: > > vgic_its_set_attr+0x250/0x3a0 > > [ 69.686919] #2: 5e71ea54 (&vcpu->mutex){+.+.}, at: > > lock_all_vcpus+0x64/0xd0 > > [ 69.687698] #3: c17e548d (&vcpu->mutex){+.+.}, at: > > lock_all_vcpus+0x64/0xd0 > > [ 69.688475] #4: ba386017 (&vcpu->mutex){+.+.}, at: > > lock_all_vcpus+0x64/0xd0 > > [ 69.689978] #5: c2c3c335 (&vcpu->mutex){+.+.}, at: > > lock_all_vcpus+0x64/0xd0 > > [ 69.690729] > > [ 69.690729] stack backtrace: > > [ 69.691151] CPU: 2 PID: 4097 Comm: qemu-system-aar Not tainted > > 5.1.0-rc1-8-g600025238f51-dirty #18 > > [ 69.691984] Hardware name: rockchip evb_rk3399/evb_rk3399, BIOS > > 2019.04-rc3-00124-g2feec69fb1 03/15/2019 > > [ 69.692831] Call trace: > > [ 69.694072] lockdep_rcu_suspicious+0xcc/0x110 > > [ 69.694490] gfn_to_memslot+0x174/0x190 > > [ 69.694853] kvm_write_guest+0x50/0xb0 > > [ 69.695209] vgic_its_save_tables_v0+0x248/0x330 > > [ 69.695639] vgic_its_set_attr+0x298/0x3a0 > > [ 69.696024] kvm_device_ioctl_attr+0x9c/0xd8 > > [ 69.696424] kvm_device_ioctl+0x8c/0xf8 > > [ 69.696788] do_vfs_ioctl+0xc8/0x960 > > [ 69.697128] ksys_ioctl+0x8c/0xa0 > > [ 69.697445] __arm64_sys_ioctl+0x28/0x38 > > [ 69.697817] el0_svc_common+0xd8/0x138 > > [ 69.698173] el0_svc_handler+0x38/0x78 > > [ 69.698528] el0_svc+0x8/0xc > > > > The fix is to obviously take the srcu lock, just like we do on the > > read side of things since bf308242ab98. One wonders why this wasn't > > fixed at the same time, but hey... > > > > Fixes: bf308242ab98 ("KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() > > calls with SRCU lock") > > Signed-off-by: Marc Zyngier > > --- > > arch/arm/include/asm/kvm_mmu.h | 11 +++ > > arch/arm64/include/asm/kvm_mmu.h | 11 +++ > > virt/kvm/arm/vgic/vgic-its.c | 8 > > 3 files changed, 26 insertions(+), 4 deletions(-) [...] > Don't we need to use kvm_write_guest_lock() as well also in > vgic_v3_lpi_sync_pending_status and vgic_v3_save_pending_tables? Ah, well spotted. Yes, that's needed too. I'll fix that. Thanks, M. -- Jazz is not dead, it just smell funny. ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Re: [PATCH 2/2] KVM: arm/arm64: vgic-its: Take the srcu lock when parsing the memslots
Hi Marc, On 3/19/19 2:30 PM, Marc Zyngier wrote: > Calling kvm_is_visible_gfn() implies that we're parsing the memslots, > and doing this without the srcu lock is frown upon: > > [12704.164532] = > [12704.164544] WARNING: suspicious RCU usage > [12704.164560] 5.1.0-rc1-8-g600025238f51-dirty #16 Tainted: GW > [12704.164573] - > [12704.164589] ./include/linux/kvm_host.h:605 suspicious > rcu_dereference_check() usage! > [12704.164602] other info that might help us debug this: > [12704.164616] rcu_scheduler_active = 2, debug_locks = 1 > [12704.164631] 6 locks held by qemu-system-aar/13968: > [12704.164644] #0: 7ebdae4f (&kvm->lock){+.+.}, at: > vgic_its_set_attr+0x244/0x3a0 > [12704.164691] #1: 7d751022 (&its->its_lock){+.+.}, at: > vgic_its_set_attr+0x250/0x3a0 > [12704.164726] #2: 219d2706 (&vcpu->mutex){+.+.}, at: > lock_all_vcpus+0x64/0xd0 > [12704.164761] #3: a760aecd (&vcpu->mutex){+.+.}, at: > lock_all_vcpus+0x64/0xd0 > [12704.164794] #4: 0ef8e31d (&vcpu->mutex){+.+.}, at: > lock_all_vcpus+0x64/0xd0 > [12704.164827] #5: 7a872093 (&vcpu->mutex){+.+.}, at: > lock_all_vcpus+0x64/0xd0 > [12704.164861] stack backtrace: > [12704.164878] CPU: 2 PID: 13968 Comm: qemu-system-aar Tainted: GW > 5.1.0-rc1-8-g600025238f51-dirty #16 > [12704.164887] Hardware name: rockchip evb_rk3399/evb_rk3399, BIOS > 2019.04-rc3-00124-g2feec69fb1 03/15/2019 > [12704.164896] Call trace: > [12704.164910] dump_backtrace+0x0/0x138 > [12704.164920] show_stack+0x24/0x30 > [12704.164934] dump_stack+0xbc/0x104 > [12704.164946] lockdep_rcu_suspicious+0xcc/0x110 > [12704.164958] gfn_to_memslot+0x174/0x190 > [12704.164969] kvm_is_visible_gfn+0x28/0x70 > [12704.164980] vgic_its_check_id.isra.0+0xec/0x1e8 > [12704.164991] vgic_its_save_tables_v0+0x1ac/0x330 > [12704.165001] vgic_its_set_attr+0x298/0x3a0 > [12704.165012] kvm_device_ioctl_attr+0x9c/0xd8 > [12704.165022] kvm_device_ioctl+0x8c/0xf8 > [12704.165035] do_vfs_ioctl+0xc8/0x960 > [12704.165045] ksys_ioctl+0x8c/0xa0 > [12704.165055] __arm64_sys_ioctl+0x28/0x38 > [12704.165067] el0_svc_common+0xd8/0x138 > [12704.165078] el0_svc_handler+0x38/0x78 > [12704.165089] el0_svc+0x8/0xc > > Make sure the lock is taken when doing this. > > Fixes: bf308242ab98 ("KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() > calls with SRCU lock") > Signed-off-by: Marc Zyngier Reviewed-by: Eric Auger Thanks Eric > --- > virt/kvm/arm/vgic/vgic-its.c | 13 ++--- > 1 file changed, 10 insertions(+), 3 deletions(-) > > diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c > index bb76220f2f30..7543f166943d 100644 > --- a/virt/kvm/arm/vgic/vgic-its.c > +++ b/virt/kvm/arm/vgic/vgic-its.c > @@ -880,8 +880,9 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 > baser, u32 id, > u64 indirect_ptr, type = GITS_BASER_TYPE(baser); > phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); > int esz = GITS_BASER_ENTRY_SIZE(baser); > - int index; > + int index, idx; > gfn_t gfn; > + bool ret; > > switch (type) { > case GITS_BASER_TYPE_DEVICE: > @@ -908,7 +909,8 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 > baser, u32 id, > > if (eaddr) > *eaddr = addr; > - return kvm_is_visible_gfn(its->dev->kvm, gfn); > + > + goto out; > } > > /* calculate and check the index into the 1st level */ > @@ -938,7 +940,12 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 > baser, u32 id, > > if (eaddr) > *eaddr = indirect_ptr; > - return kvm_is_visible_gfn(its->dev->kvm, gfn); > + > +out: > + idx = srcu_read_lock(&its->dev->kvm->srcu); > + ret = kvm_is_visible_gfn(its->dev->kvm, gfn); > + srcu_read_unlock(&its->dev->kvm->srcu, idx); > + return ret; > } > > static int vgic_its_alloc_collection(struct vgic_its *its, > ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Re: [PATCH 1/2] KVM: arm/arm64: vgic-its: Take the srcu lock when writing to guest memory
Hi Marc, On 3/19/19 2:30 PM, Marc Zyngier wrote: > When halting a guest, QEMU flushes the virtual ITS caches, which > amounts to writing to the various tables that the guest has allocated. > > When doing this, we fail to take the srcu lock, and the kernel > shouts loudly if running a lockdep kernel: > > [ 69.680416] = > [ 69.680819] WARNING: suspicious RCU usage > [ 69.681526] 5.1.0-rc1-8-g600025238f51-dirty #18 Not tainted > [ 69.682096] - > [ 69.682501] ./include/linux/kvm_host.h:605 suspicious > rcu_dereference_check() usage! > [ 69.683225] > [ 69.683225] other info that might help us debug this: > [ 69.683225] > [ 69.683975] > [ 69.683975] rcu_scheduler_active = 2, debug_locks = 1 > [ 69.684598] 6 locks held by qemu-system-aar/4097: > [ 69.685059] #0: 34196013 (&kvm->lock){+.+.}, at: > vgic_its_set_attr+0x244/0x3a0 > [ 69.686087] #1: f2ed935e (&its->its_lock){+.+.}, at: > vgic_its_set_attr+0x250/0x3a0 > [ 69.686919] #2: 5e71ea54 (&vcpu->mutex){+.+.}, at: > lock_all_vcpus+0x64/0xd0 > [ 69.687698] #3: c17e548d (&vcpu->mutex){+.+.}, at: > lock_all_vcpus+0x64/0xd0 > [ 69.688475] #4: ba386017 (&vcpu->mutex){+.+.}, at: > lock_all_vcpus+0x64/0xd0 > [ 69.689978] #5: c2c3c335 (&vcpu->mutex){+.+.}, at: > lock_all_vcpus+0x64/0xd0 > [ 69.690729] > [ 69.690729] stack backtrace: > [ 69.691151] CPU: 2 PID: 4097 Comm: qemu-system-aar Not tainted > 5.1.0-rc1-8-g600025238f51-dirty #18 > [ 69.691984] Hardware name: rockchip evb_rk3399/evb_rk3399, BIOS > 2019.04-rc3-00124-g2feec69fb1 03/15/2019 > [ 69.692831] Call trace: > [ 69.694072] lockdep_rcu_suspicious+0xcc/0x110 > [ 69.694490] gfn_to_memslot+0x174/0x190 > [ 69.694853] kvm_write_guest+0x50/0xb0 > [ 69.695209] vgic_its_save_tables_v0+0x248/0x330 > [ 69.695639] vgic_its_set_attr+0x298/0x3a0 > [ 69.696024] kvm_device_ioctl_attr+0x9c/0xd8 > [ 69.696424] kvm_device_ioctl+0x8c/0xf8 > [ 69.696788] do_vfs_ioctl+0xc8/0x960 > [ 69.697128] ksys_ioctl+0x8c/0xa0 > [ 69.697445] __arm64_sys_ioctl+0x28/0x38 > [ 69.697817] el0_svc_common+0xd8/0x138 > [ 69.698173] el0_svc_handler+0x38/0x78 > [ 69.698528] el0_svc+0x8/0xc > > The fix is to obviously take the srcu lock, just like we do on the > read side of things since bf308242ab98. One wonders why this wasn't > fixed at the same time, but hey... > > Fixes: bf308242ab98 ("KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() > calls with SRCU lock") > Signed-off-by: Marc Zyngier > --- > arch/arm/include/asm/kvm_mmu.h | 11 +++ > arch/arm64/include/asm/kvm_mmu.h | 11 +++ > virt/kvm/arm/vgic/vgic-its.c | 8 > 3 files changed, 26 insertions(+), 4 deletions(-) > > diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h > index 2de96a180166..31de4ab93005 100644 > --- a/arch/arm/include/asm/kvm_mmu.h > +++ b/arch/arm/include/asm/kvm_mmu.h > @@ -381,6 +381,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm, > return ret; > } > > +static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, > +const void *data, unsigned long len) > +{ > + int srcu_idx = srcu_read_lock(&kvm->srcu); > + int ret = kvm_write_guest(kvm, gpa, data, len); > + > + srcu_read_unlock(&kvm->srcu, srcu_idx); > + > + return ret; > +} > + > static inline void *kvm_get_hyp_vector(void) > { > switch(read_cpuid_part()) { > diff --git a/arch/arm64/include/asm/kvm_mmu.h > b/arch/arm64/include/asm/kvm_mmu.h > index b0742a16c6c9..ebeefcf835e8 100644 > --- a/arch/arm64/include/asm/kvm_mmu.h > +++ b/arch/arm64/include/asm/kvm_mmu.h > @@ -445,6 +445,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm, > return ret; > } > > +static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, > +const void *data, unsigned long len) > +{ > + int srcu_idx = srcu_read_lock(&kvm->srcu); > + int ret = kvm_write_guest(kvm, gpa, data, len); > + > + srcu_read_unlock(&kvm->srcu, srcu_idx); > + > + return ret; > +} > + > #ifdef CONFIG_KVM_INDIRECT_VECTORS > /* > * EL2 vectors can be mapped and rerouted in a number of ways, > diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c > index a60a768d9258..bb76220f2f30 100644 > --- a/virt/kvm/arm/vgic/vgic-its.c > +++ b/virt/kvm/arm/vgic/vgic-its.c > @@ -2058,7 +2058,7 @@ static int vgic_its_save_ite(struct vgic_its *its, > struct its_device *dev, > ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | > ite->collection->collection_id; > val = cpu_to_le64(val); > - return kvm_write_guest(kvm, gpa, &val, ite_esz); > + return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); > } > > /** > @@ -2205,7 +2205,7 @@ static int vgic_its_save_dte(struct vgic_its *its, > struct its_de
Re: [RFC] arm/cpu: fix soft lockup panic after resuming from stop
Hi Christoffer and Steve, On 2019/3/15 16:59, Christoffer Dall wrote: Hi Steve, On Wed, Mar 13, 2019 at 10:11:30AM +, Steven Price wrote: Personally I think what we need is: * Either a patch like the one from Heyi Guo (save/restore CNTVCT_EL0) or alternatively hooking up KVM_KVMCLOCK_CTRL to prevent the watchdog firing when user space explicitly stops scheduling the guest for a while. If we save/restore CNTVCT_EL0 and the warning goes away, does the guest wall clock timekeeping get all confused and does it figure this out automagically somehow? What's the source for guest wall clock timekeeping in current ARM64 implementation? Is it the value from CNTP_TVAL_EL0? Will guest OS keep track of this? Or is the wall clock simply platform RTC? I tested the RFC change and did not see anything unusual. Did I miss someting? Does KVM_KVMCLOCK_CTRL solve that problem? It seems KVM_KVMCLOCK_CTRL is dedicated for guest pause/resume scenario, but it relies on pvclock both in KVM and Guest and right now only X86 supports it :( Could Steve provide more insights about the whole thing? Thanks, Heyi * KVM itself saving/restoring CNTVCT_EL0 during suspend/resume so the guest doesn't see time pass during a suspend. This smells like policy to me so I'd much prefer keeping as much functionality in user space as possible. If we already have the APIs we need from KVM, let's use them. * Something equivalent to MSR_KVM_WALL_CLOCK_NEW for arm which allows the guest to query the wall clock time from the host and provides an offset between CNTVCT_EL0 to wall clock time which the KVM can update during suspend/resume. This means that during a suspend/resume the guest can observe that wall clock time has passed, without having to be bothered about CNTVCT_EL0 jumping forwards. Isn't the proper Arm architectural solution for this to read the physical counter for wall clock time keeping ? (Yes that will require a trap on physical counter reads after migration on current systems, but migration sucks in terms of timekeeping already.) Thanks, Christoffer . ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH] kvm: arm: Fix handling of stage2 huge mappings
We rely on the mmu_notifier call backs to handle the split/merge of huge pages and thus we are guaranteed that, while creating a block mapping, either the entire block is unmapped at stage2 or it is missing permission. However, we miss a case where the block mapping is split for dirty logging case and then could later be made block mapping, if we cancel the dirty logging. This not only creates inconsistent TLB entries for the pages in the the block, but also leakes the table pages for PMD level. Handle this corner case for the huge mappings at stage2 by unmapping the non-huge mapping for the block. This could potentially release the upper level table. So we need to restart the table walk once we unmap the range. Fixes : ad361f093c1e31d ("KVM: ARM: Support hugetlbfs backed huge pages") Reported-by: Zheng Xiang Cc: Zheng Xiang Cc: Zhengui Yu Cc: Marc Zyngier Cc: Christoffer Dall Signed-off-by: Suzuki K Poulose --- virt/kvm/arm/mmu.c | 63 ++ 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index fce0983..6ad6f19d 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1060,25 +1060,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache { pmd_t *pmd, old_pmd; +retry: pmd = stage2_get_pmd(kvm, cache, addr); VM_BUG_ON(!pmd); old_pmd = *pmd; + /* +* Multiple vcpus faulting on the same PMD entry, can +* lead to them sequentially updating the PMD with the +* same value. Following the break-before-make +* (pmd_clear() followed by tlb_flush()) process can +* hinder forward progress due to refaults generated +* on missing translations. +* +* Skip updating the page table if the entry is +* unchanged. +*/ + if (pmd_val(old_pmd) == pmd_val(*new_pmd)) + return 0; + if (pmd_present(old_pmd)) { /* -* Multiple vcpus faulting on the same PMD entry, can -* lead to them sequentially updating the PMD with the -* same value. Following the break-before-make -* (pmd_clear() followed by tlb_flush()) process can -* hinder forward progress due to refaults generated -* on missing translations. +* If we already have PTE level mapping for this block, +* we must unmap it to avoid inconsistent TLB state and +* leaking the table page. We could end up in this situation +* if the memory slot was marked for dirty logging and was +* reverted, leaving PTE level mappings for the pages accessed +* during the period. So, unmap the PTE level mapping for this +* block and retry, as we could have released the upper level +* table in the process. * -* Skip updating the page table if the entry is -* unchanged. +* Normal THP split/merge follows mmu_notifier callbacks and do +* get handled accordingly. */ - if (pmd_val(old_pmd) == pmd_val(*new_pmd)) - return 0; - + if (!pmd_thp_or_huge(old_pmd)) { + unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); + goto retry; + } /* * Mapping in huge pages should only happen through a * fault. If a page is merged into a transparent huge @@ -1090,8 +1108,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache * should become splitting first, unmapped, merged, * and mapped back in on-demand. */ - VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); - + WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); } else { @@ -1107,6 +1124,7 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac { pud_t *pudp, old_pud; +retry: pudp = stage2_get_pud(kvm, cache, addr); VM_BUG_ON(!pudp); @@ -1114,16 +1132,25 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac /* * A large number of vcpus faulting on the same stage 2 entry, -* can lead to a refault due to the -* stage2_pud_clear()/tlb_flush(). Skip updating the page -* tables if there is no change. +* can lead to a refault due to the stage2_pud_clear()/tlb_flush(). +* Skip updating the page tables if there is no change. */ if (pud_val(old_pud) == pud_val(*new_pudp)) return 0; if (stage2_pud_present(kvm, old_pud))
[PATCH 2/2] KVM: arm/arm64: vgic-its: Take the srcu lock when parsing the memslots
Calling kvm_is_visible_gfn() implies that we're parsing the memslots, and doing this without the srcu lock is frown upon: [12704.164532] = [12704.164544] WARNING: suspicious RCU usage [12704.164560] 5.1.0-rc1-8-g600025238f51-dirty #16 Tainted: GW [12704.164573] - [12704.164589] ./include/linux/kvm_host.h:605 suspicious rcu_dereference_check() usage! [12704.164602] other info that might help us debug this: [12704.164616] rcu_scheduler_active = 2, debug_locks = 1 [12704.164631] 6 locks held by qemu-system-aar/13968: [12704.164644] #0: 7ebdae4f (&kvm->lock){+.+.}, at: vgic_its_set_attr+0x244/0x3a0 [12704.164691] #1: 7d751022 (&its->its_lock){+.+.}, at: vgic_its_set_attr+0x250/0x3a0 [12704.164726] #2: 219d2706 (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0 [12704.164761] #3: a760aecd (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0 [12704.164794] #4: 0ef8e31d (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0 [12704.164827] #5: 7a872093 (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0 [12704.164861] stack backtrace: [12704.164878] CPU: 2 PID: 13968 Comm: qemu-system-aar Tainted: GW 5.1.0-rc1-8-g600025238f51-dirty #16 [12704.164887] Hardware name: rockchip evb_rk3399/evb_rk3399, BIOS 2019.04-rc3-00124-g2feec69fb1 03/15/2019 [12704.164896] Call trace: [12704.164910] dump_backtrace+0x0/0x138 [12704.164920] show_stack+0x24/0x30 [12704.164934] dump_stack+0xbc/0x104 [12704.164946] lockdep_rcu_suspicious+0xcc/0x110 [12704.164958] gfn_to_memslot+0x174/0x190 [12704.164969] kvm_is_visible_gfn+0x28/0x70 [12704.164980] vgic_its_check_id.isra.0+0xec/0x1e8 [12704.164991] vgic_its_save_tables_v0+0x1ac/0x330 [12704.165001] vgic_its_set_attr+0x298/0x3a0 [12704.165012] kvm_device_ioctl_attr+0x9c/0xd8 [12704.165022] kvm_device_ioctl+0x8c/0xf8 [12704.165035] do_vfs_ioctl+0xc8/0x960 [12704.165045] ksys_ioctl+0x8c/0xa0 [12704.165055] __arm64_sys_ioctl+0x28/0x38 [12704.165067] el0_svc_common+0xd8/0x138 [12704.165078] el0_svc_handler+0x38/0x78 [12704.165089] el0_svc+0x8/0xc Make sure the lock is taken when doing this. Fixes: bf308242ab98 ("KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() calls with SRCU lock") Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 13 ++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index bb76220f2f30..7543f166943d 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -880,8 +880,9 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, u64 indirect_ptr, type = GITS_BASER_TYPE(baser); phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); int esz = GITS_BASER_ENTRY_SIZE(baser); - int index; + int index, idx; gfn_t gfn; + bool ret; switch (type) { case GITS_BASER_TYPE_DEVICE: @@ -908,7 +909,8 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, if (eaddr) *eaddr = addr; - return kvm_is_visible_gfn(its->dev->kvm, gfn); + + goto out; } /* calculate and check the index into the 1st level */ @@ -938,7 +940,12 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, if (eaddr) *eaddr = indirect_ptr; - return kvm_is_visible_gfn(its->dev->kvm, gfn); + +out: + idx = srcu_read_lock(&its->dev->kvm->srcu); + ret = kvm_is_visible_gfn(its->dev->kvm, gfn); + srcu_read_unlock(&its->dev->kvm->srcu, idx); + return ret; } static int vgic_its_alloc_collection(struct vgic_its *its, -- 2.20.1 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH 1/2] KVM: arm/arm64: vgic-its: Take the srcu lock when writing to guest memory
When halting a guest, QEMU flushes the virtual ITS caches, which amounts to writing to the various tables that the guest has allocated. When doing this, we fail to take the srcu lock, and the kernel shouts loudly if running a lockdep kernel: [ 69.680416] = [ 69.680819] WARNING: suspicious RCU usage [ 69.681526] 5.1.0-rc1-8-g600025238f51-dirty #18 Not tainted [ 69.682096] - [ 69.682501] ./include/linux/kvm_host.h:605 suspicious rcu_dereference_check() usage! [ 69.683225] [ 69.683225] other info that might help us debug this: [ 69.683225] [ 69.683975] [ 69.683975] rcu_scheduler_active = 2, debug_locks = 1 [ 69.684598] 6 locks held by qemu-system-aar/4097: [ 69.685059] #0: 34196013 (&kvm->lock){+.+.}, at: vgic_its_set_attr+0x244/0x3a0 [ 69.686087] #1: f2ed935e (&its->its_lock){+.+.}, at: vgic_its_set_attr+0x250/0x3a0 [ 69.686919] #2: 5e71ea54 (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0 [ 69.687698] #3: c17e548d (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0 [ 69.688475] #4: ba386017 (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0 [ 69.689978] #5: c2c3c335 (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0 [ 69.690729] [ 69.690729] stack backtrace: [ 69.691151] CPU: 2 PID: 4097 Comm: qemu-system-aar Not tainted 5.1.0-rc1-8-g600025238f51-dirty #18 [ 69.691984] Hardware name: rockchip evb_rk3399/evb_rk3399, BIOS 2019.04-rc3-00124-g2feec69fb1 03/15/2019 [ 69.692831] Call trace: [ 69.694072] lockdep_rcu_suspicious+0xcc/0x110 [ 69.694490] gfn_to_memslot+0x174/0x190 [ 69.694853] kvm_write_guest+0x50/0xb0 [ 69.695209] vgic_its_save_tables_v0+0x248/0x330 [ 69.695639] vgic_its_set_attr+0x298/0x3a0 [ 69.696024] kvm_device_ioctl_attr+0x9c/0xd8 [ 69.696424] kvm_device_ioctl+0x8c/0xf8 [ 69.696788] do_vfs_ioctl+0xc8/0x960 [ 69.697128] ksys_ioctl+0x8c/0xa0 [ 69.697445] __arm64_sys_ioctl+0x28/0x38 [ 69.697817] el0_svc_common+0xd8/0x138 [ 69.698173] el0_svc_handler+0x38/0x78 [ 69.698528] el0_svc+0x8/0xc The fix is to obviously take the srcu lock, just like we do on the read side of things since bf308242ab98. One wonders why this wasn't fixed at the same time, but hey... Fixes: bf308242ab98 ("KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() calls with SRCU lock") Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_mmu.h | 11 +++ arch/arm64/include/asm/kvm_mmu.h | 11 +++ virt/kvm/arm/vgic/vgic-its.c | 8 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 2de96a180166..31de4ab93005 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -381,6 +381,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm, return ret; } +static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, + const void *data, unsigned long len) +{ + int srcu_idx = srcu_read_lock(&kvm->srcu); + int ret = kvm_write_guest(kvm, gpa, data, len); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + return ret; +} + static inline void *kvm_get_hyp_vector(void) { switch(read_cpuid_part()) { diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b0742a16c6c9..ebeefcf835e8 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -445,6 +445,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm, return ret; } +static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, + const void *data, unsigned long len) +{ + int srcu_idx = srcu_read_lock(&kvm->srcu); + int ret = kvm_write_guest(kvm, gpa, data, len); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + return ret; +} + #ifdef CONFIG_KVM_INDIRECT_VECTORS /* * EL2 vectors can be mapped and rerouted in a number of ways, diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index a60a768d9258..bb76220f2f30 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -2058,7 +2058,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); - return kvm_write_guest(kvm, gpa, &val, ite_esz); + return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); } /** @@ -2205,7 +2205,7 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); - return kvm_write_guest(kvm, ptr, &val, dte_esz); + return kvm_write_guest_lock(kvm, ptr, &val, dte_
[PATCH 0/2] KVM: arm/arm64: vgic-its: Guest memory access fixes
While messing with something completely different, I managed to trigger a number of RCU warnings, due to the lack of srcu locking in the ITS code. Andre partially addressed those last year[1], but there was some more left. [1] http://lists.infradead.org/pipermail/linux-arm-kernel/2018-May/577056.html Marc Zyngier (2): KVM: arm/arm64: vgic-its: Take the srcu lock when writing to guest memory KVM: arm/arm64: vgic-its: Take the srcu lock when parsing the memslots arch/arm/include/asm/kvm_mmu.h | 11 +++ arch/arm64/include/asm/kvm_mmu.h | 11 +++ virt/kvm/arm/vgic/vgic-its.c | 21 ++--- 3 files changed, 36 insertions(+), 7 deletions(-) -- 2.20.1 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[RFC] Question about enable doorbell irq and halt_poll process
Hi, all Using gicv4, when guest is waiting for irq, it sends wfi and traps to kvm. When vlpi is forwarded to PE after its_vpe_deschedule, before halt_poll in kvm_vcpu_block, halt_poll may increase latency for this vlpi getting to guest. In halt_poll process, it checks if there's pending irq for vcpu using pending_last. However, doorbell is not enable at this moment and vlpi or doorbell can not set pending_last true, to stop halt_poll. It will run until halt_poll time ends, if there's no other physical irq coming in the meantime. And then vcpu is scheduled out. This pending vlpi has to wait for vcpu getting schedule in next time. Should we enable doorbell before halt_poll process ? ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Re: [RFC PATCH] KVM: arm/arm64: Enable direct irqfd MSI injection
On Tue, 19 Mar 2019 09:09:43 +0800 Zenghui Yu wrote: > Hi all, > > On 2019/3/18 3:35, Marc Zyngier wrote: > > On Sun, 17 Mar 2019 14:36:13 +, > > Zenghui Yu wrote: > >> > >> Currently, IRQFD on arm still uses the deferred workqueue mechanism > >> to inject interrupts into guest, which will likely lead to a busy > >> context-switching from/to the kworker thread. This overhead is for > >> no purpose (only in my view ...) and will result in an interrupt > >> performance degradation. > >> > >> Implement kvm_arch_set_irq_inatomic() for arm/arm64 to support direct > >> irqfd MSI injection, by which we can get rid of the annoying latency. > >> As a result, irqfd MSI intensive scenarios (e.g., DPDK with high packet > >> processing workloads) will benefit from it. > >> > >> Signed-off-by: Zenghui Yu > >> --- > >> > >> It seems that only MSI will follow the IRQFD path, did I miss something? > >> > >> This patch is still under test and sent out for early feedback. If I have > >> any mis-understanding, please fix me up and let me know. Thanks! > > > > As mentioned by other folks in the thread, this is clearly wrong. The > > first thing kvm_inject_msi does is to lock the corresponding ITS using > > a mutex. So the "no purpose" bit was a bit too quick. > > > > When doing this kind of work, I suggest you enable lockdep and all the > > related checkers. Also, for any optimisation, please post actual > > numbers for the relevant benchmarks. Saying "application X will > > benefit from it" is meaningless without any actual data. > > > >> > >> --- > >> virt/kvm/arm/vgic/trace.h | 22 ++ > >> virt/kvm/arm/vgic/vgic-irqfd.c | 21 + > >> 2 files changed, 43 insertions(+) > >> > >> diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h > >> index 55fed77..bc1f4db 100644 > >> --- a/virt/kvm/arm/vgic/trace.h > >> +++ b/virt/kvm/arm/vgic/trace.h > >> @@ -27,6 +27,28 @@ > >> __entry->vcpu_id, __entry->irq, __entry->level) > >> ); > >> >> +TRACE_EVENT(kvm_arch_set_irq_inatomic, > >> + TP_PROTO(u32 gsi, u32 type, int level, int irq_source_id), > >> + TP_ARGS(gsi, type, level, irq_source_id), > >> + > >> + TP_STRUCT__entry( > >> + __field(u32,gsi ) > >> + __field(u32,type) > >> + __field(int,level ) > >> + __field(int,irq_source_id ) > >> + ), > >> + > >> + TP_fast_assign( > >> + __entry->gsi= gsi; > >> + __entry->type = type; > >> + __entry->level = level; > >> + __entry->irq_source_id = irq_source_id; > >> + ), > >> + > >> + TP_printk("gsi %u type %u level %d source %d", __entry->gsi, > >> +__entry->type, __entry->level, __entry->irq_source_id) > >> +); > >> + > >> #endif /* _TRACE_VGIC_H */ > >> >> #undef TRACE_INCLUDE_PATH > >> diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c > >> b/virt/kvm/arm/vgic/vgic-irqfd.c > >> index 99e026d..4cfc3f4 100644 > >> --- a/virt/kvm/arm/vgic/vgic-irqfd.c > >> +++ b/virt/kvm/arm/vgic/vgic-irqfd.c > >> @@ -19,6 +19,7 @@ > >> #include > >> #include > >> #include "vgic.h" > >> +#include "trace.h" > >> >> /** > >>* vgic_irqfd_set_irq: inject the IRQ corresponding to the > >> @@ -105,6 +106,26 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry > >> *e, > >>return vgic_its_inject_msi(kvm, &msi); > >> } > >> >> +/** > >> + * kvm_arch_set_irq_inatomic: fast-path for irqfd injection > >> + * > >> + * Currently only direct MSI injecton is supported. > >> + */ > >> +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, > >> +struct kvm *kvm, int irq_source_id, int level, > >> +bool line_status) > >> +{ > >> + int ret; > >> + > >> + trace_kvm_arch_set_irq_inatomic(e->gsi, e->type, level, irq_source_id); > >> + > >> + if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) > >> + return -EWOULDBLOCK; > >> + > >> + ret = kvm_set_msi(e, kvm, irq_source_id, level, line_status); > >> + return ret; > >> +} > >> + > > > > Although we've established that the approach is wrong, maybe we can > > look at improving this aspect. > > > > A first approach would be to keep a small cache of the last few > > successful translations for this ITS, cache that could be looked-up by > > holding a spinlock instead. A hit in this cache could directly be > > injected. Any command that invalidates or changes anything (DISCARD, > > INV, INVALL, MAPC with V=0, MAPD with V=0, MOVALL, MOVI) should nuke > > the cache altogether. > > > > Of course, all of that needs to be quantified. > > Thanks for all of your explanations, especially for Marc's suggestions! > It took me long time to figure out my mistakes, since I am not very > familiar with the locking stuff. Now I have to apologize for my noise. No need to apologize. The
Re: [RFC] Question about TLB flush while set Stage-2 huge pages
Hi Suzuki, On 2019/3/19 1:34, Suzuki K Poulose wrote: Hi ! On Sun, Mar 17, 2019 at 09:34:11PM +0800, Zenghui Yu wrote: Hi Suzuki, ---8<--- test: kvm: arm: Maybe two more fixes Applied based on Suzuki's patch. Signed-off-by: Zenghui Yu --- virt/kvm/arm/mmu.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 05765df..ccd5d5d 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1089,7 +1089,9 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache * Normal THP split/merge follows mmu_notifier * callbacks and do get handled accordingly. */ - unmap_stage2_range(kvm, (addr & S2_PMD_MASK), S2_PMD_SIZE); + addr &= S2_PMD_MASK; + unmap_stage2_ptes(kvm, pmd, addr, addr + S2_PMD_SIZE); + get_page(virt_to_page(pmd)); } else { /* @@ -1138,7 +1140,9 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac if (stage2_pud_present(kvm, old_pud)) { /* If we have PTE level mapping, unmap the entire range */ if (WARN_ON_ONCE(!stage2_pud_huge(kvm, old_pud))) { - unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); + addr &= S2_PUD_MASK; + unmap_stage2_pmds(kvm, pudp, addr, addr + S2_PUD_SIZE); + get_page(virt_to_page(pudp)); } else { stage2_pud_clear(kvm, pudp); kvm_tlb_flush_vmid_ipa(kvm, addr); This makes it a bit tricky to follow the code. The other option is to do something like : Yes. ---8>--- kvm: arm: Fix handling of stage2 huge mappings We rely on the mmu_notifier call backs to handle the split/merging of huge pages and thus we are guaranteed that while creating a block mapping, the entire block is unmapped at stage2. However, we miss a case where the block mapping is split for dirty logging case and then could later be made block mapping, if we cancel the dirty logging. This not only creates inconsistent TLB entries for the pages in the the block, but also leakes the table pages for PMD level. Handle these corner cases for the huge mappings at stage2 by unmapping the PTE level mapping. This could potentially release the upper level table. So we need to restart the table walk once we unmap the range. Signed-off-by: Suzuki K Poulose --- virt/kvm/arm/mmu.c | 57 +++--- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index fce0983..a38a3f1 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1060,25 +1060,41 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache { pmd_t *pmd, old_pmd; +retry: pmd = stage2_get_pmd(kvm, cache, addr); VM_BUG_ON(!pmd); old_pmd = *pmd; + /* +* Multiple vcpus faulting on the same PMD entry, can +* lead to them sequentially updating the PMD with the +* same value. Following the break-before-make +* (pmd_clear() followed by tlb_flush()) process can +* hinder forward progress due to refaults generated +* on missing translations. +* +* Skip updating the page table if the entry is +* unchanged. +*/ + if (pmd_val(old_pmd) == pmd_val(*new_pmd)) + return 0; + if (pmd_present(old_pmd)) { /* -* Multiple vcpus faulting on the same PMD entry, can -* lead to them sequentially updating the PMD with the -* same value. Following the break-before-make -* (pmd_clear() followed by tlb_flush()) process can -* hinder forward progress due to refaults generated -* on missing translations. -* -* Skip updating the page table if the entry is -* unchanged. +* If we already have PTE level mapping for this block, +* we must unmap it to avoid inconsistent TLB +* state. We could end up in this situation if +* the memory slot was marked for dirty logging +* and was reverted, leaving PTE level mappings +* for the pages accessed during the period. +* Normal THP split/merge follows mmu_notifier +* callbacks and do get handled accordingly. +* Unmap the PTE level mapping and retry. */ - if (pmd_val(old_pmd) == pmd_val(*new_pmd)) - return 0; - + if (!pmd_thp_or_huge(old_pmd)) { + unmap_stage2_range(kvm, (addr & S2_PMD_MASK), S2_PMD_SIZE
[PATCH v7 7/10] KVM: arm/arm64: context-switch ptrauth registers
From: Mark Rutland When pointer authentication is supported, a guest may wish to use it. This patch adds the necessary KVM infrastructure for this to work, with a semi-lazy context switch of the pointer auth state. Pointer authentication feature is only enabled when VHE is built in the kernel and present in the CPU implementation so only VHE code paths are modified. When we schedule a vcpu, we disable guest usage of pointer authentication instructions and accesses to the keys. While these are disabled, we avoid context-switching the keys. When we trap the guest trying to use pointer authentication functionality, we change to eagerly context-switching the keys, and enable the feature. The next time the vcpu is scheduled out/in, we start again. However the host key save is optimized and implemented inside ptrauth instruction/register access trap. Pointer authentication consists of address authentication and generic authentication, and CPUs in a system might have varied support for either. Where support for either feature is not uniform, it is hidden from guests via ID register emulation, as a result of the cpufeature framework in the host. Unfortunately, address authentication and generic authentication cannot be trapped separately, as the architecture provides a single EL2 trap covering both. If we wish to expose one without the other, we cannot prevent a (badly-written) guest from intermittently using a feature which is not uniformly supported (when scheduled on a physical CPU which supports the relevant feature). Hence, this patch expects both type of authentication to be present in a cpu. This switch of key is done from guest enter/exit assembly as preperation for the upcoming in-kernel pointer authentication support. Hence, these key switching routines are not implemented in C code as they may cause pointer authentication key signing error in some situations. Signed-off-by: Mark Rutland [Only VHE, key switch in full assembly, vcpu_has_ptrauth checks , save host key in ptrauth exception trap] Signed-off-by: Amit Daniel Kachhap Reviewed-by: Julien Thierry Cc: Marc Zyngier Cc: Christoffer Dall Cc: kvmarm@lists.cs.columbia.edu --- arch/arm64/include/asm/kvm_host.h| 17 ++ arch/arm64/include/asm/kvm_ptrauth_asm.h | 100 +++ arch/arm64/kernel/asm-offsets.c | 6 ++ arch/arm64/kvm/guest.c | 14 + arch/arm64/kvm/handle_exit.c | 24 +--- arch/arm64/kvm/hyp/entry.S | 7 +++ arch/arm64/kvm/reset.c | 7 +++ arch/arm64/kvm/sys_regs.c| 46 +- virt/kvm/arm/arm.c | 2 + 9 files changed, 212 insertions(+), 11 deletions(-) create mode 100644 arch/arm64/include/asm/kvm_ptrauth_asm.h diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 9dd2918..61239a6 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -152,6 +152,18 @@ enum vcpu_sysreg { PMSWINC_EL0,/* Software Increment Register */ PMUSERENR_EL0, /* User Enable Register */ + /* Pointer Authentication Registers */ + APIAKEYLO_EL1, + APIAKEYHI_EL1, + APIBKEYLO_EL1, + APIBKEYHI_EL1, + APDAKEYLO_EL1, + APDAKEYHI_EL1, + APDBKEYLO_EL1, + APDBKEYHI_EL1, + APGAKEYLO_EL1, + APGAKEYHI_EL1, + /* 32bit specific registers. Keep them at the end of the range */ DACR32_EL2, /* Domain Access Control Register */ IFSR32_EL2, /* Instruction Fault Status Register */ @@ -497,6 +509,11 @@ static inline bool kvm_arch_requires_vhe(void) test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) && \ test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) +void kvm_arm_vcpu_ptrauth_enable(struct kvm_vcpu *vcpu); +void kvm_arm_vcpu_ptrauth_disable(struct kvm_vcpu *vcpu); +void kvm_arm_vcpu_ptrauth_setup_lazy(struct kvm_vcpu *vcpu); +void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu); + static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm64/include/asm/kvm_ptrauth_asm.h b/arch/arm64/include/asm/kvm_ptrauth_asm.h new file mode 100644 index 000..97bb040 --- /dev/null +++ b/arch/arm64/include/asm/kvm_ptrauth_asm.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0 + * arch/arm64/include/asm/kvm_ptrauth_asm.h: Guest/host ptrauth save/restore + * Copyright 2019 Arm Limited + * Author: Mark Rutland + * Amit Daniel Kachhap + */ + +#ifndef __ASM_KVM_ASM_PTRAUTH_H +#define __ASM_KVM_ASM_PTRAUTH_H + +#ifndef __ASSEMBLY__ + +#define __ptrauth_save_key(regs, key) \ +({ \ + regs[key ## KEYLO_E
[PATCH v7 9/10] KVM: arm64: docs: document KVM support of pointer authentication
This adds sections for KVM API extension for pointer authentication. A brief description about usage of pointer authentication for KVM guests is added in the arm64 documentations. Signed-off-by: Amit Daniel Kachhap Cc: Mark Rutland Cc: Christoffer Dall Cc: Marc Zyngier Cc: kvmarm@lists.cs.columbia.edu --- Documentation/arm64/pointer-authentication.txt | 15 +++ Documentation/virtual/kvm/api.txt | 6 ++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/Documentation/arm64/pointer-authentication.txt b/Documentation/arm64/pointer-authentication.txt index 5baca42..4b769e6 100644 --- a/Documentation/arm64/pointer-authentication.txt +++ b/Documentation/arm64/pointer-authentication.txt @@ -87,7 +87,14 @@ used to get and set the keys for a thread. Virtualization -- -Pointer authentication is not currently supported in KVM guests. KVM -will mask the feature bits from ID_AA64ISAR1_EL1, and attempted use of -the feature will result in an UNDEFINED exception being injected into -the guest. +Pointer authentication is enabled in KVM guest when each virtual cpu is +initialised by passing flags KVM_ARM_VCPU_PTRAUTH_[ADDRESS/GENERIC] and +requesting this feature to be enabled. Without this flag, pointer +authentication is not enabled in KVM guests and attempted use of the +feature will result in an UNDEFINED exception being injected into the +guest. + +Additionally, when these vcpu feature flags are not set then KVM will +filter out the Pointer Authentication system key registers from +KVM_GET/SET_REG_* ioctls and mask those features from cpufeature ID +register. diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 7de9eee..b5c66bc 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2659,6 +2659,12 @@ Possible features: Depends on KVM_CAP_ARM_PSCI_0_2. - KVM_ARM_VCPU_PMU_V3: Emulate PMUv3 for the CPU. Depends on KVM_CAP_ARM_PMU_V3. + - KVM_ARM_VCPU_PTRAUTH_ADDRESS: + - KVM_ARM_VCPU_PTRAUTH_GENERIC: + Enables Pointer authentication for the CPU. + Depends on KVM_CAP_ARM_PTRAUTH and only on arm64 architecture. If + set, then the KVM guest allows the execution of pointer authentication + instructions. Otherwise, KVM treats these instructions as undefined. 4.83 KVM_ARM_PREFERRED_TARGET -- 2.7.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v7 8/10] KVM: arm64: Add capability to advertise ptrauth for guest
This patch advertises the capability of pointer authentication when system supports pointer authentication and VHE mode present. Signed-off-by: Amit Daniel Kachhap Cc: Mark Rutland Cc: Marc Zyngier Cc: Christoffer Dall Cc: kvmarm@lists.cs.columbia.edu --- arch/arm64/kvm/reset.c | 4 include/uapi/linux/kvm.h | 1 + 2 files changed, 5 insertions(+) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 00f0639..a3b269e 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -92,6 +92,10 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_VM_IPA_SIZE: r = kvm_ipa_limit; break; + case KVM_CAP_ARM_PTRAUTH: + r = has_vhe() && system_supports_address_auth() && + system_supports_generic_auth(); + break; default: r = 0; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6d4ea4b..a553477 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -988,6 +988,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_VM_IPA_SIZE 165 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 #define KVM_CAP_HYPERV_CPUID 167 +#define KVM_CAP_ARM_PTRAUTH 168 #ifdef KVM_CAP_IRQ_ROUTING -- 2.7.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[kvmtool PATCH v7 10/10] KVM: arm/arm64: Add a vcpu feature for pointer authentication
This is a runtime capabality for KVM tool to enable Arm64 8.3 Pointer Authentication in guest kernel. A command line option --ptrauth is required for this. Signed-off-by: Amit Daniel Kachhap --- arm/aarch32/include/kvm/kvm-cpu-arch.h| 1 + arm/aarch64/include/asm/kvm.h | 2 ++ arm/aarch64/include/kvm/kvm-config-arch.h | 4 +++- arm/aarch64/include/kvm/kvm-cpu-arch.h| 2 ++ arm/include/arm-common/kvm-config-arch.h | 1 + arm/kvm-cpu.c | 6 ++ include/linux/kvm.h | 1 + 7 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arm/aarch32/include/kvm/kvm-cpu-arch.h b/arm/aarch32/include/kvm/kvm-cpu-arch.h index d28ea67..520ea76 100644 --- a/arm/aarch32/include/kvm/kvm-cpu-arch.h +++ b/arm/aarch32/include/kvm/kvm-cpu-arch.h @@ -13,4 +13,5 @@ #define ARM_CPU_ID 0, 0, 0 #define ARM_CPU_ID_MPIDR 5 +#define ARM_VCPU_PTRAUTH_FEATURE 0 #endif /* KVM__KVM_CPU_ARCH_H */ diff --git a/arm/aarch64/include/asm/kvm.h b/arm/aarch64/include/asm/kvm.h index 97c3478..d4d0d8c 100644 --- a/arm/aarch64/include/asm/kvm.h +++ b/arm/aarch64/include/asm/kvm.h @@ -102,6 +102,8 @@ struct kvm_regs { #define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */ #define KVM_ARM_VCPU_PSCI_0_2 2 /* CPU uses PSCI v0.2 */ #define KVM_ARM_VCPU_PMU_V33 /* Support guest PMUv3 */ +#define KVM_ARM_VCPU_PTRAUTH_ADDRESS 4 /* CPU uses address pointer authentication */ +#define KVM_ARM_VCPU_PTRAUTH_GENERIC 5 /* CPU uses generic pointer authentication */ struct kvm_vcpu_init { __u32 target; diff --git a/arm/aarch64/include/kvm/kvm-config-arch.h b/arm/aarch64/include/kvm/kvm-config-arch.h index 04be43d..2074684 100644 --- a/arm/aarch64/include/kvm/kvm-config-arch.h +++ b/arm/aarch64/include/kvm/kvm-config-arch.h @@ -8,7 +8,9 @@ "Create PMUv3 device"), \ OPT_U64('\0', "kaslr-seed", &(cfg)->kaslr_seed, \ "Specify random seed for Kernel Address Space " \ - "Layout Randomization (KASLR)"), + "Layout Randomization (KASLR)"),\ + OPT_BOOLEAN('\0', "ptrauth", &(cfg)->has_ptrauth, \ + "Enable address authentication"), #include "arm-common/kvm-config-arch.h" diff --git a/arm/aarch64/include/kvm/kvm-cpu-arch.h b/arm/aarch64/include/kvm/kvm-cpu-arch.h index a9d8563..fcc2107 100644 --- a/arm/aarch64/include/kvm/kvm-cpu-arch.h +++ b/arm/aarch64/include/kvm/kvm-cpu-arch.h @@ -17,4 +17,6 @@ #define ARM_CPU_CTRL 3, 0, 1, 0 #define ARM_CPU_CTRL_SCTLR_EL1 0 +#define ARM_VCPU_PTRAUTH_FEATURE ((1UL << KVM_ARM_VCPU_PTRAUTH_ADDRESS) \ + | (1UL << KVM_ARM_VCPU_PTRAUTH_GENERIC)) #endif /* KVM__KVM_CPU_ARCH_H */ diff --git a/arm/include/arm-common/kvm-config-arch.h b/arm/include/arm-common/kvm-config-arch.h index 5734c46..5badcbd 100644 --- a/arm/include/arm-common/kvm-config-arch.h +++ b/arm/include/arm-common/kvm-config-arch.h @@ -10,6 +10,7 @@ struct kvm_config_arch { boolaarch32_guest; boolhas_pmuv3; u64 kaslr_seed; + boolhas_ptrauth; enum irqchip_type irqchip; u64 fw_addr; }; diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c index 7780251..4ac80f8 100644 --- a/arm/kvm-cpu.c +++ b/arm/kvm-cpu.c @@ -68,6 +68,12 @@ struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id) vcpu_init.features[0] |= (1UL << KVM_ARM_VCPU_PSCI_0_2); } + /* Set KVM_ARM_VCPU_PTRAUTH if available */ + if (kvm__supports_extension(kvm, KVM_CAP_ARM_PTRAUTH)) { + if (kvm->cfg.arch.has_ptrauth) + vcpu_init.features[0] |= ARM_VCPU_PTRAUTH_FEATURE; + } + /* * If the preferred target ioctl is successful then * use preferred target else try each and every target type diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 6d4ea4b..a553477 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -988,6 +988,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_VM_IPA_SIZE 165 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 #define KVM_CAP_HYPERV_CPUID 167 +#define KVM_CAP_ARM_PTRAUTH 168 #ifdef KVM_CAP_IRQ_ROUTING -- 2.7.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v7 2/10] KVM: arm64: Support runtime sysreg visibility filtering
From: Dave Martin Some optional features of the Arm architecture add new system registers that are not present in the base architecture. Where these features are optional for the guest, the visibility of these registers may need to depend on some runtime configuration, such as a flag passed to KVM_ARM_VCPU_INIT. For example, ZCR_EL1 and ID_AA64ZFR0_EL1 need to be hidden if SVE is not enabled for the guest, even though these registers may be present in the hardware and visible to the host at EL2. Adding special-case checks all over the place for individual registers is going to get messy as the number of conditionally- visible registers grows. In order to help solve this problem, this patch adds a new sysreg method restrictions() that can be used to hook in any needed runtime visibility checks. This method can currently return REG_NO_USER to inhibit enumeration and ioctl access to the register for userspace, and REG_NO_GUEST to inhibit runtime access by the guest using MSR/MRS. This allows a conditionally modified view of individual system registers such as the CPU ID registers, in addition to completely hiding register where appropriate. Signed-off-by: Dave Martin --- arch/arm64/kvm/sys_regs.c | 24 +--- arch/arm64/kvm/sys_regs.h | 13 + 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a5d14b5..75942f6 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1927,6 +1927,12 @@ static void perform_access(struct kvm_vcpu *vcpu, { trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); + /* Check for regs disabled by runtime config */ + if (restrictions(vcpu, r) & REG_NO_GUEST) { + kvm_inject_undefined(vcpu); + return; + } + /* * Not having an accessor means that we have configured a trap * that we don't know how to handle. This certainly qualifies @@ -2438,6 +2444,10 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (!r) return get_invariant_sys_reg(reg->id, uaddr); + /* Check for regs disabled by runtime config */ + if (restrictions(vcpu, r) & REG_NO_USER) + return -ENOENT; + if (r->get_user) return (r->get_user)(vcpu, r, reg, uaddr); @@ -2459,6 +2469,10 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (!r) return set_invariant_sys_reg(reg->id, uaddr); + /* Check for regs disabled by runtime config */ + if (restrictions(vcpu, r) & REG_NO_USER) + return -ENOENT; + if (r->set_user) return (r->set_user)(vcpu, r, reg, uaddr); @@ -2515,7 +2529,8 @@ static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind) return true; } -static int walk_one_sys_reg(const struct sys_reg_desc *rd, +static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 __user **uind, unsigned int *total) { @@ -2526,6 +2541,9 @@ static int walk_one_sys_reg(const struct sys_reg_desc *rd, if (!(rd->reg || rd->get_user)) return 0; + if (restrictions(vcpu, rd) & REG_NO_USER) + return 0; + if (!copy_reg_to_user(rd, uind)) return -EFAULT; @@ -2554,9 +2572,9 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) int cmp = cmp_sys_reg(i1, i2); /* target-specific overrides generic entry. */ if (cmp <= 0) - err = walk_one_sys_reg(i1, &uind, &total); + err = walk_one_sys_reg(vcpu, i1, &uind, &total); else - err = walk_one_sys_reg(i2, &uind, &total); + err = walk_one_sys_reg(vcpu, i2, &uind, &total); if (err) return err; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 3b1bc7f..b9dd4a9 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -64,8 +64,15 @@ struct sys_reg_desc { const struct kvm_one_reg *reg, void __user *uaddr); int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr); + + /* Return mask of REG_* runtime restriction flags */ + unsigned int (*restrictions)(const struct kvm_vcpu *vcpu, +const struct sys_reg_desc *rd); }; +#define REG_NO_USER(1 << 0) /* hidden from userspace ioctl interface */ +#define REG_NO_GUEST (1 << 1) /* hidden from guest */ + static inline void print_sys_reg_instr(const struct sys_reg_params *p) { /* Look, we even formatted it for yo
[PATCH v7 0/10] Add ARMv8.3 pointer authentication for kvm guest
Hi, This patch series adds pointer authentication support for KVM guest and is based on top of Linux 5.1-rc1. The basic patches in this series was originally posted by Mark Rutland earlier[1,2] and contains some history of this work. Extension Overview: = The ARMv8.3 pointer authentication extension adds functionality to detect modification of pointer values, mitigating certain classes of attack such as stack smashing, and making return oriented programming attacks harder. The extension introduces the concept of a pointer authentication code (PAC), which is stored in some upper bits of pointers. Each PAC is derived from the original pointer, another 64-bit value (e.g. the stack pointer), and a secret 128-bit key. New instructions are added which can be used to: * Insert a PAC into a pointer * Strip a PAC from a pointer * Authenticate and strip a PAC from a pointer The detailed description of ARMv8.3 pointer authentication support in userspace/kernel and can be found in Kristina's generic pointer authentication patch series[3]. KVM guest work: == If pointer authentication is enabled for KVM guests then the new PAC instructions will not trap to EL2. If not then they may be ignored if in HINT region or trapped in EL2 as illegal instruction. Since KVM guest vcpu runs as a thread so they have a key initialized which will be used by PAC. When world switch happens between host and guest then this key is exchanged. The current v7 patch series contains review comments and suggestions by James Morse, Mark Rutland and Dave Martin. The first two patch is cherry picked from Dave Martin [9] v5 series which adds SVE support for KVM guest. Changes since v6 [8]: Major changes are listed below. * Pointer authentication key switch entirely in assembly now. * isb instruction added after Key switched to host. * Use __hyp_this_cpu_ptr for both VHE and nVHE mode. * 2 separate flags for address and generic authentication. * kvm_arm_vcpu_ptrauth_allowed renamed to has_vcpu_ptrauth. * kvm_arm_vcpu_ptrauth_reset renamed to kvm_arm_vcpu_ptrauth_setup_lazy. * Save of host Key registers now done in ptrauth instruction trap. * A fix to add kern_hyp_va to get correct host_ctxt pointer in nVHE mode. * Patches re-structured to better reflect ABI change. Changes since v5 [7]: Major changes are listed below. * Split hcr_el2 and mdcr_el2 save/restore in two patches. * Reverted back save/restore of sys-reg keys as done in V4 [5]. There was suggestion by James Morse to use ptrauth utilities in a single place in arm core and use them from kvm. However this change deviates from the existing sys-reg implementations and is not scalable. * Invoked the key switch C functions from __guest_enter/__guest_exit assembly. * Host key save is now done inside vcpu_load. * Reverted back masking of cpufeature ID registers for ptrauth when disabled from userpace. * Reset of ptrauth key registers not done conditionally. * Code and Documentation cleanup. Changes since v4 [6]: Several suggestions from James Morse * Move host registers to be saved/restored inside struct kvm_cpu_context. * Similar to hcr_el2, save/restore mdcr_el2 register also. * Added save routines for ptrauth keys in generic arm core and use them during KVM context switch. * Defined a GCC attribute __no_ptrauth which discards generating ptrauth instructions in a function. This is taken from Kristina's earlier kernel pointer authentication support patches [4]. * Dropped a patch to mask cpufeature when not enabled from userspace and now only key registers are masked from register list. Changes since v3 [5]: * Use pointer authentication only when VHE is present as ARM8.3 implies ARM8.1 features to be present. * Added lazy context handling of ptrauth instructions from V2 version again. * Added more details in Documentation. Changes since v2 [1,2]: * Allow host and guest to have different HCR_EL2 settings and not just constant value HCR_HOST_VHE_FLAGS or HCR_HOST_NVHE_FLAGS. * Optimise the reading of HCR_EL2 in host/guest switch by fetching it once during KVM initialisation state and using it later. * Context switch pointer authentication keys when switching between guest and host. Pointer authentication was enabled in a lazy context earlier[2] and is removed now to make it simple. However it can be revisited later if there is significant performance issue. * Added a userspace option to choose pointer authentication. * Based on the userspace option, ptrauth cpufeature will be visible. * Based on the userspace option, ptrauth key registers will be accessible. * A small document is added on how to enable pointer authentication from userspace KVM API. Looking for feedback and comments. Thanks, Amit [1]: https://lore.kernel.org/lkml/20171127163806.31435-11-mark.rutl...@arm.com/ [2]: https://lore.kernel.org/lkml/20171127163806.31435-10-mark.rutl...@arm.com/ [3]: htt
[PATCH v7 1/10] KVM: arm64: Propagate vcpu into read_id_reg()
From: Dave Martin Architecture features that are conditionally visible to the guest will require run-time checks in the ID register accessor functions. In particular, read_id_reg() will need to perform checks in order to generate the correct emulated value for certain ID register fields such as ID_AA64PFR0_EL1.SVE for example. This patch propagates vcpu into read_id_reg() so that future patches can add run-time checks on the guest configuration here. For now, there is no functional change. Signed-off-by: Dave Martin Reviewed-by: Alex Bennee --- arch/arm64/kvm/sys_regs.c | 23 +-- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 539feec..a5d14b5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1044,7 +1044,8 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, } /* Read a sanitised cpufeature ID register by sys_reg_desc */ -static u64 read_id_reg(struct sys_reg_desc const *r, bool raz) +static u64 read_id_reg(const struct kvm_vcpu *vcpu, + struct sys_reg_desc const *r, bool raz) { u32 id = sys_reg((u32)r->Op0, (u32)r->Op1, (u32)r->CRn, (u32)r->CRm, (u32)r->Op2); @@ -1078,7 +1079,7 @@ static bool __access_id_reg(struct kvm_vcpu *vcpu, if (p->is_write) return write_to_read_only(vcpu, p, r); - p->regval = read_id_reg(r, raz); + p->regval = read_id_reg(vcpu, r, raz); return true; } @@ -1107,16 +1108,18 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); * are stored, and for set_id_reg() we don't allow the effective value * to be changed. */ -static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, +static int __get_id_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, void __user *uaddr, bool raz) { const u64 id = sys_reg_to_index(rd); - const u64 val = read_id_reg(rd, raz); + const u64 val = read_id_reg(vcpu, rd, raz); return reg_to_user(uaddr, &val, id); } -static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, +static int __set_id_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, void __user *uaddr, bool raz) { const u64 id = sys_reg_to_index(rd); @@ -1128,7 +1131,7 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, return err; /* This is what we mean by invariant: you can't change it. */ - if (val != read_id_reg(rd, raz)) + if (val != read_id_reg(vcpu, rd, raz)) return -EINVAL; return 0; @@ -1137,25 +1140,25 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(rd, uaddr, false); + return __get_id_reg(vcpu, rd, uaddr, false); } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(rd, uaddr, false); + return __set_id_reg(vcpu, rd, uaddr, false); } static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(rd, uaddr, true); + return __get_id_reg(vcpu, rd, uaddr, true); } static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(rd, uaddr, true); + return __set_id_reg(vcpu, rd, uaddr, true); } static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, -- 2.7.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v7 3/10] KVM: arm64: Move hyp_symbol_addr to fix dependency
Currently hyp_symbol_addr is palced in kvm_mmu.h which is mostly used by __hyp_this_cpu_ptr in kvm_asm.h but it cannot include kvm_mmu.h directly as kvm_mmu.h uses kvm_ksym_ref which is defined inside kvm_asm.h. Hence, hyp_symbol_addr is moved inside kvm_asm.h to fix this dependency on each other. Also kvm_ksym_ref is corresponding counterpart of hyp_symbol_addr so should be ideally placed inside kvm_asm.h. Suggested by: James Morse Signed-off-by: Amit Daniel Kachhap Cc: Marc Zyngier Cc: Christoffer Dall Cc: kvmarm@lists.cs.columbia.edu --- arch/arm64/include/asm/kvm_asm.h | 20 arch/arm64/include/asm/kvm_mmu.h | 20 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index f5b79e9..57a07e8 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -80,6 +80,26 @@ extern void __vgic_v3_init_lrs(void); extern u32 __kvm_get_mdcr_el2(void); +/* + * Obtain the PC-relative address of a kernel symbol + * s: symbol + * + * The goal of this macro is to return a symbol's address based on a + * PC-relative computation, as opposed to a loading the VA from a + * constant pool or something similar. This works well for HYP, as an + * absolute VA is guaranteed to be wrong. Only use this if trying to + * obtain the address of a symbol (i.e. not something you obtained by + * following a pointer). + */ +#define hyp_symbol_addr(s) \ + ({ \ + typeof(s) *addr;\ + asm("adrp %0, %1\n" \ + "add%0, %0, :lo12:%1\n" \ + : "=r" (addr) : "S" (&s)); \ + addr; \ + }) + /* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */ #define __hyp_this_cpu_ptr(sym) \ ({ \ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b0742a1..3dea6af 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -118,26 +118,6 @@ static inline unsigned long __kern_hyp_va(unsigned long v) #define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v /* - * Obtain the PC-relative address of a kernel symbol - * s: symbol - * - * The goal of this macro is to return a symbol's address based on a - * PC-relative computation, as opposed to a loading the VA from a - * constant pool or something similar. This works well for HYP, as an - * absolute VA is guaranteed to be wrong. Only use this if trying to - * obtain the address of a symbol (i.e. not something you obtained by - * following a pointer). - */ -#define hyp_symbol_addr(s) \ - ({ \ - typeof(s) *addr;\ - asm("adrp %0, %1\n" \ - "add%0, %0, :lo12:%1\n" \ - : "=r" (addr) : "S" (&s)); \ - addr; \ - }) - -/* * We currently support using a VM-specified IPA size. For backward * compatibility, the default IPA size is fixed to 40bits. */ -- 2.7.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v7 6/10] KVM: arm64: Add vcpu feature flags to control ptrauth accessibility
Since Pointer authentication will be enabled or disabled on a per-vcpu basis, vcpu feature flags are added in order to know which vcpus have it enabled from userspace. This features will allow the KVM guest to allow the handling of pointer authentication instructions or to treat them as undefined if not set. The helper macro added checks the feature flag along with other conditions such as VHE mode present and system supports pointer address/generic authentication. Signed-off-by: Amit Daniel Kachhap Cc: Mark Rutland Cc: Marc Zyngier Cc: Christoffer Dall Cc: kvmarm@lists.cs.columbia.edu --- arch/arm64/include/asm/kvm_host.h | 8 +++- arch/arm64/include/uapi/asm/kvm.h | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e3ccd7b..9dd2918 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -45,7 +45,7 @@ #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS -#define KVM_VCPU_MAX_FEATURES 4 +#define KVM_VCPU_MAX_FEATURES 6 #define KVM_REQ_SLEEP \ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) @@ -491,6 +491,12 @@ static inline bool kvm_arch_requires_vhe(void) return false; } +#define vcpu_has_ptrauth(vcpu) (has_vhe() && \ + system_supports_address_auth() && \ + system_supports_generic_auth() && \ + test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) && \ + test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) + static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 97c3478..8806f71 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -102,6 +102,8 @@ struct kvm_regs { #define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */ #define KVM_ARM_VCPU_PSCI_0_2 2 /* CPU uses PSCI v0.2 */ #define KVM_ARM_VCPU_PMU_V33 /* Support guest PMUv3 */ +#define KVM_ARM_VCPU_PTRAUTH_ADDRESS 4 /* VCPU uses address authentication */ +#define KVM_ARM_VCPU_PTRAUTH_GENERIC 5 /* VCPU uses generic authentication */ struct kvm_vcpu_init { __u32 target; -- 2.7.4 ___ kvmarm mailing list kvmarm@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
[PATCH v7 5/10] KVM: arm/arm64: preserve host MDCR_EL2 value
Save host MDCR_EL2 value during kvm HYP initialisation and restore after every switch from host to guest. There should not be any change in functionality due to this. The value of mdcr_el2 is now stored in struct kvm_cpu_context as both host and guest can now use this field in a common way. Signed-off-by: Amit Daniel Kachhap Acked-by: Mark Rutland Cc: Marc Zyngier Cc: Mark Rutland Cc: Christoffer Dall Cc: kvmarm@lists.cs.columbia.edu --- arch/arm/include/asm/kvm_host.h | 1 - arch/arm64/include/asm/kvm_host.h | 6 ++ arch/arm64/include/asm/kvm_hyp.h | 2 +- arch/arm64/kvm/debug.c| 28 ++-- arch/arm64/kvm/hyp/switch.c | 18 +- arch/arm64/kvm/hyp/sysreg-sr.c| 8 +++- virt/kvm/arm/arm.c| 1 - 7 files changed, 21 insertions(+), 43 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 6d0aac4..a928565 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -343,7 +343,6 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} -static inline void kvm_arm_init_debug(void) {} static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3b09fd0..e3ccd7b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -211,6 +211,8 @@ struct kvm_cpu_context { /* HYP host/guest configuration */ u64 hcr_el2; + u32 mdcr_el2; + struct kvm_vcpu *__hyp_running_vcpu; }; @@ -226,9 +228,6 @@ struct vcpu_reset_state { struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; - /* HYP configuration */ - u32 mdcr_el2; - /* Exception Information */ struct kvm_vcpu_fault_info fault; @@ -498,7 +497,6 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} -void kvm_arm_init_debug(void); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 4da765f..7fcde8a 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -152,7 +152,7 @@ void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); bool __fpsimd_enabled(void); void activate_traps_vhe_load(struct kvm_vcpu *vcpu); -void deactivate_traps_vhe_put(void); +void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu); u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); void __noreturn __hyp_do_panic(unsigned long, ...); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index fd917d6..99dc0a4 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -32,8 +32,6 @@ DBG_MDSCR_KDE | \ DBG_MDSCR_MDE) -static DEFINE_PER_CPU(u32, mdcr_el2); - /** * save/restore_guest_debug_regs * @@ -65,21 +63,6 @@ static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) } /** - * kvm_arm_init_debug - grab what we need for debug - * - * Currently the sole task of this function is to retrieve the initial - * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has - * presumably been set-up by some knowledgeable bootcode. - * - * It is called once per-cpu during CPU hyp initialisation. - */ - -void kvm_arm_init_debug(void) -{ - __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2)); -} - -/** * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state */ @@ -111,6 +94,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) { + kvm_cpu_context_t *host_cxt = this_cpu_ptr(&kvm_host_cpu_state); bool trap_debug = !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY); unsigned long mdscr; @@ -120,8 +104,8 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) * This also clears MDCR_EL2_E2PB_MASK to disable guest access * to the profiling buffer. */ - vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; - vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | + vcpu->arch.ctxt.mdcr_el2 = host_cxt->mdcr_el2 & MDCR_EL2_HPMN_MASK; + vcpu->arch.ctxt.mdcr_el2 |= (MDCR_EL2_TPM | MDCR_EL2_TPMS | MDCR_EL2_TPMCR | MDCR_EL2_
[PATCH v7 4/10] KVM: arm/arm64: preserve host HCR_EL2 value
From: Mark Rutland When restoring HCR_EL2 for the host, KVM uses HCR_HOST_VHE_FLAGS, which is a constant value. This works today, as the host HCR_EL2 value is always the same, but this will get in the way of supporting extensions that require HCR_EL2 bits to be set conditionally for the host. To allow such features to work without KVM having to explicitly handle every possible host feature combination, this patch has KVM save/restore for the host HCR when switching to/from a guest HCR. The saving of the register is done once during cpu hypervisor initialization state and is just restored after switch from guest. For fetching HCR_EL2 during kvm initialisation, a hyp call is made using kvm_call_hyp and is helpful in non-VHE case. For the hyp TLB maintenance code, __tlb_switch_to_host_vhe() is updated to toggle the TGE bit with a RMW sequence, as we already do in __tlb_switch_to_guest_vhe(). The value of hcr_el2 is now stored in struct kvm_cpu_context as both host and guest can now use this field in a common way. Signed-off-by: Mark Rutland [Added cpu_init_host_ctxt, hcr_el2 field in struct kvm_cpu_context, save hcr_el2 in hyp init stage] Signed-off-by: Amit Daniel Kachhap Reviewed-by: James Morse Cc: Marc Zyngier Cc: Christoffer Dall Cc: kvmarm@lists.cs.columbia.edu --- arch/arm/include/asm/kvm_host.h | 2 ++ arch/arm64/include/asm/kvm_asm.h | 2 ++ arch/arm64/include/asm/kvm_emulate.h | 24 arch/arm64/include/asm/kvm_host.h| 15 ++- arch/arm64/kvm/guest.c | 2 +- arch/arm64/kvm/hyp/switch.c | 22 +- arch/arm64/kvm/hyp/sysreg-sr.c | 14 ++ arch/arm64/kvm/hyp/tlb.c | 6 +- virt/kvm/arm/arm.c | 1 + 9 files changed, 64 insertions(+), 24 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 770d732..6d0aac4 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -322,6 +322,8 @@ static inline void __cpu_init_stage2(void) kvm_call_hyp(__init_stage2_translation); } +static inline void cpu_init_host_ctxt(void) {} + static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) { return 0; diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 57a07e8..a68205c 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -80,6 +80,8 @@ extern void __vgic_v3_init_lrs(void); extern u32 __kvm_get_mdcr_el2(void); +extern void __kvm_populate_host_regs(void); + /* * Obtain the PC-relative address of a kernel symbol * s: symbol diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index d384279..426815d 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -50,25 +50,25 @@ void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr); static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { - return !(vcpu->arch.hcr_el2 & HCR_RW); + return !(vcpu->arch.ctxt.hcr_el2 & HCR_RW); } static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) { - vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; + vcpu->arch.ctxt.hcr_el2 = HCR_GUEST_FLAGS; if (is_kernel_in_hyp_mode()) - vcpu->arch.hcr_el2 |= HCR_E2H; + vcpu->arch.ctxt.hcr_el2 |= HCR_E2H; if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) { /* route synchronous external abort exceptions to EL2 */ - vcpu->arch.hcr_el2 |= HCR_TEA; + vcpu->arch.ctxt.hcr_el2 |= HCR_TEA; /* trap error record accesses */ - vcpu->arch.hcr_el2 |= HCR_TERR; + vcpu->arch.ctxt.hcr_el2 |= HCR_TERR; } if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) - vcpu->arch.hcr_el2 |= HCR_FWB; + vcpu->arch.ctxt.hcr_el2 |= HCR_FWB; if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) - vcpu->arch.hcr_el2 &= ~HCR_RW; + vcpu->arch.ctxt.hcr_el2 &= ~HCR_RW; /* * TID3: trap feature register accesses that we virtualise. @@ -76,26 +76,26 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) * are currently virtualised. */ if (!vcpu_el1_is_32bit(vcpu)) - vcpu->arch.hcr_el2 |= HCR_TID3; + vcpu->arch.ctxt.hcr_el2 |= HCR_TID3; if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) || vcpu_el1_is_32bit(vcpu)) - vcpu->arch.hcr_el2 |= HCR_TID2; + vcpu->arch.ctxt.hcr_el2 |= HCR_TID2; } static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) { - return (unsigned long *)&vcpu->arch.hcr_el2; + return (unsigned long *)&vcpu->arch.ctxt.hcr_el2; } static inline void vcpu_clear_wfe_traps(struct kvm_vcpu *vcpu) { - vcpu->arch.hcr_el2 &= ~