[PATCH kvmtool v2 1/3] vfio-pci: Release INTx's unmask eventfd properly

2019-03-19 Thread Leo Yan
The PCI device INTx uses event fd 'unmask_fd' to signal the deassertion
of the line from guest to host; but this eventfd isn't released properly
when disable INTx.

This patch firstly adds field 'unmask_fd' in struct vfio_pci_device for
storing unmask eventfd and close it when disable INTx.

Signed-off-by: Leo Yan 
---
 include/kvm/vfio.h | 1 +
 vfio/pci.c | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
index 60e6c54..28223cf 100644
--- a/include/kvm/vfio.h
+++ b/include/kvm/vfio.h
@@ -74,6 +74,7 @@ struct vfio_pci_device {
 
unsigned long   irq_modes;
int intx_fd;
+   int unmask_fd;
unsigned intintx_gsi;
struct vfio_pci_msi_common  msi;
struct vfio_pci_msi_common  msix;
diff --git a/vfio/pci.c b/vfio/pci.c
index 03de3c1..5224fee 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -1008,6 +1008,7 @@ static void vfio_pci_disable_intx(struct kvm *kvm, struct 
vfio_device *vdev)
irq__del_irqfd(kvm, gsi, pdev->intx_fd);
 
close(pdev->intx_fd);
+   close(pdev->unmask_fd);
 }
 
 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
@@ -1095,6 +1096,7 @@ static int vfio_pci_enable_intx(struct kvm *kvm, struct 
vfio_device *vdev)
}
 
pdev->intx_fd = trigger_fd;
+   pdev->unmask_fd = unmask_fd;
/* Guest is going to ovewrite our irq_line... */
pdev->intx_gsi = gsi;
 
-- 
2.19.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH kvmtool v2 3/3] vfio-pci: Re-enable INTx mode when disable MSI/MSIX

2019-03-19 Thread Leo Yan
Since PCI forbids enabling INTx, MSI or MSIX at the same time, it's by
default to disable INTx mode when enable MSI/MSIX mode; but this logic is
easily broken if the guest PCI driver detects the MSI/MSIX cannot work as
expected and tries to rollback to use INTx mode.  The INTx mode has been
disabled and it has no chance to be enabled again, thus both INTx mode
and MSI/MSIX mode will not be enabled in vfio for this case.

Below shows the detailed flow for introducing this issue:

  vfio_pci_configure_dev_irqs()
`-> vfio_pci_enable_intx()

  vfio_pci_enable_msis()
`-> vfio_pci_disable_intx()

  vfio_pci_disable_msis()   => Guest PCI driver disables MSI

To fix this issue, when disable MSI/MSIX we need to check if INTx mode
is available for this device or not; if the device can support INTx then
we need to re-enable it so the device can fallback to use it.

In this patch, should note two minor changes:

- vfio_pci_disable_intx() may be called multiple times (each time the
  guest enables one MSI vector).  This patch changes to use
  'intx_fd == -1' to denote the INTx disabled, vfio_pci_disable_intx()
  and vfio_pci_enable_intx will directly bail out when detect INTx has
  been disabled and enabled respectively.

- Since pci_device_header will be corrupted after PCI configuration
  and all irq related info will be lost.  Before re-enabling INTx
  mode, this patch restores 'irq_pin' and 'irq_line' fields in struct
  pci_device_header.

Signed-off-by: Leo Yan 
---
 vfio/pci.c | 59 --
 1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/vfio/pci.c b/vfio/pci.c
index d025581..ba971eb 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -28,6 +28,7 @@ struct vfio_irq_eventfd {
msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
 
 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
+static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
 
 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
bool msix)
@@ -50,17 +51,14 @@ static int vfio_pci_enable_msis(struct kvm *kvm, struct 
vfio_device *vdev,
if (!msi_is_enabled(msis->virt_state))
return 0;
 
-   if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
-   /*
-* PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
-* time. Since INTx has to be enabled from the start (we don't
-* have a reliable way to know when the user starts using it),
-* disable it now.
-*/
+   /*
+* PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
+* time. Since INTx has to be enabled from the start (after enabling
+* 'pdev->intx_fd' will be assigned to an eventfd and doesn't equal
+* to the init value -1), disable it now.
+*/
+   if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
vfio_pci_disable_intx(kvm, vdev);
-   /* Permanently disable INTx */
-   pdev->irq_modes &= ~VFIO_PCI_IRQ_MODE_INTX;
-   }
 
eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
 
@@ -162,7 +160,34 @@ static int vfio_pci_disable_msis(struct kvm *kvm, struct 
vfio_device *vdev,
msi_set_enabled(msis->phys_state, false);
msi_set_empty(msis->phys_state, true);
 
-   return 0;
+   /*
+* When MSI or MSIX is disabled, this might be called when
+* PCI driver detects the MSI interrupt failure and wants to
+* rollback to INTx mode.  Thus enable INTx if the device
+* supports INTx mode in this case.
+*/
+   if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
+   /*
+* Struct pci_device_header is not only used for header,
+* it also is used for PCI configuration; and in the function
+* vfio_pci_cfg_write() it firstly writes configuration space
+* and then read back the configuration space data into the
+* header structure; thus 'irq_pin' and 'irq_line' in the
+* header will be overwritten.
+*
+* If want to enable INTx mode properly, firstly needs to
+* restore 'irq_pin' and 'irq_line' values; we can simply set 1
+* to 'irq_pin', and 'pdev->intx_gsi' keeps gsi value when
+* enable INTx mode previously so we can simply use it to
+* recover irq line number by adding offset KVM_IRQ_OFFSET.
+*/
+   pdev->hdr.irq_pin = 1;
+   pdev->hdr.irq_line = pdev->intx_gsi + KVM_IRQ_OFFSET;
+
+   ret = vfio_pci_enable_intx(kvm, vdev);
+   }
+
+   return ret >= 0 ? 0 : ret;
 }
 
 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
@@ -1002,6 +1027,10 @@ static void vfio_pci_disable_intx(st

[PATCH kvmtool v2 0/3] vfio-pci: Support INTx mode re-enabling

2019-03-19 Thread Leo Yan
When enable vfio-pci mode for NIC driver on Juno board, the IRQ is
failed to forward properly from host to guest, finally root caused this
issue is related with kvmtool cannot re-enable INTx mode properly.

So the basic working flow to reproduce this issue is as below:

Host Guest
-  
  INTx mode
 MSI enable failed in NIC driver
 MSI disable in NIC driver
 Switch back to INTx mode --> kvmtool doesn't support

So this patch is to support INTx mode re-enabling; 0001/0002 patches
are only minor fixing up for eventfd releasing and remove useless FDs
reservation for INTx.  0003 patch is the core patch for supporting
INTx mode re-enabling, when kvmtool detects MSI is disabled it
rollbacks to INTx mode.

This patch set has been tested on Juno-r2 board.

Leo Yan (3):
  vfio-pci: Release INTx's unmask eventfd properly
  vfio-pci: Remove useless FDs reservation in vfio_pci_enable_intx()
  vfio-pci: Re-enable INTx mode when disable MSI/MSIX

 include/kvm/vfio.h |  1 +
 vfio/pci.c | 61 +-
 2 files changed, 50 insertions(+), 12 deletions(-)

-- 
2.19.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH kvmtool v2 2/3] vfio-pci: Remove useless FDs reservation in vfio_pci_enable_intx()

2019-03-19 Thread Leo Yan
Since INTx only uses 2 FDs, it's not particularly useful to reserve FDs
in function vfio_pci_enable_intx(); so this patch is to remove FDs
reservation in this function.

Signed-off-by: Leo Yan 
---
 vfio/pci.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vfio/pci.c b/vfio/pci.c
index 5224fee..d025581 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -1025,8 +1025,6 @@ static int vfio_pci_enable_intx(struct kvm *kvm, struct 
vfio_device *vdev)
.index = VFIO_PCI_INTX_IRQ_INDEX,
};
 
-   vfio_pci_reserve_irq_fds(2);
-
ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
if (ret || irq_info.count == 0) {
vfio_dev_err(vdev, "no INTx reported by VFIO");
-- 
2.19.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH kvmtool v1 2/2] vfio-pci: Fallback to INTx mode when disable MSI/MSIX

2019-03-19 Thread Leo Yan
Hi Jean-Philippe,

On Fri, Mar 15, 2019 at 02:20:07PM +, Jean-Philippe Brucker wrote:
> On 15/03/2019 08:33, Leo Yan wrote:
> > Since PCI forbids enabling INTx, MSI or MSIX at the same time, it's by
> > default to disable INTx mode when enable MSI/MSIX mode; but this logic is
> > easily broken if the guest PCI driver detects the MSI/MSIX cannot work as
> > expected and tries to rollback to use INTx mode.  The INTx mode has been
> > disabled and it has no chance to be enabled again, thus both INTx mode
> > and MSI/MSIX mode will not be enabled in vfio for this case.
> > 
> > Below shows the detailed flow for introducing this issue:
> > 
> >   vfio_pci_configure_dev_irqs()
> > `-> vfio_pci_enable_intx()
> > 
> >   vfio_pci_enable_msis()
> > `-> vfio_pci_disable_intx()
> > 
> >   vfio_pci_disable_msis()   => Guest PCI driver disables MSI
> > 
> > To fix this issue, when disable MSI/MSIX we need to check if INTx mode
> > is available for this device or not; if the device can support INTx then
> > we need to re-enable it so the device can fallback to use it.
> > 
> > Signed-off-by: Leo Yan 
> > ---
> >  vfio/pci.c | 17 -
> >  1 file changed, 12 insertions(+), 5 deletions(-)
> > 
> > diff --git a/vfio/pci.c b/vfio/pci.c
> > index c0683f6..44727bb 100644
> > --- a/vfio/pci.c
> > +++ b/vfio/pci.c
> > @@ -28,6 +28,7 @@ struct vfio_irq_eventfd {
> > msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
> >  
> >  static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device 
> > *vdev);
> > +static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
> >  
> >  static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
> > bool msix)
> > @@ -50,7 +51,7 @@ static int vfio_pci_enable_msis(struct kvm *kvm, struct 
> > vfio_device *vdev,
> > if (!msi_is_enabled(msis->virt_state))
> > return 0;
> >  
> > -   if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
> > +   if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
> > /*
> >  * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
> >  * time. Since INTx has to be enabled from the start (we don't
> > @@ -58,9 +59,6 @@ static int vfio_pci_enable_msis(struct kvm *kvm, struct 
> > vfio_device *vdev,
> >  * disable it now.
> >  */
> > vfio_pci_disable_intx(kvm, vdev);
> > -   /* Permanently disable INTx */
> > -   pdev->irq_modes &= ~VFIO_PCI_IRQ_MODE_INTX;
> 
> As a result vfio_pci_disable_intx() may be called multiple times (each
> time the guest enables one MSI vector). Could you make
> vfio_pci_disable_intx() safe against that (maybe use intx_fd == -1 to
> denote the INTx state)?

Will do this.

> > -   }
> >  
> > eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
> >  
> > @@ -162,7 +160,16 @@ static int vfio_pci_disable_msis(struct kvm *kvm, 
> > struct vfio_device *vdev,
> > msi_set_enabled(msis->phys_state, false);
> > msi_set_empty(msis->phys_state, true);
> >  
> > -   return 0;
> > +   if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
> > +   /*
> > +* When MSI or MSIX is disabled, this might be called when
> > +* PCI driver detects the MSI interrupt failure and wants to
> > +* rollback to INTx mode.  Thus enable INTx if the device
> > +* supports INTx mode in this case.
> > +*/
> > +   ret = vfio_pci_enable_intx(kvm, vdev);
> 
> Let's remove vfio_pci_reserve_irq_fds(2) from vfio_pci_enable_intx(), it
> should only called once per run, and isn't particularly useful here
> since INTx only uses 2 fds. It's used to bump the fd rlimit when a
> device needs ~2048 file descriptors for MSI-X.

Understand; will do it.

Thanks a lot for very detailed suggestions.

> > +
> > +   return ret >= 0 ? 0 : ret;
> >  }
> >  
> >  static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device 
> > *vdev,
> > 
> 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[kvmarm:5.1-fixes 6/6] arch/arm/kvm/../../../virt/kvm/arm/mmu.c:1154:35: error: 'S2_PUD_MASK' undeclared; did you mean 'S2_PMD_MASK'?

2019-03-19 Thread kbuild test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git 
5.1-fixes
head:   4c6b77dd0036d42268cc1e6c5ab5a9e9d98cae65
commit: 4c6b77dd0036d42268cc1e6c5ab5a9e9d98cae65 [6/6] KVM: arm/arm64: Fix 
handling of stage2 huge mappings
config: arm-axm55xx_defconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 4c6b77dd0036d42268cc1e6c5ab5a9e9d98cae65
# save the attached .config to linux build tree
GCC_VERSION=7.2.0 make.cross ARCH=arm 

All errors (new ones prefixed by >>):

   arch/arm/kvm/../../../virt/kvm/arm/mmu.c: In function 'stage2_set_pud_huge':
>> arch/arm/kvm/../../../virt/kvm/arm/mmu.c:1154:35: error: 'S2_PUD_MASK' 
>> undeclared (first use in this function); did you mean 'S2_PMD_MASK'?
   unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
  ^~~
  S2_PMD_MASK
   arch/arm/kvm/../../../virt/kvm/arm/mmu.c:1154:35: note: each undeclared 
identifier is reported only once for each function it appears in
>> arch/arm/kvm/../../../virt/kvm/arm/mmu.c:1154:48: error: 'S2_PUD_SIZE' 
>> undeclared (first use in this function); did you mean 'S2_PMD_SIZE'?
   unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
   ^~~
   S2_PMD_SIZE
   In file included from arch/arm/include/asm/bug.h:60:0,
from include/linux/bug.h:5,
from include/linux/mmdebug.h:5,
from include/linux/mm.h:9,
from include/linux/mman.h:5,
from arch/arm/kvm/../../../virt/kvm/arm/mmu.c:19:
>> arch/arm/kvm/../../../virt/kvm/arm/mmu.c:1157:17: error: implicit 
>> declaration of function 'pud_pfn'; did you mean 'pmd_pfn'? 
>> [-Werror=implicit-function-declaration]
   WARN_ON_ONCE(pud_pfn(old_pud) != pud_pfn(*new_pudp));
^
   include/asm-generic/bug.h:148:27: note: in definition of macro 'WARN_ON_ONCE'
 int __ret_warn_once = !!(condition);   \
  ^
   cc1: some warnings being treated as errors

vim +1154 arch/arm/kvm/../../../virt/kvm/arm/mmu.c

  1128  
  1129  static int stage2_set_pud_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache *cache,
  1130 phys_addr_t addr, const pud_t *new_pudp)
  1131  {
  1132  pud_t *pudp, old_pud;
  1133  
  1134  retry:
  1135  pudp = stage2_get_pud(kvm, cache, addr);
  1136  VM_BUG_ON(!pudp);
  1137  
  1138  old_pud = *pudp;
  1139  
  1140  /*
  1141   * A large number of vcpus faulting on the same stage 2 entry,
  1142   * can lead to a refault due to the 
stage2_pud_clear()/tlb_flush().
  1143   * Skip updating the page tables if there is no change.
  1144   */
  1145  if (pud_val(old_pud) == pud_val(*new_pudp))
  1146  return 0;
  1147  
  1148  if (stage2_pud_present(kvm, old_pud)) {
  1149  /*
  1150   * If we already have table level mapping for this 
block, unmap
  1151   * the range for this block and retry.
  1152   */
  1153  if (!stage2_pud_huge(kvm, old_pud)) {
> 1154  unmap_stage2_range(kvm, addr & S2_PUD_MASK, 
> S2_PUD_SIZE);
  1155  goto retry;
  1156  } else {
> 1157  WARN_ON_ONCE(pud_pfn(old_pud) != 
> pud_pfn(*new_pudp));
  1158  stage2_pud_clear(kvm, pudp);
  1159  kvm_tlb_flush_vmid_ipa(kvm, addr);
  1160  }
  1161  } else {
  1162  get_page(virt_to_page(pudp));
  1163  }
  1164  
  1165  kvm_set_pud(pudp, *new_pudp);
  1166  return 0;
  1167  }
  1168  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 00/26] KVM: arm64: SVE guest support

2019-03-19 Thread Dave Martin
On Tue, Mar 19, 2019 at 05:51:51PM +, Dave Martin wrote:

[...]

Yes, I messed up the subject line in the cover letter :P

The prefix should read "[PATCH v6 00/27]"

Cheers
---Dave
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 27/27] KVM: arm64/sve: Document KVM API extensions for SVE

2019-03-19 Thread Dave Martin
This patch adds sections to the KVM API documentation describing
the extensions for supporting the Scalable Vector Extension (SVE)
in guests.

Signed-off-by: Dave Martin 

---

Changes since v5:

 * Document KVM_ARM_VCPU_FINALIZE and its interactions with SVE.
---
 Documentation/virtual/kvm/api.txt | 132 +-
 1 file changed, 129 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index cd920dd..68509de 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1873,6 +1873,7 @@ Parameters: struct kvm_one_reg (in)
 Returns: 0 on success, negative value on failure
 Errors:
   ENOENT:   no such register
+  EPERM:    register access forbidden for architecture-dependent reasons
   EINVAL:   other errors, such as bad size encoding for a known register
 
 struct kvm_one_reg {
@@ -2127,13 +2128,20 @@ Specifically:
   0x6030  0010 004c SPSR_UND64  spsr[KVM_SPSR_UND]
   0x6030  0010 004e SPSR_IRQ64  spsr[KVM_SPSR_IRQ]
   0x6060  0010 0050 SPSR_FIQ64  spsr[KVM_SPSR_FIQ]
-  0x6040  0010 0054 V0 128  fp_regs.vregs[0]
-  0x6040  0010 0058 V1 128  fp_regs.vregs[1]
+  0x6040  0010 0054 V0 128  fp_regs.vregs[0](*)
+  0x6040  0010 0058 V1 128  fp_regs.vregs[1](*)
 ...
-  0x6040  0010 00d0 V31128  fp_regs.vregs[31]
+  0x6040  0010 00d0 V31128  fp_regs.vregs[31]   (*)
   0x6020  0010 00d4 FPSR32  fp_regs.fpsr
   0x6020  0010 00d5 FPCR32  fp_regs.fpcr
 
+(*) These encodings are not accepted for SVE-enabled vcpus.  See
+KVM_ARM_VCPU_INIT.
+
+The equivalent register content can be accessed via bits [127:0] of
+the corresponding SVE Zn registers instead for vcpus that have SVE
+enabled (see below).
+
 arm64 CCSIDR registers are demultiplexed by CSSELR value:
   0x6020  0011 00 
 
@@ -2143,6 +2151,61 @@ arm64 system registers have the following id bit 
patterns:
 arm64 firmware pseudo-registers have the following bit pattern:
   0x6030  0014 
 
+arm64 SVE registers have the following bit patterns:
+  0x6080  0015 00 Zn bits[2048*slice + 2047 : 2048*slice]
+  0x6050  0015 04 Pn bits[256*slice + 255 : 256*slice]
+  0x6050  0015 060 FFR bits[256*slice + 255 : 256*slice]
+  0x6060  0015  KVM_REG_ARM64_SVE_VLS pseudo-register
+
+Access to slices beyond the maximum vector length configured for the
+vcpu (i.e., where 16 * slice >= max_vq (**)) will fail with ENOENT.
+
+These registers are only accessible on vcpus for which SVE is enabled.
+See KVM_ARM_VCPU_INIT for details.
+
+In addition, except for KVM_REG_ARM64_SVE_VLS, these registers are not
+accessible until the vcpu's SVE configuration has been finalized
+using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE).  See KVM_ARM_VCPU_INIT
+and KVM_ARM_VCPU_FINALIZE for more information about this procedure.
+
+KVM_REG_ARM64_SVE_VLS is a pseudo-register that allows the set of vector
+lengths supported by the vcpu to be discovered and configured by
+userspace.  When transferred to or from user memory via KVM_GET_ONE_REG
+or KVM_SET_ONE_REG, the value of this register is of type __u64[8], and
+encodes the set of vector lengths as follows:
+
+__u64 vector_lengths[8];
+
+if (vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX &&
+((vector_lengths[(vq - 1) / 64] >> ((vq - 1) % 64)) & 1))
+   /* Vector length vq * 16 bytes supported */
+else
+   /* Vector length vq * 16 bytes not supported */
+
+(**) The maximum value vq for which the above condition is true is
+max_vq.  This is the maximum vector length available to the guest on
+this vcpu, and determines which register slices are visible through
+this ioctl interface.
+
+(See Documentation/arm64/sve.txt for an explanation of the "vq"
+nomenclature.)
+
+KVM_REG_ARM64_SVE_VLS is only accessible after KVM_ARM_VCPU_INIT.
+KVM_ARM_VCPU_INIT initialises it to the best set of vector lengths that
+the host supports.
+
+Userspace may subsequently modify it if desired until the vcpu's SVE
+configuration is finalized using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE).
+
+Apart from simply removing all vector lengths from the host set that
+exceed some value, support for arbitrarily chosen sets of vector lengths
+is hardware-dependent and may not be available.  Attempting to configure
+an invalid set of vector lengths via KVM_SET_ONE_REG will fail with
+EINVAL.
+
+After the vcpu's SVE configuration is finalized, further attempts to
+write this register will fail with EPERM.
+
 
 MIPS registers are mapped using the lower 32 bits.  The upper 16 of that is
 the register group type:
@@ -2197,6 +2260,7 @@ Parameters: struct kvm_one_reg (in and out)
 Returns: 0 on success, negative value on failure
 Errors:
   ENOENT:   no such register
+  EPERM:    register access forbidden for architecture-dependent reasons
   EINVAL:   other errors, such as 

[PATCH v6 23/27] KVM: arm64/sve: Add pseudo-register for the guest's vector lengths

2019-03-19 Thread Dave Martin
This patch adds a new pseudo-register KVM_REG_ARM64_SVE_VLS to
allow userspace to set and query the set of vector lengths visible
to the guest.

In the future, multiple register slices per SVE register may be
visible through the ioctl interface.  Once the set of slices has
been determined we would not be able to allow the vector length set
to be changed any more, in order to avoid userspace seeing
inconsistent sets of registers.  For this reason, this patch adds
support for explicit finalization of the SVE configuration via the
KVM_ARM_VCPU_FINALIZE ioctl.

Finalization is the proper place to allocate the SVE register state
storage in vcpu->arch.sve_state, so this patch adds that as
appropriate.  The data is freed via kvm_arch_vcpu_uninit(), which
was previously a no-op on arm64.

To simplify the logic for determining what vector lengths can be
supported, some code is added to KVM init to work this out, in the
kvm_arm_init_arch_resources() hook.

The KVM_REG_ARM64_SVE_VLS pseudo-register is not exposed yet.
Subsequent patches will allow SVE to be turned on for guest vcpus,
making it visible.

Signed-off-by: Dave Martin 

---

Changes since v5:

 * [Julien Thierry] Delete overzealous BUILD_BUG_ON() checks.
   It also turns out that these could cause kernel build failures in
   some configurations, even though the checked condition is compile-
   time constant.

   Because of the way the affected functions are called, the checks
   are superfluous, so the simplest option is simply to get rid of
   them.

 * [Julien Thierry] Free vcpu->arch.sve_state (if any) in
   kvm_arch_vcpu_uninit() (which is currently a no-op).

   This was accidentally lost during a previous rebase.

 * Add kvm_arm_init_arch_resources() hook, and use it to probe SVE
   configuration for KVM, to avoid duplicating the logic elsewhere.
   We only need to do this once.

 * Move sve_state buffer allocation to kvm_arm_vcpu_finalize().

   As well as making the code more straightforward, this avoids the
   need to allocate memory in kvm_reset_vcpu(), the meat of which is
   non-preemptible since commit 358b28f09f0a ("arm/arm64: KVM: Allow a
   VCPU to fully reset itself").

   The refactoring means that if this has not been done by the time
   we hit KVM_RUN, then this allocation will happen on the
   kvm_arm_first_run_init() path, where preemption remains enabled.

 * Add a couple of comments in {get,set}_sve_reg() to make the handling
   of the KVM_REG_ARM64_SVE_VLS special case a little clearer.

 * Move mis-split rework to avoid put_user() being the correct size
   by accident in KVM_GET_REG_LIST to KVM: arm64: Enumerate SVE register
   indices for KVM_GET_REG_LIST.

 * Fix wrong WARN_ON() check sense when checking whether the
   implementation may needs move SVE register slices than KVM can
   support.

 * Fix erroneous setting of vcpu->arch.sve_max_vl based on stale loop
   control veriable vq.

 * Move definition of KVM_ARM_VCPU_SVE from KVM: arm64/sve: Allow
   userspace to enable SVE for vcpus.

 * Migrate to explicit finalization of the SVE configuration, using
   KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE).
---
 arch/arm64/include/asm/kvm_host.h |  15 +++--
 arch/arm64/include/uapi/asm/kvm.h |   5 ++
 arch/arm64/kvm/guest.c| 114 +-
 arch/arm64/kvm/reset.c|  89 +
 4 files changed, 215 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 98658f7..5475cc4 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -23,7 +23,6 @@
 #define __ARM64_KVM_HOST_H__
 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -50,6 +49,7 @@
 
 #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
 
+/* Will be incremented when KVM_ARM_VCPU_SVE is fully implemented: */
 #define KVM_VCPU_MAX_FEATURES 4
 
 #define KVM_REQ_SLEEP \
@@ -59,10 +59,12 @@
 
 DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 
-static inline int kvm_arm_init_arch_resources(void) { return 0; }
+extern unsigned int kvm_sve_max_vl;
+int kvm_arm_init_arch_resources(void);
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
 int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
 void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t 
idmap_start);
 
@@ -353,6 +355,7 @@ struct kvm_vcpu_arch {
 #define KVM_ARM64_HOST_SVE_IN_USE  (1 << 3) /* backup for host TIF_SVE */
 #define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */
 #define KVM_ARM64_GUEST_HAS_SVE(1 << 5) /* SVE exposed to 
guest */
+#define KVM_ARM64_VCPU_SVE_FINALIZED   (1 << 6) /* SVE config completed */
 
 #define vcpu_has_sve(vcpu) (system_supports_sve() && \
((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE))
@@ -525,7 +528,6 @@ static inline bool kvm_arch_requires_vhe(voi

[PATCH v6 24/27] KVM: arm64/sve: Allow userspace to enable SVE for vcpus

2019-03-19 Thread Dave Martin
Now that all the pieces are in place, this patch offers a new flag
KVM_ARM_VCPU_SVE that userspace can pass to KVM_ARM_VCPU_INIT to
turn on SVE for the guest, on a per-vcpu basis.

As part of this, support for initialisation and reset of the SVE
vector length set and registers is added in the appropriate places,
as well as finally setting the KVM_ARM64_GUEST_HAS_SVE vcpu flag,
to turn on the SVE support code.

Allocation of the SVE register storage in vcpu->arch.sve_state is
deferred until the SVE configuration is finalized, by which time
the size of the registers is known.

Setting the vector lengths supported by the vcpu is considered
configuration of the emulated hardware rather than runtime
configuration, so no support is offered for changing the vector
lengths available to an existing vcpu across reset.

Signed-off-by: Dave Martin 

---

Changes since v5:

 * Refactored to make the code flow clearer and clarify responsiblity
   for the various initialisation phases/checks.

   In place of the previous, confusingly dual-purpose kvm_reset_sve(),
   enabling and resetting of SVE are split into separate functions and
   called as appropriate from kvm_reset_vcpu().

   To avoid interactions with preempt_disable(), memory allocation is
   done in the kvm_vcpu_first_fun_init() path instead.  To achieve
   this, the SVE memory allocation is moved to kvm_arm_vcpu_finalize(),
   which now takes on the role of actually doing deferred setup instead
   of just setting a flag to indicate that the setup was done.

 * Add has_vhe() sanity-check into kvm_vcpu_enable_sve(), since it
   makes more sense here than when resetting the vcpu.

 * When checking for SVE finalization in kvm_reset_vcpu(), call the new
   SVE-specific function kvm_arm_vcpu_sve_finalized().  The new generic
   check kvm_arm_vcpu_is_finalized() is unnecessarily broad here: using
   the appropriate specific check makes the code more self-describing.

 * Definition of KVM_ARM_VCPU_SVE moved to KVM: arm64/sve: Add pseudo-
   register for the guest's vector lengths (which needs it for the
   KVM_ARM_VCPU_FINALIZE ioctl).
---
 arch/arm64/include/asm/kvm_host.h |  3 +--
 arch/arm64/kvm/reset.c| 45 ++-
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 5475cc4..9d57cf8 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -49,8 +49,7 @@
 
 #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
 
-/* Will be incremented when KVM_ARM_VCPU_SVE is fully implemented: */
-#define KVM_VCPU_MAX_FEATURES 4
+#define KVM_VCPU_MAX_FEATURES 5
 
 #define KVM_REQ_SLEEP \
KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e7f9c06..4f04dbf 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -20,10 +20,12 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -37,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Maximum phys_shift supported for any VM on this host */
 static u32 kvm_ipa_limit;
@@ -130,6 +133,27 @@ int kvm_arm_init_arch_resources(void)
return 0;
 }
 
+static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
+{
+   if (!system_supports_sve())
+   return -EINVAL;
+
+   /* Verify that KVM startup enforced this when SVE was detected: */
+   if (WARN_ON(!has_vhe()))
+   return -EINVAL;
+
+   vcpu->arch.sve_max_vl = kvm_sve_max_vl;
+
+   /*
+* Userspace can still customize the vector lengths by writing
+* KVM_REG_ARM64_SVE_VLS.  Allocation is deferred until
+* kvm_arm_vcpu_finalize(), which freezes the configuration.
+*/
+   vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_SVE;
+
+   return 0;
+}
+
 /*
  * Finalize vcpu's maximum SVE vector length, allocating
  * vcpu->arch.sve_state as necessary.
@@ -188,13 +212,20 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
kfree(vcpu->arch.sve_state);
 }
 
+static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
+{
+   if (vcpu_has_sve(vcpu))
+   memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu));
+}
+
 /**
  * kvm_reset_vcpu - sets core registers and sys_regs to reset value
  * @vcpu: The VCPU pointer
  *
  * This function finds the right table above and sets the registers on
  * the virtual CPU struct to their architecturally defined reset
- * values.
+ * values, except for registers whose reset is deferred until
+ * kvm_arm_vcpu_finalize().
  *
  * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT
  * ioctl or as part of handling a request issued by another VCPU in the PSCI
@@ -217,6 +248,18 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
if (loaded)
kvm_arch_vcpu_put(vcpu);
 
+   if (!kvm_arm_vcpu_sve_finalized(vcpu)) {
+ 

[PATCH v6 22/27] KVM: arm/arm64: Add KVM_ARM_VCPU_FINALIZE ioctl

2019-03-19 Thread Dave Martin
Some aspects of vcpu configuration may be too complex to be
completed inside KVM_ARM_VCPU_INIT.  Thus, there may be a
requirement for userspace to do some additional configuration
before various other ioctls will work in a consistent way.

In particular this will be the case for SVE, where userspace will
need to negotiate the set of vector lengths to be made available to
the guest before the vcpu becomes fully usable.

In order to provide an explicit way for userspace to confirm that
it has finished setting up a particular vcpu feature, this patch
adds a new ioctl KVM_ARM_VCPU_FINALIZE.

When userspace has opted into a feature that requires finalization,
typically by means of a feature flag passed to KVM_ARM_VCPU_INIT, a
matching call to KVM_ARM_VCPU_FINALIZE is now required before
KVM_RUN or KVM_GET_REG_LIST is allowed.  Individual features may
impose additional restrictions where appropriate.

No existing vcpu features are affected by this, so current
userspace implementations will continue to work exactly as before,
with no need to issue KVM_ARM_VCPU_FINALIZE.

As implemented in this patch, KVM_ARM_VCPU_FINALIZE is currently a
placeholder: no finalizable features exist yet, so ioctl is not
required and will always yield EINVAL.  Subsequent patches will add
the finalization logic to make use of this ioctl for SVE.

No functional change for existing userspace.

Signed-off-by: Dave Martin 

---

Changes since v5:

 * Commit message, including subject line, rewritten.

   This patch is a rework of "KVM: arm/arm64: Add hook to finalize the
   vcpu configuration".  The old subject line and commit message no
   longer accurately described what the patch does.  However, the code
   is an evolution of the previous patch rather than a wholesale
   rewrite.

 * Added an explicit KVM_ARM_VCPU_FINALIZE ioctl, rather than just
   providing internal hooks in the kernel to finalize the vcpu
   configuration implicitly.  This allows userspace to confirm exactly
   when it has finished configuring the vcpu and is ready to use it.

   This results in simpler (and hopefully more maintainable) ioctl
   ordering rules.
---
 arch/arm/include/asm/kvm_host.h   |  4 
 arch/arm64/include/asm/kvm_host.h |  4 
 include/uapi/linux/kvm.h  |  3 +++
 virt/kvm/arm/arm.c| 18 ++
 4 files changed, 29 insertions(+)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index a49ee01..e80cfc1 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -19,6 +19,7 @@
 #ifndef __ARM_KVM_HOST_H__
 #define __ARM_KVM_HOST_H__
 
+#include 
 #include 
 #include 
 #include 
@@ -411,4 +412,7 @@ static inline int kvm_arm_setup_stage2(struct kvm *kvm, 
unsigned long type)
return 0;
 }
 
+#define kvm_arm_vcpu_finalize(vcpu, what) (-EINVAL)
+#define kvm_arm_vcpu_is_finalized(vcpu) true
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 3e89509..98658f7 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -23,6 +23,7 @@
 #define __ARM64_KVM_HOST_H__
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -625,4 +626,7 @@ void kvm_arch_free_vm(struct kvm *kvm);
 
 int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
 
+#define kvm_arm_vcpu_finalize(vcpu, what) (-EINVAL)
+#define kvm_arm_vcpu_is_finalized(vcpu) true
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index dc77a5a..c3b8e7a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1441,6 +1441,9 @@ struct kvm_enc_region {
 /* Available with KVM_CAP_HYPERV_CPUID */
 #define KVM_GET_SUPPORTED_HV_CPUID _IOWR(KVMIO, 0xc1, struct kvm_cpuid2)
 
+/* Available with KVM_CAP_ARM_SVE */
+#define KVM_ARM_VCPU_FINALIZE_IOW(KVMIO,  0xc2, int)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
/* Guest initialization commands */
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index c69e137..9edbf0f 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -545,6 +545,9 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
if (likely(vcpu->arch.has_run_once))
return 0;
 
+   if (!kvm_arm_vcpu_is_finalized(vcpu))
+   return -EPERM;
+
vcpu->arch.has_run_once = true;
 
if (likely(irqchip_in_kernel(kvm))) {
@@ -1116,6 +1119,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (unlikely(!kvm_vcpu_initialized(vcpu)))
break;
 
+   r = -EPERM;
+   if (!kvm_arm_vcpu_is_finalized(vcpu))
+   break;
+
r = -EFAULT;
if (copy_from_user(®_list, user_list, sizeof(reg_list)))
break;
@@ -1169,6 +1176,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 
return kvm_arm_vcpu_set_events(vc

[PATCH v6 25/27] KVM: arm64: Add a capability to advertise SVE support

2019-03-19 Thread Dave Martin
To provide a uniform way to check for KVM SVE support amongst other
features, this patch adds a suitable capability KVM_CAP_ARM_SVE,
and reports it as present when SVE is available.

Signed-off-by: Dave Martin 

---

Changes since v5:

 * [Julien Thierry] Strip out has_vhe() sanity-check, which wasn't in
   the most logical place, and anyway doesn't really belong in this
   patch.

   Moved to KVM: arm64/sve: Allow userspace to enable SVE for vcpus
   instead.
---
 arch/arm64/kvm/reset.c   | 3 +++
 include/uapi/linux/kvm.h | 1 +
 2 files changed, 4 insertions(+)

diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 4f04dbf..180d7a5 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -98,6 +98,9 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
case KVM_CAP_ARM_VM_IPA_SIZE:
r = kvm_ipa_limit;
break;
+   case KVM_CAP_ARM_SVE:
+   r = system_supports_sve();
+   break;
default:
r = 0;
}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c3b8e7a..1d56444 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -988,6 +988,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_VM_IPA_SIZE 165
 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166
 #define KVM_CAP_HYPERV_CPUID 167
+#define KVM_CAP_ARM_SVE 168
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 21/27] KVM: arm/arm64: Add hook for arch-specific KVM initialisation

2019-03-19 Thread Dave Martin
This patch adds a kvm_arm_init_arch_resources() hook to perform
subarch-specific initialisation when starting up KVM.

This will be used in a subsequent patch for global SVE-related
setup on arm64.

No functional change.

Signed-off-by: Dave Martin 
---
 arch/arm/include/asm/kvm_host.h   | 2 ++
 arch/arm64/include/asm/kvm_host.h | 2 ++
 virt/kvm/arm/arm.c| 4 
 3 files changed, 8 insertions(+)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 770d732..a49ee01 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -53,6 +53,8 @@
 
 DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 
+static inline int kvm_arm_init_arch_resources(void) { return 0; }
+
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 205438a..3e89509 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -58,6 +58,8 @@
 
 DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 
+static inline int kvm_arm_init_arch_resources(void) { return 0; }
+
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 99c3738..c69e137 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -1664,6 +1664,10 @@ int kvm_arch_init(void *opaque)
if (err)
return err;
 
+   err = kvm_arm_init_arch_resources();
+   if (err)
+   return err;
+
if (!in_hyp_mode) {
err = init_hyp_mode();
if (err)
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 26/27] KVM: Document errors for KVM_GET_ONE_REG and KVM_SET_ONE_REG

2019-03-19 Thread Dave Martin
KVM_GET_ONE_REG and KVM_SET_ONE_REG return some error codes that
are not documented (but hopefully not surprising either).  To give
an indication of what these may mean, this patch adds brief
documentation.

Signed-off-by: Dave Martin 
---
 Documentation/virtual/kvm/api.txt | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 2d4f7ce..cd920dd 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1871,6 +1871,9 @@ Architectures: all
 Type: vcpu ioctl
 Parameters: struct kvm_one_reg (in)
 Returns: 0 on success, negative value on failure
+Errors:
+  ENOENT:   no such register
+  EINVAL:   other errors, such as bad size encoding for a known register
 
 struct kvm_one_reg {
__u64 id;
@@ -2192,6 +2195,9 @@ Architectures: all
 Type: vcpu ioctl
 Parameters: struct kvm_one_reg (in and out)
 Returns: 0 on success, negative value on failure
+Errors:
+  ENOENT:   no such register
+  EINVAL:   other errors, such as bad size encoding for a known register
 
 This ioctl allows to receive the value of a single register implemented
 in a vcpu. The register to read is indicated by the "id" field of the
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 20/27] arm64/sve: In-kernel vector length availability query interface

2019-03-19 Thread Dave Martin
KVM will need to interrogate the set of SVE vector lengths
available on the system.

This patch exposes the relevant bits to the kernel, along with a
sve_vq_available() helper to check whether a particular vector
length is supported.

__vq_to_bit() and __bit_to_vq() are not intended for use outside
these functions: now that these are exposed outside fpsimd.c, they
are prefixed with __ in order to provide an extra hint that they
are not intended for general-purpose use.

Signed-off-by: Dave Martin 
Reviewed-by: Alex Bennée 
---
 arch/arm64/include/asm/fpsimd.h | 29 +
 arch/arm64/kernel/fpsimd.c  | 35 ---
 2 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index df7a143..ad6d2e4 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -24,10 +24,13 @@
 
 #ifndef __ASSEMBLY__
 
+#include 
 #include 
+#include 
 #include 
 #include 
 #include 
+#include 
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /* Masks for extracting the FPSR and FPCR from the FPSCR */
@@ -89,6 +92,32 @@ extern u64 read_zcr_features(void);
 
 extern int __ro_after_init sve_max_vl;
 extern int __ro_after_init sve_max_virtualisable_vl;
+/* Set of available vector lengths, as vq_to_bit(vq): */
+extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
+
+/*
+ * Helpers to translate bit indices in sve_vq_map to VQ values (and
+ * vice versa).  This allows find_next_bit() to be used to find the
+ * _maximum_ VQ not exceeding a certain value.
+ */
+static inline unsigned int __vq_to_bit(unsigned int vq)
+{
+   return SVE_VQ_MAX - vq;
+}
+
+static inline unsigned int __bit_to_vq(unsigned int bit)
+{
+   if (WARN_ON(bit >= SVE_VQ_MAX))
+   bit = SVE_VQ_MAX - 1;
+
+   return SVE_VQ_MAX - bit;
+}
+
+/* Ensure vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX before calling this function */
+static inline bool sve_vq_available(unsigned int vq)
+{
+   return test_bit(__vq_to_bit(vq), sve_vq_map);
+}
 
 #ifdef CONFIG_ARM64_SVE
 
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 8a93afa..577296b 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -136,7 +136,7 @@ static int sve_default_vl = -1;
 int __ro_after_init sve_max_vl = SVE_VL_MIN;
 int __ro_after_init sve_max_virtualisable_vl = SVE_VL_MIN;
 /* Set of available vector lengths, as vq_to_bit(vq): */
-static __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
+__ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
 /* Set of vector lengths present on at least one cpu: */
 static __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX);
 static void __percpu *efi_sve_state;
@@ -270,25 +270,6 @@ void fpsimd_save(void)
 }
 
 /*
- * Helpers to translate bit indices in sve_vq_map to VQ values (and
- * vice versa).  This allows find_next_bit() to be used to find the
- * _maximum_ VQ not exceeding a certain value.
- */
-
-static unsigned int vq_to_bit(unsigned int vq)
-{
-   return SVE_VQ_MAX - vq;
-}
-
-static unsigned int bit_to_vq(unsigned int bit)
-{
-   if (WARN_ON(bit >= SVE_VQ_MAX))
-   bit = SVE_VQ_MAX - 1;
-
-   return SVE_VQ_MAX - bit;
-}
-
-/*
  * All vector length selection from userspace comes through here.
  * We're on a slow path, so some sanity-checks are included.
  * If things go wrong there's a bug somewhere, but try to fall back to a
@@ -309,8 +290,8 @@ static unsigned int find_supported_vector_length(unsigned 
int vl)
vl = max_vl;
 
bit = find_next_bit(sve_vq_map, SVE_VQ_MAX,
-   vq_to_bit(sve_vq_from_vl(vl)));
-   return sve_vl_from_vq(bit_to_vq(bit));
+   __vq_to_bit(sve_vq_from_vl(vl)));
+   return sve_vl_from_vq(__bit_to_vq(bit));
 }
 
 #ifdef CONFIG_SYSCTL
@@ -648,7 +629,7 @@ static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX))
write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1); /* self-syncing */
vl = sve_get_vl();
vq = sve_vq_from_vl(vl); /* skip intervening lengths */
-   set_bit(vq_to_bit(vq), map);
+   set_bit(__vq_to_bit(vq), map);
}
 }
 
@@ -717,7 +698,7 @@ int sve_verify_vq_map(void)
 * Mismatches above sve_max_virtualisable_vl are fine, since
 * no guest is allowed to configure ZCR_EL2.LEN to exceed this:
 */
-   if (sve_vl_from_vq(bit_to_vq(b)) <= sve_max_virtualisable_vl) {
+   if (sve_vl_from_vq(__bit_to_vq(b)) <= sve_max_virtualisable_vl) {
pr_warn("SVE: cpu%d: Unsupported vector length(s) present\n",
smp_processor_id());
return -EINVAL;
@@ -801,8 +782,8 @@ void __init sve_setup(void)
 * so sve_vq_map must have at least SVE_VQ_MIN set.
 * If something went wrong, at least try to patch it up:
 */
-   if (WARN_ON(!tes

[PATCH v6 17/27] KVM: arm64: Reject ioctl access to FPSIMD V-regs on SVE vcpus

2019-03-19 Thread Dave Martin
In order to avoid the pointless complexity of maintaining two ioctl
register access views of the same data, this patch blocks ioctl
access to the FPSIMD V-registers on vcpus that support SVE.

This will make it more straightforward to add SVE register access
support.

Since SVE is an opt-in feature for userspace, this will not affect
existing users.

Signed-off-by: Dave Martin 

---

(Julien Thierry's Reviewed-by dropped due to non-trivial refactoring)

Changes since v5:

 * Refactored to cope with the removal of core_reg_size_from_offset()
   (which was added by another series which will now be handled
   independently).

   This leaves some duplication in that we still filter the V-regs out
   in two places, but this no worse than other existing code in guest.c.
   I plan to tidy this up independently later on.
---
 arch/arm64/kvm/guest.c | 48 
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index a391a61..756d0d6 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -54,12 +54,19 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
return 0;
 }
 
+static bool core_reg_offset_is_vreg(u64 off)
+{
+   return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) &&
+   off < KVM_REG_ARM_CORE_REG(fp_regs.fpsr);
+}
+
 static u64 core_reg_offset_from_id(u64 id)
 {
return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE);
 }
 
-static int validate_core_offset(const struct kvm_one_reg *reg)
+static int validate_core_offset(const struct kvm_vcpu *vcpu,
+   const struct kvm_one_reg *reg)
 {
u64 off = core_reg_offset_from_id(reg->id);
int size;
@@ -91,11 +98,19 @@ static int validate_core_offset(const struct kvm_one_reg 
*reg)
return -EINVAL;
}
 
-   if (KVM_REG_SIZE(reg->id) == size &&
-   IS_ALIGNED(off, size / sizeof(__u32)))
-   return 0;
+   if (KVM_REG_SIZE(reg->id) != size ||
+   !IS_ALIGNED(off, size / sizeof(__u32)))
+   return -EINVAL;
 
-   return -EINVAL;
+   /*
+* The KVM_REG_ARM64_SVE regs must be used instead of
+* KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on
+* SVE-enabled vcpus:
+*/
+   if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(off))
+   return -EINVAL;
+
+   return 0;
 }
 
 static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
@@ -117,7 +132,7 @@ static int get_core_reg(struct kvm_vcpu *vcpu, const struct 
kvm_one_reg *reg)
(off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs)
return -ENOENT;
 
-   if (validate_core_offset(reg))
+   if (validate_core_offset(vcpu, reg))
return -EINVAL;
 
if (copy_to_user(uaddr, ((u32 *)regs) + off, KVM_REG_SIZE(reg->id)))
@@ -142,7 +157,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct 
kvm_one_reg *reg)
(off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs)
return -ENOENT;
 
-   if (validate_core_offset(reg))
+   if (validate_core_offset(vcpu, reg))
return -EINVAL;
 
if (KVM_REG_SIZE(reg->id) > sizeof(tmp))
@@ -195,13 +210,22 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
return -EINVAL;
 }
 
-static int kvm_arm_copy_core_reg_indices(u64 __user *uindices)
+static int copy_core_reg_indices(const struct kvm_vcpu *vcpu,
+u64 __user *uindices)
 {
unsigned int i;
int n = 0;
const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | 
KVM_REG_ARM_CORE;
 
for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) {
+   /*
+* The KVM_REG_ARM64_SVE regs must be used instead of
+* KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on
+* SVE-enabled vcpus:
+*/
+   if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(i))
+   continue;
+
if (uindices) {
if (put_user(core_reg | i, uindices))
return -EFAULT;
@@ -214,9 +238,9 @@ static int kvm_arm_copy_core_reg_indices(u64 __user 
*uindices)
return n;
 }
 
-static unsigned long num_core_regs(void)
+static unsigned long num_core_regs(const struct kvm_vcpu *vcpu)
 {
-   return kvm_arm_copy_core_reg_indices(NULL);
+   return copy_core_reg_indices(vcpu, NULL);
 }
 
 /**
@@ -281,7 +305,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
 {
unsigned long res = 0;
 
-   res += num_core_regs();
+   res += num_core_regs(vcpu);
res += kvm_arm_num_sys_reg_descs(vcpu);
res += kvm_arm_get_fw_num_regs(vcpu);
res += NUM_TIMER_REGS;
@@ -298,7 +322,7 @@ int kvm_arm_copy_reg_indices(struc

[PATCH v6 13/27] KVM: arm64/sve: Context switch the SVE registers

2019-03-19 Thread Dave Martin
In order to give each vcpu its own view of the SVE registers, this
patch adds context storage via a new sve_state pointer in struct
vcpu_arch.  An additional member sve_max_vl is also added for each
vcpu, to determine the maximum vector length visible to the guest
and thus the value to be configured in ZCR_EL2.LEN while the vcpu
is active.  This also determines the layout and size of the storage
in sve_state, which is read and written by the same backend
functions that are used for context-switching the SVE state for
host tasks.

On SVE-enabled vcpus, SVE access traps are now handled by switching
in the vcpu's SVE context and disabling the trap before returning
to the guest.  On other vcpus, the trap is not handled and an exit
back to the host occurs, where the handle_sve() fallback path
reflects an undefined instruction exception back to the guest,
consistently with the behaviour of non-SVE-capable hardware (as was
done unconditionally prior to this patch).

No SVE handling is added on non-VHE-only paths, since VHE is an
architectural and Kconfig prerequisite of SVE.

Signed-off-by: Dave Martin 

---

Changes since v5:

 * [Julien Thierry, Julien Grall] Commit message typo fixes

 * [Mark Rutland] Rename trap_class to hsr_ec, for consistency with
   existing code.

 * [Mark Rutland] Simplify condition for refusing to handle an
   FPSIMD/SVE trap, using multiple if () statements for clarity.  The
   previous condition was a bit tortuous, and how that the static_key
   checks have been hoisted out, it makes little difference to the
   compiler how we express the condition here.
---
 arch/arm64/include/asm/kvm_host.h |  6 
 arch/arm64/kvm/fpsimd.c   |  5 +--
 arch/arm64/kvm/hyp/switch.c   | 75 +--
 3 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 22cf484..4fabfd2 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -228,6 +228,8 @@ struct vcpu_reset_state {
 
 struct kvm_vcpu_arch {
struct kvm_cpu_context ctxt;
+   void *sve_state;
+   unsigned int sve_max_vl;
 
/* HYP configuration */
u64 hcr_el2;
@@ -323,6 +325,10 @@ struct kvm_vcpu_arch {
bool sysregs_loaded_on_cpu;
 };
 
+/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
+#define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \
+ sve_ffr_offset((vcpu)->arch.sve_max_vl)))
+
 /* vcpu_arch flags field values: */
 #define KVM_ARM64_DEBUG_DIRTY  (1 << 0)
 #define KVM_ARM64_FP_ENABLED   (1 << 1) /* guest FP regs loaded */
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 7053bf4..6e3c9c8 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -87,10 +87,11 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
 
if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) {
fpsimd_bind_state_to_cpu(&vcpu->arch.ctxt.gp_regs.fp_regs,
-NULL, SVE_VL_MIN);
+vcpu->arch.sve_state,
+vcpu->arch.sve_max_vl);
 
clear_thread_flag(TIF_FOREIGN_FPSTATE);
-   clear_thread_flag(TIF_SVE);
+   update_thread_flag(TIF_SVE, vcpu_has_sve(vcpu));
}
 }
 
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 9d46066..5444b9c 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -100,7 +100,10 @@ static void activate_traps_vhe(struct kvm_vcpu *vcpu)
val = read_sysreg(cpacr_el1);
val |= CPACR_EL1_TTA;
val &= ~CPACR_EL1_ZEN;
-   if (!update_fp_enabled(vcpu)) {
+   if (update_fp_enabled(vcpu)) {
+   if (vcpu_has_sve(vcpu))
+   val |= CPACR_EL1_ZEN;
+   } else {
val &= ~CPACR_EL1_FPEN;
__activate_traps_fpsimd32(vcpu);
}
@@ -317,16 +320,48 @@ static bool __hyp_text __populate_fault_info(struct 
kvm_vcpu *vcpu)
return true;
 }
 
-static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu)
+/* Check for an FPSIMD/SVE trap and handle as appropriate */
+static bool __hyp_text __hyp_handle_fpsimd(struct kvm_vcpu *vcpu)
 {
-   struct user_fpsimd_state *host_fpsimd = vcpu->arch.host_fpsimd_state;
+   bool vhe, sve_guest, sve_host;
+   u8 hsr_ec;
 
-   if (has_vhe())
-   write_sysreg(read_sysreg(cpacr_el1) | CPACR_EL1_FPEN,
-cpacr_el1);
-   else
+   if (!system_supports_fpsimd())
+   return false;
+
+   if (system_supports_sve()) {
+   sve_guest = vcpu_has_sve(vcpu);
+   sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE;
+   vhe = true;
+   } else {
+   sve_guest = false;
+   sve_

[PATCH v6 18/27] KVM: arm64/sve: Add SVE support to register access ioctl interface

2019-03-19 Thread Dave Martin
This patch adds the following registers for access via the
KVM_{GET,SET}_ONE_REG interface:

 * KVM_REG_ARM64_SVE_ZREG(n, i) (n = 0..31) (in 2048-bit slices)
 * KVM_REG_ARM64_SVE_PREG(n, i) (n = 0..15) (in 256-bit slices)
 * KVM_REG_ARM64_SVE_FFR(i) (in 256-bit slices)

In order to adapt gracefully to future architectural extensions,
the registers are logically divided up into slices as noted above:
the i parameter denotes the slice index.

This allows us to reserve space in the ABI for future expansion of
these registers.  However, as of today the architecture does not
permit registers to be larger than a single slice, so no code is
needed in the kernel to expose additional slices, for now.  The
code can be extended later as needed to expose them up to a maximum
of 32 slices (as carved out in the architecture itself) if they
really exist someday.

The registers are only visible for vcpus that have SVE enabled.
They are not enumerated by KVM_GET_REG_LIST on vcpus that do not
have SVE.

Accesses to the FPSIMD registers via KVM_REG_ARM_CORE is not
allowed for SVE-enabled vcpus: SVE-aware userspace can use the
KVM_REG_ARM64_SVE_ZREG() interface instead to access the same
register state.  This avoids some complex and pointless emulation
in the kernel to convert between the two views of these aliased
registers.

Signed-off-by: Dave Martin 

---

Changes since v5:

 * [Julien Thierry] rename sve_reg_region() to sve_reg_to_region() to
   make its purpose a bit clearer.

 * [Julien Thierry] rename struct sve_state_region to
   sve_state_reg_region to make it clearer this this struct only
   describes the bounds of (part of) a single register within
   sve_state.

 * [Julien Thierry] Add a comment to clarify the purpose of struct
   sve_state_reg_region.
---
 arch/arm64/include/asm/kvm_host.h |  14 
 arch/arm64/include/uapi/asm/kvm.h |  17 +
 arch/arm64/kvm/guest.c| 139 ++
 3 files changed, 158 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 4fabfd2..205438a 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -329,6 +329,20 @@ struct kvm_vcpu_arch {
 #define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \
  sve_ffr_offset((vcpu)->arch.sve_max_vl)))
 
+#define vcpu_sve_state_size(vcpu) ({   \
+   size_t __size_ret;  \
+   unsigned int __vcpu_vq; \
+   \
+   if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) {  \
+   __size_ret = 0; \
+   } else {\
+   __vcpu_vq = sve_vq_from_vl((vcpu)->arch.sve_max_vl);\
+   __size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq);  \
+   }   \
+   \
+   __size_ret; \
+})
+
 /* vcpu_arch flags field values: */
 #define KVM_ARM64_DEBUG_DIRTY  (1 << 0)
 #define KVM_ARM64_FP_ENABLED   (1 << 1) /* guest FP regs loaded */
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index 97c3478..ced760c 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -226,6 +226,23 @@ struct kvm_vcpu_events {
 KVM_REG_ARM_FW | ((r) & 0x))
 #define KVM_REG_ARM_PSCI_VERSION   KVM_REG_ARM_FW_REG(0)
 
+/* SVE registers */
+#define KVM_REG_ARM64_SVE  (0x15 << KVM_REG_ARM_COPROC_SHIFT)
+
+/* Z- and P-regs occupy blocks at the following offsets within this range: */
+#define KVM_REG_ARM64_SVE_ZREG_BASE0
+#define KVM_REG_ARM64_SVE_PREG_BASE0x400
+
+#define KVM_REG_ARM64_SVE_ZREG(n, i)   (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | \
+KVM_REG_ARM64_SVE_ZREG_BASE |  \
+KVM_REG_SIZE_U2048 |   \
+((n) << 5) | (i))
+#define KVM_REG_ARM64_SVE_PREG(n, i)   (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | \
+KVM_REG_ARM64_SVE_PREG_BASE |  \
+KVM_REG_SIZE_U256 |\
+((n) << 5) | (i))
+#define KVM_REG_ARM64_SVE_FFR(i)   KVM_REG_ARM64_SVE_PREG(16, i)
+
 /* Device Control API: ARM VGIC */
 #define KVM_DEV_ARM_VGIC_GRP_ADDR  0
 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 756d0d6..736d8cb 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/

[PATCH v6 12/27] KVM: arm64/sve: System register context switch and access support

2019-03-19 Thread Dave Martin
This patch adds the necessary support for context switching ZCR_EL1
for each vcpu.

ZCR_EL1 is trapped alongside the FPSIMD/SVE registers, so it makes
sense for it to be handled as part of the guest FPSIMD/SVE context
for context switch purposes instead of handling it as a general
system register.  This means that it can be switched in lazily at
the appropriate time.  No effort is made to track host context for
this register, since SVE requires VHE: thus the hosts's value for
this register lives permanently in ZCR_EL2 and does not alias the
guest's value at any time.

The Hyp switch and fpsimd context handling code is extended
appropriately.

Accessors are added in sys_regs.c to expose the SVE system
registers and ID register fields.  Because these need to be
conditionally visible based on the guest configuration, they are
implemented separately for now rather than by use of the generic
system register helpers.  This may be abstracted better later on
when/if there are more features requiring this model.

ID_AA64ZFR0_EL1 is RO-RAZ for MRS/MSR when SVE is disabled for the
guest, but for compatibility with non-SVE aware KVM implementations
the register should not be enumerated at all for KVM_GET_REG_LIST
in this case.  For consistency we also reject ioctl access to the
register.  This ensures that a non-SVE-enabled guest looks the same
to userspace, irrespective of whether the kernel KVM implementation
supports SVE.

Signed-off-by: Dave Martin 

---

Changes since v5:

 * Port to the renamed visibility() framework.

 * Swap visiblity() helpers so that they appear by the relevant accessor
   functions.

 * [Julien Grall] With the visibility() checks, {get,set}_zcr_el1()
   degenerate to doing exactly what the common code does, so drop them.

   The ID_AA64ZFR0_EL1 handlers are still needed to provide contitional
   RAZ behaviour.  This could be moved to the common code too, but since
   this is a one-off case I don't do this for now.  We can address this
   later if other regs need to follow the same pattern.

 * [Julien Thierry] Reset ZCR_EL1 to a fixed value using reset_val
   instead of using relying on reset_unknown() honouring set bits in val
   as RES0.

   Most of the bits in ZCR_EL1 are RES0 anyway, and many implementations
   of SVE will support larger vectors than 128 bits, so 0 seems as good
   a value as any to expose guests that forget to initialise this
   register properly.
---
 arch/arm64/include/asm/kvm_host.h |  1 +
 arch/arm64/include/asm/sysreg.h   |  3 ++
 arch/arm64/kvm/fpsimd.c   |  9 -
 arch/arm64/kvm/hyp/switch.c   |  3 ++
 arch/arm64/kvm/sys_regs.c | 83 ---
 5 files changed, 93 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index ad4f7f0..22cf484 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -121,6 +121,7 @@ enum vcpu_sysreg {
SCTLR_EL1,  /* System Control Register */
ACTLR_EL1,  /* Auxiliary Control Register */
CPACR_EL1,  /* Coprocessor Access Control */
+   ZCR_EL1,/* SVE Control */
TTBR0_EL1,  /* Translation Table Base Register 0 */
TTBR1_EL1,  /* Translation Table Base Register 1 */
TCR_EL1,/* Translation Control Register */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 5b267de..4d6262d 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -454,6 +454,9 @@
 #define SYS_ICH_LR14_EL2   __SYS__LR8_EL2(6)
 #define SYS_ICH_LR15_EL2   __SYS__LR8_EL2(7)
 
+/* VHE encodings for architectural EL0/1 system registers */
+#define SYS_ZCR_EL12   sys_reg(3, 5, 1, 2, 0)
+
 /* Common SCTLR_ELx flags. */
 #define SCTLR_ELx_DSSBS(_BITUL(44))
 #define SCTLR_ELx_ENIA (_BITUL(31))
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 1cf4f02..7053bf4 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -103,14 +103,21 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
 {
unsigned long flags;
+   bool host_has_sve = system_supports_sve();
+   bool guest_has_sve = vcpu_has_sve(vcpu);
 
local_irq_save(flags);
 
if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) {
+   u64 *guest_zcr = &vcpu->arch.ctxt.sys_regs[ZCR_EL1];
+
/* Clean guest FP state to memory and invalidate cpu view */
fpsimd_save();
fpsimd_flush_cpu_state();
-   } else if (system_supports_sve()) {
+
+   if (guest_has_sve)
+   *guest_zcr = read_sysreg_s(SYS_ZCR_EL12);
+   } else if (host_has_sve) {
/*
 * The FPSIMD/SVE state in the CPU has not been touched, and we
 * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) 

[PATCH v6 16/27] KVM: arm64: Factor out core register ID enumeration

2019-03-19 Thread Dave Martin
In preparation for adding logic to filter out some KVM_REG_ARM_CORE
registers from the KVM_GET_REG_LIST output, this patch factors out
the core register enumeration into a separate function and rebuilds
num_core_regs() on top of it.

This may be a little more expensive (depending on how good a job
the compiler does of specialising the code), but KVM_GET_REG_LIST
is not a hot path.

This will make it easier to consolidate ID filtering code in one
place.

No functional change.

Signed-off-by: Dave Martin 

---

Changes since v5:

 * New patch.

   This reimplements part of the separately-posted patch "KVM: arm64:
   Factor out KVM_GET_REG_LIST core register enumeration", minus aspects
   that potentially break the ABI.

   As a result, the opportunity to truly consolidate all the ID reg
   filtering in one place is deliberately left on the floor, for now.
   This will be addressed in a separate series later on.
---
 arch/arm64/kvm/guest.c | 33 +
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 3e38eb2..a391a61 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -194,9 +195,28 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
return -EINVAL;
 }
 
+static int kvm_arm_copy_core_reg_indices(u64 __user *uindices)
+{
+   unsigned int i;
+   int n = 0;
+   const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | 
KVM_REG_ARM_CORE;
+
+   for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) {
+   if (uindices) {
+   if (put_user(core_reg | i, uindices))
+   return -EFAULT;
+   uindices++;
+   }
+
+   n++;
+   }
+
+   return n;
+}
+
 static unsigned long num_core_regs(void)
 {
-   return sizeof(struct kvm_regs) / sizeof(__u32);
+   return kvm_arm_copy_core_reg_indices(NULL);
 }
 
 /**
@@ -276,15 +296,12 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
  */
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
 {
-   unsigned int i;
-   const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | 
KVM_REG_ARM_CORE;
int ret;
 
-   for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) {
-   if (put_user(core_reg | i, uindices))
-   return -EFAULT;
-   uindices++;
-   }
+   ret = kvm_arm_copy_core_reg_indices(uindices);
+   if (ret)
+   return ret;
+   uindices += ret;
 
ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices);
if (ret)
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 19/27] KVM: arm64: Enumerate SVE register indices for KVM_GET_REG_LIST

2019-03-19 Thread Dave Martin
This patch includes the SVE register IDs in the list returned by
KVM_GET_REG_LIST, as appropriate.

On a non-SVE-enabled vcpu, no new IDs are added.

On an SVE-enabled vcpu, IDs for the FPSIMD V-registers are removed
from the list, since userspace is required to access the Z-
registers instead in order to access the V-register content.  For
the variably-sized SVE registers, the appropriate set of slice IDs
are enumerated, depending on the maximum vector length for the
vcpu.

As it currently stands, the SVE architecture never requires more
than one slice to exist per register, so this patch adds no
explicit support for enumerating multiple slices.  The code can be
extended straightforwardly to support this in the future, if
needed.

Signed-off-by: Dave Martin 

---

Changes since v5:

(Dropped Julien Thierry's Reviewed-by due to non-trivial rebasing)

 * Move mis-split reword to prevent put_user()s being accidentally the
   correct size from KVM: arm64/sve: Add pseudo-register for the guest's
   vector lengths.
---
 arch/arm64/kvm/guest.c | 56 ++
 1 file changed, 56 insertions(+)

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 736d8cb..585c31e5 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -411,6 +411,56 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const 
struct kvm_one_reg *reg)
return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
 }
 
+static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu)
+{
+   /* Only the first slice ever exists, for now */
+   const unsigned int slices = 1;
+
+   if (!vcpu_has_sve(vcpu))
+   return 0;
+
+   return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */);
+}
+
+static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu,
+   u64 __user *uindices)
+{
+   /* Only the first slice ever exists, for now */
+   const unsigned int slices = 1;
+   u64 reg;
+   unsigned int i, n;
+   int num_regs = 0;
+
+   if (!vcpu_has_sve(vcpu))
+   return 0;
+
+   for (i = 0; i < slices; i++) {
+   for (n = 0; n < SVE_NUM_ZREGS; n++) {
+   reg = KVM_REG_ARM64_SVE_ZREG(n, i);
+   if (put_user(reg, uindices++))
+   return -EFAULT;
+
+   num_regs++;
+   }
+
+   for (n = 0; n < SVE_NUM_PREGS; n++) {
+   reg = KVM_REG_ARM64_SVE_PREG(n, i);
+   if (put_user(reg, uindices++))
+   return -EFAULT;
+
+   num_regs++;
+   }
+
+   reg = KVM_REG_ARM64_SVE_FFR(i);
+   if (put_user(reg, uindices++))
+   return -EFAULT;
+
+   num_regs++;
+   }
+
+   return num_regs;
+}
+
 /**
  * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG
  *
@@ -421,6 +471,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
unsigned long res = 0;
 
res += num_core_regs(vcpu);
+   res += num_sve_regs(vcpu);
res += kvm_arm_num_sys_reg_descs(vcpu);
res += kvm_arm_get_fw_num_regs(vcpu);
res += NUM_TIMER_REGS;
@@ -442,6 +493,11 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 
__user *uindices)
return ret;
uindices += ret;
 
+   ret = copy_sve_reg_indices(vcpu, uindices);
+   if (ret)
+   return ret;
+   uindices += ret;
+
ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices);
if (ret)
return ret;
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 09/27] KVM: arm64: Add a vcpu flag to control SVE visibility for the guest

2019-03-19 Thread Dave Martin
Since SVE will be enabled or disabled on a per-vcpu basis, a flag
is needed in order to track which vcpus have it enabled.

This patch adds a suitable flag and a helper for checking it.

Signed-off-by: Dave Martin 
Reviewed-by: Alex Bennée 
---
 arch/arm64/include/asm/kvm_host.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 6d10100..ad4f7f0 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -328,6 +328,10 @@ struct kvm_vcpu_arch {
 #define KVM_ARM64_FP_HOST  (1 << 2) /* host FP regs loaded */
 #define KVM_ARM64_HOST_SVE_IN_USE  (1 << 3) /* backup for host TIF_SVE */
 #define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */
+#define KVM_ARM64_GUEST_HAS_SVE(1 << 5) /* SVE exposed to 
guest */
+
+#define vcpu_has_sve(vcpu) (system_supports_sve() && \
+   ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE))
 
 #define vcpu_gp_regs(v)(&(v)->arch.ctxt.gp_regs)
 
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 08/27] arm64/sve: Enable SVE state tracking for non-task contexts

2019-03-19 Thread Dave Martin
The current FPSIMD/SVE context handling support for non-task (i.e.,
KVM vcpu) contexts does not take SVE into account.  This means that
only task contexts can safely use SVE at present.

In preparation for enabling KVM guests to use SVE, it is necessary
to keep track of SVE state for non-task contexts too.

This patch adds the necessary support, removing assumptions from
the context switch code about the location of the SVE context
storage.

When binding a vcpu context, its vector length is arbitrarily
specified as SVE_VL_MIN for now.  In any case, because TIF_SVE is
presently cleared at vcpu context bind time, the specified vector
length will not be used for anything yet.  In later patches TIF_SVE
will be set here as appropriate, and the appropriate maximum vector
length for the vcpu will be passed when binding.

Signed-off-by: Dave Martin 
Reviewed-by: Alex Bennée 
Reviewed-by: Julien Grall 
---
 arch/arm64/include/asm/fpsimd.h |  3 ++-
 arch/arm64/kernel/fpsimd.c  | 20 +++-
 arch/arm64/kvm/fpsimd.c |  5 -
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 964adc9..df7a143 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -56,7 +56,8 @@ extern void fpsimd_restore_current_state(void);
 extern void fpsimd_update_current_state(struct user_fpsimd_state const *state);
 
 extern void fpsimd_bind_task_to_cpu(void);
-extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state);
+extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state,
+void *sve_state, unsigned int sve_vl);
 
 extern void fpsimd_flush_task_state(struct task_struct *target);
 extern void fpsimd_flush_cpu_state(void);
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index b219796a..8a93afa 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -121,6 +121,8 @@
  */
 struct fpsimd_last_state_struct {
struct user_fpsimd_state *st;
+   void *sve_state;
+   unsigned int sve_vl;
 };
 
 static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state);
@@ -241,14 +243,15 @@ static void task_fpsimd_load(void)
  */
 void fpsimd_save(void)
 {
-   struct user_fpsimd_state *st = __this_cpu_read(fpsimd_last_state.st);
+   struct fpsimd_last_state_struct const *last =
+   this_cpu_ptr(&fpsimd_last_state);
/* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */
 
WARN_ON(!in_softirq() && !irqs_disabled());
 
if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) {
if (system_supports_sve() && test_thread_flag(TIF_SVE)) {
-   if (WARN_ON(sve_get_vl() != current->thread.sve_vl)) {
+   if (WARN_ON(sve_get_vl() != last->sve_vl)) {
/*
 * Can't save the user regs, so current would
 * re-enter user with corrupt state.
@@ -258,9 +261,11 @@ void fpsimd_save(void)
return;
}
 
-   sve_save_state(sve_pffr(¤t->thread), &st->fpsr);
+   sve_save_state((char *)last->sve_state +
+   sve_ffr_offset(last->sve_vl),
+  &last->st->fpsr);
} else
-   fpsimd_save_state(st);
+   fpsimd_save_state(last->st);
}
 }
 
@@ -1034,6 +1039,8 @@ void fpsimd_bind_task_to_cpu(void)
this_cpu_ptr(&fpsimd_last_state);
 
last->st = ¤t->thread.uw.fpsimd_state;
+   last->sve_state = current->thread.sve_state;
+   last->sve_vl = current->thread.sve_vl;
current->thread.fpsimd_cpu = smp_processor_id();
 
if (system_supports_sve()) {
@@ -1047,7 +1054,8 @@ void fpsimd_bind_task_to_cpu(void)
}
 }
 
-void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st)
+void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state,
+ unsigned int sve_vl)
 {
struct fpsimd_last_state_struct *last =
this_cpu_ptr(&fpsimd_last_state);
@@ -1055,6 +1063,8 @@ void fpsimd_bind_state_to_cpu(struct user_fpsimd_state 
*st)
WARN_ON(!in_softirq() && !irqs_disabled());
 
last->st = st;
+   last->sve_state = sve_state;
+   last->sve_vl = sve_vl;
 }
 
 /*
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index aac7808..1cf4f02 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -85,7 +86,9 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(!irqs_disabled());
 
if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) {
-   fpsimd_bind_stat

[PATCH v6 14/27] KVM: Allow 2048-bit register access via ioctl interface

2019-03-19 Thread Dave Martin
The Arm SVE architecture defines registers that are up to 2048 bits
in size (with some possibility of further future expansion).

In order to avoid the need for an excessively large number of
ioctls when saving and restoring a vcpu's registers, this patch
adds a #define to make support for individual 2048-bit registers
through the KVM_{GET,SET}_ONE_REG ioctl interface official.  This
will allow each SVE register to be accessed in a single call.

There are sufficient spare bits in the register id size field for
this change, so there is no ABI impact, providing that
KVM_GET_REG_LIST does not enumerate any 2048-bit register unless
userspace explicitly opts in to the relevant architecture-specific
features.

Signed-off-by: Dave Martin 
Reviewed-by: Alex Bennée 
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6d4ea4b..dc77a5a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1145,6 +1145,7 @@ struct kvm_dirty_tlb {
 #define KVM_REG_SIZE_U256  0x0050ULL
 #define KVM_REG_SIZE_U512  0x0060ULL
 #define KVM_REG_SIZE_U1024 0x0070ULL
+#define KVM_REG_SIZE_U2048 0x0080ULL
 
 struct kvm_reg_list {
__u64 n; /* number of regs */
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 15/27] KVM: arm64: Add missing #include of in guest.c

2019-03-19 Thread Dave Martin
arch/arm64/kvm/guest.c uses the string functions, but the
corresponding header is not included.

We seem to get away with this for now, but for completeness this
patch adds the #include, in preparation for adding yet more
memset() calls.

Signed-off-by: Dave Martin 
---
 arch/arm64/kvm/guest.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 62514cb..3e38eb2 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 04/27] KVM: arm64: Refactor kvm_arm_num_regs() for easier maintenance

2019-03-19 Thread Dave Martin
kvm_arm_num_regs() adds together various partial register counts in
a freeform sum expression, which makes it harder than necessary to
read diffs that add, modify or remove a single term in the sum
(which is expected to the common case under maintenance).

This patch refactors the code to add the term one per line, for
maximum readability.

Signed-off-by: Dave Martin 
Reviewed-by: Alex Bennée 
---
 arch/arm64/kvm/guest.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index dd436a5..62514cb 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -258,8 +258,14 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const 
struct kvm_one_reg *reg)
  */
 unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
 {
-   return num_core_regs() + kvm_arm_num_sys_reg_descs(vcpu)
-   + kvm_arm_get_fw_num_regs(vcpu) + NUM_TIMER_REGS;
+   unsigned long res = 0;
+
+   res += num_core_regs();
+   res += kvm_arm_num_sys_reg_descs(vcpu);
+   res += kvm_arm_get_fw_num_regs(vcpu);
+   res += NUM_TIMER_REGS;
+
+   return res;
 }
 
 /**
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 11/27] KVM: arm64: Support runtime sysreg visibility filtering

2019-03-19 Thread Dave Martin
Some optional features of the Arm architecture add new system
registers that are not present in the base architecture.

Where these features are optional for the guest, the visibility of
these registers may need to depend on some runtime configuration,
such as a flag passed to KVM_ARM_VCPU_INIT.

For example, ZCR_EL1 and ID_AA64ZFR0_EL1 need to be hidden if SVE
is not enabled for the guest, even though these registers may be
present in the hardware and visible to the host at EL2.

Adding special-case checks all over the place for individual
registers is going to get messy as the number of conditionally-
visible registers grows.

In order to help solve this problem, this patch adds a new sysreg
method visibility() that can be used to hook in any needed runtime
visibility checks.  This method can currently return
REG_HIDDEN_USER to inhibit enumeration and ioctl access to the
register for userspace, and REG_HIDDEN_GUEST to inhibit runtime
access by the guest using MSR/MRS.  Wrappers are added to allow
these flags to be conveniently queried.

This approach allows a conditionally modified view of individual
system registers such as the CPU ID registers, in addition to
completely hiding register where appropriate.

Signed-off-by: Dave Martin 

---

Changes since v5:

 * Rename the visibility override flags, add some comments, and rename/
   introduce helpers to make the purpose of this code clearer.
---
 arch/arm64/kvm/sys_regs.c | 24 +---
 arch/arm64/kvm/sys_regs.h | 25 +
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index a5d14b5..c86a7b0 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1927,6 +1927,12 @@ static void perform_access(struct kvm_vcpu *vcpu,
 {
trace_kvm_sys_access(*vcpu_pc(vcpu), params, r);
 
+   /* Check for regs disabled by runtime config */
+   if (sysreg_hidden_from_guest(vcpu, r)) {
+   kvm_inject_undefined(vcpu);
+   return;
+   }
+
/*
 * Not having an accessor means that we have configured a trap
 * that we don't know how to handle. This certainly qualifies
@@ -2438,6 +2444,10 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const 
struct kvm_one_reg *reg
if (!r)
return get_invariant_sys_reg(reg->id, uaddr);
 
+   /* Check for regs disabled by runtime config */
+   if (sysreg_hidden_from_user(vcpu, r))
+   return -ENOENT;
+
if (r->get_user)
return (r->get_user)(vcpu, r, reg, uaddr);
 
@@ -2459,6 +2469,10 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const 
struct kvm_one_reg *reg
if (!r)
return set_invariant_sys_reg(reg->id, uaddr);
 
+   /* Check for regs disabled by runtime config */
+   if (sysreg_hidden_from_user(vcpu, r))
+   return -ENOENT;
+
if (r->set_user)
return (r->set_user)(vcpu, r, reg, uaddr);
 
@@ -2515,7 +2529,8 @@ static bool copy_reg_to_user(const struct sys_reg_desc 
*reg, u64 __user **uind)
return true;
 }
 
-static int walk_one_sys_reg(const struct sys_reg_desc *rd,
+static int walk_one_sys_reg(const struct kvm_vcpu *vcpu,
+   const struct sys_reg_desc *rd,
u64 __user **uind,
unsigned int *total)
 {
@@ -2526,6 +2541,9 @@ static int walk_one_sys_reg(const struct sys_reg_desc *rd,
if (!(rd->reg || rd->get_user))
return 0;
 
+   if (sysreg_hidden_from_user(vcpu, rd))
+   return 0;
+
if (!copy_reg_to_user(rd, uind))
return -EFAULT;
 
@@ -2554,9 +2572,9 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 
__user *uind)
int cmp = cmp_sys_reg(i1, i2);
/* target-specific overrides generic entry. */
if (cmp <= 0)
-   err = walk_one_sys_reg(i1, &uind, &total);
+   err = walk_one_sys_reg(vcpu, i1, &uind, &total);
else
-   err = walk_one_sys_reg(i2, &uind, &total);
+   err = walk_one_sys_reg(vcpu, i2, &uind, &total);
 
if (err)
return err;
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 3b1bc7f..2be9950 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -64,8 +64,15 @@ struct sys_reg_desc {
const struct kvm_one_reg *reg, void __user *uaddr);
int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr);
+
+   /* Return mask of REG_* runtime visibility overrides */
+   unsigned int (*visibility)(const struct kvm_vcpu *vcpu,
+  const struct sys_reg_desc *rd);
 };
 
+#define REG_HIDDEN_USER

[PATCH v6 10/27] KVM: arm64: Propagate vcpu into read_id_reg()

2019-03-19 Thread Dave Martin
Architecture features that are conditionally visible to the guest
will require run-time checks in the ID register accessor functions.
In particular, read_id_reg() will need to perform checks in order
to generate the correct emulated value for certain ID register
fields such as ID_AA64PFR0_EL1.SVE for example.

This patch propagates vcpu into read_id_reg() so that future
patches can add run-time checks on the guest configuration here.

For now, there is no functional change.

Signed-off-by: Dave Martin 
Reviewed-by: Alex Bennée 
---
 arch/arm64/kvm/sys_regs.c | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 539feec..a5d14b5 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1044,7 +1044,8 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
 }
 
 /* Read a sanitised cpufeature ID register by sys_reg_desc */
-static u64 read_id_reg(struct sys_reg_desc const *r, bool raz)
+static u64 read_id_reg(const struct kvm_vcpu *vcpu,
+   struct sys_reg_desc const *r, bool raz)
 {
u32 id = sys_reg((u32)r->Op0, (u32)r->Op1,
 (u32)r->CRn, (u32)r->CRm, (u32)r->Op2);
@@ -1078,7 +1079,7 @@ static bool __access_id_reg(struct kvm_vcpu *vcpu,
if (p->is_write)
return write_to_read_only(vcpu, p, r);
 
-   p->regval = read_id_reg(r, raz);
+   p->regval = read_id_reg(vcpu, r, raz);
return true;
 }
 
@@ -1107,16 +1108,18 @@ static u64 sys_reg_to_index(const struct sys_reg_desc 
*reg);
  * are stored, and for set_id_reg() we don't allow the effective value
  * to be changed.
  */
-static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
+static int __get_id_reg(const struct kvm_vcpu *vcpu,
+   const struct sys_reg_desc *rd, void __user *uaddr,
bool raz)
 {
const u64 id = sys_reg_to_index(rd);
-   const u64 val = read_id_reg(rd, raz);
+   const u64 val = read_id_reg(vcpu, rd, raz);
 
return reg_to_user(uaddr, &val, id);
 }
 
-static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
+static int __set_id_reg(const struct kvm_vcpu *vcpu,
+   const struct sys_reg_desc *rd, void __user *uaddr,
bool raz)
 {
const u64 id = sys_reg_to_index(rd);
@@ -1128,7 +1131,7 @@ static int __set_id_reg(const struct sys_reg_desc *rd, 
void __user *uaddr,
return err;
 
/* This is what we mean by invariant: you can't change it. */
-   if (val != read_id_reg(rd, raz))
+   if (val != read_id_reg(vcpu, rd, raz))
return -EINVAL;
 
return 0;
@@ -1137,25 +1140,25 @@ static int __set_id_reg(const struct sys_reg_desc *rd, 
void __user *uaddr,
 static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-   return __get_id_reg(rd, uaddr, false);
+   return __get_id_reg(vcpu, rd, uaddr, false);
 }
 
 static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-   return __set_id_reg(rd, uaddr, false);
+   return __set_id_reg(vcpu, rd, uaddr, false);
 }
 
 static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-   return __get_id_reg(rd, uaddr, true);
+   return __get_id_reg(vcpu, rd, uaddr, true);
 }
 
 static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-   return __set_id_reg(rd, uaddr, true);
+   return __set_id_reg(vcpu, rd, uaddr, true);
 }
 
 static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 07/27] arm64/sve: Check SVE virtualisability

2019-03-19 Thread Dave Martin
Due to the way the effective SVE vector length is controlled and
trapped at different exception levels, certain mismatches in the
sets of vector lengths supported by different physical CPUs in the
system may prevent straightforward virtualisation of SVE at parity
with the host.

This patch analyses the extent to which SVE can be virtualised
safely without interfering with migration of vcpus between physical
CPUs, and rejects late secondary CPUs that would erode the
situation further.

It is left up to KVM to decide what to do with this information.

Signed-off-by: Dave Martin 
Reviewed-by: Julien Thierry 

---

QUESTION: The final structure of this code makes it quite natural to
clamp the vector length for KVM guests to the maximum value supportable
across all CPUs; such a value is guaranteed to exist, but may be
surprisingly small on a given hardware platform.

Should we be stricter and actually refuse to support KVM at all on such
hardware?  This may help to force people to fix Linux (or the
architecture) if/when this issue comes up.

For now, I stick with pr_warn() and make do with a limited SVE vector
length for guests.

Changes since v5:

 * pr_info() about the presence of unvirtualisable vector lengths in
   sve_setup() upgrade to pr_warn(), for consistency with
   sve_verify_vq_map().
---
 arch/arm64/include/asm/fpsimd.h |  1 +
 arch/arm64/kernel/cpufeature.c  |  2 +-
 arch/arm64/kernel/fpsimd.c  | 86 ++---
 3 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index dd1ad39..964adc9 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -87,6 +87,7 @@ extern void sve_kernel_enable(const struct 
arm64_cpu_capabilities *__unused);
 extern u64 read_zcr_features(void);
 
 extern int __ro_after_init sve_max_vl;
+extern int __ro_after_init sve_max_virtualisable_vl;
 
 #ifdef CONFIG_ARM64_SVE
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e24e94d..4b1cda5 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1862,7 +1862,7 @@ static void verify_sve_features(void)
unsigned int len = zcr & ZCR_ELx_LEN_MASK;
 
if (len < safe_len || sve_verify_vq_map()) {
-   pr_crit("CPU%d: SVE: required vector length(s) missing\n",
+   pr_crit("CPU%d: SVE: vector length support mismatch\n",
smp_processor_id());
cpu_die_early();
}
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index f59ea67..b219796a 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -18,6 +18,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -48,6 +49,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define FPEXC_IOF  (1 << 0)
 #define FPEXC_DZF  (1 << 1)
@@ -130,14 +132,18 @@ static int sve_default_vl = -1;
 
 /* Maximum supported vector length across all CPUs (initially poisoned) */
 int __ro_after_init sve_max_vl = SVE_VL_MIN;
+int __ro_after_init sve_max_virtualisable_vl = SVE_VL_MIN;
 /* Set of available vector lengths, as vq_to_bit(vq): */
 static __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
+/* Set of vector lengths present on at least one cpu: */
+static __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX);
 static void __percpu *efi_sve_state;
 
 #else /* ! CONFIG_ARM64_SVE */
 
 /* Dummy declaration for code that will be optimised out: */
 extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
+extern __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX);
 extern void __percpu *efi_sve_state;
 
 #endif /* ! CONFIG_ARM64_SVE */
@@ -623,12 +629,6 @@ int sve_get_current_vl(void)
return sve_prctl_status(0);
 }
 
-/*
- * Bitmap for temporary storage of the per-CPU set of supported vector lengths
- * during secondary boot.
- */
-static DECLARE_BITMAP(sve_secondary_vq_map, SVE_VQ_MAX);
-
 static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX))
 {
unsigned int vq, vl;
@@ -654,6 +654,7 @@ static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX))
 void __init sve_init_vq_map(void)
 {
sve_probe_vqs(sve_vq_map);
+   bitmap_copy(sve_vq_partial_map, sve_vq_map, SVE_VQ_MAX);
 }
 
 /*
@@ -663,8 +664,11 @@ void __init sve_init_vq_map(void)
  */
 void sve_update_vq_map(void)
 {
-   sve_probe_vqs(sve_secondary_vq_map);
-   bitmap_and(sve_vq_map, sve_vq_map, sve_secondary_vq_map, SVE_VQ_MAX);
+   DECLARE_BITMAP(tmp_map, SVE_VQ_MAX);
+
+   sve_probe_vqs(tmp_map);
+   bitmap_and(sve_vq_map, sve_vq_map, tmp_map, SVE_VQ_MAX);
+   bitmap_or(sve_vq_partial_map, sve_vq_partial_map, tmp_map, SVE_VQ_MAX);
 }
 
 /*
@@ -673,18 +677,48 @@ void sve_update_vq_map(void)
  */
 int sve_verify_vq_map(void)
 {
-   int ret = 0;
+   DECLARE_BITMAP(tmp_map, SVE_VQ_MAX);
+   unsigned long b;
 
-   sve_prob

[PATCH v6 06/27] arm64/sve: Clarify role of the VQ map maintenance functions

2019-03-19 Thread Dave Martin
The roles of sve_init_vq_map(), sve_update_vq_map() and
sve_verify_vq_map() are highly non-obvious to anyone who has not dug
through cpufeatures.c in detail.

Since the way these functions interact with each other is more
important here than a full understanding of the cpufeatures code, this
patch adds comments to make the functions' roles clearer.

No functional change.

Signed-off-by: Dave Martin 
Reviewed-by: Julien Thierry 
Reviewed-by: Julien Grall 

---

Changes since v5:

 * [Julien Thierry] This patch is useful for explaining the previous
   patch (as per the v5 ordering), and is anyway non-functional.
   Swapped it with the previous patch to provide a more logical reading
   order for the series.
---
 arch/arm64/kernel/fpsimd.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 62c37f0..f59ea67 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -647,6 +647,10 @@ static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX))
}
 }
 
+/*
+ * Initialise the set of known supported VQs for the boot CPU.
+ * This is called during kernel boot, before secondary CPUs are brought up.
+ */
 void __init sve_init_vq_map(void)
 {
sve_probe_vqs(sve_vq_map);
@@ -655,6 +659,7 @@ void __init sve_init_vq_map(void)
 /*
  * If we haven't committed to the set of supported VQs yet, filter out
  * those not supported by the current CPU.
+ * This function is called during the bring-up of early secondary CPUs only.
  */
 void sve_update_vq_map(void)
 {
@@ -662,7 +667,10 @@ void sve_update_vq_map(void)
bitmap_and(sve_vq_map, sve_vq_map, sve_secondary_vq_map, SVE_VQ_MAX);
 }
 
-/* Check whether the current CPU supports all VQs in the committed set */
+/*
+ * Check whether the current CPU supports all VQs in the committed set.
+ * This function is called during the bring-up of late secondary CPUs only.
+ */
 int sve_verify_vq_map(void)
 {
int ret = 0;
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 05/27] KVM: arm64: Add missing #includes to kvm_host.h

2019-03-19 Thread Dave Martin
kvm_host.h uses some declarations from other headers that are
currently included by accident, without an explicit #include.

This patch adds a few #includes that are clearly missing.  Although
the header builds without them today, this should help to avoid
future surprises.

Signed-off-by: Dave Martin 
Acked-by: Mark Rutland 

---

Changes since v5:

 * [Mark Rutland] Add additional missing #includes ,
   , , while we're about it.

   Commit message reworded to match.
---
 arch/arm64/include/asm/kvm_host.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index a01fe087..6d10100 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -22,9 +22,13 @@
 #ifndef __ARM64_KVM_HOST_H__
 #define __ARM64_KVM_HOST_H__
 
+#include 
 #include 
+#include 
 #include 
+#include 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 02/27] arm64: fpsimd: Always set TIF_FOREIGN_FPSTATE on task state flush

2019-03-19 Thread Dave Martin
This patch updates fpsimd_flush_task_state() to mirror the new
semantics of fpsimd_flush_cpu_state() introduced by commit
d8ad71fa38a9 ("arm64: fpsimd: Fix TIF_FOREIGN_FPSTATE after
invalidating cpu regs").  Both functions now implicitly set
TIF_FOREIGN_FPSTATE to indicate that the task's FPSIMD state is not
loaded into the cpu.

As a side-effect, fpsimd_flush_task_state() now sets
TIF_FOREIGN_FPSTATE even for non-running tasks.  In the case of
non-running tasks this is not useful but also harmless, because the
flag is live only while the corresponding task is running.  This
function is not called from fast paths, so special-casing this for
the task == current case is not really worth it.

Compiler barriers previously present in restore_sve_fpsimd_context()
are pulled into fpsimd_flush_task_state() so that it can be safely
called with preemption enabled if necessary.

Explicit calls to set TIF_FOREIGN_FPSTATE that accompany
fpsimd_flush_task_state() calls and are now redundant are removed
as appropriate.

fpsimd_flush_task_state() is used to get exclusive access to the
representation of the task's state via task_struct, for the purpose
of replacing the state.  Thus, the call to this function should
happen before manipulating fpsimd_state or sve_state etc. in
task_struct.  Anomalous cases are reordered appropriately in order
to make the code more consistent, although there should be no
functional difference since these cases are protected by
local_bh_disable() anyway.

Signed-off-by: Dave Martin 
Reviewed-by: Alex Bennée 
Reviewed-by: Julien Grall 
---
 arch/arm64/kernel/fpsimd.c | 25 +++--
 arch/arm64/kernel/signal.c |  5 -
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 5ebe73b..62c37f0 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -550,7 +550,6 @@ int sve_set_vector_length(struct task_struct *task,
local_bh_disable();
 
fpsimd_save();
-   set_thread_flag(TIF_FOREIGN_FPSTATE);
}
 
fpsimd_flush_task_state(task);
@@ -816,12 +815,11 @@ asmlinkage void do_sve_acc(unsigned int esr, struct 
pt_regs *regs)
local_bh_disable();
 
fpsimd_save();
-   fpsimd_to_sve(current);
 
/* Force ret_to_user to reload the registers: */
fpsimd_flush_task_state(current);
-   set_thread_flag(TIF_FOREIGN_FPSTATE);
 
+   fpsimd_to_sve(current);
if (test_and_set_thread_flag(TIF_SVE))
WARN_ON(1); /* SVE access shouldn't have trapped */
 
@@ -894,9 +892,9 @@ void fpsimd_flush_thread(void)
 
local_bh_disable();
 
+   fpsimd_flush_task_state(current);
memset(¤t->thread.uw.fpsimd_state, 0,
   sizeof(current->thread.uw.fpsimd_state));
-   fpsimd_flush_task_state(current);
 
if (system_supports_sve()) {
clear_thread_flag(TIF_SVE);
@@ -933,8 +931,6 @@ void fpsimd_flush_thread(void)
current->thread.sve_vl_onexec = 0;
}
 
-   set_thread_flag(TIF_FOREIGN_FPSTATE);
-
local_bh_enable();
 }
 
@@ -1043,12 +1039,29 @@ void fpsimd_update_current_state(struct 
user_fpsimd_state const *state)
 
 /*
  * Invalidate live CPU copies of task t's FPSIMD state
+ *
+ * This function may be called with preemption enabled.  The barrier()
+ * ensures that the assignment to fpsimd_cpu is visible to any
+ * preemption/softirq that could race with set_tsk_thread_flag(), so
+ * that TIF_FOREIGN_FPSTATE cannot be spuriously re-cleared.
+ *
+ * The final barrier ensures that TIF_FOREIGN_FPSTATE is seen set by any
+ * subsequent code.
  */
 void fpsimd_flush_task_state(struct task_struct *t)
 {
t->thread.fpsimd_cpu = NR_CPUS;
+
+   barrier();
+   set_tsk_thread_flag(t, TIF_FOREIGN_FPSTATE);
+
+   barrier();
 }
 
+/*
+ * Invalidate any task's FPSIMD state that is present on this cpu.
+ * This function must be called with softirqs disabled.
+ */
 void fpsimd_flush_cpu_state(void)
 {
__this_cpu_write(fpsimd_last_state.st, NULL);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 867a7ce..a9b0485 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -296,11 +296,6 @@ static int restore_sve_fpsimd_context(struct user_ctxs 
*user)
 */
 
fpsimd_flush_task_state(current);
-   barrier();
-   /* From now, fpsimd_thread_switch() won't clear TIF_FOREIGN_FPSTATE */
-
-   set_thread_flag(TIF_FOREIGN_FPSTATE);
-   barrier();
/* From now, fpsimd_thread_switch() won't touch thread.sve_state */
 
sve_alloc(current);
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 03/27] KVM: arm64: Delete orphaned declaration for __fpsimd_enabled()

2019-03-19 Thread Dave Martin
__fpsimd_enabled() no longer exists, but a dangling declaration has
survived in kvm_hyp.h.

This patch gets rid of it.

Signed-off-by: Dave Martin 
Reviewed-by: Alex Bennée 
---
 arch/arm64/include/asm/kvm_hyp.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 4da765f..ef8b839 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -149,7 +149,6 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu);
 
 void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
 void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
-bool __fpsimd_enabled(void);
 
 void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
 void deactivate_traps_vhe_put(void);
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v6 01/27] KVM: Documentation: Document arm64 core registers in detail

2019-03-19 Thread Dave Martin
Since the the sizes of individual members of the core arm64
registers vary, the list of register encodings that make sense is
not a simple linear sequence.

To clarify which encodings to use, this patch adds a brief list
to the documentation.

Signed-off-by: Dave Martin 
Reviewed-by: Julien Grall 
Reviewed-by: Peter Maydell 
---
 Documentation/virtual/kvm/api.txt | 24 
 1 file changed, 24 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 7de9eee..2d4f7ce 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2107,6 +2107,30 @@ contains elements ranging from 32 to 128 bits. The index 
is a 32bit
 value in the kvm_regs structure seen as a 32bit array.
   0x60x0  0010 
 
+Specifically:
+EncodingRegister  Bits  kvm_regs member
+
+  0x6030  0010  X0  64  regs.regs[0]
+  0x6030  0010 0002 X1  64  regs.regs[1]
+...
+  0x6030  0010 003c X30 64  regs.regs[30]
+  0x6030  0010 003e SP  64  regs.sp
+  0x6030  0010 0040 PC  64  regs.pc
+  0x6030  0010 0042 PSTATE  64  regs.pstate
+  0x6030  0010 0044 SP_EL1  64  sp_el1
+  0x6030  0010 0046 ELR_EL1 64  elr_el1
+  0x6030  0010 0048 SPSR_EL164  spsr[KVM_SPSR_EL1] (alias SPSR_SVC)
+  0x6030  0010 004a SPSR_ABT64  spsr[KVM_SPSR_ABT]
+  0x6030  0010 004c SPSR_UND64  spsr[KVM_SPSR_UND]
+  0x6030  0010 004e SPSR_IRQ64  spsr[KVM_SPSR_IRQ]
+  0x6060  0010 0050 SPSR_FIQ64  spsr[KVM_SPSR_FIQ]
+  0x6040  0010 0054 V0 128  fp_regs.vregs[0]
+  0x6040  0010 0058 V1 128  fp_regs.vregs[1]
+...
+  0x6040  0010 00d0 V31128  fp_regs.vregs[31]
+  0x6020  0010 00d4 FPSR32  fp_regs.fpsr
+  0x6020  0010 00d5 FPCR32  fp_regs.fpcr
+
 arm64 CCSIDR registers are demultiplexed by CSSELR value:
   0x6020  0011 00 
 
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v5 00/26] KVM: arm64: SVE guest support

2019-03-19 Thread Dave Martin
This series implements support for allowing KVM guests to use the Arm
Scalable Vector Extension (SVE), superseding the previous v5 series [1].

The patches are also available on a branch for reviewer convenience. [2]

The patches are based on v5.1-rc1.

The series has been reworked to remove the dependency on [3],
eliminating the ABI "fixes" but keeping the relevant cleanup/
refactoring.  (See patch 16.)

This series addresses review comments received on v5, and contains one
significant change:

 * A new ioctl KVM_ARM_VCPU_FINALIZE is added to replace the implicit
   finalization behaviour in v5.  If userspace enables SVE, it must now
   use this ioctl to finalize the vcpu configuration before KVM_RUN
   etc. or SVE register access are permitted.

For a description of minor updates, see the individual patches.


Known issues:

 * This update requires modifications to kvmtool that are not published
   yet.  I will reply to this cover letter with a link when those are
   available (hopefully within 24 hours of this posting).

   The kvmtool branch referenced by the v5 cover letter **will not
   work** with v6...  Please be patient :)


Testing status:

 * Lightweight testing on the Arm Fast Model, primarily to exercise the
   new vcpu finalization API.

   I plan to re-run stress testing once v6 is posted.  The low-level
   context switch internals in the series remain unchanged since v5, so
   it is "relatively unlikely" that new problems will crop up since the
   v5 testing.

 * ThunderX2: basic testing of arm64 defconfig, including booting guests
   (no SVE support on this hardware).

   (This was done on an interim release candidate of v6: I will redo it
   after this posting.)

 * aarch32 host testing has only been done in v5 so far.  arch/arm
   changes between v5 and v6 are minimal.  I plan to redo sanity-check
   testing after this posting.

I will reply to this cover letter when I have v6 test outcomes to
report.

   
Resolved issues:

 * The register zeroing bug observed when testing v5 has been tracked
   down to architecture noncompliance in the Arm Fast Model, which led
   to lanes of SVE registers being zeroed in some inappropriate
   situations.


[1] Previous series:
[PATCH v5 00/26] KVM: arm64: SVE guest support
https://lists.cs.columbia.edu/pipermail/kvmarm/2019-February/034695.html

[2] This series in git:
http://linux-arm.org/git?p=linux-dm.git;a=shortlog;h=refs/heads/sve-kvm/v6/head
git://linux-arm.org/linux-dm.git sve-kvm/v6/head

[3] [PATCH v2 0/2] Fix KVM_GET_REG_LIST invalid register ID regression
https://lists.cs.columbia.edu/pipermail/kvmarm/2018-December/033810.html


Dave Martin (27):
  KVM: Documentation: Document arm64 core registers in detail
  arm64: fpsimd: Always set TIF_FOREIGN_FPSTATE on task state flush
  KVM: arm64: Delete orphaned declaration for __fpsimd_enabled()
  KVM: arm64: Refactor kvm_arm_num_regs() for easier maintenance
  KVM: arm64: Add missing #includes to kvm_host.h
  arm64/sve: Clarify role of the VQ map maintenance functions
  arm64/sve: Check SVE virtualisability
  arm64/sve: Enable SVE state tracking for non-task contexts
  KVM: arm64: Add a vcpu flag to control SVE visibility for the guest
  KVM: arm64: Propagate vcpu into read_id_reg()
  KVM: arm64: Support runtime sysreg visibility filtering
  KVM: arm64/sve: System register context switch and access support
  KVM: arm64/sve: Context switch the SVE registers
  KVM: Allow 2048-bit register access via ioctl interface
  KVM: arm64: Add missing #include of  in guest.c
  KVM: arm64: Factor out core register ID enumeration
  KVM: arm64: Reject ioctl access to FPSIMD V-regs on SVE vcpus
  KVM: arm64/sve: Add SVE support to register access ioctl interface
  KVM: arm64: Enumerate SVE register indices for KVM_GET_REG_LIST
  arm64/sve: In-kernel vector length availability query interface
  KVM: arm/arm64: Add hook for arch-specific KVM initialisation
  KVM: arm/arm64: Add KVM_ARM_VCPU_FINALIZE ioctl
  KVM: arm64/sve: Add pseudo-register for the guest's vector lengths
  KVM: arm64/sve: Allow userspace to enable SVE for vcpus
  KVM: arm64: Add a capability to advertise SVE support
  KVM: Document errors for KVM_GET_ONE_REG and KVM_SET_ONE_REG
  KVM: arm64/sve: Document KVM API extensions for SVE

 Documentation/virtual/kvm/api.txt | 156 +++
 arch/arm/include/asm/kvm_host.h   |   6 +
 arch/arm64/include/asm/fpsimd.h   |  33 +++-
 arch/arm64/include/asm/kvm_host.h |  43 -
 arch/arm64/include/asm/kvm_hyp.h  |   1 -
 arch/arm64/include/asm/sysreg.h   |   3 +
 arch/arm64/include/uapi/asm/kvm.h |  22 +++
 arch/arm64/kernel/cpufeature.c|   2 +-
 arch/arm64/kernel/fpsimd.c| 172 -
 arch/arm64/kernel/signal.c|   5 -
 arch/arm64/kvm/fpsimd.c   |  17 +-
 arch/arm64/kvm/guest.c| 387 +++---
 arch/arm64/kvm/hyp/switch.c   |  74 ++--
 arch/arm64/kvm/reset.c| 137 +-
 arch/arm64/kvm/sys_regs.c

Re: [RFC PATCH] KVM: arm/arm64: Enable direct irqfd MSI injection

2019-03-19 Thread Marc Zyngier
On Tue, 19 Mar 2019 15:59:00 +,
Zenghui Yu  wrote:
> 
> Hi Marc,
> 
> On 2019/3/19 18:01, Marc Zyngier wrote:
> > On Tue, 19 Mar 2019 09:09:43 +0800
> > Zenghui Yu  wrote:
> > 
> >> Hi all,
> >> 
> >> On 2019/3/18 3:35, Marc Zyngier wrote:
> >>> A first approach would be to keep a small cache of the last few
> >>> successful translations for this ITS, cache that could be looked-up by
> >>> holding a spinlock instead. A hit in this cache could directly be
> >>> injected. Any command that invalidates or changes anything (DISCARD,
> >>> INV, INVALL, MAPC with V=0, MAPD with V=0, MOVALL, MOVI) should nuke
> >>> the cache altogether.
> >>> 
> >>> Of course, all of that needs to be quantified.
> >> 
> >> Thanks for all of your explanations, especially for Marc's suggestions!
> >> It took me long time to figure out my mistakes, since I am not very
> >> familiar with the locking stuff. Now I have to apologize for my noise.
> > 
> > No need to apologize. The whole point of this list is to have
> > discussions. Although your approach wasn't working, you did
> > identify potential room for improvement.
> > 
> >> As for the its-translation-cache code (a really good news to us), we
> >> have a rough look at it and start testing now!
> > 
> > Please let me know about your findings. My initial test doesn't show
> > any improvement, but that could easily be attributed to the system I
> > running this on (a tiny and slightly broken dual A53 system). The sizing
> > of the cache is also important: too small, and you have the overhead of
> > the lookup for no benefit; too big, and you waste memory.
> 
> Not smoothly as expected. With below config (in the form of XML):

The good news is that nothing was expected at all.

> ---8<---
> 
>   
>   
>   
> 
> ---8<---

Sorry, I don't read XML, and I have zero idea what this represent.

> 
> VM can't even get to boot successfully!
> 
> 
> Kernel version is -stable 4.19.28. And *dmesg* on host shows:

Please don't test on any other thing but mainline. The only thing I'm
interested in at the moment is 5.1-rc1.

> 
> ---8<---
> [  507.908330] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
> [  507.908338] rcu: 35-...0: (0 ticks this GP)
> idle=d06/1/0x4000 softirq=72150/72150 fqs=6269
> [  507.908341] rcu: 41-...0: (0 ticks this GP)
> idle=dee/1/0x4000 softirq=68144/68144 fqs=6269
> [  507.908342] rcu: (detected by 23, t=15002 jiffies, g=68929, q=408641)
> [  507.908350] Task dump for CPU 35:
> [  507.908351] qemu-kvmR  running task0 66789  1
> 0x0002
> [  507.908354] Call trace:
> [  507.908360]  __switch_to+0x94/0xe8
> [  507.908363]  _cond_resched+0x24/0x68
> [  507.908366]  __flush_work+0x58/0x280
> [  507.908369]  free_unref_page_commit+0xc4/0x198
> [  507.908370]  free_unref_page+0x84/0xa0
> [  507.908371]  __free_pages+0x58/0x68
> [  507.908372]  free_pages.part.21+0x34/0x40
> [  507.908373]  free_pages+0x2c/0x38
> [  507.908375]  poll_freewait+0xa8/0xd0
> [  507.908377]  do_sys_poll+0x3d0/0x560
> [  507.908378]  __arm64_sys_ppoll+0x180/0x1e8
> [  507.908380]  0xa48990
> [  507.908381] Task dump for CPU 41:
> [  507.908382] kworker/41:1R  running task0   647  2
> 0x002a
> [  507.908387] Workqueue: events irqfd_inject
> [  507.908389] Call trace:
> [  507.908391]  __switch_to+0x94/0xe8
> [  507.908392]  0x20131
> [... ...]
> [  687.928330] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
> [  687.928339] rcu: 35-...0: (0 ticks this GP)
> idle=d06/1/0x4000 softirq=72150/72150 fqs=25034
> [  687.928341] rcu: 41-...0: (0 ticks this GP)
> idle=dee/1/0x4000 softirq=68144/68144 fqs=25034
> [  687.928343] rcu: (detected by 16, t=60007 jiffies, g=68929,
> q=1601093)
> [  687.928351] Task dump for CPU 35:
> [  687.928352] qemu-kvmR  running task0 66789  1
> 0x0002
> [  687.928355] Call trace:
> [  687.928360]  __switch_to+0x94/0xe8
> [  687.928364]  _cond_resched+0x24/0x68
> [  687.928367]  __flush_work+0x58/0x280
> [  687.928369]  free_unref_page_commit+0xc4/0x198
> [  687.928370]  free_unref_page+0x84/0xa0
> [  687.928372]  __free_pages+0x58/0x68
> [  687.928373]  free_pages.part.21+0x34/0x40
> [  687.928374]  free_pages+0x2c/0x38
> [  687.928376]  poll_freewait+0xa8/0xd0
> [  687.928378]  do_sys_poll+0x3d0/0x560
> [  687.928379]  __arm64_sys_ppoll+0x180/0x1e8
> [  687.928381]  0xa48990
> [  687.928382] Task dump for CPU 41:
> [  687.928383] kworker/41:1R  running task0   647  2
> 0x002a
> [  687.928389] Workqueue: events irqfd_inject
> [  687.928391] Call trace:
> [  687.928392]  __switch_to+0x94/0xe8
> [  687.928394]  0x20131
> [...]
> ---8<---   endlessly ...
> 
> It seems that we've suffered from some locking related issues. Any
> suggestions for debugging?

None at the moment. And this doesn't seem quite related to the problem
at hand, does it?

> And could you please provide your test

Re: [PATCH] kvm: arm: Fix handling of stage2 huge mappings

2019-03-19 Thread Zenghui Yu

Hi Suzuki,

On 2019/3/19 22:11, Suzuki K Poulose wrote:

We rely on the mmu_notifier call backs to handle the split/merge
of huge pages and thus we are guaranteed that, while creating a
block mapping, either the entire block is unmapped at stage2 or it
is missing permission.

However, we miss a case where the block mapping is split for dirty
logging case and then could later be made block mapping, if we cancel the
dirty logging. This not only creates inconsistent TLB entries for
the pages in the the block, but also leakes the table pages for
PMD level.

Handle this corner case for the huge mappings at stage2 by
unmapping the non-huge mapping for the block. This could potentially
release the upper level table. So we need to restart the table walk
once we unmap the range.

Fixes : ad361f093c1e31d ("KVM: ARM: Support hugetlbfs backed huge pages")
Reported-by: Zheng Xiang 
Cc: Zheng Xiang 
Cc: Zhengui Yu 


Sorry to bother you, but this should be "Zenghui Yu", thanks!


zenghui


Cc: Marc Zyngier 
Cc: Christoffer Dall 
Signed-off-by: Suzuki K Poulose 
---
  virt/kvm/arm/mmu.c | 63 ++
  1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index fce0983..6ad6f19d 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1060,25 +1060,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache
  {
pmd_t *pmd, old_pmd;
  
+retry:

pmd = stage2_get_pmd(kvm, cache, addr);
VM_BUG_ON(!pmd);
  
  	old_pmd = *pmd;

+   /*
+* Multiple vcpus faulting on the same PMD entry, can
+* lead to them sequentially updating the PMD with the
+* same value. Following the break-before-make
+* (pmd_clear() followed by tlb_flush()) process can
+* hinder forward progress due to refaults generated
+* on missing translations.
+*
+* Skip updating the page table if the entry is
+* unchanged.
+*/
+   if (pmd_val(old_pmd) == pmd_val(*new_pmd))
+   return 0;
+
if (pmd_present(old_pmd)) {
/*
-* Multiple vcpus faulting on the same PMD entry, can
-* lead to them sequentially updating the PMD with the
-* same value. Following the break-before-make
-* (pmd_clear() followed by tlb_flush()) process can
-* hinder forward progress due to refaults generated
-* on missing translations.
+* If we already have PTE level mapping for this block,
+* we must unmap it to avoid inconsistent TLB state and
+* leaking the table page. We could end up in this situation
+* if the memory slot was marked for dirty logging and was
+* reverted, leaving PTE level mappings for the pages accessed
+* during the period. So, unmap the PTE level mapping for this
+* block and retry, as we could have released the upper level
+* table in the process.
 *
-* Skip updating the page table if the entry is
-* unchanged.
+* Normal THP split/merge follows mmu_notifier callbacks and do
+* get handled accordingly.
 */
-   if (pmd_val(old_pmd) == pmd_val(*new_pmd))
-   return 0;
-
+   if (!pmd_thp_or_huge(old_pmd)) {
+   unmap_stage2_range(kvm, addr & S2_PMD_MASK, 
S2_PMD_SIZE);
+   goto retry;
+   }
/*
 * Mapping in huge pages should only happen through a
 * fault.  If a page is merged into a transparent huge
@@ -1090,8 +1108,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache
 * should become splitting first, unmapped, merged,
 * and mapped back in on-demand.
 */
-   VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
-
+   WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
} else {
@@ -1107,6 +1124,7 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache *cac
  {
pud_t *pudp, old_pud;
  
+retry:

pudp = stage2_get_pud(kvm, cache, addr);
VM_BUG_ON(!pudp);
  
@@ -1114,16 +1132,25 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
  
  	/*

 * A large number of vcpus faulting on the same stage 2 entry,
-* can lead to a refault due to the
-* stage2_pud_clear()/tlb_flush(). Skip updating the page
-* tables if there is no change.
+* can lead to a refault due to the stage2_pud_clear()/tlb_flush().
+* Skip updating the page tables if there is no change.
 

Re: [RFC PATCH] KVM: arm/arm64: Enable direct irqfd MSI injection

2019-03-19 Thread Zenghui Yu

Hi Marc,

On 2019/3/19 18:01, Marc Zyngier wrote:

On Tue, 19 Mar 2019 09:09:43 +0800
Zenghui Yu  wrote:


Hi all,

On 2019/3/18 3:35, Marc Zyngier wrote:

A first approach would be to keep a small cache of the last few
successful translations for this ITS, cache that could be looked-up by
holding a spinlock instead. A hit in this cache could directly be
injected. Any command that invalidates or changes anything (DISCARD,
INV, INVALL, MAPC with V=0, MAPD with V=0, MOVALL, MOVI) should nuke
the cache altogether.

Of course, all of that needs to be quantified.


Thanks for all of your explanations, especially for Marc's suggestions!
It took me long time to figure out my mistakes, since I am not very
familiar with the locking stuff. Now I have to apologize for my noise.


No need to apologize. The whole point of this list is to have
discussions. Although your approach wasn't working, you did
identify potential room for improvement.


As for the its-translation-cache code (a really good news to us), we
have a rough look at it and start testing now!


Please let me know about your findings. My initial test doesn't show
any improvement, but that could easily be attributed to the system I
running this on (a tiny and slightly broken dual A53 system). The sizing
of the cache is also important: too small, and you have the overhead of
the lookup for no benefit; too big, and you waste memory.


Not smoothly as expected. With below config (in the form of XML):

---8<---

  
  
  

---8<---

VM can't even get to boot successfully!


Kernel version is -stable 4.19.28. And *dmesg* on host shows:

---8<---
[  507.908330] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
[  507.908338] rcu: 35-...0: (0 ticks this GP) 
idle=d06/1/0x4000 softirq=72150/72150 fqs=6269
[  507.908341] rcu: 41-...0: (0 ticks this GP) 
idle=dee/1/0x4000 softirq=68144/68144 fqs=6269

[  507.908342] rcu: (detected by 23, t=15002 jiffies, g=68929, q=408641)
[  507.908350] Task dump for CPU 35:
[  507.908351] qemu-kvmR  running task0 66789  1 
0x0002

[  507.908354] Call trace:
[  507.908360]  __switch_to+0x94/0xe8
[  507.908363]  _cond_resched+0x24/0x68
[  507.908366]  __flush_work+0x58/0x280
[  507.908369]  free_unref_page_commit+0xc4/0x198
[  507.908370]  free_unref_page+0x84/0xa0
[  507.908371]  __free_pages+0x58/0x68
[  507.908372]  free_pages.part.21+0x34/0x40
[  507.908373]  free_pages+0x2c/0x38
[  507.908375]  poll_freewait+0xa8/0xd0
[  507.908377]  do_sys_poll+0x3d0/0x560
[  507.908378]  __arm64_sys_ppoll+0x180/0x1e8
[  507.908380]  0xa48990
[  507.908381] Task dump for CPU 41:
[  507.908382] kworker/41:1R  running task0   647  2 
0x002a

[  507.908387] Workqueue: events irqfd_inject
[  507.908389] Call trace:
[  507.908391]  __switch_to+0x94/0xe8
[  507.908392]  0x20131
[... ...]
[  687.928330] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
[  687.928339] rcu: 35-...0: (0 ticks this GP) 
idle=d06/1/0x4000 softirq=72150/72150 fqs=25034
[  687.928341] rcu: 41-...0: (0 ticks this GP) 
idle=dee/1/0x4000 softirq=68144/68144 fqs=25034
[  687.928343] rcu: (detected by 16, t=60007 jiffies, g=68929, 
q=1601093)

[  687.928351] Task dump for CPU 35:
[  687.928352] qemu-kvmR  running task0 66789  1 
0x0002

[  687.928355] Call trace:
[  687.928360]  __switch_to+0x94/0xe8
[  687.928364]  _cond_resched+0x24/0x68
[  687.928367]  __flush_work+0x58/0x280
[  687.928369]  free_unref_page_commit+0xc4/0x198
[  687.928370]  free_unref_page+0x84/0xa0
[  687.928372]  __free_pages+0x58/0x68
[  687.928373]  free_pages.part.21+0x34/0x40
[  687.928374]  free_pages+0x2c/0x38
[  687.928376]  poll_freewait+0xa8/0xd0
[  687.928378]  do_sys_poll+0x3d0/0x560
[  687.928379]  __arm64_sys_ppoll+0x180/0x1e8
[  687.928381]  0xa48990
[  687.928382] Task dump for CPU 41:
[  687.928383] kworker/41:1R  running task0   647  2 
0x002a

[  687.928389] Workqueue: events irqfd_inject
[  687.928391] Call trace:
[  687.928392]  __switch_to+0x94/0xe8
[  687.928394]  0x20131
[...]
---8<---   endlessly ...

It seems that we've suffered from some locking related issues. Any
suggestions for debugging?

And could you please provide your test steps ? So that I can run
some tests on my HW to see improvement hopefully.



Having thought about it a bit more, I think we can drop the
invalidation on MOVI/MOVALL, as the LPI is still perfectly valid, and
we don't cache the target vcpu. On the other hand, the cache must be
nuked when the ITS is turned off.


All of these are valuable. But it might be early for me to consider
about them (I have to get the above problem solved first ...)


thanks,

zenghui



Thanks,

M.



___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/2] KVM: arm/arm64: vgic-its: Take the srcu lock when writing to guest memory

2019-03-19 Thread Marc Zyngier
Hi Eric,

On Tue, 19 Mar 2019 15:16:38 +,
Auger Eric  wrote:
> 
> Hi Marc,
> 
> On 3/19/19 2:30 PM, Marc Zyngier wrote:
> > When halting a guest, QEMU flushes the virtual ITS caches, which
> > amounts to writing to the various tables that the guest has allocated.
> > 
> > When doing this, we fail to take the srcu lock, and the kernel
> > shouts loudly if running a lockdep kernel:
> > 
> > [   69.680416] =
> > [   69.680819] WARNING: suspicious RCU usage
> > [   69.681526] 5.1.0-rc1-8-g600025238f51-dirty #18 Not tainted
> > [   69.682096] -
> > [   69.682501] ./include/linux/kvm_host.h:605 suspicious 
> > rcu_dereference_check() usage!
> > [   69.683225]
> > [   69.683225] other info that might help us debug this:
> > [   69.683225]
> > [   69.683975]
> > [   69.683975] rcu_scheduler_active = 2, debug_locks = 1
> > [   69.684598] 6 locks held by qemu-system-aar/4097:
> > [   69.685059]  #0: 34196013 (&kvm->lock){+.+.}, at: 
> > vgic_its_set_attr+0x244/0x3a0
> > [   69.686087]  #1: f2ed935e (&its->its_lock){+.+.}, at: 
> > vgic_its_set_attr+0x250/0x3a0
> > [   69.686919]  #2: 5e71ea54 (&vcpu->mutex){+.+.}, at: 
> > lock_all_vcpus+0x64/0xd0
> > [   69.687698]  #3: c17e548d (&vcpu->mutex){+.+.}, at: 
> > lock_all_vcpus+0x64/0xd0
> > [   69.688475]  #4: ba386017 (&vcpu->mutex){+.+.}, at: 
> > lock_all_vcpus+0x64/0xd0
> > [   69.689978]  #5: c2c3c335 (&vcpu->mutex){+.+.}, at: 
> > lock_all_vcpus+0x64/0xd0
> > [   69.690729]
> > [   69.690729] stack backtrace:
> > [   69.691151] CPU: 2 PID: 4097 Comm: qemu-system-aar Not tainted 
> > 5.1.0-rc1-8-g600025238f51-dirty #18
> > [   69.691984] Hardware name: rockchip evb_rk3399/evb_rk3399, BIOS 
> > 2019.04-rc3-00124-g2feec69fb1 03/15/2019
> > [   69.692831] Call trace:
> > [   69.694072]  lockdep_rcu_suspicious+0xcc/0x110
> > [   69.694490]  gfn_to_memslot+0x174/0x190
> > [   69.694853]  kvm_write_guest+0x50/0xb0
> > [   69.695209]  vgic_its_save_tables_v0+0x248/0x330
> > [   69.695639]  vgic_its_set_attr+0x298/0x3a0
> > [   69.696024]  kvm_device_ioctl_attr+0x9c/0xd8
> > [   69.696424]  kvm_device_ioctl+0x8c/0xf8
> > [   69.696788]  do_vfs_ioctl+0xc8/0x960
> > [   69.697128]  ksys_ioctl+0x8c/0xa0
> > [   69.697445]  __arm64_sys_ioctl+0x28/0x38
> > [   69.697817]  el0_svc_common+0xd8/0x138
> > [   69.698173]  el0_svc_handler+0x38/0x78
> > [   69.698528]  el0_svc+0x8/0xc
> > 
> > The fix is to obviously take the srcu lock, just like we do on the
> > read side of things since bf308242ab98. One wonders why this wasn't
> > fixed at the same time, but hey...
> > 
> > Fixes: bf308242ab98 ("KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() 
> > calls with SRCU lock")
> > Signed-off-by: Marc Zyngier 
> > ---
> >  arch/arm/include/asm/kvm_mmu.h   | 11 +++
> >  arch/arm64/include/asm/kvm_mmu.h | 11 +++
> >  virt/kvm/arm/vgic/vgic-its.c |  8 
> >  3 files changed, 26 insertions(+), 4 deletions(-)

[...]

> Don't we need to use kvm_write_guest_lock() as well also in
> vgic_v3_lpi_sync_pending_status and vgic_v3_save_pending_tables?

Ah, well spotted. Yes, that's needed too. I'll fix that.

Thanks,

M.

-- 
Jazz is not dead, it just smell funny.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/2] KVM: arm/arm64: vgic-its: Take the srcu lock when parsing the memslots

2019-03-19 Thread Auger Eric
Hi Marc,

On 3/19/19 2:30 PM, Marc Zyngier wrote:
> Calling kvm_is_visible_gfn() implies that we're parsing the memslots,
> and doing this without the srcu lock is frown upon:
> 
> [12704.164532] =
> [12704.164544] WARNING: suspicious RCU usage
> [12704.164560] 5.1.0-rc1-8-g600025238f51-dirty #16 Tainted: GW
> [12704.164573] -
> [12704.164589] ./include/linux/kvm_host.h:605 suspicious 
> rcu_dereference_check() usage!
> [12704.164602] other info that might help us debug this:
> [12704.164616] rcu_scheduler_active = 2, debug_locks = 1
> [12704.164631] 6 locks held by qemu-system-aar/13968:
> [12704.164644]  #0: 7ebdae4f (&kvm->lock){+.+.}, at: 
> vgic_its_set_attr+0x244/0x3a0
> [12704.164691]  #1: 7d751022 (&its->its_lock){+.+.}, at: 
> vgic_its_set_attr+0x250/0x3a0
> [12704.164726]  #2: 219d2706 (&vcpu->mutex){+.+.}, at: 
> lock_all_vcpus+0x64/0xd0
> [12704.164761]  #3: a760aecd (&vcpu->mutex){+.+.}, at: 
> lock_all_vcpus+0x64/0xd0
> [12704.164794]  #4: 0ef8e31d (&vcpu->mutex){+.+.}, at: 
> lock_all_vcpus+0x64/0xd0
> [12704.164827]  #5: 7a872093 (&vcpu->mutex){+.+.}, at: 
> lock_all_vcpus+0x64/0xd0
> [12704.164861] stack backtrace:
> [12704.164878] CPU: 2 PID: 13968 Comm: qemu-system-aar Tainted: GW
>  5.1.0-rc1-8-g600025238f51-dirty #16
> [12704.164887] Hardware name: rockchip evb_rk3399/evb_rk3399, BIOS 
> 2019.04-rc3-00124-g2feec69fb1 03/15/2019
> [12704.164896] Call trace:
> [12704.164910]  dump_backtrace+0x0/0x138
> [12704.164920]  show_stack+0x24/0x30
> [12704.164934]  dump_stack+0xbc/0x104
> [12704.164946]  lockdep_rcu_suspicious+0xcc/0x110
> [12704.164958]  gfn_to_memslot+0x174/0x190
> [12704.164969]  kvm_is_visible_gfn+0x28/0x70
> [12704.164980]  vgic_its_check_id.isra.0+0xec/0x1e8
> [12704.164991]  vgic_its_save_tables_v0+0x1ac/0x330
> [12704.165001]  vgic_its_set_attr+0x298/0x3a0
> [12704.165012]  kvm_device_ioctl_attr+0x9c/0xd8
> [12704.165022]  kvm_device_ioctl+0x8c/0xf8
> [12704.165035]  do_vfs_ioctl+0xc8/0x960
> [12704.165045]  ksys_ioctl+0x8c/0xa0
> [12704.165055]  __arm64_sys_ioctl+0x28/0x38
> [12704.165067]  el0_svc_common+0xd8/0x138
> [12704.165078]  el0_svc_handler+0x38/0x78
> [12704.165089]  el0_svc+0x8/0xc
> 
> Make sure the lock is taken when doing this.
> 
> Fixes: bf308242ab98 ("KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() 
> calls with SRCU lock")
> Signed-off-by: Marc Zyngier 
Reviewed-by: Eric Auger 

Thanks

Eric
> ---
>  virt/kvm/arm/vgic/vgic-its.c | 13 ++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
> index bb76220f2f30..7543f166943d 100644
> --- a/virt/kvm/arm/vgic/vgic-its.c
> +++ b/virt/kvm/arm/vgic/vgic-its.c
> @@ -880,8 +880,9 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 
> baser, u32 id,
>   u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
>   phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
>   int esz = GITS_BASER_ENTRY_SIZE(baser);
> - int index;
> + int index, idx;
>   gfn_t gfn;
> + bool ret;
>  
>   switch (type) {
>   case GITS_BASER_TYPE_DEVICE:
> @@ -908,7 +909,8 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 
> baser, u32 id,
>  
>   if (eaddr)
>   *eaddr = addr;
> - return kvm_is_visible_gfn(its->dev->kvm, gfn);
> +
> + goto out;
>   }
>  
>   /* calculate and check the index into the 1st level */
> @@ -938,7 +940,12 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 
> baser, u32 id,
>  
>   if (eaddr)
>   *eaddr = indirect_ptr;
> - return kvm_is_visible_gfn(its->dev->kvm, gfn);
> +
> +out:
> + idx = srcu_read_lock(&its->dev->kvm->srcu);
> + ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
> + srcu_read_unlock(&its->dev->kvm->srcu, idx);
> + return ret;
>  }
>  
>  static int vgic_its_alloc_collection(struct vgic_its *its,
> 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/2] KVM: arm/arm64: vgic-its: Take the srcu lock when writing to guest memory

2019-03-19 Thread Auger Eric
Hi Marc,

On 3/19/19 2:30 PM, Marc Zyngier wrote:
> When halting a guest, QEMU flushes the virtual ITS caches, which
> amounts to writing to the various tables that the guest has allocated.
> 
> When doing this, we fail to take the srcu lock, and the kernel
> shouts loudly if running a lockdep kernel:
> 
> [   69.680416] =
> [   69.680819] WARNING: suspicious RCU usage
> [   69.681526] 5.1.0-rc1-8-g600025238f51-dirty #18 Not tainted
> [   69.682096] -
> [   69.682501] ./include/linux/kvm_host.h:605 suspicious 
> rcu_dereference_check() usage!
> [   69.683225]
> [   69.683225] other info that might help us debug this:
> [   69.683225]
> [   69.683975]
> [   69.683975] rcu_scheduler_active = 2, debug_locks = 1
> [   69.684598] 6 locks held by qemu-system-aar/4097:
> [   69.685059]  #0: 34196013 (&kvm->lock){+.+.}, at: 
> vgic_its_set_attr+0x244/0x3a0
> [   69.686087]  #1: f2ed935e (&its->its_lock){+.+.}, at: 
> vgic_its_set_attr+0x250/0x3a0
> [   69.686919]  #2: 5e71ea54 (&vcpu->mutex){+.+.}, at: 
> lock_all_vcpus+0x64/0xd0
> [   69.687698]  #3: c17e548d (&vcpu->mutex){+.+.}, at: 
> lock_all_vcpus+0x64/0xd0
> [   69.688475]  #4: ba386017 (&vcpu->mutex){+.+.}, at: 
> lock_all_vcpus+0x64/0xd0
> [   69.689978]  #5: c2c3c335 (&vcpu->mutex){+.+.}, at: 
> lock_all_vcpus+0x64/0xd0
> [   69.690729]
> [   69.690729] stack backtrace:
> [   69.691151] CPU: 2 PID: 4097 Comm: qemu-system-aar Not tainted 
> 5.1.0-rc1-8-g600025238f51-dirty #18
> [   69.691984] Hardware name: rockchip evb_rk3399/evb_rk3399, BIOS 
> 2019.04-rc3-00124-g2feec69fb1 03/15/2019
> [   69.692831] Call trace:
> [   69.694072]  lockdep_rcu_suspicious+0xcc/0x110
> [   69.694490]  gfn_to_memslot+0x174/0x190
> [   69.694853]  kvm_write_guest+0x50/0xb0
> [   69.695209]  vgic_its_save_tables_v0+0x248/0x330
> [   69.695639]  vgic_its_set_attr+0x298/0x3a0
> [   69.696024]  kvm_device_ioctl_attr+0x9c/0xd8
> [   69.696424]  kvm_device_ioctl+0x8c/0xf8
> [   69.696788]  do_vfs_ioctl+0xc8/0x960
> [   69.697128]  ksys_ioctl+0x8c/0xa0
> [   69.697445]  __arm64_sys_ioctl+0x28/0x38
> [   69.697817]  el0_svc_common+0xd8/0x138
> [   69.698173]  el0_svc_handler+0x38/0x78
> [   69.698528]  el0_svc+0x8/0xc
> 
> The fix is to obviously take the srcu lock, just like we do on the
> read side of things since bf308242ab98. One wonders why this wasn't
> fixed at the same time, but hey...
> 
> Fixes: bf308242ab98 ("KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() 
> calls with SRCU lock")
> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm/include/asm/kvm_mmu.h   | 11 +++
>  arch/arm64/include/asm/kvm_mmu.h | 11 +++
>  virt/kvm/arm/vgic/vgic-its.c |  8 
>  3 files changed, 26 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> index 2de96a180166..31de4ab93005 100644
> --- a/arch/arm/include/asm/kvm_mmu.h
> +++ b/arch/arm/include/asm/kvm_mmu.h
> @@ -381,6 +381,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm,
>   return ret;
>  }
>  
> +static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
> +const void *data, unsigned long len)
> +{
> + int srcu_idx = srcu_read_lock(&kvm->srcu);
> + int ret = kvm_write_guest(kvm, gpa, data, len);
> +
> + srcu_read_unlock(&kvm->srcu, srcu_idx);
> +
> + return ret;
> +}
> +
>  static inline void *kvm_get_hyp_vector(void)
>  {
>   switch(read_cpuid_part()) {
> diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> b/arch/arm64/include/asm/kvm_mmu.h
> index b0742a16c6c9..ebeefcf835e8 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -445,6 +445,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm,
>   return ret;
>  }
>  
> +static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
> +const void *data, unsigned long len)
> +{
> + int srcu_idx = srcu_read_lock(&kvm->srcu);
> + int ret = kvm_write_guest(kvm, gpa, data, len);
> +
> + srcu_read_unlock(&kvm->srcu, srcu_idx);
> +
> + return ret;
> +}
> +
>  #ifdef CONFIG_KVM_INDIRECT_VECTORS
>  /*
>   * EL2 vectors can be mapped and rerouted in a number of ways,
> diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
> index a60a768d9258..bb76220f2f30 100644
> --- a/virt/kvm/arm/vgic/vgic-its.c
> +++ b/virt/kvm/arm/vgic/vgic-its.c
> @@ -2058,7 +2058,7 @@ static int vgic_its_save_ite(struct vgic_its *its, 
> struct its_device *dev,
>  ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
>   ite->collection->collection_id;
>   val = cpu_to_le64(val);
> - return kvm_write_guest(kvm, gpa, &val, ite_esz);
> + return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
>  }
>  
>  /**
> @@ -2205,7 +2205,7 @@ static int vgic_its_save_dte(struct vgic_its *its, 
> struct its_de

Re: [RFC] arm/cpu: fix soft lockup panic after resuming from stop

2019-03-19 Thread Heyi Guo

Hi Christoffer and Steve,


On 2019/3/15 16:59, Christoffer Dall wrote:

Hi Steve,

On Wed, Mar 13, 2019 at 10:11:30AM +, Steven Price wrote:

Personally I think what we need is:

* Either a patch like the one from Heyi Guo (save/restore CNTVCT_EL0) or
alternatively hooking up KVM_KVMCLOCK_CTRL to prevent the watchdog
firing when user space explicitly stops scheduling the guest for a while.

If we save/restore CNTVCT_EL0 and the warning goes away, does the guest
wall clock timekeeping get all confused and does it figure this out
automagically somehow?

What's the source for guest wall clock timekeeping in current ARM64 
implementation? Is it the value from CNTP_TVAL_EL0? Will guest OS keep track of 
this? Or is the wall clock simply platform RTC?

I tested the RFC change and did not see anything unusual. Did I miss someting?



Does KVM_KVMCLOCK_CTRL solve that problem?

It seems KVM_KVMCLOCK_CTRL is dedicated for guest pause/resume scenario, but it 
relies on pvclock both in KVM and Guest and right now only X86 supports it :(

Could Steve provide more insights about the whole thing?

Thanks,
Heyi




* KVM itself saving/restoring CNTVCT_EL0 during suspend/resume so the
guest doesn't see time pass during a suspend.

This smells like policy to me so I'd much prefer keeping as much
functionality in user space as possible.  If we already have the APIs we
need from KVM, let's use them.


* Something equivalent to MSR_KVM_WALL_CLOCK_NEW for arm which allows
the guest to query the wall clock time from the host and provides an
offset between CNTVCT_EL0 to wall clock time which the KVM can update
during suspend/resume. This means that during a suspend/resume the guest
can observe that wall clock time has passed, without having to be
bothered about CNTVCT_EL0 jumping forwards.


Isn't the proper Arm architectural solution for this to read the
physical counter for wall clock time keeping ?

(Yes that will require a trap on physical counter reads after migration
on current systems, but migration sucks in terms of timekeeping
already.)


Thanks,

 Christoffer

.




___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH] kvm: arm: Fix handling of stage2 huge mappings

2019-03-19 Thread Suzuki K Poulose
We rely on the mmu_notifier call backs to handle the split/merge
of huge pages and thus we are guaranteed that, while creating a
block mapping, either the entire block is unmapped at stage2 or it
is missing permission.

However, we miss a case where the block mapping is split for dirty
logging case and then could later be made block mapping, if we cancel the
dirty logging. This not only creates inconsistent TLB entries for
the pages in the the block, but also leakes the table pages for
PMD level.

Handle this corner case for the huge mappings at stage2 by
unmapping the non-huge mapping for the block. This could potentially
release the upper level table. So we need to restart the table walk
once we unmap the range.

Fixes : ad361f093c1e31d ("KVM: ARM: Support hugetlbfs backed huge pages")
Reported-by: Zheng Xiang 
Cc: Zheng Xiang 
Cc: Zhengui Yu 
Cc: Marc Zyngier 
Cc: Christoffer Dall 
Signed-off-by: Suzuki K Poulose 
---
 virt/kvm/arm/mmu.c | 63 ++
 1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index fce0983..6ad6f19d 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1060,25 +1060,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache
 {
pmd_t *pmd, old_pmd;
 
+retry:
pmd = stage2_get_pmd(kvm, cache, addr);
VM_BUG_ON(!pmd);
 
old_pmd = *pmd;
+   /*
+* Multiple vcpus faulting on the same PMD entry, can
+* lead to them sequentially updating the PMD with the
+* same value. Following the break-before-make
+* (pmd_clear() followed by tlb_flush()) process can
+* hinder forward progress due to refaults generated
+* on missing translations.
+*
+* Skip updating the page table if the entry is
+* unchanged.
+*/
+   if (pmd_val(old_pmd) == pmd_val(*new_pmd))
+   return 0;
+
if (pmd_present(old_pmd)) {
/*
-* Multiple vcpus faulting on the same PMD entry, can
-* lead to them sequentially updating the PMD with the
-* same value. Following the break-before-make
-* (pmd_clear() followed by tlb_flush()) process can
-* hinder forward progress due to refaults generated
-* on missing translations.
+* If we already have PTE level mapping for this block,
+* we must unmap it to avoid inconsistent TLB state and
+* leaking the table page. We could end up in this situation
+* if the memory slot was marked for dirty logging and was
+* reverted, leaving PTE level mappings for the pages accessed
+* during the period. So, unmap the PTE level mapping for this
+* block and retry, as we could have released the upper level
+* table in the process.
 *
-* Skip updating the page table if the entry is
-* unchanged.
+* Normal THP split/merge follows mmu_notifier callbacks and do
+* get handled accordingly.
 */
-   if (pmd_val(old_pmd) == pmd_val(*new_pmd))
-   return 0;
-
+   if (!pmd_thp_or_huge(old_pmd)) {
+   unmap_stage2_range(kvm, addr & S2_PMD_MASK, 
S2_PMD_SIZE);
+   goto retry;
+   }
/*
 * Mapping in huge pages should only happen through a
 * fault.  If a page is merged into a transparent huge
@@ -1090,8 +1108,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache
 * should become splitting first, unmapped, merged,
 * and mapped back in on-demand.
 */
-   VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
-
+   WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
} else {
@@ -1107,6 +1124,7 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache *cac
 {
pud_t *pudp, old_pud;
 
+retry:
pudp = stage2_get_pud(kvm, cache, addr);
VM_BUG_ON(!pudp);
 
@@ -1114,16 +1132,25 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache *cac
 
/*
 * A large number of vcpus faulting on the same stage 2 entry,
-* can lead to a refault due to the
-* stage2_pud_clear()/tlb_flush(). Skip updating the page
-* tables if there is no change.
+* can lead to a refault due to the stage2_pud_clear()/tlb_flush().
+* Skip updating the page tables if there is no change.
 */
if (pud_val(old_pud) == pud_val(*new_pudp))
return 0;
 
if (stage2_pud_present(kvm, old_pud))

[PATCH 2/2] KVM: arm/arm64: vgic-its: Take the srcu lock when parsing the memslots

2019-03-19 Thread Marc Zyngier
Calling kvm_is_visible_gfn() implies that we're parsing the memslots,
and doing this without the srcu lock is frown upon:

[12704.164532] =
[12704.164544] WARNING: suspicious RCU usage
[12704.164560] 5.1.0-rc1-8-g600025238f51-dirty #16 Tainted: GW
[12704.164573] -
[12704.164589] ./include/linux/kvm_host.h:605 suspicious 
rcu_dereference_check() usage!
[12704.164602] other info that might help us debug this:
[12704.164616] rcu_scheduler_active = 2, debug_locks = 1
[12704.164631] 6 locks held by qemu-system-aar/13968:
[12704.164644]  #0: 7ebdae4f (&kvm->lock){+.+.}, at: 
vgic_its_set_attr+0x244/0x3a0
[12704.164691]  #1: 7d751022 (&its->its_lock){+.+.}, at: 
vgic_its_set_attr+0x250/0x3a0
[12704.164726]  #2: 219d2706 (&vcpu->mutex){+.+.}, at: 
lock_all_vcpus+0x64/0xd0
[12704.164761]  #3: a760aecd (&vcpu->mutex){+.+.}, at: 
lock_all_vcpus+0x64/0xd0
[12704.164794]  #4: 0ef8e31d (&vcpu->mutex){+.+.}, at: 
lock_all_vcpus+0x64/0xd0
[12704.164827]  #5: 7a872093 (&vcpu->mutex){+.+.}, at: 
lock_all_vcpus+0x64/0xd0
[12704.164861] stack backtrace:
[12704.164878] CPU: 2 PID: 13968 Comm: qemu-system-aar Tainted: GW  
   5.1.0-rc1-8-g600025238f51-dirty #16
[12704.164887] Hardware name: rockchip evb_rk3399/evb_rk3399, BIOS 
2019.04-rc3-00124-g2feec69fb1 03/15/2019
[12704.164896] Call trace:
[12704.164910]  dump_backtrace+0x0/0x138
[12704.164920]  show_stack+0x24/0x30
[12704.164934]  dump_stack+0xbc/0x104
[12704.164946]  lockdep_rcu_suspicious+0xcc/0x110
[12704.164958]  gfn_to_memslot+0x174/0x190
[12704.164969]  kvm_is_visible_gfn+0x28/0x70
[12704.164980]  vgic_its_check_id.isra.0+0xec/0x1e8
[12704.164991]  vgic_its_save_tables_v0+0x1ac/0x330
[12704.165001]  vgic_its_set_attr+0x298/0x3a0
[12704.165012]  kvm_device_ioctl_attr+0x9c/0xd8
[12704.165022]  kvm_device_ioctl+0x8c/0xf8
[12704.165035]  do_vfs_ioctl+0xc8/0x960
[12704.165045]  ksys_ioctl+0x8c/0xa0
[12704.165055]  __arm64_sys_ioctl+0x28/0x38
[12704.165067]  el0_svc_common+0xd8/0x138
[12704.165078]  el0_svc_handler+0x38/0x78
[12704.165089]  el0_svc+0x8/0xc

Make sure the lock is taken when doing this.

Fixes: bf308242ab98 ("KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() calls 
with SRCU lock")
Signed-off-by: Marc Zyngier 
---
 virt/kvm/arm/vgic/vgic-its.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index bb76220f2f30..7543f166943d 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -880,8 +880,9 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 
baser, u32 id,
u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
int esz = GITS_BASER_ENTRY_SIZE(baser);
-   int index;
+   int index, idx;
gfn_t gfn;
+   bool ret;
 
switch (type) {
case GITS_BASER_TYPE_DEVICE:
@@ -908,7 +909,8 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 
baser, u32 id,
 
if (eaddr)
*eaddr = addr;
-   return kvm_is_visible_gfn(its->dev->kvm, gfn);
+
+   goto out;
}
 
/* calculate and check the index into the 1st level */
@@ -938,7 +940,12 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 
baser, u32 id,
 
if (eaddr)
*eaddr = indirect_ptr;
-   return kvm_is_visible_gfn(its->dev->kvm, gfn);
+
+out:
+   idx = srcu_read_lock(&its->dev->kvm->srcu);
+   ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
+   srcu_read_unlock(&its->dev->kvm->srcu, idx);
+   return ret;
 }
 
 static int vgic_its_alloc_collection(struct vgic_its *its,
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 1/2] KVM: arm/arm64: vgic-its: Take the srcu lock when writing to guest memory

2019-03-19 Thread Marc Zyngier
When halting a guest, QEMU flushes the virtual ITS caches, which
amounts to writing to the various tables that the guest has allocated.

When doing this, we fail to take the srcu lock, and the kernel
shouts loudly if running a lockdep kernel:

[   69.680416] =
[   69.680819] WARNING: suspicious RCU usage
[   69.681526] 5.1.0-rc1-8-g600025238f51-dirty #18 Not tainted
[   69.682096] -
[   69.682501] ./include/linux/kvm_host.h:605 suspicious 
rcu_dereference_check() usage!
[   69.683225]
[   69.683225] other info that might help us debug this:
[   69.683225]
[   69.683975]
[   69.683975] rcu_scheduler_active = 2, debug_locks = 1
[   69.684598] 6 locks held by qemu-system-aar/4097:
[   69.685059]  #0: 34196013 (&kvm->lock){+.+.}, at: 
vgic_its_set_attr+0x244/0x3a0
[   69.686087]  #1: f2ed935e (&its->its_lock){+.+.}, at: 
vgic_its_set_attr+0x250/0x3a0
[   69.686919]  #2: 5e71ea54 (&vcpu->mutex){+.+.}, at: 
lock_all_vcpus+0x64/0xd0
[   69.687698]  #3: c17e548d (&vcpu->mutex){+.+.}, at: 
lock_all_vcpus+0x64/0xd0
[   69.688475]  #4: ba386017 (&vcpu->mutex){+.+.}, at: 
lock_all_vcpus+0x64/0xd0
[   69.689978]  #5: c2c3c335 (&vcpu->mutex){+.+.}, at: 
lock_all_vcpus+0x64/0xd0
[   69.690729]
[   69.690729] stack backtrace:
[   69.691151] CPU: 2 PID: 4097 Comm: qemu-system-aar Not tainted 
5.1.0-rc1-8-g600025238f51-dirty #18
[   69.691984] Hardware name: rockchip evb_rk3399/evb_rk3399, BIOS 
2019.04-rc3-00124-g2feec69fb1 03/15/2019
[   69.692831] Call trace:
[   69.694072]  lockdep_rcu_suspicious+0xcc/0x110
[   69.694490]  gfn_to_memslot+0x174/0x190
[   69.694853]  kvm_write_guest+0x50/0xb0
[   69.695209]  vgic_its_save_tables_v0+0x248/0x330
[   69.695639]  vgic_its_set_attr+0x298/0x3a0
[   69.696024]  kvm_device_ioctl_attr+0x9c/0xd8
[   69.696424]  kvm_device_ioctl+0x8c/0xf8
[   69.696788]  do_vfs_ioctl+0xc8/0x960
[   69.697128]  ksys_ioctl+0x8c/0xa0
[   69.697445]  __arm64_sys_ioctl+0x28/0x38
[   69.697817]  el0_svc_common+0xd8/0x138
[   69.698173]  el0_svc_handler+0x38/0x78
[   69.698528]  el0_svc+0x8/0xc

The fix is to obviously take the srcu lock, just like we do on the
read side of things since bf308242ab98. One wonders why this wasn't
fixed at the same time, but hey...

Fixes: bf308242ab98 ("KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() calls 
with SRCU lock")
Signed-off-by: Marc Zyngier 
---
 arch/arm/include/asm/kvm_mmu.h   | 11 +++
 arch/arm64/include/asm/kvm_mmu.h | 11 +++
 virt/kvm/arm/vgic/vgic-its.c |  8 
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 2de96a180166..31de4ab93005 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -381,6 +381,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm,
return ret;
 }
 
+static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
+  const void *data, unsigned long len)
+{
+   int srcu_idx = srcu_read_lock(&kvm->srcu);
+   int ret = kvm_write_guest(kvm, gpa, data, len);
+
+   srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+   return ret;
+}
+
 static inline void *kvm_get_hyp_vector(void)
 {
switch(read_cpuid_part()) {
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index b0742a16c6c9..ebeefcf835e8 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -445,6 +445,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm,
return ret;
 }
 
+static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
+  const void *data, unsigned long len)
+{
+   int srcu_idx = srcu_read_lock(&kvm->srcu);
+   int ret = kvm_write_guest(kvm, gpa, data, len);
+
+   srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+   return ret;
+}
+
 #ifdef CONFIG_KVM_INDIRECT_VECTORS
 /*
  * EL2 vectors can be mapped and rerouted in a number of ways,
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index a60a768d9258..bb76220f2f30 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -2058,7 +2058,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct 
its_device *dev,
   ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
ite->collection->collection_id;
val = cpu_to_le64(val);
-   return kvm_write_guest(kvm, gpa, &val, ite_esz);
+   return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
 }
 
 /**
@@ -2205,7 +2205,7 @@ static int vgic_its_save_dte(struct vgic_its *its, struct 
its_device *dev,
   (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
(dev->num_eventid_bits - 1));
val = cpu_to_le64(val);
-   return kvm_write_guest(kvm, ptr, &val, dte_esz);
+   return kvm_write_guest_lock(kvm, ptr, &val, dte_

[PATCH 0/2] KVM: arm/arm64: vgic-its: Guest memory access fixes

2019-03-19 Thread Marc Zyngier
While messing with something completely different, I managed to
trigger a number of RCU warnings, due to the lack of srcu locking
in the ITS code.

Andre partially addressed those last year[1], but there was some
more left.

[1] http://lists.infradead.org/pipermail/linux-arm-kernel/2018-May/577056.html

Marc Zyngier (2):
  KVM: arm/arm64: vgic-its: Take the srcu lock when writing to guest
memory
  KVM: arm/arm64: vgic-its: Take the srcu lock when parsing the memslots

 arch/arm/include/asm/kvm_mmu.h   | 11 +++
 arch/arm64/include/asm/kvm_mmu.h | 11 +++
 virt/kvm/arm/vgic/vgic-its.c | 21 ++---
 3 files changed, 36 insertions(+), 7 deletions(-)

-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC] Question about enable doorbell irq and halt_poll process

2019-03-19 Thread Tangnianyao (ICT)
Hi, all

Using gicv4, when guest is waiting for irq, it sends wfi and traps to kvm.
When vlpi is forwarded to PE after its_vpe_deschedule, before halt_poll in
kvm_vcpu_block, halt_poll may increase latency for this vlpi getting to guest.
In halt_poll process, it checks if there's pending irq for vcpu using 
pending_last.
However, doorbell is not enable at this moment and vlpi or doorbell can not set
pending_last true, to stop halt_poll. It will run until halt_poll time ends, if
there's no other physical irq coming in the meantime. And then vcpu is 
scheduled out.
This pending vlpi has to wait for vcpu getting schedule in next time.

Should we enable doorbell before halt_poll process ?

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH] KVM: arm/arm64: Enable direct irqfd MSI injection

2019-03-19 Thread Marc Zyngier
On Tue, 19 Mar 2019 09:09:43 +0800
Zenghui Yu  wrote:

> Hi all,
> 
> On 2019/3/18 3:35, Marc Zyngier wrote:
> > On Sun, 17 Mar 2019 14:36:13 +,
> > Zenghui Yu  wrote:  
> >>
> >> Currently, IRQFD on arm still uses the deferred workqueue mechanism
> >> to inject interrupts into guest, which will likely lead to a busy
> >> context-switching from/to the kworker thread. This overhead is for
> >> no purpose (only in my view ...) and will result in an interrupt
> >> performance degradation.
> >>
> >> Implement kvm_arch_set_irq_inatomic() for arm/arm64 to support direct
> >> irqfd MSI injection, by which we can get rid of the annoying latency.
> >> As a result, irqfd MSI intensive scenarios (e.g., DPDK with high packet
> >> processing workloads) will benefit from it.
> >>
> >> Signed-off-by: Zenghui Yu 
> >> ---
> >>
> >> It seems that only MSI will follow the IRQFD path, did I miss something?
> >>
> >> This patch is still under test and sent out for early feedback. If I have
> >> any mis-understanding, please fix me up and let me know. Thanks!  
> > 
> > As mentioned by other folks in the thread, this is clearly wrong. The
> > first thing kvm_inject_msi does is to lock the corresponding ITS using
> > a mutex. So the "no purpose" bit was a bit too quick.
> > 
> > When doing this kind of work, I suggest you enable lockdep and all the
> > related checkers. Also, for any optimisation, please post actual
> > numbers for the relevant benchmarks. Saying "application X will
> > benefit from it" is meaningless without any actual data.
> >   
> >>
> >> ---
> >>   virt/kvm/arm/vgic/trace.h  | 22 ++
> >>   virt/kvm/arm/vgic/vgic-irqfd.c | 21 +
> >>   2 files changed, 43 insertions(+)
> >>
> >> diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h
> >> index 55fed77..bc1f4db 100644
> >> --- a/virt/kvm/arm/vgic/trace.h
> >> +++ b/virt/kvm/arm/vgic/trace.h
> >> @@ -27,6 +27,28 @@
> >>  __entry->vcpu_id, __entry->irq, __entry->level)
> >>   );  
> >>   >> +TRACE_EVENT(kvm_arch_set_irq_inatomic,  
> >> +  TP_PROTO(u32 gsi, u32 type, int level, int irq_source_id),
> >> +  TP_ARGS(gsi, type, level, irq_source_id),
> >> +
> >> +  TP_STRUCT__entry(
> >> +  __field(u32,gsi )
> >> +  __field(u32,type)
> >> +  __field(int,level   )
> >> +  __field(int,irq_source_id   )
> >> +  ),
> >> +
> >> +  TP_fast_assign(
> >> +  __entry->gsi= gsi;
> >> +  __entry->type   = type;
> >> +  __entry->level  = level;
> >> +  __entry->irq_source_id  = irq_source_id;
> >> +  ),
> >> +
> >> +  TP_printk("gsi %u type %u level %d source %d", __entry->gsi,
> >> +__entry->type, __entry->level, __entry->irq_source_id)
> >> +);
> >> +
> >>   #endif /* _TRACE_VGIC_H */  
> >>   >>   #undef TRACE_INCLUDE_PATH  
> >> diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c 
> >> b/virt/kvm/arm/vgic/vgic-irqfd.c
> >> index 99e026d..4cfc3f4 100644
> >> --- a/virt/kvm/arm/vgic/vgic-irqfd.c
> >> +++ b/virt/kvm/arm/vgic/vgic-irqfd.c
> >> @@ -19,6 +19,7 @@
> >>   #include 
> >>   #include 
> >>   #include "vgic.h"
> >> +#include "trace.h"  
> >>   >>   /**  
> >>* vgic_irqfd_set_irq: inject the IRQ corresponding to the
> >> @@ -105,6 +106,26 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry 
> >> *e,
> >>return vgic_its_inject_msi(kvm, &msi);
> >>   }  
> >>   >> +/**  
> >> + * kvm_arch_set_irq_inatomic: fast-path for irqfd injection
> >> + *
> >> + * Currently only direct MSI injecton is supported.
> >> + */
> >> +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
> >> +struct kvm *kvm, int irq_source_id, int level,
> >> +bool line_status)
> >> +{
> >> +  int ret;
> >> +
> >> +  trace_kvm_arch_set_irq_inatomic(e->gsi, e->type, level, irq_source_id);
> >> +
> >> +  if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
> >> +  return -EWOULDBLOCK;
> >> +
> >> +  ret = kvm_set_msi(e, kvm, irq_source_id, level, line_status);
> >> +  return ret;
> >> +}
> >> +  
> > 
> > Although we've established that the approach is wrong, maybe we can
> > look at improving this aspect.
> > 
> > A first approach would be to keep a small cache of the last few
> > successful translations for this ITS, cache that could be looked-up by
> > holding a spinlock instead. A hit in this cache could directly be
> > injected. Any command that invalidates or changes anything (DISCARD,
> > INV, INVALL, MAPC with V=0, MAPD with V=0, MOVALL, MOVI) should nuke
> > the cache altogether.
> > 
> > Of course, all of that needs to be quantified.  
> 
> Thanks for all of your explanations, especially for Marc's suggestions!
> It took me long time to figure out my mistakes, since I am not very
> familiar with the locking stuff. Now I have to apologize for my noise.

No need to apologize. The 

Re: [RFC] Question about TLB flush while set Stage-2 huge pages

2019-03-19 Thread Zenghui Yu

Hi Suzuki,

On 2019/3/19 1:34, Suzuki K Poulose wrote:

Hi !
On Sun, Mar 17, 2019 at 09:34:11PM +0800, Zenghui Yu wrote:

Hi Suzuki,

---8<---

test: kvm: arm: Maybe two more fixes

Applied based on Suzuki's patch.

Signed-off-by: Zenghui Yu 
---
  virt/kvm/arm/mmu.c | 8 ++--
  1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 05765df..ccd5d5d 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1089,7 +1089,9 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct
kvm_mmu_memory_cache
 * Normal THP split/merge follows mmu_notifier
 * callbacks and do get handled accordingly.
 */
-   unmap_stage2_range(kvm, (addr & S2_PMD_MASK), 
S2_PMD_SIZE);
+   addr &= S2_PMD_MASK;
+   unmap_stage2_ptes(kvm, pmd, addr, addr + S2_PMD_SIZE);
+   get_page(virt_to_page(pmd));
} else {

/*
@@ -1138,7 +1140,9 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct
kvm_mmu_memory_cache *cac
if (stage2_pud_present(kvm, old_pud)) {
/* If we have PTE level mapping, unmap the entire range */
if (WARN_ON_ONCE(!stage2_pud_huge(kvm, old_pud))) {
-   unmap_stage2_range(kvm, addr & S2_PUD_MASK, 
S2_PUD_SIZE);
+   addr &= S2_PUD_MASK;
+   unmap_stage2_pmds(kvm, pudp, addr, addr + S2_PUD_SIZE);
+   get_page(virt_to_page(pudp));
} else {
stage2_pud_clear(kvm, pudp);
kvm_tlb_flush_vmid_ipa(kvm, addr);


This makes it a bit tricky to follow the code. The other option is to
do something like :


Yes.




---8>---

kvm: arm: Fix handling of stage2 huge mappings

We rely on the mmu_notifier call backs to handle the split/merging
of huge pages and thus we are guaranteed that while creating a
block mapping, the entire block is unmapped at stage2. However,
we miss a case where the block mapping is split for dirty logging
case and then could later be made block mapping, if we cancel the
dirty logging. This not only creates inconsistent TLB entries for
the pages in the the block, but also leakes the table pages for
PMD level.

Handle these corner cases for the huge mappings at stage2 by
unmapping the PTE level mapping. This could potentially release
the upper level table. So we need to restart the table walk
once we unmap the range.

Signed-off-by: Suzuki K Poulose 
---
  virt/kvm/arm/mmu.c | 57 +++---
  1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index fce0983..a38a3f1 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1060,25 +1060,41 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache
  {
pmd_t *pmd, old_pmd;
  
+retry:

pmd = stage2_get_pmd(kvm, cache, addr);
VM_BUG_ON(!pmd);
  
  	old_pmd = *pmd;

+   /*
+* Multiple vcpus faulting on the same PMD entry, can
+* lead to them sequentially updating the PMD with the
+* same value. Following the break-before-make
+* (pmd_clear() followed by tlb_flush()) process can
+* hinder forward progress due to refaults generated
+* on missing translations.
+*
+* Skip updating the page table if the entry is
+* unchanged.
+*/
+   if (pmd_val(old_pmd) == pmd_val(*new_pmd))
+   return 0;
+
if (pmd_present(old_pmd)) {
/*
-* Multiple vcpus faulting on the same PMD entry, can
-* lead to them sequentially updating the PMD with the
-* same value. Following the break-before-make
-* (pmd_clear() followed by tlb_flush()) process can
-* hinder forward progress due to refaults generated
-* on missing translations.
-*
-* Skip updating the page table if the entry is
-* unchanged.
+* If we already have PTE level mapping for this block,
+* we must unmap it to avoid inconsistent TLB
+* state. We could end up in this situation if
+* the memory slot was marked for dirty logging
+* and was reverted, leaving PTE level mappings
+* for the pages accessed during the period.
+* Normal THP split/merge follows mmu_notifier
+* callbacks and do get handled accordingly.
+* Unmap the PTE level mapping and retry.
 */
-   if (pmd_val(old_pmd) == pmd_val(*new_pmd))
-   return 0;
-
+   if (!pmd_thp_or_huge(old_pmd)) {
+   unmap_stage2_range(kvm, (addr & S2_PMD_MASK), 
S2_PMD_SIZE

[PATCH v7 7/10] KVM: arm/arm64: context-switch ptrauth registers

2019-03-19 Thread Amit Daniel Kachhap
From: Mark Rutland 

When pointer authentication is supported, a guest may wish to use it.
This patch adds the necessary KVM infrastructure for this to work, with
a semi-lazy context switch of the pointer auth state.

Pointer authentication feature is only enabled when VHE is built
in the kernel and present in the CPU implementation so only VHE code
paths are modified.

When we schedule a vcpu, we disable guest usage of pointer
authentication instructions and accesses to the keys. While these are
disabled, we avoid context-switching the keys. When we trap the guest
trying to use pointer authentication functionality, we change to eagerly
context-switching the keys, and enable the feature. The next time the
vcpu is scheduled out/in, we start again. However the host key save is
optimized and implemented inside ptrauth instruction/register access
trap.

Pointer authentication consists of address authentication and generic
authentication, and CPUs in a system might have varied support for
either. Where support for either feature is not uniform, it is hidden
from guests via ID register emulation, as a result of the cpufeature
framework in the host.

Unfortunately, address authentication and generic authentication cannot
be trapped separately, as the architecture provides a single EL2 trap
covering both. If we wish to expose one without the other, we cannot
prevent a (badly-written) guest from intermittently using a feature
which is not uniformly supported (when scheduled on a physical CPU which
supports the relevant feature). Hence, this patch expects both type of
authentication to be present in a cpu.

This switch of key is done from guest enter/exit assembly as preperation
for the upcoming in-kernel pointer authentication support. Hence, these
key switching routines are not implemented in C code as they may cause
pointer authentication key signing error in some situations.

Signed-off-by: Mark Rutland 
[Only VHE, key switch in full assembly, vcpu_has_ptrauth checks
, save host key in ptrauth exception trap]
Signed-off-by: Amit Daniel Kachhap 
Reviewed-by: Julien Thierry 
Cc: Marc Zyngier 
Cc: Christoffer Dall 
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/include/asm/kvm_host.h|  17 ++
 arch/arm64/include/asm/kvm_ptrauth_asm.h | 100 +++
 arch/arm64/kernel/asm-offsets.c  |   6 ++
 arch/arm64/kvm/guest.c   |  14 +
 arch/arm64/kvm/handle_exit.c |  24 +---
 arch/arm64/kvm/hyp/entry.S   |   7 +++
 arch/arm64/kvm/reset.c   |   7 +++
 arch/arm64/kvm/sys_regs.c|  46 +-
 virt/kvm/arm/arm.c   |   2 +
 9 files changed, 212 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/include/asm/kvm_ptrauth_asm.h

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 9dd2918..61239a6 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -152,6 +152,18 @@ enum vcpu_sysreg {
PMSWINC_EL0,/* Software Increment Register */
PMUSERENR_EL0,  /* User Enable Register */
 
+   /* Pointer Authentication Registers */
+   APIAKEYLO_EL1,
+   APIAKEYHI_EL1,
+   APIBKEYLO_EL1,
+   APIBKEYHI_EL1,
+   APDAKEYLO_EL1,
+   APDAKEYHI_EL1,
+   APDBKEYLO_EL1,
+   APDBKEYHI_EL1,
+   APGAKEYLO_EL1,
+   APGAKEYHI_EL1,
+
/* 32bit specific registers. Keep them at the end of the range */
DACR32_EL2, /* Domain Access Control Register */
IFSR32_EL2, /* Instruction Fault Status Register */
@@ -497,6 +509,11 @@ static inline bool kvm_arch_requires_vhe(void)
test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) && \
test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features))
 
+void kvm_arm_vcpu_ptrauth_enable(struct kvm_vcpu *vcpu);
+void kvm_arm_vcpu_ptrauth_disable(struct kvm_vcpu *vcpu);
+void kvm_arm_vcpu_ptrauth_setup_lazy(struct kvm_vcpu *vcpu);
+void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu);
+
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
diff --git a/arch/arm64/include/asm/kvm_ptrauth_asm.h 
b/arch/arm64/include/asm/kvm_ptrauth_asm.h
new file mode 100644
index 000..97bb040
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_ptrauth_asm.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * arch/arm64/include/asm/kvm_ptrauth_asm.h: Guest/host ptrauth save/restore
+ * Copyright 2019 Arm Limited
+ * Author: Mark Rutland 
+ * Amit Daniel Kachhap 
+ */
+
+#ifndef __ASM_KVM_ASM_PTRAUTH_H
+#define __ASM_KVM_ASM_PTRAUTH_H
+
+#ifndef __ASSEMBLY__
+
+#define __ptrauth_save_key(regs, key)  
\
+({ 
\
+   regs[key ## KEYLO_E

[PATCH v7 9/10] KVM: arm64: docs: document KVM support of pointer authentication

2019-03-19 Thread Amit Daniel Kachhap
This adds sections for KVM API extension for pointer authentication.
A brief description about usage of pointer authentication for KVM guests
is added in the arm64 documentations.

Signed-off-by: Amit Daniel Kachhap 
Cc: Mark Rutland 
Cc: Christoffer Dall 
Cc: Marc Zyngier 
Cc: kvmarm@lists.cs.columbia.edu
---
 Documentation/arm64/pointer-authentication.txt | 15 +++
 Documentation/virtual/kvm/api.txt  |  6 ++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/Documentation/arm64/pointer-authentication.txt 
b/Documentation/arm64/pointer-authentication.txt
index 5baca42..4b769e6 100644
--- a/Documentation/arm64/pointer-authentication.txt
+++ b/Documentation/arm64/pointer-authentication.txt
@@ -87,7 +87,14 @@ used to get and set the keys for a thread.
 Virtualization
 --
 
-Pointer authentication is not currently supported in KVM guests. KVM
-will mask the feature bits from ID_AA64ISAR1_EL1, and attempted use of
-the feature will result in an UNDEFINED exception being injected into
-the guest.
+Pointer authentication is enabled in KVM guest when each virtual cpu is
+initialised by passing flags KVM_ARM_VCPU_PTRAUTH_[ADDRESS/GENERIC] and
+requesting this feature to be enabled. Without this flag, pointer
+authentication is not enabled in KVM guests and attempted use of the
+feature will result in an UNDEFINED exception being injected into the
+guest.
+
+Additionally, when these vcpu feature flags are not set then KVM will
+filter out the Pointer Authentication system key registers from
+KVM_GET/SET_REG_* ioctls and mask those features from cpufeature ID
+register.
diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 7de9eee..b5c66bc 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2659,6 +2659,12 @@ Possible features:
  Depends on KVM_CAP_ARM_PSCI_0_2.
- KVM_ARM_VCPU_PMU_V3: Emulate PMUv3 for the CPU.
  Depends on KVM_CAP_ARM_PMU_V3.
+   - KVM_ARM_VCPU_PTRAUTH_ADDRESS:
+   - KVM_ARM_VCPU_PTRAUTH_GENERIC:
+ Enables Pointer authentication for the CPU.
+ Depends on KVM_CAP_ARM_PTRAUTH and only on arm64 architecture. If
+ set, then the KVM guest allows the execution of pointer authentication
+ instructions. Otherwise, KVM treats these instructions as undefined.
 
 
 4.83 KVM_ARM_PREFERRED_TARGET
-- 
2.7.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v7 8/10] KVM: arm64: Add capability to advertise ptrauth for guest

2019-03-19 Thread Amit Daniel Kachhap
This patch advertises the capability of pointer authentication
when system supports pointer authentication and VHE mode present.

Signed-off-by: Amit Daniel Kachhap 
Cc: Mark Rutland 
Cc: Marc Zyngier 
Cc: Christoffer Dall 
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/kvm/reset.c   | 4 
 include/uapi/linux/kvm.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 00f0639..a3b269e 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -92,6 +92,10 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
case KVM_CAP_ARM_VM_IPA_SIZE:
r = kvm_ipa_limit;
break;
+   case KVM_CAP_ARM_PTRAUTH:
+   r = has_vhe() && system_supports_address_auth() &&
+   system_supports_generic_auth();
+   break;
default:
r = 0;
}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6d4ea4b..a553477 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -988,6 +988,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_VM_IPA_SIZE 165
 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166
 #define KVM_CAP_HYPERV_CPUID 167
+#define KVM_CAP_ARM_PTRAUTH 168
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.7.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[kvmtool PATCH v7 10/10] KVM: arm/arm64: Add a vcpu feature for pointer authentication

2019-03-19 Thread Amit Daniel Kachhap
This is a runtime capabality for KVM tool to enable Arm64 8.3 Pointer
Authentication in guest kernel. A command line option --ptrauth is
required for this.

Signed-off-by: Amit Daniel Kachhap 
---
 arm/aarch32/include/kvm/kvm-cpu-arch.h| 1 +
 arm/aarch64/include/asm/kvm.h | 2 ++
 arm/aarch64/include/kvm/kvm-config-arch.h | 4 +++-
 arm/aarch64/include/kvm/kvm-cpu-arch.h| 2 ++
 arm/include/arm-common/kvm-config-arch.h  | 1 +
 arm/kvm-cpu.c | 6 ++
 include/linux/kvm.h   | 1 +
 7 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/arm/aarch32/include/kvm/kvm-cpu-arch.h 
b/arm/aarch32/include/kvm/kvm-cpu-arch.h
index d28ea67..520ea76 100644
--- a/arm/aarch32/include/kvm/kvm-cpu-arch.h
+++ b/arm/aarch32/include/kvm/kvm-cpu-arch.h
@@ -13,4 +13,5 @@
 #define ARM_CPU_ID 0, 0, 0
 #define ARM_CPU_ID_MPIDR   5
 
+#define ARM_VCPU_PTRAUTH_FEATURE   0
 #endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/arm/aarch64/include/asm/kvm.h b/arm/aarch64/include/asm/kvm.h
index 97c3478..d4d0d8c 100644
--- a/arm/aarch64/include/asm/kvm.h
+++ b/arm/aarch64/include/asm/kvm.h
@@ -102,6 +102,8 @@ struct kvm_regs {
 #define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */
 #define KVM_ARM_VCPU_PSCI_0_2  2 /* CPU uses PSCI v0.2 */
 #define KVM_ARM_VCPU_PMU_V33 /* Support guest PMUv3 */
+#define KVM_ARM_VCPU_PTRAUTH_ADDRESS   4 /* CPU uses address pointer 
authentication */
+#define KVM_ARM_VCPU_PTRAUTH_GENERIC   5 /* CPU uses generic pointer 
authentication */
 
 struct kvm_vcpu_init {
__u32 target;
diff --git a/arm/aarch64/include/kvm/kvm-config-arch.h 
b/arm/aarch64/include/kvm/kvm-config-arch.h
index 04be43d..2074684 100644
--- a/arm/aarch64/include/kvm/kvm-config-arch.h
+++ b/arm/aarch64/include/kvm/kvm-config-arch.h
@@ -8,7 +8,9 @@
"Create PMUv3 device"), \
OPT_U64('\0', "kaslr-seed", &(cfg)->kaslr_seed, \
"Specify random seed for Kernel Address Space " \
-   "Layout Randomization (KASLR)"),
+   "Layout Randomization (KASLR)"),\
+   OPT_BOOLEAN('\0', "ptrauth", &(cfg)->has_ptrauth,   \
+   "Enable address authentication"),
 
 #include "arm-common/kvm-config-arch.h"
 
diff --git a/arm/aarch64/include/kvm/kvm-cpu-arch.h 
b/arm/aarch64/include/kvm/kvm-cpu-arch.h
index a9d8563..fcc2107 100644
--- a/arm/aarch64/include/kvm/kvm-cpu-arch.h
+++ b/arm/aarch64/include/kvm/kvm-cpu-arch.h
@@ -17,4 +17,6 @@
 #define ARM_CPU_CTRL   3, 0, 1, 0
 #define ARM_CPU_CTRL_SCTLR_EL1 0
 
+#define ARM_VCPU_PTRAUTH_FEATURE   ((1UL << KVM_ARM_VCPU_PTRAUTH_ADDRESS) \
+   | (1UL << KVM_ARM_VCPU_PTRAUTH_GENERIC))
 #endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/arm/include/arm-common/kvm-config-arch.h 
b/arm/include/arm-common/kvm-config-arch.h
index 5734c46..5badcbd 100644
--- a/arm/include/arm-common/kvm-config-arch.h
+++ b/arm/include/arm-common/kvm-config-arch.h
@@ -10,6 +10,7 @@ struct kvm_config_arch {
boolaarch32_guest;
boolhas_pmuv3;
u64 kaslr_seed;
+   boolhas_ptrauth;
enum irqchip_type irqchip;
u64 fw_addr;
 };
diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c
index 7780251..4ac80f8 100644
--- a/arm/kvm-cpu.c
+++ b/arm/kvm-cpu.c
@@ -68,6 +68,12 @@ struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned 
long cpu_id)
vcpu_init.features[0] |= (1UL << KVM_ARM_VCPU_PSCI_0_2);
}
 
+   /* Set KVM_ARM_VCPU_PTRAUTH if available */
+   if (kvm__supports_extension(kvm, KVM_CAP_ARM_PTRAUTH)) {
+   if (kvm->cfg.arch.has_ptrauth)
+   vcpu_init.features[0] |= ARM_VCPU_PTRAUTH_FEATURE;
+   }
+
/*
 * If the preferred target ioctl is successful then
 * use preferred target else try each and every target type
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6d4ea4b..a553477 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -988,6 +988,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_VM_IPA_SIZE 165
 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166
 #define KVM_CAP_HYPERV_CPUID 167
+#define KVM_CAP_ARM_PTRAUTH 168
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.7.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v7 2/10] KVM: arm64: Support runtime sysreg visibility filtering

2019-03-19 Thread Amit Daniel Kachhap
From: Dave Martin 

Some optional features of the Arm architecture add new system
registers that are not present in the base architecture.

Where these features are optional for the guest, the visibility of
these registers may need to depend on some runtime configuration,
such as a flag passed to KVM_ARM_VCPU_INIT.

For example, ZCR_EL1 and ID_AA64ZFR0_EL1 need to be hidden if SVE
is not enabled for the guest, even though these registers may be
present in the hardware and visible to the host at EL2.

Adding special-case checks all over the place for individual
registers is going to get messy as the number of conditionally-
visible registers grows.

In order to help solve this problem, this patch adds a new sysreg
method restrictions() that can be used to hook in any needed
runtime visibility checks.  This method can currently return
REG_NO_USER to inhibit enumeration and ioctl access to the register
for userspace, and REG_NO_GUEST to inhibit runtime access by the
guest using MSR/MRS.

This allows a conditionally modified view of individual system
registers such as the CPU ID registers, in addition to completely
hiding register where appropriate.

Signed-off-by: Dave Martin 
---
 arch/arm64/kvm/sys_regs.c | 24 +---
 arch/arm64/kvm/sys_regs.h | 13 +
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index a5d14b5..75942f6 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1927,6 +1927,12 @@ static void perform_access(struct kvm_vcpu *vcpu,
 {
trace_kvm_sys_access(*vcpu_pc(vcpu), params, r);
 
+   /* Check for regs disabled by runtime config */
+   if (restrictions(vcpu, r) & REG_NO_GUEST) {
+   kvm_inject_undefined(vcpu);
+   return;
+   }
+
/*
 * Not having an accessor means that we have configured a trap
 * that we don't know how to handle. This certainly qualifies
@@ -2438,6 +2444,10 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const 
struct kvm_one_reg *reg
if (!r)
return get_invariant_sys_reg(reg->id, uaddr);
 
+   /* Check for regs disabled by runtime config */
+   if (restrictions(vcpu, r) & REG_NO_USER)
+   return -ENOENT;
+
if (r->get_user)
return (r->get_user)(vcpu, r, reg, uaddr);
 
@@ -2459,6 +2469,10 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const 
struct kvm_one_reg *reg
if (!r)
return set_invariant_sys_reg(reg->id, uaddr);
 
+   /* Check for regs disabled by runtime config */
+   if (restrictions(vcpu, r) & REG_NO_USER)
+   return -ENOENT;
+
if (r->set_user)
return (r->set_user)(vcpu, r, reg, uaddr);
 
@@ -2515,7 +2529,8 @@ static bool copy_reg_to_user(const struct sys_reg_desc 
*reg, u64 __user **uind)
return true;
 }
 
-static int walk_one_sys_reg(const struct sys_reg_desc *rd,
+static int walk_one_sys_reg(const struct kvm_vcpu *vcpu,
+   const struct sys_reg_desc *rd,
u64 __user **uind,
unsigned int *total)
 {
@@ -2526,6 +2541,9 @@ static int walk_one_sys_reg(const struct sys_reg_desc *rd,
if (!(rd->reg || rd->get_user))
return 0;
 
+   if (restrictions(vcpu, rd) & REG_NO_USER)
+   return 0;
+
if (!copy_reg_to_user(rd, uind))
return -EFAULT;
 
@@ -2554,9 +2572,9 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 
__user *uind)
int cmp = cmp_sys_reg(i1, i2);
/* target-specific overrides generic entry. */
if (cmp <= 0)
-   err = walk_one_sys_reg(i1, &uind, &total);
+   err = walk_one_sys_reg(vcpu, i1, &uind, &total);
else
-   err = walk_one_sys_reg(i2, &uind, &total);
+   err = walk_one_sys_reg(vcpu, i2, &uind, &total);
 
if (err)
return err;
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 3b1bc7f..b9dd4a9 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -64,8 +64,15 @@ struct sys_reg_desc {
const struct kvm_one_reg *reg, void __user *uaddr);
int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr);
+
+   /* Return mask of REG_* runtime restriction flags */
+   unsigned int (*restrictions)(const struct kvm_vcpu *vcpu,
+const struct sys_reg_desc *rd);
 };
 
+#define REG_NO_USER(1 << 0) /* hidden from userspace ioctl interface */
+#define REG_NO_GUEST   (1 << 1) /* hidden from guest */
+
 static inline void print_sys_reg_instr(const struct sys_reg_params *p)
 {
/* Look, we even formatted it for yo

[PATCH v7 0/10] Add ARMv8.3 pointer authentication for kvm guest

2019-03-19 Thread Amit Daniel Kachhap
Hi,

This patch series adds pointer authentication support for KVM guest and
is based on top of Linux 5.1-rc1. The basic patches in this series was
originally posted by Mark Rutland earlier[1,2] and contains some history
of this work.

Extension Overview:
=

The ARMv8.3 pointer authentication extension adds functionality to detect
modification of pointer values, mitigating certain classes of attack such as
stack smashing, and making return oriented programming attacks harder.

The extension introduces the concept of a pointer authentication code (PAC),
which is stored in some upper bits of pointers. Each PAC is derived from the
original pointer, another 64-bit value (e.g. the stack pointer), and a secret
128-bit key.

New instructions are added which can be used to:

* Insert a PAC into a pointer
* Strip a PAC from a pointer
* Authenticate and strip a PAC from a pointer

The detailed description of ARMv8.3 pointer authentication support in
userspace/kernel and can be found in Kristina's generic pointer authentication
patch series[3].

KVM guest work:
==

If pointer authentication is enabled for KVM guests then the new PAC 
instructions
will not trap to EL2. If not then they may be ignored if in HINT region or 
trapped
in EL2 as illegal instruction. Since KVM guest vcpu runs as a thread so they 
have
a key initialized which will be used by PAC. When world switch happens between
host and guest then this key is exchanged.

The current v7 patch series contains review comments and suggestions by James
Morse, Mark Rutland and Dave Martin.

The first two patch is cherry picked from Dave Martin [9] v5 series which adds
SVE support for KVM guest. 

Changes since v6 [8]: Major changes are listed below.

* Pointer authentication key switch entirely in assembly now.
* isb instruction added after Key switched to host.
* Use __hyp_this_cpu_ptr for both VHE and nVHE mode.
* 2 separate flags for address and generic authentication.
* kvm_arm_vcpu_ptrauth_allowed renamed to has_vcpu_ptrauth.
* kvm_arm_vcpu_ptrauth_reset renamed to kvm_arm_vcpu_ptrauth_setup_lazy.
* Save of host Key registers now done in ptrauth instruction trap.
* A fix to add kern_hyp_va to get correct host_ctxt pointer in nVHE mode.
* Patches re-structured to better reflect ABI change.

Changes since v5 [7]: Major changes are listed below.

* Split hcr_el2 and mdcr_el2 save/restore in two patches.
* Reverted back save/restore of sys-reg keys as done in V4 [5]. There was
  suggestion by James Morse to use ptrauth utilities in a single place
  in arm core and use them from kvm. However this change deviates from the
  existing sys-reg implementations and is not scalable.
* Invoked the key switch C functions from __guest_enter/__guest_exit assembly.
* Host key save is now done inside vcpu_load.
* Reverted back masking of cpufeature ID registers for ptrauth when disabled
  from userpace.
* Reset of ptrauth key registers not done conditionally.
* Code and Documentation cleanup.

Changes since v4 [6]: Several suggestions from James Morse
* Move host registers to be saved/restored inside struct kvm_cpu_context.
* Similar to hcr_el2, save/restore mdcr_el2 register also.
* Added save routines for ptrauth keys in generic arm core and
  use them during KVM context switch.
* Defined a GCC attribute __no_ptrauth which discards generating
  ptrauth instructions in a function. This is taken from Kristina's
  earlier kernel pointer authentication support patches [4].
* Dropped a patch to mask cpufeature when not enabled from userspace and
  now only key registers are masked from register list.

Changes since v3 [5]:
* Use pointer authentication only when VHE is present as ARM8.3 implies ARM8.1
  features to be present.
* Added lazy context handling of ptrauth instructions from V2 version again. 
* Added more details in Documentation.

Changes since v2 [1,2]:
* Allow host and guest to have different HCR_EL2 settings and not just constant
  value HCR_HOST_VHE_FLAGS or HCR_HOST_NVHE_FLAGS.
* Optimise the reading of HCR_EL2 in host/guest switch by fetching it once
  during KVM initialisation state and using it later.
* Context switch pointer authentication keys when switching between guest
  and host. Pointer authentication was enabled in a lazy context earlier[2] and
  is removed now to make it simple. However it can be revisited later if there
  is significant performance issue.
* Added a userspace option to choose pointer authentication.
* Based on the userspace option, ptrauth cpufeature will be visible.
* Based on the userspace option, ptrauth key registers will be accessible.
* A small document is added on how to enable pointer authentication from
  userspace KVM API.

Looking for feedback and comments.

Thanks,
Amit

[1]: https://lore.kernel.org/lkml/20171127163806.31435-11-mark.rutl...@arm.com/
[2]: https://lore.kernel.org/lkml/20171127163806.31435-10-mark.rutl...@arm.com/
[3]: htt

[PATCH v7 1/10] KVM: arm64: Propagate vcpu into read_id_reg()

2019-03-19 Thread Amit Daniel Kachhap
From: Dave Martin 

Architecture features that are conditionally visible to the guest
will require run-time checks in the ID register accessor functions.
In particular, read_id_reg() will need to perform checks in order
to generate the correct emulated value for certain ID register
fields such as ID_AA64PFR0_EL1.SVE for example.

This patch propagates vcpu into read_id_reg() so that future
patches can add run-time checks on the guest configuration here.

For now, there is no functional change.

Signed-off-by: Dave Martin 
Reviewed-by: Alex Bennee 
---
 arch/arm64/kvm/sys_regs.c | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 539feec..a5d14b5 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1044,7 +1044,8 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
 }
 
 /* Read a sanitised cpufeature ID register by sys_reg_desc */
-static u64 read_id_reg(struct sys_reg_desc const *r, bool raz)
+static u64 read_id_reg(const struct kvm_vcpu *vcpu,
+   struct sys_reg_desc const *r, bool raz)
 {
u32 id = sys_reg((u32)r->Op0, (u32)r->Op1,
 (u32)r->CRn, (u32)r->CRm, (u32)r->Op2);
@@ -1078,7 +1079,7 @@ static bool __access_id_reg(struct kvm_vcpu *vcpu,
if (p->is_write)
return write_to_read_only(vcpu, p, r);
 
-   p->regval = read_id_reg(r, raz);
+   p->regval = read_id_reg(vcpu, r, raz);
return true;
 }
 
@@ -1107,16 +1108,18 @@ static u64 sys_reg_to_index(const struct sys_reg_desc 
*reg);
  * are stored, and for set_id_reg() we don't allow the effective value
  * to be changed.
  */
-static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
+static int __get_id_reg(const struct kvm_vcpu *vcpu,
+   const struct sys_reg_desc *rd, void __user *uaddr,
bool raz)
 {
const u64 id = sys_reg_to_index(rd);
-   const u64 val = read_id_reg(rd, raz);
+   const u64 val = read_id_reg(vcpu, rd, raz);
 
return reg_to_user(uaddr, &val, id);
 }
 
-static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
+static int __set_id_reg(const struct kvm_vcpu *vcpu,
+   const struct sys_reg_desc *rd, void __user *uaddr,
bool raz)
 {
const u64 id = sys_reg_to_index(rd);
@@ -1128,7 +1131,7 @@ static int __set_id_reg(const struct sys_reg_desc *rd, 
void __user *uaddr,
return err;
 
/* This is what we mean by invariant: you can't change it. */
-   if (val != read_id_reg(rd, raz))
+   if (val != read_id_reg(vcpu, rd, raz))
return -EINVAL;
 
return 0;
@@ -1137,25 +1140,25 @@ static int __set_id_reg(const struct sys_reg_desc *rd, 
void __user *uaddr,
 static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-   return __get_id_reg(rd, uaddr, false);
+   return __get_id_reg(vcpu, rd, uaddr, false);
 }
 
 static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-   return __set_id_reg(rd, uaddr, false);
+   return __set_id_reg(vcpu, rd, uaddr, false);
 }
 
 static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-   return __get_id_reg(rd, uaddr, true);
+   return __get_id_reg(vcpu, rd, uaddr, true);
 }
 
 static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-   return __set_id_reg(rd, uaddr, true);
+   return __set_id_reg(vcpu, rd, uaddr, true);
 }
 
 static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
-- 
2.7.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v7 3/10] KVM: arm64: Move hyp_symbol_addr to fix dependency

2019-03-19 Thread Amit Daniel Kachhap
Currently hyp_symbol_addr is palced in kvm_mmu.h which is mostly
used by __hyp_this_cpu_ptr in kvm_asm.h but it cannot include
kvm_mmu.h directly as kvm_mmu.h uses kvm_ksym_ref which is
defined inside kvm_asm.h. Hence, hyp_symbol_addr is moved inside
kvm_asm.h to fix this dependency on each other.

Also kvm_ksym_ref is corresponding counterpart of hyp_symbol_addr
so should be ideally placed inside kvm_asm.h.

Suggested by: James Morse 
Signed-off-by: Amit Daniel Kachhap 
Cc: Marc Zyngier 
Cc: Christoffer Dall 
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/include/asm/kvm_asm.h | 20 
 arch/arm64/include/asm/kvm_mmu.h | 20 
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index f5b79e9..57a07e8 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -80,6 +80,26 @@ extern void __vgic_v3_init_lrs(void);
 
 extern u32 __kvm_get_mdcr_el2(void);
 
+/*
+ * Obtain the PC-relative address of a kernel symbol
+ * s: symbol
+ *
+ * The goal of this macro is to return a symbol's address based on a
+ * PC-relative computation, as opposed to a loading the VA from a
+ * constant pool or something similar. This works well for HYP, as an
+ * absolute VA is guaranteed to be wrong. Only use this if trying to
+ * obtain the address of a symbol (i.e. not something you obtained by
+ * following a pointer).
+ */
+#define hyp_symbol_addr(s) \
+   ({  \
+   typeof(s) *addr;\
+   asm("adrp   %0, %1\n"   \
+   "add%0, %0, :lo12:%1\n" \
+   : "=r" (addr) : "S" (&s));  \
+   addr;   \
+   })
+
 /* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */
 #define __hyp_this_cpu_ptr(sym)
\
({  \
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index b0742a1..3dea6af 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -118,26 +118,6 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
 #define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v
 
 /*
- * Obtain the PC-relative address of a kernel symbol
- * s: symbol
- *
- * The goal of this macro is to return a symbol's address based on a
- * PC-relative computation, as opposed to a loading the VA from a
- * constant pool or something similar. This works well for HYP, as an
- * absolute VA is guaranteed to be wrong. Only use this if trying to
- * obtain the address of a symbol (i.e. not something you obtained by
- * following a pointer).
- */
-#define hyp_symbol_addr(s) \
-   ({  \
-   typeof(s) *addr;\
-   asm("adrp   %0, %1\n"   \
-   "add%0, %0, :lo12:%1\n" \
-   : "=r" (addr) : "S" (&s));  \
-   addr;   \
-   })
-
-/*
  * We currently support using a VM-specified IPA size. For backward
  * compatibility, the default IPA size is fixed to 40bits.
  */
-- 
2.7.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v7 6/10] KVM: arm64: Add vcpu feature flags to control ptrauth accessibility

2019-03-19 Thread Amit Daniel Kachhap
Since Pointer authentication will be enabled or disabled on a
per-vcpu basis, vcpu feature flags are added in order to know which
vcpus have it enabled from userspace.

This features will allow the KVM guest to allow the handling of
pointer authentication instructions or to treat them as undefined
if not set.

The helper macro added checks the feature flag along with
other conditions such as VHE mode present and system supports
pointer address/generic authentication.

Signed-off-by: Amit Daniel Kachhap 
Cc: Mark Rutland 
Cc: Marc Zyngier 
Cc: Christoffer Dall 
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/include/asm/kvm_host.h | 8 +++-
 arch/arm64/include/uapi/asm/kvm.h | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index e3ccd7b..9dd2918 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -45,7 +45,7 @@
 
 #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
 
-#define KVM_VCPU_MAX_FEATURES 4
+#define KVM_VCPU_MAX_FEATURES 6
 
 #define KVM_REQ_SLEEP \
KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
@@ -491,6 +491,12 @@ static inline bool kvm_arch_requires_vhe(void)
return false;
 }
 
+#define vcpu_has_ptrauth(vcpu) (has_vhe() && \
+   system_supports_address_auth() && \
+   system_supports_generic_auth() && \
+   test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) && \
+   test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features))
+
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index 97c3478..8806f71 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -102,6 +102,8 @@ struct kvm_regs {
 #define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */
 #define KVM_ARM_VCPU_PSCI_0_2  2 /* CPU uses PSCI v0.2 */
 #define KVM_ARM_VCPU_PMU_V33 /* Support guest PMUv3 */
+#define KVM_ARM_VCPU_PTRAUTH_ADDRESS   4 /* VCPU uses address authentication */
+#define KVM_ARM_VCPU_PTRAUTH_GENERIC   5 /* VCPU uses generic authentication */
 
 struct kvm_vcpu_init {
__u32 target;
-- 
2.7.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v7 5/10] KVM: arm/arm64: preserve host MDCR_EL2 value

2019-03-19 Thread Amit Daniel Kachhap
Save host MDCR_EL2 value during kvm HYP initialisation and restore
after every switch from host to guest. There should not be any
change in functionality due to this.

The value of mdcr_el2 is now stored in struct kvm_cpu_context as
both host and guest can now use this field in a common way.

Signed-off-by: Amit Daniel Kachhap 
Acked-by: Mark Rutland 
Cc: Marc Zyngier 
Cc: Mark Rutland 
Cc: Christoffer Dall 
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm/include/asm/kvm_host.h   |  1 -
 arch/arm64/include/asm/kvm_host.h |  6 ++
 arch/arm64/include/asm/kvm_hyp.h  |  2 +-
 arch/arm64/kvm/debug.c| 28 ++--
 arch/arm64/kvm/hyp/switch.c   | 18 +-
 arch/arm64/kvm/hyp/sysreg-sr.c|  8 +++-
 virt/kvm/arm/arm.c|  1 -
 7 files changed, 21 insertions(+), 43 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 6d0aac4..a928565 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -343,7 +343,6 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu 
*vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
-static inline void kvm_arm_init_debug(void) {}
 static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {}
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 3b09fd0..e3ccd7b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -211,6 +211,8 @@ struct kvm_cpu_context {
 
/* HYP host/guest configuration */
u64 hcr_el2;
+   u32 mdcr_el2;
+
struct kvm_vcpu *__hyp_running_vcpu;
 };
 
@@ -226,9 +228,6 @@ struct vcpu_reset_state {
 struct kvm_vcpu_arch {
struct kvm_cpu_context ctxt;
 
-   /* HYP configuration */
-   u32 mdcr_el2;
-
/* Exception Information */
struct kvm_vcpu_fault_info fault;
 
@@ -498,7 +497,6 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu 
*vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
-void kvm_arm_init_debug(void);
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 4da765f..7fcde8a 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -152,7 +152,7 @@ void __fpsimd_restore_state(struct user_fpsimd_state 
*fp_regs);
 bool __fpsimd_enabled(void);
 
 void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
-void deactivate_traps_vhe_put(void);
+void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu);
 
 u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
 void __noreturn __hyp_do_panic(unsigned long, ...);
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index fd917d6..99dc0a4 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -32,8 +32,6 @@
DBG_MDSCR_KDE | \
DBG_MDSCR_MDE)
 
-static DEFINE_PER_CPU(u32, mdcr_el2);
-
 /**
  * save/restore_guest_debug_regs
  *
@@ -65,21 +63,6 @@ static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
 }
 
 /**
- * kvm_arm_init_debug - grab what we need for debug
- *
- * Currently the sole task of this function is to retrieve the initial
- * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has
- * presumably been set-up by some knowledgeable bootcode.
- *
- * It is called once per-cpu during CPU hyp initialisation.
- */
-
-void kvm_arm_init_debug(void)
-{
-   __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2));
-}
-
-/**
  * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state
  */
 
@@ -111,6 +94,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
 
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
 {
+   kvm_cpu_context_t *host_cxt = this_cpu_ptr(&kvm_host_cpu_state);
bool trap_debug = !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY);
unsigned long mdscr;
 
@@ -120,8 +104,8 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
 * This also clears MDCR_EL2_E2PB_MASK to disable guest access
 * to the profiling buffer.
 */
-   vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
-   vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+   vcpu->arch.ctxt.mdcr_el2 = host_cxt->mdcr_el2 & MDCR_EL2_HPMN_MASK;
+   vcpu->arch.ctxt.mdcr_el2 |= (MDCR_EL2_TPM |
MDCR_EL2_TPMS |
MDCR_EL2_TPMCR |
MDCR_EL2_

[PATCH v7 4/10] KVM: arm/arm64: preserve host HCR_EL2 value

2019-03-19 Thread Amit Daniel Kachhap
From: Mark Rutland 

When restoring HCR_EL2 for the host, KVM uses HCR_HOST_VHE_FLAGS, which
is a constant value. This works today, as the host HCR_EL2 value is
always the same, but this will get in the way of supporting extensions
that require HCR_EL2 bits to be set conditionally for the host.

To allow such features to work without KVM having to explicitly handle
every possible host feature combination, this patch has KVM save/restore
for the host HCR when switching to/from a guest HCR. The saving of the
register is done once during cpu hypervisor initialization state and is
just restored after switch from guest.

For fetching HCR_EL2 during kvm initialisation, a hyp call is made using
kvm_call_hyp and is helpful in non-VHE case.

For the hyp TLB maintenance code, __tlb_switch_to_host_vhe() is updated
to toggle the TGE bit with a RMW sequence, as we already do in
__tlb_switch_to_guest_vhe().

The value of hcr_el2 is now stored in struct kvm_cpu_context as both host
and guest can now use this field in a common way.

Signed-off-by: Mark Rutland 
[Added cpu_init_host_ctxt, hcr_el2 field in struct kvm_cpu_context,
save hcr_el2 in hyp init stage]
Signed-off-by: Amit Daniel Kachhap 
Reviewed-by: James Morse 
Cc: Marc Zyngier 
Cc: Christoffer Dall 
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm/include/asm/kvm_host.h  |  2 ++
 arch/arm64/include/asm/kvm_asm.h |  2 ++
 arch/arm64/include/asm/kvm_emulate.h | 24 
 arch/arm64/include/asm/kvm_host.h| 15 ++-
 arch/arm64/kvm/guest.c   |  2 +-
 arch/arm64/kvm/hyp/switch.c  | 22 +-
 arch/arm64/kvm/hyp/sysreg-sr.c   | 14 ++
 arch/arm64/kvm/hyp/tlb.c |  6 +-
 virt/kvm/arm/arm.c   |  1 +
 9 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 770d732..6d0aac4 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -322,6 +322,8 @@ static inline void __cpu_init_stage2(void)
kvm_call_hyp(__init_stage2_translation);
 }
 
+static inline void cpu_init_host_ctxt(void) {}
+
 static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 {
return 0;
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 57a07e8..a68205c 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -80,6 +80,8 @@ extern void __vgic_v3_init_lrs(void);
 
 extern u32 __kvm_get_mdcr_el2(void);
 
+extern void __kvm_populate_host_regs(void);
+
 /*
  * Obtain the PC-relative address of a kernel symbol
  * s: symbol
diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index d384279..426815d 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -50,25 +50,25 @@ void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long 
addr);
 
 static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
 {
-   return !(vcpu->arch.hcr_el2 & HCR_RW);
+   return !(vcpu->arch.ctxt.hcr_el2 & HCR_RW);
 }
 
 static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 {
-   vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
+   vcpu->arch.ctxt.hcr_el2 = HCR_GUEST_FLAGS;
if (is_kernel_in_hyp_mode())
-   vcpu->arch.hcr_el2 |= HCR_E2H;
+   vcpu->arch.ctxt.hcr_el2 |= HCR_E2H;
if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) {
/* route synchronous external abort exceptions to EL2 */
-   vcpu->arch.hcr_el2 |= HCR_TEA;
+   vcpu->arch.ctxt.hcr_el2 |= HCR_TEA;
/* trap error record accesses */
-   vcpu->arch.hcr_el2 |= HCR_TERR;
+   vcpu->arch.ctxt.hcr_el2 |= HCR_TERR;
}
if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-   vcpu->arch.hcr_el2 |= HCR_FWB;
+   vcpu->arch.ctxt.hcr_el2 |= HCR_FWB;
 
if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features))
-   vcpu->arch.hcr_el2 &= ~HCR_RW;
+   vcpu->arch.ctxt.hcr_el2 &= ~HCR_RW;
 
/*
 * TID3: trap feature register accesses that we virtualise.
@@ -76,26 +76,26 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 * are currently virtualised.
 */
if (!vcpu_el1_is_32bit(vcpu))
-   vcpu->arch.hcr_el2 |= HCR_TID3;
+   vcpu->arch.ctxt.hcr_el2 |= HCR_TID3;
 
if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
vcpu_el1_is_32bit(vcpu))
-   vcpu->arch.hcr_el2 |= HCR_TID2;
+   vcpu->arch.ctxt.hcr_el2 |= HCR_TID2;
 }
 
 static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
 {
-   return (unsigned long *)&vcpu->arch.hcr_el2;
+   return (unsigned long *)&vcpu->arch.ctxt.hcr_el2;
 }
 
 static inline void vcpu_clear_wfe_traps(struct kvm_vcpu *vcpu)
 {
-   vcpu->arch.hcr_el2 &= ~