[PATCH kvmtool v3 2/3] vfio-pci: Add new function for INTx one-time initialisation

2019-03-26 Thread Leo Yan
To support INTx enabling for multiple times, we need firstly to extract
one-time initialisation and move the related code into a new function
vfio_pci_init_intx(); if later disable and re-enable the INTx, we can
skip these one-time operations.

This patch move below three main operations for INTx one-time
initialisation from function vfio_pci_enable_intx() into function
vfio_pci_init_intx():

- Reserve 2 FDs for INTx;
- Sanity check with ioctl VFIO_DEVICE_GET_IRQ_INFO;
- Setup pdev->intx_gsi.

Suggested-by: Jean-Philippe Brucker 
Signed-off-by: Leo Yan 
---
 vfio/pci.c | 67 --
 1 file changed, 40 insertions(+), 27 deletions(-)

diff --git a/vfio/pci.c b/vfio/pci.c
index 5224fee..3c39844 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -1018,30 +1018,7 @@ static int vfio_pci_enable_intx(struct kvm *kvm, struct 
vfio_device *vdev)
struct vfio_irq_eventfd trigger;
struct vfio_irq_eventfd unmask;
struct vfio_pci_device *pdev = &vdev->pci;
-   int gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
-
-   struct vfio_irq_info irq_info = {
-   .argsz = sizeof(irq_info),
-   .index = VFIO_PCI_INTX_IRQ_INDEX,
-   };
-
-   vfio_pci_reserve_irq_fds(2);
-
-   ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
-   if (ret || irq_info.count == 0) {
-   vfio_dev_err(vdev, "no INTx reported by VFIO");
-   return -ENODEV;
-   }
-
-   if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
-   vfio_dev_err(vdev, "interrupt not eventfd capable");
-   return -EINVAL;
-   }
-
-   if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
-   vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
-   return -EINVAL;
-   }
+   int gsi = pdev->intx_gsi;
 
/*
 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
@@ -1097,8 +1074,6 @@ static int vfio_pci_enable_intx(struct kvm *kvm, struct 
vfio_device *vdev)
 
pdev->intx_fd = trigger_fd;
pdev->unmask_fd = unmask_fd;
-   /* Guest is going to ovewrite our irq_line... */
-   pdev->intx_gsi = gsi;
 
return 0;
 
@@ -1117,6 +1092,39 @@ err_close:
return ret;
 }
 
+static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
+{
+   int ret;
+   struct vfio_pci_device *pdev = &vdev->pci;
+   struct vfio_irq_info irq_info = {
+   .argsz = sizeof(irq_info),
+   .index = VFIO_PCI_INTX_IRQ_INDEX,
+   };
+
+   vfio_pci_reserve_irq_fds(2);
+
+   ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+   if (ret || irq_info.count == 0) {
+   vfio_dev_err(vdev, "no INTx reported by VFIO");
+   return -ENODEV;
+   }
+
+   if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+   vfio_dev_err(vdev, "interrupt not eventfd capable");
+   return -EINVAL;
+   }
+
+   if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
+   vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
+   return -EINVAL;
+   }
+
+   /* Guest is going to ovewrite our irq_line... */
+   pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
+
+   return 0;
+}
+
 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device 
*vdev)
 {
int ret = 0;
@@ -1142,8 +1150,13 @@ static int vfio_pci_configure_dev_irqs(struct kvm *kvm, 
struct vfio_device *vdev
return ret;
}
 
-   if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
+   if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
+   ret = vfio_pci_init_intx(kvm, vdev);
+   if (ret)
+   return ret;
+
ret = vfio_pci_enable_intx(kvm, vdev);
+   }
 
return ret;
 }
-- 
2.19.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH kvmtool v3 3/3] vfio-pci: Re-enable INTx mode when disable MSI/MSIX

2019-03-26 Thread Leo Yan
Since PCI forbids enabling INTx, MSI or MSIX at the same time, it's by
default to disable INTx mode when enable MSI/MSIX mode; but this logic is
easily broken if the guest PCI driver detects the MSI/MSIX cannot work as
expected and tries to rollback to use INTx mode.  In this case, the INTx
mode has been disabled and has no chance to re-enable it, thus both INTx
mode and MSI/MSIX mode cannot work in vfio.

Below shows the detailed flow for introducing this issue:

  vfio_pci_configure_dev_irqs()
`-> vfio_pci_enable_intx()

  vfio_pci_enable_msis()
`-> vfio_pci_disable_intx()

  vfio_pci_disable_msis()   => Guest PCI driver disables MSI

To fix this issue, when disable MSI/MSIX we need to check if INTx mode
is available for this device or not; if the device can support INTx then
re-enable it so that the device can fallback to use it.

Since vfio_pci_disable_intx() / vfio_pci_enable_intx() pair functions
may be called for multiple times, this patch uses 'intx_fd == -1' to
denote the INTx is disabled, the pair functions can directly bail out
when detect INTx has been disabled and enabled respectively.

Suggested-by: Jean-Philippe Brucker 
Signed-off-by: Leo Yan 
---
 vfio/pci.c | 41 ++---
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/vfio/pci.c b/vfio/pci.c
index 3c39844..3b2b1e7 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -28,6 +28,7 @@ struct vfio_irq_eventfd {
msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
 
 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
+static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
 
 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
bool msix)
@@ -50,17 +51,14 @@ static int vfio_pci_enable_msis(struct kvm *kvm, struct 
vfio_device *vdev,
if (!msi_is_enabled(msis->virt_state))
return 0;
 
-   if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
-   /*
-* PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
-* time. Since INTx has to be enabled from the start (we don't
-* have a reliable way to know when the user starts using it),
-* disable it now.
-*/
+   /*
+* PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
+* time. Since INTx has to be enabled from the start (after enabling
+* 'pdev->intx_fd' will be assigned to an eventfd and doesn't equal
+* to the init value -1), disable it now.
+*/
+   if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
vfio_pci_disable_intx(kvm, vdev);
-   /* Permanently disable INTx */
-   pdev->irq_modes &= ~VFIO_PCI_IRQ_MODE_INTX;
-   }
 
eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
 
@@ -162,7 +160,16 @@ static int vfio_pci_disable_msis(struct kvm *kvm, struct 
vfio_device *vdev,
msi_set_enabled(msis->phys_state, false);
msi_set_empty(msis->phys_state, true);
 
-   return 0;
+   /*
+* When MSI or MSIX is disabled, this might be called when
+* PCI driver detects the MSI interrupt failure and wants to
+* rollback to INTx mode.  Thus enable INTx if the device
+* supports INTx mode in this case.
+*/
+   if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
+   ret = vfio_pci_enable_intx(kvm, vdev);
+
+   return ret >= 0 ? 0 : ret;
 }
 
 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
@@ -1002,6 +1009,10 @@ static void vfio_pci_disable_intx(struct kvm *kvm, 
struct vfio_device *vdev)
.index  = VFIO_PCI_INTX_IRQ_INDEX,
};
 
+   /* INTx mode has been disabled */
+   if (pdev->intx_fd == -1)
+   return;
+
pr_debug("user requested MSI, disabling INTx %d", gsi);
 
ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
@@ -1009,6 +1020,7 @@ static void vfio_pci_disable_intx(struct kvm *kvm, struct 
vfio_device *vdev)
 
close(pdev->intx_fd);
close(pdev->unmask_fd);
+   pdev->intx_fd = -1;
 }
 
 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
@@ -1020,6 +1032,10 @@ static int vfio_pci_enable_intx(struct kvm *kvm, struct 
vfio_device *vdev)
struct vfio_pci_device *pdev = &vdev->pci;
int gsi = pdev->intx_gsi;
 
+   /* INTx mode has been enabled */
+   if (pdev->intx_fd != -1)
+   return 0;
+
/*
 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
 * signals an interrupt from host to guest, and unmask_fd signals the
@@ -1122,6 +1138,9 @@ static int vfio_pci_init_intx(struct kvm *kvm, struct 
vfio_device *vdev)
/* Guest is going to ovewrite our irq_line... */
pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
 
+   /* Use intx_fd = -1 t

[PATCH kvmtool v3 1/3] vfio-pci: Release INTx's unmask eventfd properly

2019-03-26 Thread Leo Yan
The PCI device INTx uses event fd 'unmask_fd' to signal the deassertion
of the line from guest to host; but this eventfd isn't released properly
when disable INTx.

This patch firstly adds field 'unmask_fd' in struct vfio_pci_device for
storing unmask eventfd and close it when disable INTx.

Reviewed-by: Jean-Philippe Brucker 
Signed-off-by: Leo Yan 
---
 include/kvm/vfio.h | 1 +
 vfio/pci.c | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
index 60e6c54..28223cf 100644
--- a/include/kvm/vfio.h
+++ b/include/kvm/vfio.h
@@ -74,6 +74,7 @@ struct vfio_pci_device {
 
unsigned long   irq_modes;
int intx_fd;
+   int unmask_fd;
unsigned intintx_gsi;
struct vfio_pci_msi_common  msi;
struct vfio_pci_msi_common  msix;
diff --git a/vfio/pci.c b/vfio/pci.c
index 03de3c1..5224fee 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -1008,6 +1008,7 @@ static void vfio_pci_disable_intx(struct kvm *kvm, struct 
vfio_device *vdev)
irq__del_irqfd(kvm, gsi, pdev->intx_fd);
 
close(pdev->intx_fd);
+   close(pdev->unmask_fd);
 }
 
 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
@@ -1095,6 +1096,7 @@ static int vfio_pci_enable_intx(struct kvm *kvm, struct 
vfio_device *vdev)
}
 
pdev->intx_fd = trigger_fd;
+   pdev->unmask_fd = unmask_fd;
/* Guest is going to ovewrite our irq_line... */
pdev->intx_gsi = gsi;
 
-- 
2.19.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH kvmtool v3 0/3] vfio-pci: Support INTx mode re-enabling

2019-03-26 Thread Leo Yan
When enable vfio-pci mode for NIC driver on Juno board, the IRQ is
failed to forward properly from host to guest, finally root caused this
issue is related with kvmtool cannot re-enable INTx mode properly.

So the basic working flow to reproduce this issue is as below:

Host Guest
-  
  INTx mode
 MSI enable failed in NIC driver
 MSI disable in NIC driver
 Switch back to INTx mode --> kvmtool doesn't support

So this patch is to support INTx mode re-enabling; patch 0001 is one
minor fixing up for eventfd releasing; patch 0002 introduces a new
function vfio_pci_init_intx() which is used to finish INTx one-time
initialisation; patch 0003 is the core patch for support INTx mode
re-enabling, when kvmtool detects MSI is disabled it rollbacks to INTx
mode.

This patch set has been tested on Juno-r2 board.

== Changes for V3 ==
* Add new function vfio_pci_init_intx() for one-time initialisation.
* Simplized INTx re-enabling (don't change irq_line anymore at the
  runtime).


Leo Yan (3):
  vfio-pci: Release INTx's unmask eventfd properly
  vfio-pci: Add new function for INTx one-time initialisation
  vfio-pci: Re-enable INTx mode when disable MSI/MSIX

 include/kvm/vfio.h |   1 +
 vfio/pci.c | 108 +
 2 files changed, 72 insertions(+), 37 deletions(-)

-- 
2.19.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 00/26] KVM: arm64: SVE guest support

2019-03-26 Thread Dave Martin
On Mon, Mar 25, 2019 at 05:49:56PM +0100, Andrew Jones wrote:
> On Mon, Mar 25, 2019 at 04:38:03PM +, Peter Maydell wrote:
> > On Mon, 25 Mar 2019 at 16:33, Andrew Jones  wrote:
> > >
> > > On Tue, Mar 19, 2019 at 05:51:51PM +, Dave Martin wrote:
> > > > This series implements support for allowing KVM guests to use the Arm
> > > > Scalable Vector Extension (SVE), superseding the previous v5 series [1].
> > 
> > > Hi Dave and Peter,
> > >
> > > Do either of you know if anyone has picked up the QEMU side of this?
> > > If not, then I'll take it.
> > 
> > Nobody has yet, so feel free.
> >
> 
> OK, hopefully I'll get something sent by the end of next week.

If one of you guys can review the patches with QEMU in mind in the
meantime, that would be very helpful.

Just looking at the documentation updates would be a good place to
start.

Cheers
---Dave
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC] arm/cpu: fix soft lockup panic after resuming from stop

2019-03-26 Thread Heyi Guo

Hi Steven,

Thanks for your explanation. Please see my comments below.


On 2019/3/21 0:29, Steven Price wrote:

On 19/03/2019 14:39, Heyi Guo wrote:

Hi Christoffer and Steve,


On 2019/3/15 16:59, Christoffer Dall wrote:

Hi Steve,

On Wed, Mar 13, 2019 at 10:11:30AM +, Steven Price wrote:

Personally I think what we need is:

* Either a patch like the one from Heyi Guo (save/restore CNTVCT_EL0) or
alternatively hooking up KVM_KVMCLOCK_CTRL to prevent the watchdog
firing when user space explicitly stops scheduling the guest for a
while.

If we save/restore CNTVCT_EL0 and the warning goes away, does the guest
wall clock timekeeping get all confused and does it figure this out
automagically somehow?

What's the source for guest wall clock timekeeping in current ARM64
implementation? Is it the value from CNTP_TVAL_EL0? Will guest OS keep
track of this? Or is the wall clock simply platform RTC?

I tested the RFC change and did not see anything unusual. Did I miss
someting?

Are you running ARM or ARM64? I'm assuming ARM64 here...

Yes, definitely :)



Unless I'm mistaken you should see the time reported by the guest not
progress when you have stopped it using the QEMU monitor console.

Running something like "while /bin/true; do date; sleep 1; done" should
allow you to see that without the patch time will jump in the guest
(i.e. the time reflects wall-clock time). With the patch I believe it
will not jump (i.e. the clock will be behind wall-clock time after the
pause).

I did the test and the result was exactly like you said.


However, this behaviour does depend on the exact system being emulated.
>From drivers/clocksource/arm_arch_timer.c:


static void __init arch_counter_register(unsigned type)
{
u64 start_count;

/* Register the CP15 based counter if we have one */
if (type & ARCH_TIMER_TYPE_CP15) {
if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) ||
arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
arch_timer_read_counter = arch_counter_get_cntvct;
else
arch_timer_read_counter = arch_counter_get_cntpct;

clocksource_counter.archdata.vdso_direct = vdso_default;
} else {
arch_timer_read_counter = arch_counter_get_cntvct_mem;
}

So we can see here that there are three functions for reading the
counter - there's the memory interface for CNTVCT, the "CP15" interface
also for CNTVCT, and an interface for CNTPCT. CNTPCT is only used for
!ARM64 or if hyp mode is available (not relevant until nested
virtualisation is added).

The upshot is that on arm64 we're using the virtual counter (CNTVCT).


Does KVM_KVMCLOCK_CTRL solve that problem?

It seems KVM_KVMCLOCK_CTRL is dedicated for guest pause/resume scenario,
but it relies on pvclock both in KVM and Guest and right now only X86
supports it :(

KVM_KVMCLOCK_CTRL simply provides a mechanism for the host to inform the
guest that a vCPU hasn't been scheduled for "a while". The guest can
then use that information to avoid triggering the watchdog timeout. As
you note it is only currently implemented for X86.


Could Steve provide more insights about the whole thing?

I'll try - see below.


Thanks,
Heyi


* KVM itself saving/restoring CNTVCT_EL0 during suspend/resume so the
guest doesn't see time pass during a suspend.

This smells like policy to me so I'd much prefer keeping as much
functionality in user space as possible.  If we already have the APIs we
need from KVM, let's use them.

The problem with suspend/resume is that user space doesn't get a
look-in. There's no way (generically) for a user space application to
save/restore registers during the suspend. User space simply sees time
jump forwards - which is precisely what we're trying to hide from the
guest (as it otherwise gets upset that it appears a vCPU has deadlocked
for a while).

So while I agree this is "policy", it's something we need the kernel to
do on our behalf. Potentially we can put it behind an ioctl so that user
space can opt-in to it.


* Something equivalent to MSR_KVM_WALL_CLOCK_NEW for arm which allows
the guest to query the wall clock time from the host and provides an
offset between CNTVCT_EL0 to wall clock time which the KVM can update
during suspend/resume. This means that during a suspend/resume the guest
can observe that wall clock time has passed, without having to be
bothered about CNTVCT_EL0 jumping forwards.


Isn't the proper Arm architectural solution for this to read the
physical counter for wall clock time keeping ?

(Yes that will require a trap on physical counter reads after migration
on current systems, but migration sucks in terms of timekeeping
already.)

The physical counter is certainly tempting as it largely does what we
want as a secondary counter. However I'm wary of using it because it
starts to get "really interesting" when dealing with nested
virtualisation. Any by "really interesti

Re: [RFC] arm/cpu: fix soft lockup panic after resuming from stop

2019-03-26 Thread Heyi Guo

I also tested save/restore operations, and observed that clock in guest would 
not jump after restoring either. If we consider guest clock not being 
synchronized with real wall clock as an issue, does it mean save/restore 
function has the same issue?

Thanks,

Heyi


On 2019/3/26 19:54, Heyi Guo wrote:

Hi Steven,

Thanks for your explanation. Please see my comments below.


On 2019/3/21 0:29, Steven Price wrote:

On 19/03/2019 14:39, Heyi Guo wrote:

Hi Christoffer and Steve,


On 2019/3/15 16:59, Christoffer Dall wrote:

Hi Steve,

On Wed, Mar 13, 2019 at 10:11:30AM +, Steven Price wrote:

Personally I think what we need is:

* Either a patch like the one from Heyi Guo (save/restore CNTVCT_EL0) or
alternatively hooking up KVM_KVMCLOCK_CTRL to prevent the watchdog
firing when user space explicitly stops scheduling the guest for a
while.

If we save/restore CNTVCT_EL0 and the warning goes away, does the guest
wall clock timekeeping get all confused and does it figure this out
automagically somehow?

What's the source for guest wall clock timekeeping in current ARM64
implementation? Is it the value from CNTP_TVAL_EL0? Will guest OS keep
track of this? Or is the wall clock simply platform RTC?

I tested the RFC change and did not see anything unusual. Did I miss
someting?

Are you running ARM or ARM64? I'm assuming ARM64 here...

Yes, definitely :)



Unless I'm mistaken you should see the time reported by the guest not
progress when you have stopped it using the QEMU monitor console.

Running something like "while /bin/true; do date; sleep 1; done" should
allow you to see that without the patch time will jump in the guest
(i.e. the time reflects wall-clock time). With the patch I believe it
will not jump (i.e. the clock will be behind wall-clock time after the
pause).

I did the test and the result was exactly like you said.


However, this behaviour does depend on the exact system being emulated.
>From drivers/clocksource/arm_arch_timer.c:


static void __init arch_counter_register(unsigned type)
{
u64 start_count;

/* Register the CP15 based counter if we have one */
if (type & ARCH_TIMER_TYPE_CP15) {
if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) ||
arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
arch_timer_read_counter = arch_counter_get_cntvct;
else
arch_timer_read_counter = arch_counter_get_cntpct;

clocksource_counter.archdata.vdso_direct = vdso_default;
} else {
arch_timer_read_counter = arch_counter_get_cntvct_mem;
}

So we can see here that there are three functions for reading the
counter - there's the memory interface for CNTVCT, the "CP15" interface
also for CNTVCT, and an interface for CNTPCT. CNTPCT is only used for
!ARM64 or if hyp mode is available (not relevant until nested
virtualisation is added).

The upshot is that on arm64 we're using the virtual counter (CNTVCT).


Does KVM_KVMCLOCK_CTRL solve that problem?

It seems KVM_KVMCLOCK_CTRL is dedicated for guest pause/resume scenario,
but it relies on pvclock both in KVM and Guest and right now only X86
supports it :(

KVM_KVMCLOCK_CTRL simply provides a mechanism for the host to inform the
guest that a vCPU hasn't been scheduled for "a while". The guest can
then use that information to avoid triggering the watchdog timeout. As
you note it is only currently implemented for X86.


Could Steve provide more insights about the whole thing?

I'll try - see below.


Thanks,
Heyi


* KVM itself saving/restoring CNTVCT_EL0 during suspend/resume so the
guest doesn't see time pass during a suspend.

This smells like policy to me so I'd much prefer keeping as much
functionality in user space as possible.  If we already have the APIs we
need from KVM, let's use them.

The problem with suspend/resume is that user space doesn't get a
look-in. There's no way (generically) for a user space application to
save/restore registers during the suspend. User space simply sees time
jump forwards - which is precisely what we're trying to hide from the
guest (as it otherwise gets upset that it appears a vCPU has deadlocked
for a while).

So while I agree this is "policy", it's something we need the kernel to
do on our behalf. Potentially we can put it behind an ioctl so that user
space can opt-in to it.


* Something equivalent to MSR_KVM_WALL_CLOCK_NEW for arm which allows
the guest to query the wall clock time from the host and provides an
offset between CNTVCT_EL0 to wall clock time which the KVM can update
during suspend/resume. This means that during a suspend/resume the guest
can observe that wall clock time has passed, without having to be
bothered about CNTVCT_EL0 jumping forwards.


Isn't the proper Arm architectural solution for this to read the
physical counter for wall clock time keeping ?

(Yes that will require a trap on physical counter reads after migration
on current systems, but migration sucks in terms of timekeeping
already.)

The physical cou

Re: [PATCH v6 12/27] KVM: arm64/sve: System register context switch and access support

2019-03-26 Thread Julien Thierry
Hi Dave,

On 19/03/2019 17:52, Dave Martin wrote:
> This patch adds the necessary support for context switching ZCR_EL1
> for each vcpu.
> 
> ZCR_EL1 is trapped alongside the FPSIMD/SVE registers, so it makes
> sense for it to be handled as part of the guest FPSIMD/SVE context
> for context switch purposes instead of handling it as a general
> system register.  This means that it can be switched in lazily at
> the appropriate time.  No effort is made to track host context for
> this register, since SVE requires VHE: thus the hosts's value for
> this register lives permanently in ZCR_EL2 and does not alias the
> guest's value at any time.
> 
> The Hyp switch and fpsimd context handling code is extended
> appropriately.
> 
> Accessors are added in sys_regs.c to expose the SVE system
> registers and ID register fields.  Because these need to be
> conditionally visible based on the guest configuration, they are
> implemented separately for now rather than by use of the generic
> system register helpers.  This may be abstracted better later on
> when/if there are more features requiring this model.
> 
> ID_AA64ZFR0_EL1 is RO-RAZ for MRS/MSR when SVE is disabled for the
> guest, but for compatibility with non-SVE aware KVM implementations
> the register should not be enumerated at all for KVM_GET_REG_LIST
> in this case.  For consistency we also reject ioctl access to the
> register.  This ensures that a non-SVE-enabled guest looks the same
> to userspace, irrespective of whether the kernel KVM implementation
> supports SVE.
> 
> Signed-off-by: Dave Martin 

Reviewed-by: Julien Thierry 

Cheers,

Julien

> 
> ---
> 
> Changes since v5:
> 
>  * Port to the renamed visibility() framework.
> 
>  * Swap visiblity() helpers so that they appear by the relevant accessor
>functions.
> 
>  * [Julien Grall] With the visibility() checks, {get,set}_zcr_el1()
>degenerate to doing exactly what the common code does, so drop them.
> 
>The ID_AA64ZFR0_EL1 handlers are still needed to provide contitional
>RAZ behaviour.  This could be moved to the common code too, but since
>this is a one-off case I don't do this for now.  We can address this
>later if other regs need to follow the same pattern.
> 
>  * [Julien Thierry] Reset ZCR_EL1 to a fixed value using reset_val
>instead of using relying on reset_unknown() honouring set bits in val
>as RES0.
> 
>Most of the bits in ZCR_EL1 are RES0 anyway, and many implementations
>of SVE will support larger vectors than 128 bits, so 0 seems as good
>a value as any to expose guests that forget to initialise this
>register properly.
> ---
>  arch/arm64/include/asm/kvm_host.h |  1 +
>  arch/arm64/include/asm/sysreg.h   |  3 ++
>  arch/arm64/kvm/fpsimd.c   |  9 -
>  arch/arm64/kvm/hyp/switch.c   |  3 ++
>  arch/arm64/kvm/sys_regs.c | 83 
> ---
>  5 files changed, 93 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index ad4f7f0..22cf484 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -121,6 +121,7 @@ enum vcpu_sysreg {
>   SCTLR_EL1,  /* System Control Register */
>   ACTLR_EL1,  /* Auxiliary Control Register */
>   CPACR_EL1,  /* Coprocessor Access Control */
> + ZCR_EL1,/* SVE Control */
>   TTBR0_EL1,  /* Translation Table Base Register 0 */
>   TTBR1_EL1,  /* Translation Table Base Register 1 */
>   TCR_EL1,/* Translation Control Register */
> diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
> index 5b267de..4d6262d 100644
> --- a/arch/arm64/include/asm/sysreg.h
> +++ b/arch/arm64/include/asm/sysreg.h
> @@ -454,6 +454,9 @@
>  #define SYS_ICH_LR14_EL2 __SYS__LR8_EL2(6)
>  #define SYS_ICH_LR15_EL2 __SYS__LR8_EL2(7)
>  
> +/* VHE encodings for architectural EL0/1 system registers */
> +#define SYS_ZCR_EL12 sys_reg(3, 5, 1, 2, 0)
> +
>  /* Common SCTLR_ELx flags. */
>  #define SCTLR_ELx_DSSBS  (_BITUL(44))
>  #define SCTLR_ELx_ENIA   (_BITUL(31))
> diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
> index 1cf4f02..7053bf4 100644
> --- a/arch/arm64/kvm/fpsimd.c
> +++ b/arch/arm64/kvm/fpsimd.c
> @@ -103,14 +103,21 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
>  void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
>  {
>   unsigned long flags;
> + bool host_has_sve = system_supports_sve();
> + bool guest_has_sve = vcpu_has_sve(vcpu);
>  
>   local_irq_save(flags);
>  
>   if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) {
> + u64 *guest_zcr = &vcpu->arch.ctxt.sys_regs[ZCR_EL1];
> +
>   /* Clean guest FP state to memory and invalidate cpu view */
>   fpsimd_save();
>   fpsimd_flush_cpu_state();
> - } else if (system_supports_sve()) {
> +
> +   

Re: [RFC] arm/cpu: fix soft lockup panic after resuming from stop

2019-03-26 Thread Steven Price
Hi Heyi,

On 26/03/2019 13:53, Heyi Guo wrote:
> I also tested save/restore operations, and observed that clock in guest
> would not jump after restoring either. If we consider guest clock not
> being synchronized with real wall clock as an issue, does it mean
> save/restore function has the same issue?

Basically at the moment when the guest isn't running you have a choice
of two behaviours:

 1. Stop (i.e. save/restore) CNTVCT - this means that the guest sees no
time occur. If the guest needs to have a concept of wall-clock time
(e.g. it communicates with other systems over a network) then this can
cause problems (e.g. timeouts might be wrong, certificates might start
appearing to be in the future etc).

 2. Leave CNTVCT running - the guest sees the time pass but interprets
the vCPUs as effectively having locked up. Linux will trigger the soft
lockup warning.

There are two ways of solving this, which match the two behaviours above:

 1. Provide the guest with a view of wall-clock time. The obvious way of
doing this is with a pvclock implementation like MSR_KVM_WALL_CLOCK_NEW
for x86.

 2. Inform the guest to ignore the apparent "soft-lockup". There's
already an ioctl for x86 for this: KVM_KVMCLOCK_CTRL

My preference is for option 1 - as this gives the guest a good view of
both the time that it is actually executing (useful for internal
watchdog timers like the soft-lockup one in Linux) and maintains a view
of wall-clock time (useful when communicating with other external
services - e.g. the a server on the internet). Your patch to QEMU
provides the first step of that, but as you mention there's much more to do.

One thing I haven't investigated in great detail is how KVM handles the
timer during various forms of suspend. In particular for suspend types
like full hibernation the host's physical counter will jump (quite
possibly backwards) - I haven't looked in detail how KVM presents this
to the guest. Hopefully not by making it go backwards!

I'm not sure how much time I'm going to have to look at this in the near
future, but please keep me in the loop if you decide to tackle any of this.

Thanks,

Steve
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v7 7/10] KVM: arm/arm64: context-switch ptrauth registers

2019-03-26 Thread Kristina Martsenko
On 26/03/2019 04:03, Amit Daniel Kachhap wrote:
> Hi,
> 
> On 3/26/19 1:34 AM, Kristina Martsenko wrote:
>> On 19/03/2019 08:30, Amit Daniel Kachhap wrote:
>>> From: Mark Rutland 
>>>
>>> When pointer authentication is supported, a guest may wish to use it.
>>> This patch adds the necessary KVM infrastructure for this to work, with
>>> a semi-lazy context switch of the pointer auth state.
>>
>> [...]
>>
>>> +    if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) ||
>>> +    test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) {
>>> +    /* Verify that KVM startup matches the conditions for ptrauth */
>>> +    if (WARN_ON(!vcpu_has_ptrauth(vcpu)))
>>> +    return -EINVAL;
>>> +    }
>>
>> I think this now needs to have "goto out;" instead of "return -EINVAL;",
>> since 5.1-rcX contains commit e761a927bc9a ("KVM: arm/arm64: Reset the
>> VCPU without preemption and vcpu state loaded") which changed some of
>> this code.
> ok missed the changes for this commit.

One more thing - I think the WARN_ON() here should be removed. Otherwise
if panic_on_warn is set then userspace can panic the kernel. I think
WARN_ON is only for internal kernel errors (see comment in
include/asm-generic/bug.h).

Thanks,
Kristina
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v7 7/10] KVM: arm/arm64: context-switch ptrauth registers

2019-03-26 Thread Amit Daniel Kachhap



On 3/26/19 11:31 PM, Kristina Martsenko wrote:

On 26/03/2019 04:03, Amit Daniel Kachhap wrote:

Hi,

On 3/26/19 1:34 AM, Kristina Martsenko wrote:

On 19/03/2019 08:30, Amit Daniel Kachhap wrote:

From: Mark Rutland 

When pointer authentication is supported, a guest may wish to use it.
This patch adds the necessary KVM infrastructure for this to work, with
a semi-lazy context switch of the pointer auth state.


[...]


+    if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) ||
+    test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) {
+    /* Verify that KVM startup matches the conditions for ptrauth */
+    if (WARN_ON(!vcpu_has_ptrauth(vcpu)))
+    return -EINVAL;
+    }


I think this now needs to have "goto out;" instead of "return -EINVAL;",
since 5.1-rcX contains commit e761a927bc9a ("KVM: arm/arm64: Reset the
VCPU without preemption and vcpu state loaded") which changed some of
this code.

ok missed the changes for this commit.


One more thing - I think the WARN_ON() here should be removed. Otherwise
if panic_on_warn is set then userspace can panic the kernel. I think
WARN_ON is only for internal kernel errors (see comment in
include/asm-generic/bug.h).


The documentation makes sense so in this case a pr_err like message 
will suffice. Btw there is one WARN in the function kvm_set_ipa_limit in 
the same file.


Thanks,
Amit D


Thanks,
Kristina


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


RE: [PATCH v5 00/26] KVM: arm64: SVE guest support

2019-03-26 Thread Zhang, Lei
Hi guys,


> -Original Message-
> From: linux-arm-kernel  On
> Behalf Of Dave Martin
> Sent: Wednesday, March 20, 2019 2:59 AM
> To: kvmarm@lists.cs.columbia.edu
> Cc: Peter Maydell ; Okamoto, Takayuki/��本 高幸
> ; Christoffer Dall ; Ard
> Biesheuvel ; Marc Zyngier
> ; Catalin Marinas ; Will
> Deacon ; Zhang, Lei/�� 雷
> ; Julien Grall ; Alex
> Bennée ; linux-arm-ker...@lists.infradead.org
> Subject: Re: [PATCH v5 00/26] KVM: arm64: SVE guest support
> 
> On Tue, Mar 19, 2019 at 05:51:51PM +, Dave Martin wrote:
> 
> [...]
> 
> Yes, I messed up the subject line in the cover letter :P
> 
> The prefix should read "[PATCH v6 00/27]"

[>] 
I have tested patch v6 on Fujitsu A64FX chip which SVE feature had been 
implemented.
All of tests have been passed.
Please add follows:
Tested-by: zhang.lei 


Best Regards,
Zhang Lei
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm