RE: [PATCH v9 01/13] KVM: PPC: POWERNV: move iommu_add_device earlier
Hi Alex, Looks like this patch is not picked by anyone, Are you going to pick this patch? My vfio/iommu patches have dependency on this patch (this is already tested by me). Thanks -Bharat > -Original Message- > From: Linuxppc-dev [mailto:linuxppc-dev- > bounces+bharat.bhushan=freescale@lists.ozlabs.org] On Behalf Of Alexey > Kardashevskiy > Sent: Wednesday, August 28, 2013 2:08 PM > To: linuxppc-...@lists.ozlabs.org > Cc: kvm@vger.kernel.org; Gleb Natapov; Alexey Kardashevskiy; Alexander Graf; > kvm-...@vger.kernel.org; linux-ker...@vger.kernel.org; linux...@kvack.org; > Paul > Mackerras; Paolo Bonzini; David Gibson > Subject: [PATCH v9 01/13] KVM: PPC: POWERNV: move iommu_add_device earlier > > The current implementation of IOMMU on sPAPR does not use iommu_ops and > therefore does not call IOMMU API's bus_set_iommu() which > 1) sets iommu_ops for a bus > 2) registers a bus notifier > Instead, PCI devices are added to IOMMU groups from > subsys_initcall_sync(tce_iommu_init) which does basically the same thing > without > using iommu_ops callbacks. > > However Freescale PAMU driver (https://lkml.org/lkml/2013/7/1/158) > implements iommu_ops and when tce_iommu_init is called, every PCI device is > already added to some group so there is a conflict. > > This patch does 2 things: > 1. removes the loop in which PCI devices were added to groups and adds > explicit > iommu_add_device() calls to add devices as soon as they get the iommu_table > pointer assigned to them. > 2. moves a bus notifier to powernv code in order to avoid conflict with the > notifier from Freescale driver. > > iommu_add_device() and iommu_del_device() are public now. > > Signed-off-by: Alexey Kardashevskiy > --- > Changes: > v8: > * added the check for iommu_group!=NULL before removing device from a group as > suggested by Wei Yang > > v2: > * added a helper - set_iommu_table_base_and_group - which does > set_iommu_table_base() and iommu_add_device() > --- > arch/powerpc/include/asm/iommu.h| 9 +++ > arch/powerpc/kernel/iommu.c | 41 > +++-- > arch/powerpc/platforms/powernv/pci-ioda.c | 8 +++--- > arch/powerpc/platforms/powernv/pci-p5ioc2.c | 2 +- > arch/powerpc/platforms/powernv/pci.c| 33 ++- > arch/powerpc/platforms/pseries/iommu.c | 8 +++--- > 6 files changed, 55 insertions(+), 46 deletions(-) > > diff --git a/arch/powerpc/include/asm/iommu.h > b/arch/powerpc/include/asm/iommu.h > index c34656a..19ad77f 100644 > --- a/arch/powerpc/include/asm/iommu.h > +++ b/arch/powerpc/include/asm/iommu.h > @@ -103,6 +103,15 @@ extern struct iommu_table *iommu_init_table(struct > iommu_table * tbl, > int nid); > extern void iommu_register_group(struct iommu_table *tbl, >int pci_domain_number, unsigned long pe_num); > +extern int iommu_add_device(struct device *dev); extern void > +iommu_del_device(struct device *dev); > + > +static inline void set_iommu_table_base_and_group(struct device *dev, > + void *base) > +{ > + set_iommu_table_base(dev, base); > + iommu_add_device(dev); > +} > > extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl, > struct scatterlist *sglist, int nelems, diff --git > a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index > b20ff17..15f8ca8 100644 > --- a/arch/powerpc/kernel/iommu.c > +++ b/arch/powerpc/kernel/iommu.c > @@ -1105,7 +1105,7 @@ void iommu_release_ownership(struct iommu_table *tbl) } > EXPORT_SYMBOL_GPL(iommu_release_ownership); > > -static int iommu_add_device(struct device *dev) > +int iommu_add_device(struct device *dev) > { > struct iommu_table *tbl; > int ret = 0; > @@ -1134,46 +1134,13 @@ static int iommu_add_device(struct device *dev) > > return ret; > } > +EXPORT_SYMBOL_GPL(iommu_add_device); > > -static void iommu_del_device(struct device *dev) > +void iommu_del_device(struct device *dev) > { > iommu_group_remove_device(dev); > } > - > -static int iommu_bus_notifier(struct notifier_block *nb, > - unsigned long action, void *data) > -{ > - struct device *dev = data; > - > - switch (action) { > - case BUS_NOTIFY_ADD_DEVICE: > - return iommu_add_device(dev); > - case BUS_NOTIFY_DEL_DEVICE: > - iommu_del_device(dev); > - return 0; > - default: > - return 0; > - } > -} > - > -static struct notifier_block tce_iommu_bus_nb = { > - .notifier_call = iommu_bus_notifier, > -}; > - > -static int __init tce_iommu_init(void) > -{ > - struct pci_dev *pdev = NULL; > - > - BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE); > - > - for_each_pci_dev(pdev) > - iommu_add_device(&pdev->dev); > - > - bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb); > - return 0
RE: [PATCH 2/7] Initial skeleton of VFIO support for Device Tree based devices
> -Original Message- > From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf > Of > Alex Williamson > Sent: Tuesday, October 29, 2013 5:17 PM > To: Don Dutile > Cc: Bhushan Bharat-R65777; Antonios Motakis; kvm...@lists.cs.columbia.edu; > linux-samsung-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Yoder > Stuart-B08248; io...@lists.linux-foundation.org; t...@virtualopensystems.com > Subject: Re: [PATCH 2/7] Initial skeleton of VFIO support for Device Tree > based > devices > > On Mon, 2013-10-28 at 21:29 -0400, Don Dutile wrote: > > On 09/30/2013 11:37 AM, Bhushan Bharat-R65777 wrote: > > > > > > > > >> -Original Message- > > >> From: iommu-boun...@lists.linux-foundation.org [mailto:iommu- > > >> boun...@lists.linux-foundation.org] On Behalf Of Antonios Motakis > > >> Sent: Monday, September 30, 2013 8:59 PM > > >> To: kvm...@lists.cs.columbia.edu; alex.william...@redhat.com > > >> Cc: linux-samsung-...@vger.kernel.org; kvm@vger.kernel.org; > > >> ag...@suse.de; Yoder Stuart-B08248; > > >> io...@lists.linux-foundation.org; Antonios Motakis; > > >> t...@virtualopensystems.com > > >> Subject: [PATCH 2/7] Initial skeleton of VFIO support for Device > > >> Tree based devices > > >> > > >> Platform devices in the Linux kernel are usually managed by the DT > interface. > > >> This patch forms the base to support these kind of devices with VFIO. > > >> > > >> Signed-off-by: Antonios Motakis > > >> --- > > >> drivers/vfio/Kconfig | 11 +++ > > >> drivers/vfio/Makefile| 1 + > > >> drivers/vfio/vfio_platform.c | 187 > +++ > > >> include/uapi/linux/vfio.h| 1 + > > >> 4 files changed, 200 insertions(+) > > >> create mode 100644 drivers/vfio/vfio_platform.c > > >> > > >> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index > > >> 1f84eda..35254b7 > > >> 100644 > > >> --- a/drivers/vfio/Kconfig > > >> +++ b/drivers/vfio/Kconfig > > >> @@ -13,4 +13,15 @@ menuconfig VFIO > > >> > > >>If you don't know what to do here, say N. > > >> > > >> +config VFIO_PLATFORM > > >> +tristate "VFIO support for device tree based platform devices" > > >> +depends on VFIO&& EVENTFD&& OF > > >> +help > > >> + Support for platform devices with VFIO. This is required to > > >> make > > >> + use of platform devices present on device tree nodes using > > >> the VFIO > > >> + framework. Devices that are not described in the device tree > > >> cannot > > >> + be used by this driver. > > >> + > > >> + If you don't know what to do here, say N. > > >> + > > >> source "drivers/vfio/pci/Kconfig" > > >> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index > > >> 2398d4a..575c8dd 100644 > > >> --- a/drivers/vfio/Makefile > > >> +++ b/drivers/vfio/Makefile > > >> @@ -1,3 +1,4 @@ > > >> obj-$(CONFIG_VFIO) += vfio.o > > >> obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o > > >> obj-$(CONFIG_VFIO_PCI) += pci/ > > >> +obj-$(CONFIG_VFIO_PLATFORM) += vfio_platform.o > > >> diff --git a/drivers/vfio/vfio_platform.c > > >> b/drivers/vfio/vfio_platform.c new > > > > > > We can make this parallel to PCI, something like > > > drivers/vfio/platform/platform.c > > > > > pls, no. 'platform' is too generic, and it really means 'arm-dt' ... > > so can move it to the arch/arm space, and have it's kconfig conditional on > ARM&&VFIO. > > if kept under drivers/vfio, then use a better directory name that ties it to > arm-dt. > > thanks. > > The intention is that vfio platform device support is not arm-dt specific. > This > is to be used by both arm and embedded ppc. The devices we intend to support > with them are known as platform drivers in the kernel, thus the name. I > suppose > the question remains whether the interface here is really generic for any > "platform" device or whether we're making whether we're making an interface > specifically for device tree platfo
RE: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via sysfs only
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, October 29, 2013 10:25 AM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; Alex Williamson; Kim Phillips; Yoder Stuart-B08248; > christoffer.d...@linaro.org; linux-ker...@vger.kernel.org; > a.mota...@virtualopensystems.com; ag...@suse.de; Sethi Varun-B16395; > peter.mayd...@linaro.org; santosh.shu...@linaro.org; kvm@vger.kernel.org; > gre...@linuxfoundation.org > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via > sysfs only > > On Mon, 2013-10-28 at 23:45 -0500, Bhushan Bharat-R65777 wrote: > > > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Tuesday, October 29, 2013 10:05 AM > > > To: Bhushan Bharat-R65777 > > > Cc: Wood Scott-B07421; Alex Williamson; Kim Phillips; Yoder > > > Stuart-B08248; christoffer.d...@linaro.org; > > > linux-ker...@vger.kernel.org; a.mota...@virtualopensystems.com; > > > ag...@suse.de; Sethi Varun-B16395; peter.mayd...@linaro.org; > > > santosh.shu...@linaro.org; kvm@vger.kernel.org; > > > gre...@linuxfoundation.org > > > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit > > > binding via sysfs only > > > > > > On Mon, 2013-10-28 at 23:31 -0500, Bhushan Bharat-R65777 wrote: > > > > > > > > > -Original Message- > > > > > From: Wood Scott-B07421 > > > > > Sent: Tuesday, October 29, 2013 10:00 AM > > > > > To: Bhushan Bharat-R65777 > > > > > Cc: Wood Scott-B07421; Alex Williamson; Kim Phillips; Yoder > > > > > Stuart-B08248; christoffer.d...@linaro.org; > > > > > linux-ker...@vger.kernel.org; a.mota...@virtualopensystems.com; > > > > > ag...@suse.de; Sethi Varun-B16395; peter.mayd...@linaro.org; > > > > > santosh.shu...@linaro.org; kvm@vger.kernel.org; > > > > > gre...@linuxfoundation.org > > > > > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit > > > > > binding via sysfs only > > > > > > > > > > On Mon, 2013-10-28 at 22:52 -0500, Bhushan Bharat-R65777 wrote: > > > > > > So when ids == NULL it does not check of vendor etc and calls > > > > > > pci_add_dynid() > > > > > which in turn calls driver_attach(). > > > > > > > > > > > > If we change the above loop to break if ids->vendor == > > > > > >PCI_ANY_ID && ids- subvendor == PCI_ANY_ID then also we will call > pci_add_dyids(). > > > > > > > > > > What problem are you trying to solve? > > > > > > > > new_id interface to continue working as before. > > > > > > In what specific way does this allow new_id to continue working as > > > before? Be verbose. > > > > > > What I observed that this patch (kim's patch) new_id interface stops > > working. > > Yes. > > > This is found to be because store_new_id() checks for pdrv->id_table > > which is no more NULL, so the below check fails > > I do not think that is the reason. The reason is because sysfs_bind_only is > set, and this is not a direct sysfs bind. > > > if (ids) { > > ^^ > > This is no more NULL, so enter inside the loop > > > > retval = -EINVAL; > > while (ids->vendor || ids->subvendor || ids->class_mask) { > > if (driver_data == ids->driver_data) { > > retval = 0; > > break; > > } > > ids++; > > } > > if (retval) /* No match */ > > return retval; ^ This is where it returns > > as -EINVAL > > Why wouldn't it have broken out of the loop earlier, since driver_data and > ids- > >driver_data should both be zero? I assume this is with a patch to do > PCI_ANY_ID in vfio-pci. hmmm, I am pretty sure I have seen that issue a few time (below is command line output) but now I am not getting any error reported. Although device is not binding to driver because of sysfs_bind_only as you mentioned (I thought of this as a second issue). If I will be able to reproduce the first issue then I will let you guys know otherwise there was no first issue :( root@p5040ds:/sys/bus/pci# echo :01:00.0 > devices/\:01\:00.0/driver/unbind e1000e :01:00.0 eth0: removed PHC root@p5040ds:/sys/bus/pci# echo 8086 10d3 > drivers/vfio-pci/new_id -sh: echo: write error: Invalid argument root@p5040ds:/sys/bus/pci# echo :01:00.0 > drivers/vfio-pci/bind -Bharat > > -Scott >
RE: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via sysfs only
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, October 29, 2013 10:05 AM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; Alex Williamson; Kim Phillips; Yoder Stuart-B08248; > christoffer.d...@linaro.org; linux-ker...@vger.kernel.org; > a.mota...@virtualopensystems.com; ag...@suse.de; Sethi Varun-B16395; > peter.mayd...@linaro.org; santosh.shu...@linaro.org; kvm@vger.kernel.org; > gre...@linuxfoundation.org > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via > sysfs only > > On Mon, 2013-10-28 at 23:31 -0500, Bhushan Bharat-R65777 wrote: > > > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Tuesday, October 29, 2013 10:00 AM > > > To: Bhushan Bharat-R65777 > > > Cc: Wood Scott-B07421; Alex Williamson; Kim Phillips; Yoder > > > Stuart-B08248; christoffer.d...@linaro.org; > > > linux-ker...@vger.kernel.org; a.mota...@virtualopensystems.com; > > > ag...@suse.de; Sethi Varun-B16395; peter.mayd...@linaro.org; > > > santosh.shu...@linaro.org; kvm@vger.kernel.org; > > > gre...@linuxfoundation.org > > > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit > > > binding via sysfs only > > > > > > On Mon, 2013-10-28 at 22:52 -0500, Bhushan Bharat-R65777 wrote: > > > > > > > > > -Original Message- > > > > > From: Wood Scott-B07421 > > > > > Sent: Tuesday, October 29, 2013 9:11 AM > > > > > To: Bhushan Bharat-R65777 > > > > > Cc: Wood Scott-B07421; Alex Williamson; Kim Phillips; Yoder > > > > > Stuart-B08248; christoffer.d...@linaro.org; > > > > > linux-ker...@vger.kernel.org; a.mota...@virtualopensystems.com; > > > > > ag...@suse.de; Sethi Varun-B16395; peter.mayd...@linaro.org; > > > > > santosh.shu...@linaro.org; kvm@vger.kernel.org; > > > > > gre...@linuxfoundation.org > > > > > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit > > > > > binding via sysfs only > > > > > > > > > > On Mon, 2013-10-28 at 22:38 -0500, Bhushan Bharat-R65777 wrote: > > > > > > > > > > > > > -Original Message- > > > > > > > From: Wood Scott-B07421 > > > > > > > Sent: Monday, October 28, 2013 11:40 PM > > > > > > > To: Alex Williamson > > > > > > > Cc: Kim Phillips; Bhushan Bharat-R65777; Wood Scott-B07421; > > > > > > > Yoder Stuart-B08248; christoffer.d...@linaro.org; > > > > > > > linux-ker...@vger.kernel.org; > > > > > > > a.mota...@virtualopensystems.com; ag...@suse.de; Sethi > > > > > > > Varun-B16395; peter.mayd...@linaro.org; > > > > > > > santosh.shu...@linaro.org; kvm@vger.kernel.org; > > > > > > > gre...@linuxfoundation.org > > > > > > > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for > > > > > > > explicit binding via sysfs only > > > > > > > > > > > > > > On Mon, 2013-10-28 at 13:00 -0500, Scott Wood wrote: > > > > > > > > On Mon, 2013-10-28 at 11:47 -0600, Alex Williamson wrote: > > > > > > > > > On Fri, 2013-10-11 at 01:27 -0500, Kim Phillips wrote: > > > > > > > > > > Force the vfio-pci driver to only be bound explicitly > > > > > > > > > > via sysfs to avoid conflics with other drivers in the > > > > > > > > > > event of a > > > hotplug. > > > > > > > > > > > > > > > > > > We can't break userspace, so we can't disable the > > > > > > > > > current method of binding devices to vfio-pci. We can > > > > > > > > > add a new method and perhaps deprecate the existing > > > > > > > > > mechanism to be removed at some point in the future. > > > > > > > > > Thanks, > > > > > > > > > > > > > > > > I thought the existing method involved using sysfs bind, > > > > > > > > and this was just eliminating a race. How does the bind > > > > > > > > get triggered > > > currently? > > > > > > > > > > > > > > OK, so it seems it's relying on the write to new_id calling > > > driver_attach(). > > > > > >
RE: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via sysfs only
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, October 29, 2013 10:00 AM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; Alex Williamson; Kim Phillips; Yoder Stuart-B08248; > christoffer.d...@linaro.org; linux-ker...@vger.kernel.org; > a.mota...@virtualopensystems.com; ag...@suse.de; Sethi Varun-B16395; > peter.mayd...@linaro.org; santosh.shu...@linaro.org; kvm@vger.kernel.org; > gre...@linuxfoundation.org > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via > sysfs only > > On Mon, 2013-10-28 at 22:52 -0500, Bhushan Bharat-R65777 wrote: > > > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Tuesday, October 29, 2013 9:11 AM > > > To: Bhushan Bharat-R65777 > > > Cc: Wood Scott-B07421; Alex Williamson; Kim Phillips; Yoder > > > Stuart-B08248; christoffer.d...@linaro.org; > > > linux-ker...@vger.kernel.org; a.mota...@virtualopensystems.com; > > > ag...@suse.de; Sethi Varun-B16395; peter.mayd...@linaro.org; > > > santosh.shu...@linaro.org; kvm@vger.kernel.org; > > > gre...@linuxfoundation.org > > > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit > > > binding via sysfs only > > > > > > On Mon, 2013-10-28 at 22:38 -0500, Bhushan Bharat-R65777 wrote: > > > > > > > > > -Original Message- > > > > > From: Wood Scott-B07421 > > > > > Sent: Monday, October 28, 2013 11:40 PM > > > > > To: Alex Williamson > > > > > Cc: Kim Phillips; Bhushan Bharat-R65777; Wood Scott-B07421; > > > > > Yoder Stuart-B08248; christoffer.d...@linaro.org; > > > > > linux-ker...@vger.kernel.org; a.mota...@virtualopensystems.com; > > > > > ag...@suse.de; Sethi Varun-B16395; peter.mayd...@linaro.org; > > > > > santosh.shu...@linaro.org; kvm@vger.kernel.org; > > > > > gre...@linuxfoundation.org > > > > > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit > > > > > binding via sysfs only > > > > > > > > > > On Mon, 2013-10-28 at 13:00 -0500, Scott Wood wrote: > > > > > > On Mon, 2013-10-28 at 11:47 -0600, Alex Williamson wrote: > > > > > > > On Fri, 2013-10-11 at 01:27 -0500, Kim Phillips wrote: > > > > > > > > Force the vfio-pci driver to only be bound explicitly via > > > > > > > > sysfs to avoid conflics with other drivers in the event of a > hotplug. > > > > > > > > > > > > > > We can't break userspace, so we can't disable the current > > > > > > > method of binding devices to vfio-pci. We can add a new > > > > > > > method and perhaps deprecate the existing mechanism to be > > > > > > > removed at some point in the future. Thanks, > > > > > > > > > > > > I thought the existing method involved using sysfs bind, and > > > > > > this was just eliminating a race. How does the bind get triggered > currently? > > > > > > > > > > OK, so it seems it's relying on the write to new_id calling > driver_attach(). > > > > > Sigh. I guess we could make driver-sysfs-bind-only be settable > > > > > via sysfs, and have new-userspace set both that and PCI_ANY_ID > > > > > (or the specific ID if userspace > > > > > prefers) via new_id. The platform bus patches could continue as > > > > > is, since there's no existing mechanism to break. > > > > > > > > What about changing the store_new_id() to bypass exact ids check > > > > if driver > > > have PCI_ANY_ID? > > > > > > I don't follow. > > > > store_new_id() function id defined as: > > > > static ssize_t store_new_id(struct device_driver *driver, const char > > *buf, size_t count) { > > struct pci_driver *pdrv = to_pci_driver(driver); > > const struct pci_device_id *ids = pdrv->id_table; > > > > > > /* Only accept driver_data values that match an existing id_table > >entry */ > > if (ids) { > > retval = -EINVAL; > > while (ids->vendor || ids->subvendor || ids->class_mask) { > > if (driver_data == ids->driver_data) { > > retval = 0; > > break; > > } > > ids++; > > } > > if (retval) /* No match */ > > return retval; > > } > > > > retval = pci_add_dynid(pdrv, vendor, device, subvendor, subdevice, > >class, class_mask, driver_data); > > > > > > So when ids == NULL it does not check of vendor etc and calls > > pci_add_dynid() > which in turn calls driver_attach(). > > > > If we change the above loop to break if ids->vendor == PCI_ANY_ID && ids- > >subvendor == PCI_ANY_ID then also we will call pci_add_dyids(). > > What problem are you trying to solve? new_id interface to continue working as before. -Bharat > > -Scott > N�r��yb�X��ǧv�^�){.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf
RE: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via sysfs only
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, October 29, 2013 9:11 AM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; Alex Williamson; Kim Phillips; Yoder Stuart-B08248; > christoffer.d...@linaro.org; linux-ker...@vger.kernel.org; > a.mota...@virtualopensystems.com; ag...@suse.de; Sethi Varun-B16395; > peter.mayd...@linaro.org; santosh.shu...@linaro.org; kvm@vger.kernel.org; > gre...@linuxfoundation.org > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via > sysfs only > > On Mon, 2013-10-28 at 22:38 -0500, Bhushan Bharat-R65777 wrote: > > > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Monday, October 28, 2013 11:40 PM > > > To: Alex Williamson > > > Cc: Kim Phillips; Bhushan Bharat-R65777; Wood Scott-B07421; Yoder > > > Stuart-B08248; christoffer.d...@linaro.org; > > > linux-ker...@vger.kernel.org; a.mota...@virtualopensystems.com; > > > ag...@suse.de; Sethi Varun-B16395; peter.mayd...@linaro.org; > > > santosh.shu...@linaro.org; kvm@vger.kernel.org; > > > gre...@linuxfoundation.org > > > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit > > > binding via sysfs only > > > > > > On Mon, 2013-10-28 at 13:00 -0500, Scott Wood wrote: > > > > On Mon, 2013-10-28 at 11:47 -0600, Alex Williamson wrote: > > > > > On Fri, 2013-10-11 at 01:27 -0500, Kim Phillips wrote: > > > > > > Force the vfio-pci driver to only be bound explicitly via > > > > > > sysfs to avoid conflics with other drivers in the event of a > > > > > > hotplug. > > > > > > > > > > We can't break userspace, so we can't disable the current method > > > > > of binding devices to vfio-pci. We can add a new method and > > > > > perhaps deprecate the existing mechanism to be removed at some > > > > > point in the future. Thanks, > > > > > > > > I thought the existing method involved using sysfs bind, and this > > > > was just eliminating a race. How does the bind get triggered currently? > > > > > > OK, so it seems it's relying on the write to new_id calling > > > driver_attach(). > > > Sigh. I guess we could make driver-sysfs-bind-only be settable via > > > sysfs, and have new-userspace set both that and PCI_ANY_ID (or the > > > specific ID if userspace > > > prefers) via new_id. The platform bus patches could continue as is, > > > since there's no existing mechanism to break. > > > > What about changing the store_new_id() to bypass exact ids check if driver > have PCI_ANY_ID? > > I don't follow. store_new_id() function id defined as: static ssize_t store_new_id(struct device_driver *driver, const char *buf, size_t count) { struct pci_driver *pdrv = to_pci_driver(driver); const struct pci_device_id *ids = pdrv->id_table; /* Only accept driver_data values that match an existing id_table entry */ if (ids) { retval = -EINVAL; while (ids->vendor || ids->subvendor || ids->class_mask) { if (driver_data == ids->driver_data) { retval = 0; break; } ids++; } if (retval) /* No match */ return retval; } retval = pci_add_dynid(pdrv, vendor, device, subvendor, subdevice, class, class_mask, driver_data); So when ids == NULL it does not check of vendor etc and calls pci_add_dynid() which in turn calls driver_attach(). If we change the above loop to break if ids->vendor == PCI_ANY_ID && ids->subvendor == PCI_ANY_ID then also we will call pci_add_dyids(). -Bharat > > -Scott >
RE: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via sysfs only
> -Original Message- > From: Wood Scott-B07421 > Sent: Monday, October 28, 2013 11:40 PM > To: Alex Williamson > Cc: Kim Phillips; Bhushan Bharat-R65777; Wood Scott-B07421; Yoder > Stuart-B08248; > christoffer.d...@linaro.org; linux-ker...@vger.kernel.org; > a.mota...@virtualopensystems.com; ag...@suse.de; Sethi Varun-B16395; > peter.mayd...@linaro.org; santosh.shu...@linaro.org; kvm@vger.kernel.org; > gre...@linuxfoundation.org > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via > sysfs only > > On Mon, 2013-10-28 at 13:00 -0500, Scott Wood wrote: > > On Mon, 2013-10-28 at 11:47 -0600, Alex Williamson wrote: > > > On Fri, 2013-10-11 at 01:27 -0500, Kim Phillips wrote: > > > > Force the vfio-pci driver to only be bound explicitly via sysfs to > > > > avoid conflics with other drivers in the event of a hotplug. > > > > > > We can't break userspace, so we can't disable the current method of > > > binding devices to vfio-pci. We can add a new method and perhaps > > > deprecate the existing mechanism to be removed at some point in the > > > future. Thanks, > > > > I thought the existing method involved using sysfs bind, and this was > > just eliminating a race. How does the bind get triggered currently? > > OK, so it seems it's relying on the write to new_id calling driver_attach(). > Sigh. I guess we could make driver-sysfs-bind-only be settable via sysfs, and > have new-userspace set both that and PCI_ANY_ID (or the specific ID if > userspace > prefers) via new_id. The platform bus patches could continue as is, since > there's no existing mechanism to break. What about changing the store_new_id() to bypass exact ids check if driver have PCI_ANY_ID? -Bharat > > -Scott >
RE: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via sysfs only
> -Original Message- > From: Kim Phillips [mailto:kim.phill...@linaro.org] > Sent: Saturday, October 12, 2013 4:47 AM > To: Wood Scott-B07421 > Cc: Bhushan Bharat-R65777; Wood Scott-B07421; Yoder Stuart-B08248; > christoffer.d...@linaro.org; alex.william...@redhat.com; linux- > ker...@vger.kernel.org; a.mota...@virtualopensystems.com; ag...@suse.de; Sethi > Varun-B16395; peter.mayd...@linaro.org; santosh.shu...@linaro.org; > kvm@vger.kernel.org; gre...@linuxfoundation.org > Subject: Re: [PATCH 3/4] VFIO: pci: amend vfio-pci for explicit binding via > sysfs only > > On Fri, 11 Oct 2013 15:43:40 -0500 > Scott Wood wrote: > > > On Fri, 2013-10-11 at 01:27 -0500, Kim Phillips wrote: > > > Force the vfio-pci driver to only be bound explicitly via sysfs to avoid > > > conflics with other drivers in the event of a hotplug. > > > > > > Signed-off-by: Kim Phillips > > > --- > > > drivers/vfio/pci/vfio_pci.c | 3 +++ > > > 1 file changed, 3 insertions(+) > > > > > > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > > > index 6ab71b9..bdd7833 100644 > > > --- a/drivers/vfio/pci/vfio_pci.c > > > +++ b/drivers/vfio/pci/vfio_pci.c > > > @@ -901,6 +901,9 @@ static struct pci_driver vfio_pci_driver = { > > > .probe = vfio_pci_probe, > > > .remove = vfio_pci_remove, > > > .err_handler= &vfio_err_handlers, > > > + .driver = { > > > + .sysfs_bind_only = true, > > > + }, > > > }; > > > > > > static void __exit vfio_pci_cleanup(void) > > > > You also need to add a PCI_ANY_ID match in order to be able to get rid > > of the new_id usage. > > thanks - see below. > > Can someone with a PCI bus test this? Bharat? Hello Kim, I can test that we can get rid of new_id and use "bind" to bind the device to vfio_pci. Other thing is generating hotplug, or reorder the driver registration by tweaking Makefile to test sysfs_bind_only way to bind is not yet tested. Thanks -Bharat > > Kim > > From a8d6c12f2ec763c2ac7fd384a3397c370cc1b932 Mon Sep 17 00:00:00 2001 > From: Kim Phillips > Date: Thu, 10 Oct 2013 22:16:34 -0500 > Subject: [PATCH 3/4 v2] VFIO: pci: amend vfio-pci for explicit binding via > sysfs > only > > Force the vfio-pci driver to only be bound explicitly via sysfs to avoid > conflics with other drivers in the event of a hotplug. Also replace > the only dynamic ids assignment with a table with a single PCI_ANY_ID > entry since writing the sysfs bind file without having to specify ids > via the new_id file first should no longer be necessary. > > Signed-off-by: Kim Phillips > --- > drivers/vfio/pci/vfio_pci.c | 12 +++- > 1 file changed, 11 insertions(+), 1 deletion(-) > > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > index 6ab71b9..c5b434f 100644 > --- a/drivers/vfio/pci/vfio_pci.c > +++ b/drivers/vfio/pci/vfio_pci.c > @@ -895,12 +895,22 @@ static struct pci_error_handlers vfio_err_handlers = { > .error_detected = vfio_pci_aer_err_detected, > }; > > +static DEFINE_PCI_DEVICE_TABLE(vfio_pci_id_table) = { > +{ PCI_DEVICE(PCI_ANY_ID, PCI_ANY_ID) }, > +{ 0 } > +}; > + > +MODULE_DEVICE_TABLE(pci, vfio_pci_id_table); > + > static struct pci_driver vfio_pci_driver = { > .name = "vfio-pci", > - .id_table = NULL, /* only dynamic ids */ > + .id_table = vfio_pci_id_table, /* no dynamic ids */ > .probe = vfio_pci_probe, > .remove = vfio_pci_remove, > .err_handler= &vfio_err_handlers, > + .driver = { > + .sysfs_bind_only = true, /* bind only via sysfs */ > + }, > }; > > static void __exit vfio_pci_cleanup(void) > -- > 1.8.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: RFC: (re-)binding the VFIO platform driver to a platform device
> -Original Message- > From: Wood Scott-B07421 > Sent: Thursday, October 10, 2013 8:53 PM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; Yoder Stuart-B08248; Kim Phillips; Christoffer Dall; > Alex > Williamson; linux-ker...@vger.kernel.org; a.mota...@virtualopensystems.com; > ag...@suse.de; Sethi Varun-B16395; peter.mayd...@linaro.org; > santosh.shu...@linaro.org; kvm@vger.kernel.org; gre...@linuxfoundation.org > Subject: Re: RFC: (re-)binding the VFIO platform driver to a platform device > > On Thu, 2013-10-10 at 02:45 -0500, Bhushan Bharat-R65777 wrote: > > > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Thursday, October 10, 2013 1:33 AM > > > To: Yoder Stuart-B08248 > > > Cc: Wood Scott-B07421; Kim Phillips; Christoffer Dall; Alex > > > Williamson; linux- ker...@vger.kernel.org; > > > a.mota...@virtualopensystems.com; ag...@suse.de; Sethi Varun-B16395; > > > Bhushan Bharat-R65777; peter.mayd...@linaro.org; > > > santosh.shu...@linaro.org; kvm@vger.kernel.org; > > > gre...@linuxfoundation.org > > > Subject: Re: RFC: (re-)binding the VFIO platform driver to a > > > platform device > > > > > > On Wed, 2013-10-09 at 14:44 -0500, Yoder Stuart-B08248 wrote: > > > > Ah, think I understand now...yes that works as well, and would be > > > > less intrustive. So are you writing a patch? :) > > > > > > I've been meaning to since the previous round of discussion, but I've been > busy. > > > Would someone else be able to test it in the context of using it for VFIO? > > > > I wish I could have but I do not have vfio-platform stuff. > > VFIO PCI without new_id would also be a useful test. I will do that :) -Bharat > > -Scott >
RE: [PATCH 3/4] kvm: powerpc: define a linux pte lookup function
> -Original Message- > From: Paul Mackerras [mailto:pau...@samba.org] > Sent: Thursday, October 10, 2013 4:06 PM > To: Wood Scott-B07421 > Cc: Bhushan Bharat-R65777; Wood Scott-B07421; ag...@suse.de; Yoder Stuart- > B08248; kvm@vger.kernel.org; kvm-...@vger.kernel.org > Subject: Re: [PATCH 3/4] kvm: powerpc: define a linux pte lookup function > > On Wed, Oct 09, 2013 at 12:47:31PM -0500, Scott Wood wrote: > > On Wed, 2013-10-09 at 03:48 -0500, Bhushan Bharat-R65777 wrote: > > > > > > What lookup_linux_pte_and_update() does:- > > > - find_linux_pte_or_hugepte() > > > - does size and some other trivial checks > > > - Then atomically update the pte:- > > >=> while() > > >=> wait till _PAGE_BUSY is clear > > >=> atomically update the pte > > >=> if not updated then go back to while() above else break > > > > > > > > > While what lookup_linux_pte() does:- > > > - find_linux_pte_or_hugepte() > > > - does size and some other trivial checks > > > - wait till _PAGE_BUSY is clear > > > - return pte > > > > > > I am finding it difficult to call lookup_linux_pte() from > lookup_linux_pte_and_update(). > > > > You could factor out a common lookup_linux_ptep(). > > I don't really think it's enough code to be worth wringing out the last drop > of > duplication. However, if he removed the checks for _PAGE_BUSY and > _PAGE_PRESENT > as I suggested in another mail, and made it return the pte pointer rather than > the value, it would then essentially be a lookup_linux_ptep() as you suggest. Do we want to have lookup_linux_pte() or lookup_linux_ptep() or both where lookup_linux_pte() and lookup_linux_pte_and_update() calls lookup_linux_ptep() ? -Bharat > > Paul. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/2] kvm: ppc: booke: check range page invalidation progress on page setup
> -Original Message- > From: Paolo Bonzini [mailto:paolo.bonz...@gmail.com] On Behalf Of Paolo > Bonzini > Sent: Monday, October 07, 2013 5:35 PM > To: Alexander Graf > Cc: Bhushan Bharat-R65777; Paul Mackerras; Wood Scott-B07421; kvm- > p...@vger.kernel.org; kvm@vger.kernel.org mailing list; Bhushan Bharat-R65777; > Gleb Natapov > Subject: Re: [PATCH 2/2] kvm: ppc: booke: check range page invalidation > progress > on page setup > > Il 04/10/2013 15:38, Alexander Graf ha scritto: > > > > On 07.08.2013, at 12:03, Bharat Bhushan wrote: > > > >> When the MM code is invalidating a range of pages, it calls the KVM > >> kvm_mmu_notifier_invalidate_range_start() notifier function, which calls > >> kvm_unmap_hva_range(), which arranges to flush all the TLBs for guest > >> pages. > >> However, the Linux PTEs for the range being flushed are still valid at > >> that point. We are not supposed to establish any new references to pages > >> in the range until the ...range_end() notifier gets called. > >> The PPC-specific KVM code doesn't get any explicit notification of that; > >> instead, we are supposed to use mmu_notifier_retry() to test whether we > >> are or have been inside a range flush notifier pair while we have been > >> referencing a page. > >> > >> This patch calls the mmu_notifier_retry() while mapping the guest > >> page to ensure we are not referencing a page when in range invalidation. > >> > >> This call is inside a region locked with kvm->mmu_lock, which is the > >> same lock that is called by the KVM MMU notifier functions, thus > >> ensuring that no new notification can proceed while we are in the > >> locked region. > >> > >> Signed-off-by: Bharat Bhushan > > > > Acked-by: Alexander Graf > > > > Gleb, Paolo, please queue for 3.12 directly. > > Here is the backport. The second hunk has a nontrivial conflict, so > someone please give their {Tested,Reviewed,Compiled}-by. {Compiled,Reviewed}-by: Bharat Bhushan Thanks -Bharat > > Paolo > > diff --git a/arch/powerpc/kvm/e500_mmu_host.c > b/arch/powerpc/kvm/e500_mmu_host.c > index 1c6a9d7..c65593a 100644 > --- a/arch/powerpc/kvm/e500_mmu_host.c > +++ b/arch/powerpc/kvm/e500_mmu_host.c > @@ -332,6 +332,13 @@ static inline int kvmppc_e500_shadow_map(struct > kvmppc_vcpu_e500 *vcpu_e500, > unsigned long hva; > int pfnmap = 0; > int tsize = BOOK3E_PAGESZ_4K; > + int ret = 0; > + unsigned long mmu_seq; > + struct kvm *kvm = vcpu_e500->vcpu.kvm; > + > + /* used to check for invalidations in progress */ > + mmu_seq = kvm->mmu_notifier_seq; > + smp_rmb(); > > /* >* Translate guest physical to true physical, acquiring > @@ -449,6 +456,12 @@ static inline int kvmppc_e500_shadow_map(struct > kvmppc_vcpu_e500 *vcpu_e500, > gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); > } > > + spin_lock(&kvm->mmu_lock); > + if (mmu_notifier_retry(kvm, mmu_seq)) { > + ret = -EAGAIN; > + goto out; > + } > + > kvmppc_e500_ref_setup(ref, gtlbe, pfn); > > kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, > @@ -457,10 +470,13 @@ static inline int kvmppc_e500_shadow_map(struct > kvmppc_vcpu_e500 *vcpu_e500, > /* Clear i-cache for new pages */ > kvmppc_mmu_flush_icache(pfn); > > +out: > + spin_unlock(&kvm->mmu_lock); > + > /* Drop refcount on page, so that mmu notifiers can clear it */ > kvm_release_pfn_clean(pfn); > > - return 0; > + return ret; > } > > /* XXX only map the one-one case, for now use TLB0 */ > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: RFC: (re-)binding the VFIO platform driver to a platform device
> -Original Message- > From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf > Of > Kim Phillips > Sent: Thursday, October 10, 2013 8:36 AM > To: Wood Scott-B07421 > Cc: Yoder Stuart-B08248; Wood Scott-B07421; christoffer.d...@linaro.org; > alex.william...@redhat.com; linux-ker...@vger.kernel.org; > a.mota...@virtualopensystems.com; ag...@suse.de; Sethi Varun-B16395; Bhushan > Bharat-R65777; peter.mayd...@linaro.org; santosh.shu...@linaro.org; > kvm@vger.kernel.org; gre...@linuxfoundation.org > Subject: Re: RFC: (re-)binding the VFIO platform driver to a platform device > > On Wed, 9 Oct 2013 15:03:19 -0500 > Scott Wood wrote: > > > On Wed, 2013-10-09 at 14:44 -0500, Yoder Stuart-B08248 wrote: > > > > From: Wood Scott-B07421 > > > > Sent: Wednesday, October 09, 2013 2:22 PM > > > > > > > > On Wed, 2013-10-09 at 14:02 -0500, Yoder Stuart-B08248 wrote: > > > > > Have been thinking about this issue some more. As Scott > > > > > mentioned, > > thanks for bringing this up again. > > > > > There's already a "bool suppress_bind_attrs" to prevent sysfs > > > > bind/unbind. I suggested a similar flag to mean the oppsosite -- > > > > bind > > > > *only* through sysfs. Greg KH was skeptical and wanted to see a > > > > patch before any further discussion. > > > > > > Ah, think I understand now...yes that works as well, and would be > > > less intrustive. So are you writing a patch? :) > > > > I've been meaning to since the previous round of discussion, but I've > > been busy. Would someone else be able to test it in the context of > > using it for VFIO? > > yes - see below. > > > Otherwise, that looks about right, for the driver side (though > > driver_attach could error out earlier rather than testing it inside > > the loop). > > I've made the changes you suggested and tested the resulting diff below on an > arndale board. I successfully performed the following sequence of commands > after first changing the i2c@12C8 node in the device tree to be > exclusively > compatible with "vfio": > > === > # ls -l /sys/bus/platform/drivers/vfio-platform/ > total 0 > --w--- 1 root root 4096 Sep 24 19:17 bind > --w--- 1 root root 4096 Sep 24 19:13 uevent > --w--- 1 root root 4096 Sep 24 19:18 unbind # ls -l > /sys/bus/platform/drivers/s3c-i2c total 0 > lrwxrwxrwx 1 root root0 Sep 24 19:11 12c6.i2c -> > ../../../../devices/12c6.i2c > lrwxrwxrwx 1 root root0 Sep 24 19:11 12c9.i2c -> > ../../../../devices/12c9.i2c > lrwxrwxrwx 1 root root0 Sep 24 19:20 12ce.i2c -> > ../../../../devices/12ce.i2c > --w--- 1 root root 4096 Sep 24 19:18 bind > --w--- 1 root root 4096 Sep 24 19:11 uevent > --w--- 1 root root 4096 Sep 24 19:17 unbind # ls -l > /sys/devices/12c8.i2c/driver # this is the one with the 'vfio' compatible > ls: cannot access /sys/devices/12c8.i2c/driver: No such file or directory > # > ls -l /sys/devices/12ce.i2c/driver lrwxrwxrwx 1 root root 0 Sep 24 19:18 > /sys/devices/12ce.i2c/driver -> ../../bus/platform/drivers/s3c-i2c > # echo 12ce.i2c > /sys/bus/platform/drivers/s3c-i2c/unbind > # ls -l /sys/devices/12ce.i2c/driver > ls: cannot access /sys/devices/12ce.i2c/driver: No such file or directory > # > echo 12ce.i2c > /sys/bus/platform/drivers/vfio-platform/bind > # ls -l /sys/devices/12ce.i2c/driver lrwxrwxrwx 1 root root 0 Sep 24 19:21 > /sys/devices/12ce.i2c/driver -> ../../bus/platform/drivers/vfio-platform > # echo 12ce.i2c > /sys/bus/platform/drivers/vfio-platform/unbind > # ls -l /sys/devices/12ce.i2c/driver # echo 12ce.i2c > > /sys/bus/platform/drivers/s3c-i2c/bind > [ 722.137524] s3c-i2c 12ce.i2c: slave address 0x38 [ 722.141037] s3c-i2c > 12ce.i2c: bus frequency set to 65 KHz [ 722.150605] s3c-i2c 12ce.i2c: > i2c-8: S3C I2C adapter # ls -l /sys/devices/12ce.i2c/driver lrwxrwxrwx 1 > root root 0 Sep 24 19:21 /sys/devices/12ce.i2c/driver -> > ../../bus/platform/drivers/s3c-i2c > # > > > so it's correctly not allowing 'vfio' driver to bind to a device tree > compatible > it's declared, and it then can bind the i2c @ 12ce device to the vfio- > platform driver, and unbind and bind it back to the i2c driver. > > For clarity's sake, before this diff, the command: > > echo 12ce.i2c > /sys/bus/platform/drivers/vfio-platform/bind > > would error with: > > echo: write
RE: RFC: (re-)binding the VFIO platform driver to a platform device
> -Original Message- > From: Wood Scott-B07421 > Sent: Thursday, October 10, 2013 1:33 AM > To: Yoder Stuart-B08248 > Cc: Wood Scott-B07421; Kim Phillips; Christoffer Dall; Alex Williamson; linux- > ker...@vger.kernel.org; a.mota...@virtualopensystems.com; ag...@suse.de; Sethi > Varun-B16395; Bhushan Bharat-R65777; peter.mayd...@linaro.org; > santosh.shu...@linaro.org; kvm@vger.kernel.org; gre...@linuxfoundation.org > Subject: Re: RFC: (re-)binding the VFIO platform driver to a platform device > > On Wed, 2013-10-09 at 14:44 -0500, Yoder Stuart-B08248 wrote: > > > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Wednesday, October 09, 2013 2:22 PM > > > To: Yoder Stuart-B08248 > > > Cc: Wood Scott-B07421; Kim Phillips; Christoffer Dall; Alex > > > Williamson; linux-ker...@vger.kernel.org; > > > a.mota...@virtualopensystems.com; ag...@suse.de; Sethi Varun-B16395; > > > Bhushan Bharat-R65777; peter.mayd...@linaro.org; > > > santosh.shu...@linaro.org; kvm@vger.kernel.org; > > > gre...@linuxfoundation.org > > > Subject: Re: RFC: (re-)binding the VFIO platform driver to a > > > platform device > > > > > > On Wed, 2013-10-09 at 14:02 -0500, Yoder Stuart-B08248 wrote: > > > > Have been thinking about this issue some more. As Scott > > > > mentioned, 'wildcard' matching for a driver can be fairly done in > > > > the platform bus driver. We could add a new flag to the platform driver > struct: > > > > > > > > diff --git a/drivers/base/platform.c b/drivers/base/platform.c > > > > index 4f8bef3..4d6cf14 100644 > > > > --- a/drivers/base/platform.c > > > > +++ b/drivers/base/platform.c > > > > @@ -727,6 +727,10 @@ static int platform_match(struct device *dev, > > > struct device_driver *drv) > > > > struct platform_device *pdev = to_platform_device(dev); > > > > struct platform_driver *pdrv = to_platform_driver(drv); > > > > > > > > + /* the driver matches any device */ > > > > + if (pdrv->match_any) > > > > + return 1; > > > > + > > > > /* Attempt an OF style match first */ > > > > if (of_driver_match_device(dev, drv)) > > > > return 1; > > > > > > > > However, the more problematic issue is that a bus driver has no > > > > way to differentiate from an explicit bind request via sysfs and a > > > > bind that happened through bus probing. > > > > > > Again, I think the wildcard match should be orthogonal to "don't > > > bind by default" as far as the mechanism goes. > > > > > > There's already a "bool suppress_bind_attrs" to prevent sysfs > > > bind/unbind. I suggested a similar flag to mean the oppsosite -- > > > bind > > > *only* through sysfs. Greg KH was skeptical and wanted to see a > > > patch before any further discussion. > > > > Ah, think I understand now...yes that works as well, and would be > > less intrustive. So are you writing a patch? :) > > I've been meaning to since the previous round of discussion, but I've been > busy. > Would someone else be able to test it in the context of using it for VFIO? I wish I could have but I do not have vfio-platform stuff. > > > It would be something like this, right? > > > > diff --git a/drivers/base/dd.c b/drivers/base/dd.c index > > 35fa368..c9a61ea 100644 > > --- a/drivers/base/dd.c > > +++ b/drivers/base/dd.c > > @@ -389,7 +389,7 @@ static int __device_attach(struct device_driver > > *drv, void *data) { > > struct device *dev = data; > > > > - if (!driver_match_device(drv, dev)) > > + if (!drv->explicit_bind_only && !driver_match_device(drv, > > + dev)) > > return 0; > > if (drv->explicit_bind_only || !driver_match_device(drv, dev)) > return 0; Scott, I am trying to understand what you are proposing here (example "DEVICE" can be handled by "DRIVER1" and "VFIO-PLATFORM-DRIVER"): - By default drv->explicit_bind_only will be clear in all drivers. - By default device->explicit_bind_only will also be clear for all devices. - On boot, matching devices will bound to the respective driver (DEVICE >==> DRIVER1). This will never bound with VFIO-PLATFORM-DRIVER. So far same as before. - Via Sysfs interface set drv->explicit_bind_only
RE: [PATCH 3/4] kvm: powerpc: define a linux pte lookup function
> -Original Message- > From: Wood Scott-B07421 > Sent: Wednesday, October 09, 2013 3:07 AM > To: Bhushan Bharat-R65777 > Cc: ag...@suse.de; Yoder Stuart-B08248; kvm@vger.kernel.org; kvm- > p...@vger.kernel.org; pau...@samba.org; Bhushan Bharat-R65777 > Subject: Re: [PATCH 3/4] kvm: powerpc: define a linux pte lookup function > > On Tue, 2013-10-08 at 11:33 +0530, Bharat Bhushan wrote: > > We need to search linux "pte" to get "pte" attributes for setting TLB > > in KVM. > > This patch defines a linux_pte_lookup() function for same. > > > > Signed-off-by: Bharat Bhushan > > --- > > arch/powerpc/include/asm/pgtable.h | 35 > > +++ > > 1 files changed, 35 insertions(+), 0 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/pgtable.h > > b/arch/powerpc/include/asm/pgtable.h > > index 7d6eacf..fd26c04 100644 > > --- a/arch/powerpc/include/asm/pgtable.h > > +++ b/arch/powerpc/include/asm/pgtable.h > > @@ -223,6 +223,41 @@ extern int gup_hugepte(pte_t *ptep, unsigned long > > sz, unsigned long addr, #endif pte_t > > *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, > > unsigned *shift); > > + > > +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, > > +unsigned long *pte_sizep) > > +{ > > + pte_t *ptep; > > + pte_t pte; > > + unsigned long ps = *pte_sizep; > > + unsigned int shift; > > + > > + ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); > > + if (!ptep) > > + return __pte(0); > > + if (shift) > > + *pte_sizep = 1ul << shift; > > + else > > + *pte_sizep = PAGE_SIZE; > > + > > + if (ps > *pte_sizep) > > + return __pte(0); > > + > > + /* wait until _PAGE_BUSY is clear */ > > + while (1) { > > + pte = pte_val(*ptep); > > + if (unlikely(pte & _PAGE_BUSY)) { > > + cpu_relax(); > > + continue; > > + } > > + } > > + > > + /* If pte is not present return None */ > > + if (unlikely(!(pte & _PAGE_PRESENT))) > > + return __pte(0); > > + > > + return pte; > > +} > > Can lookup_linux_pte_and_update() call lookup_linux_pte()? What lookup_linux_pte_and_update() does:- - find_linux_pte_or_hugepte() - does size and some other trivial checks - Then atomically update the pte:- => while() => wait till _PAGE_BUSY is clear => atomically update the pte => if not updated then go back to while() above else break While what lookup_linux_pte() does:- - find_linux_pte_or_hugepte() - does size and some other trivial checks - wait till _PAGE_BUSY is clear - return pte I am finding it difficult to call lookup_linux_pte() from lookup_linux_pte_and_update(). Thanks -Bharat > > -Scott >
RE: [PATCH 1/2] kvm/powerpc: rename kvm_hypercall() to epapr_hypercall()
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Monday, October 07, 2013 9:43 PM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; kvm-...@vger.kernel.org; kvm@vger.kernel.org > Subject: Re: [PATCH 1/2] kvm/powerpc: rename kvm_hypercall() to > epapr_hypercall() > > > On 07.10.2013, at 18:04, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: kvm-ppc-ow...@vger.kernel.org > >> [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf > >> Sent: Monday, October 07, 2013 9:16 PM > >> To: Bhushan Bharat-R65777 > >> Cc: Wood Scott-B07421; kvm-...@vger.kernel.org; kvm@vger.kernel.org > >> Subject: Re: [PATCH 1/2] kvm/powerpc: rename kvm_hypercall() to > >> epapr_hypercall() > >> > >> > >> On 07.10.2013, at 17:43, Bhushan Bharat-R65777 > >> wrote: > >> > >>>>>>>>>>> at least when I can avoid it. With the current code the > >>>>>>>>>>> compiler would be > >>>>>>>> smart enough to just optimize out the complete branch. > >>>>>>>>>> > >>>>>>>>>> Sure. My point is, where would you be calling that where the > >>>>>>>>>> entire file isn't predicated on (or selecting) > >>>>>>>>>> CONFIG_KVM_GUEST or > >>>> similar? > >>>>>>>>>> > >>>>>>>>>> We don't do these stubs for every single function in the > >>>>>>>>>> kernel > >>>>>>>>>> -- only ones where the above is a reasonable use case. > >>>>>>>>> > >>>>>>>>> Yeah, I'm fine on dropping it, but we need to make that a > >>>>>>>>> conscious decision > >>>>>>>> and verify that no caller relies on it. > >>>>>>>> > >>>>>>>> kvm_para_has_feature() is called from > >>>>>>>> arch/powerpc/kernel/kvm.c, arch/x86/kernel/kvm.c, and > >>>>>>>> arch/x86/kernel/kvmclock.c, all of which are enabled by > CONFIG_KVM_GUEST. > >>>>>>>> > >>>>>>>> I did find one example of kvm_para_available() being used in an > >>>>>>>> unexpected place > >>>>>>>> -- sound/pci/intel8x0.c. It defines its own > >>>>>>>> non-CONFIG_KVM_GUEST stub, even though x86 defines > >>>>>>>> kvm_para_available() using inline CPUID stuff which should work > >>>>>>>> without > CONFIG_KVM_GUEST. > >>>>>>>> I'm not sure why it even needs to do that, though -- shouldn't > >>>>>>>> the subsequent PCI subsystem vendor/device check should be > >>>>>>>> sufficient? > >>>>>>>> No hypercalls are involved. > >>>>>>>> > >>>>>>>> That said, the possibility that some random driver might want > >>>>>>>> to make use of paravirt features is a decent argument for keeping the > stub. > >>>>>>>> > >>>>>>> > >>>>>>> I am not sure where we are agreeing on? > >>>>>>> Do we want to remove the stub in > >>>>>>> arch/powerpc/include/asm/kvm_para.h > >>>>>>> ? as > >>>>>> there is no caller without KVM_GUEST and in future caller ensure > >>>>>> this to be called only from code selected by KVM_GUEST? > >>>>>>> > >>>>>>> Or let this stub stay to avoid any random driver calling this ? > >>>>>> > >>>>>> I think the most reasonable way forward is to add a stub for > >>>>>> non-CONFIG_EPAPR to the epapr code, then replace the kvm bits > >>>>>> with generic epapr bits (which your patches already do). > >>>>> > >>>>> Please describe which stub you are talking about. > >>>> > >>>> kvm_hypercall is always available, regardless of the config option, > >>>> which makes all its subfunctions always available as well. > >>> > >>> This patch renames kvm_hypercall() to epapr_hypercall() and which is > >>> always > >> available. And the kvm_hypercall() friends now directly calls > epapr_hypercall(). > >>> IIUC, So what you are trying to say is let the kvm_hypercall() > >>> friends keep on > >> calling kvm_hypercall() itself and a sub something like this: > >> > >> No, what I'm saying is that we either > >> > >> a) drop the whole #ifndef code path consciously. This would have to > >> be a separate patch with a separate discussion. It's orthogonal to > >> combining > >> kvm_hypercall() and epapr_hypercall() > >> > >> b) add the #ifndef path to epapr_hypercall() > > > > Do you mean like this in arch/powerpc/include/asm/epapr_hcalls.h > > > > #ifdef CONFIG_KVM_GUEST > > CONFIG_EPAPR_PARAVIRT Yes, I was getting confused why only KVM_GUEST as this not specific to KVM-GUEST. Thank you > > Apart from that, yes, I think that's what we want. > > > Alex > > > static inline unsigned long epapr_hypercall(unsigned long *in, > > unsigned long *out, > > unsigned long nr) { // code for this > > function } #else static inline unsigned long epapr_hypercall(unsigned > > long *in, > > unsigned long *out, > > unsigned long nr) { > > return EV_UNIMPLEMENTED; > > } > > #endif > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 1/2] kvm/powerpc: rename kvm_hypercall() to epapr_hypercall()
> -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Alexander Graf > Sent: Monday, October 07, 2013 9:16 PM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; kvm-...@vger.kernel.org; kvm@vger.kernel.org > Subject: Re: [PATCH 1/2] kvm/powerpc: rename kvm_hypercall() to > epapr_hypercall() > > > On 07.10.2013, at 17:43, Bhushan Bharat-R65777 wrote: > > >>>>>>>>> at least when I can avoid it. With the current code the > >>>>>>>>> compiler would be > >>>>>> smart enough to just optimize out the complete branch. > >>>>>>>> > >>>>>>>> Sure. My point is, where would you be calling that where the > >>>>>>>> entire file isn't predicated on (or selecting) CONFIG_KVM_GUEST > >>>>>>>> or > >> similar? > >>>>>>>> > >>>>>>>> We don't do these stubs for every single function in the kernel > >>>>>>>> -- only ones where the above is a reasonable use case. > >>>>>>> > >>>>>>> Yeah, I'm fine on dropping it, but we need to make that a > >>>>>>> conscious decision > >>>>>> and verify that no caller relies on it. > >>>>>> > >>>>>> kvm_para_has_feature() is called from arch/powerpc/kernel/kvm.c, > >>>>>> arch/x86/kernel/kvm.c, and arch/x86/kernel/kvmclock.c, all of > >>>>>> which are enabled by CONFIG_KVM_GUEST. > >>>>>> > >>>>>> I did find one example of kvm_para_available() being used in an > >>>>>> unexpected place > >>>>>> -- sound/pci/intel8x0.c. It defines its own non-CONFIG_KVM_GUEST > >>>>>> stub, even though x86 defines kvm_para_available() using inline > >>>>>> CPUID stuff which should work without CONFIG_KVM_GUEST. > >>>>>> I'm not sure why it even needs to do that, though -- shouldn't > >>>>>> the subsequent PCI subsystem vendor/device check should be sufficient? > >>>>>> No hypercalls are involved. > >>>>>> > >>>>>> That said, the possibility that some random driver might want to > >>>>>> make use of paravirt features is a decent argument for keeping the > >>>>>> stub. > >>>>>> > >>>>> > >>>>> I am not sure where we are agreeing on? > >>>>> Do we want to remove the stub in > >>>>> arch/powerpc/include/asm/kvm_para.h > >>>>> ? as > >>>> there is no caller without KVM_GUEST and in future caller ensure > >>>> this to be called only from code selected by KVM_GUEST? > >>>>> > >>>>> Or let this stub stay to avoid any random driver calling this ? > >>>> > >>>> I think the most reasonable way forward is to add a stub for > >>>> non-CONFIG_EPAPR to the epapr code, then replace the kvm bits with > >>>> generic epapr bits (which your patches already do). > >>> > >>> Please describe which stub you are talking about. > >> > >> kvm_hypercall is always available, regardless of the config option, > >> which makes all its subfunctions always available as well. > > > > This patch renames kvm_hypercall() to epapr_hypercall() and which is always > available. And the kvm_hypercall() friends now directly calls > epapr_hypercall(). > > IIUC, So what you are trying to say is let the kvm_hypercall() friends keep > > on > calling kvm_hypercall() itself and a sub something like this: > > No, what I'm saying is that we either > > a) drop the whole #ifndef code path consciously. This would have to be a > separate patch with a separate discussion. It's orthogonal to combining > kvm_hypercall() and epapr_hypercall() > > b) add the #ifndef path to epapr_hypercall() Do you mean like this in arch/powerpc/include/asm/epapr_hcalls.h #ifdef CONFIG_KVM_GUEST static inline unsigned long epapr_hypercall(unsigned long *in, unsigned long *out, unsigned long nr) { // code for this function } #else static inline unsigned long epapr_hypercall(unsigned long *in, unsigned long *out, unsigned long nr) { return EV_UNIMPLEMENTED; } #endif > > I prefer b, Scott prefers b. > > > Alex > > -- > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body > of a message to majord...@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/2] arm64: KVM: Yield CPU when vcpu executes a WFE
> -Original Message- > From: Marc Zyngier [mailto:marc.zyng...@arm.com] > Sent: Monday, October 07, 2013 9:11 PM > To: linux-arm-ker...@lists.infradead.org; kvm...@lists.cs.columbia.edu; > kvm@vger.kernel.org > Subject: [PATCH 2/2] arm64: KVM: Yield CPU when vcpu executes a WFE > > On an (even slightly) oversubscribed system, spinlocks are quickly becoming a > bottleneck, as some vcpus are spinning, waiting for a lock to be released, > while > the vcpu holding the lock may not be running at all. > > The solution is to trap blocking WFEs and tell KVM that we're now spinning. > This > ensures that other vpus will get a scheduling boost, allowing the lock to be > released more quickly. > > Signed-off-by: Marc Zyngier > --- > arch/arm64/include/asm/kvm_arm.h | 8 ++-- > arch/arm64/kvm/handle_exit.c | 18 +- > 2 files changed, 19 insertions(+), 7 deletions(-) > > diff --git a/arch/arm64/include/asm/kvm_arm.h > b/arch/arm64/include/asm/kvm_arm.h > index a5f28e2..c98ef47 100644 > --- a/arch/arm64/include/asm/kvm_arm.h > +++ b/arch/arm64/include/asm/kvm_arm.h > @@ -63,6 +63,7 @@ > * TAC: Trap ACTLR > * TSC: Trap SMC > * TSW: Trap cache operations by set/way > + * TWE: Trap WFE > * TWI: Trap WFI > * TIDCP:Trap L2CTLR/L2ECTLR > * BSU_IS: Upgrade barriers to the inner shareable domain > @@ -72,8 +73,9 @@ > * FMO: Override CPSR.F and enable signaling with VF > * SWIO: Turn set/way invalidates into set/way clean+invalidate > */ > -#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | > \ > - HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ > +#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ > + HCR_BSU_IS | HCR_FB | HCR_TAC | \ > + HCR_AMO | HCR_IMO | HCR_FMO | \ >HCR_SWIO | HCR_TIDCP | HCR_RW) > #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) > > @@ -242,4 +244,6 @@ > > #define ESR_EL2_EC_xABT_xFSR_EXTABT 0x10 > > +#define ESR_EL2_EC_WFI_ISS_WFE (1 << 0) In another patch this is named as WHI_IS_WFE whereas here it is WFI_ISS_WFE, looks like typo. Anyways, what I am interested to understand is what does this macro means? Thanks -Bharat > + > #endif /* __ARM64_KVM_ARM_H__ */ > diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index > 9beaca03..8da5606 100644 > --- a/arch/arm64/kvm/handle_exit.c > +++ b/arch/arm64/kvm/handle_exit.c > @@ -47,21 +47,29 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct > kvm_run > *run) } > > /** > - * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a > guest > + * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event > + * instruction executed by a guest > + * > * @vcpu:the vcpu pointer > * > - * Simply call kvm_vcpu_block(), which will halt execution of > + * WFE: Yield the CPU and come back to this vcpu when the scheduler > + * decides to. > + * WFI: Simply call kvm_vcpu_block(), which will halt execution of > * world-switches and schedule other host processes until there is an > * incoming IRQ or FIQ to the VM. > */ > -static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run) > +static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) > { > - kvm_vcpu_block(vcpu); > + if (kvm_vcpu_get_hsr(vcpu) & ESR_EL2_EC_WFI_ISS_WFE) > + kvm_vcpu_on_spin(vcpu); > + else > + kvm_vcpu_block(vcpu); > + > return 1; > } > > static exit_handle_fn arm_exit_handlers[] = { > - [ESR_EL2_EC_WFI]= kvm_handle_wfi, > + [ESR_EL2_EC_WFI]= kvm_handle_wfx, > [ESR_EL2_EC_CP15_32]= kvm_handle_cp15_32, > [ESR_EL2_EC_CP15_64]= kvm_handle_cp15_64, > [ESR_EL2_EC_CP14_MR]= kvm_handle_cp14_access, > -- > 1.8.2.3 > > > > ___ > kvmarm mailing list > kvm...@lists.cs.columbia.edu > https://lists.cs.columbia.edu/cucslists/listinfo/kvmarm -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 1/2] kvm/powerpc: rename kvm_hypercall() to epapr_hypercall()
> >>> at least when I can avoid it. With the current code the compiler > >>> would be > smart enough to just optimize out the complete branch. > >> > >> Sure. My point is, where would you be calling that where the > >> entire file isn't predicated on (or selecting) CONFIG_KVM_GUEST or > similar? > >> > >> We don't do these stubs for every single function in the kernel > >> -- only ones where the above is a reasonable use case. > > > > Yeah, I'm fine on dropping it, but we need to make that a > > conscious decision > and verify that no caller relies on it. > > kvm_para_has_feature() is called from arch/powerpc/kernel/kvm.c, > arch/x86/kernel/kvm.c, and arch/x86/kernel/kvmclock.c, all of which > are enabled by CONFIG_KVM_GUEST. > > I did find one example of kvm_para_available() being used in an > unexpected place > -- sound/pci/intel8x0.c. It defines its own non-CONFIG_KVM_GUEST > stub, even though x86 defines kvm_para_available() using inline > CPUID stuff which should work without CONFIG_KVM_GUEST. > I'm not sure why it even needs to do that, though -- shouldn't the > subsequent PCI subsystem vendor/device check should be sufficient? > No hypercalls are involved. > > That said, the possibility that some random driver might want to > make use of paravirt features is a decent argument for keeping the stub. > > >>> > >>> I am not sure where we are agreeing on? > >>> Do we want to remove the stub in arch/powerpc/include/asm/kvm_para.h > >>> ? as > >> there is no caller without KVM_GUEST and in future caller ensure this > >> to be called only from code selected by KVM_GUEST? > >>> > >>> Or let this stub stay to avoid any random driver calling this ? > >> > >> I think the most reasonable way forward is to add a stub for > >> non-CONFIG_EPAPR to the epapr code, then replace the kvm bits with > >> generic epapr bits (which your patches already do). > > > > Please describe which stub you are talking about. > > kvm_hypercall is always available, regardless of the config option, which > makes > all its subfunctions always available as well. This patch renames kvm_hypercall() to epapr_hypercall() and which is always available. And the kvm_hypercall() friends now directly calls epapr_hypercall(). IIUC, So what you are trying to say is let the kvm_hypercall() friends keep on calling kvm_hypercall() itself and a sub something like this: #ifdef CONFIG_KVM_GUEST static unsigned long kvm_hypercall(unsigned long *in, unsigned long *out, unsigned long nr) { return epapr_hypercall(in, out. nr); } #else static unsigned long kvm_hypercall(unsigned long *in, unsigned long *out, unsigned long nr) { return EV_UNIMPLEMENTED; } - I am still not really convinced about why we want to keep this stub where we know this is not called outside KVM_GUEST and calling this without KVM_GUEST is debatable. Thanks -Bharat Thanks -Bharat > > > Alex > > --- > > #ifdef CONFIG_KVM_GUEST > > #include > > static inline int kvm_para_available(void) { > struct device_node *hyper_node; > > hyper_node = of_find_node_by_path("/hypervisor"); > if (!hyper_node) > return 0; > > if (!of_device_is_compatible(hyper_node, "linux,kvm")) > return 0; > > return 1; > } > > extern unsigned long kvm_hypercall(unsigned long *in, >unsigned long *out, >unsigned long nr); > > #else > > static inline int kvm_para_available(void) { > return 0; > } > > static unsigned long kvm_hypercall(unsigned long *in, >unsigned long *out, >unsigned long nr) { > return EV_UNIMPLEMENTED; > } > > #endif > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 1/2] kvm/powerpc: rename kvm_hypercall() to epapr_hypercall()
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Friday, October 04, 2013 4:46 PM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; kvm-...@vger.kernel.org; kvm@vger.kernel.org > Subject: Re: [PATCH 1/2] kvm/powerpc: rename kvm_hypercall() to > epapr_hypercall() > > > On 04.10.2013, at 06:26, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: Wood Scott-B07421 > >> Sent: Thursday, October 03, 2013 12:04 AM > >> To: Alexander Graf > >> Cc: Bhushan Bharat-R65777; kvm-...@vger.kernel.org; > >> kvm@vger.kernel.org; Bhushan > >> Bharat-R65777 > >> Subject: Re: [PATCH 1/2] kvm/powerpc: rename kvm_hypercall() to > >> epapr_hypercall() > >> > >> On Wed, 2013-10-02 at 19:54 +0200, Alexander Graf wrote: > >>> On 02.10.2013, at 19:49, Scott Wood wrote: > >>> > >>>> On Wed, 2013-10-02 at 19:46 +0200, Alexander Graf wrote: > >>>>> On 02.10.2013, at 19:42, Scott Wood wrote: > >>>>> > >>>>>> On Wed, 2013-10-02 at 19:17 +0200, Alexander Graf wrote: > >>>>>>> On 02.10.2013, at 19:04, Scott Wood wrote: > >>>>>>> > >>>>>>>> On Wed, 2013-10-02 at 18:53 +0200, Alexander Graf wrote: > >>>>>>>>> On 02.10.2013, at 18:40, Scott Wood wrote: > >>>>>>>>> > >>>>>>>>>> On Wed, 2013-10-02 at 16:19 +0200, Alexander Graf wrote: > >>>>>>>>>>> Won't this break when CONFIG_EPAPR_PARAVIRT=n? We wouldn't > >>>>>>>>>>> have > >> epapr_hcalls.S compiled into the code base then and the bl above > >> would reference an unknown function. > >>>>>>>>>> > >>>>>>>>>> KVM_GUEST selects EPAPR_PARAVIRT. > >>>>>>>>> > >>>>>>>>> But you can not select KVM_GUEST and still call these inline > >>>>>>>>> functions, > >> no? > >>>>>>>> > >>>>>>>> No. > >>>>>>>> > >>>>>>>>> Like kvm_arch_para_features(). > >>>>>>>> > >>>>>>>> Where does that get called without KVM_GUEST? > >>>>>>>> > >>>>>>>> How would that work currently, with the call to kvm_hypercall() > >>>>>>>> in arch/powerpc/kernel/kvm.c (which calls epapr_hypercall, BTW)? > >>>>>>> > >>>>>>> It wouldn't ever get called because kvm_hypercall() ends up > >>>>>>> always > >> returning EV_UNIMPLEMENTED when #ifndef CONFIG_KVM_GUEST. > >>>>>> > >>>>>> OK, so the objection is to removing that stub? Where would we > >>>>>> actually want to call this without knowing that KVM_GUEST or > >>>>>> EPAPR_PARAVIRT are enabled? > >>>>> > >>>>> In probing code. I usually prefer > >>>>> > >>>>> if (kvm_feature_available(X)) { > >>>>> ... > >>>>> } > >>>>> > >>>>> over > >>>>> > >>>>> #ifdef CONFIG_KVM_GUEST > >>>>> if (kvm_feature_available(X)) { > >>>>> ... > >>>>> } > >>>>> #endif > >>>>> > >>>>> at least when I can avoid it. With the current code the compiler > >>>>> would be > >> smart enough to just optimize out the complete branch. > >>>> > >>>> Sure. My point is, where would you be calling that where the > >>>> entire file isn't predicated on (or selecting) CONFIG_KVM_GUEST or > >>>> similar? > >>>> > >>>> We don't do these stubs for every single function in the kernel -- > >>>> only ones where the above is a reasonable use case. > >>> > >>> Yeah, I'm fine on dropping it, but we need to make that a conscious > >>> decision > >> and verify that no caller relies on it. > >> > >> kvm_para_has_feature() is called from arch/powerpc/kernel/kvm.c, > >> arch/x86/kernel/kvm.c, and arch/x86/kernel/kvmclock.c, all of which > >> are enabled by CONFIG_KVM_GUEST. > >> > >> I did find one example of kvm_para_available() being used in an > >> unexpected place > >> -- sound/pci/intel8x0.c. It defines its own non-CONFIG_KVM_GUEST > >> stub, even though x86 defines kvm_para_available() using inline CPUID > >> stuff which should work without CONFIG_KVM_GUEST. > >> I'm not sure why it even needs to do that, though -- shouldn't the > >> subsequent PCI subsystem vendor/device check should be sufficient? > >> No hypercalls are involved. > >> > >> That said, the possibility that some random driver might want to make > >> use of paravirt features is a decent argument for keeping the stub. > >> > > > > I am not sure where we are agreeing on? > > Do we want to remove the stub in arch/powerpc/include/asm/kvm_para.h ? as > there is no caller without KVM_GUEST and in future caller ensure this to be > called only from code selected by KVM_GUEST? > > > > Or let this stub stay to avoid any random driver calling this ? > > I think the most reasonable way forward is to add a stub for non-CONFIG_EPAPR > to > the epapr code, then replace the kvm bits with generic epapr bits (which your > patches already do). Please describe which stub you are talking about. Thanks -Bharat > > With that we should be 100% equivalent to today's code, just with a lot less > lines of code :). > > > Alex > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 1/3 v6] kvm: powerpc: keep only pte search logic in lookup_linux_pte
Hi Paul, > -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Paul Mackerras > Sent: Monday, October 07, 2013 4:39 AM > To: Bhushan Bharat-R65777 > Cc: ag...@suse.de; kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott- > B07421; b...@kernel.crashing.org; Bhushan Bharat-R65777 > Subject: Re: [PATCH 1/3 v6] kvm: powerpc: keep only pte search logic in > lookup_linux_pte > > On Fri, Oct 04, 2013 at 08:25:31PM +0530, Bharat Bhushan wrote: > > lookup_linux_pte() was searching for a pte and also sets access flags > > is writable. This function now searches only pte while access flag > > setting is done explicitly. > > So in order to reduce some code duplication, you have added code duplication > in > the existing callers of this function. I'm not convinced it's an overall win. lookup_linux_pte(): as per name it is supposed to only lookup for a pte, but it is doing more than that (Also updating the pte). So I made this function to only do lookup (which also check size). I am not an MM expert but I think we can make this function better like you suggested checking pte_present() only if _PAGE_BUSY not set. > What's left in this function is pretty trivial, just a call to > find_linux_pte_or_hugepte() and some pagesize computations. I would prefer > you > found a way to do what you want without adding code duplication at the > existing > call sites. What about doing this way: 1) A function which will do the lookup for Linux pte. May be call that as lookup_linux_pte() 2) lookup + page update (what the existing function lookup_linux_pte() is doing). Will rename this function to lookup_linux_pte_and_update(), which will call above defined lookup_linux_pte() Thanks -Bharat > Maybe you could have a new find_linux_pte_and_check_pagesize() and > call that from the existing lookup_linux_pte(). > > The other thing you've done, without commenting on why you have done it, is to > add a pte_present check without having looked at _PAGE_BUSY. > kvmppc_read_update_linux_pte() only checks _PAGE_PRESENT after checking that > _PAGE_BUSY is clear, so this is a semantic change, which I think is wrong for > server processors. > > So, on the whole, NACK from me for this patch. > > Paul. > -- > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body > of a message to majord...@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 1/3 v6] kvm: powerpc: keep only pte search logic in lookup_linux_pte
Adding Paul > -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Friday, October 04, 2013 8:49 PM > To: Bhushan Bharat-R65777 > Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; > b...@kernel.crashing.org; Bhushan Bharat-R65777 > Subject: Re: [PATCH 1/3 v6] kvm: powerpc: keep only pte search logic in > lookup_linux_pte > > > On 04.10.2013, at 16:55, Bharat Bhushan wrote: > > > lookup_linux_pte() was searching for a pte and also sets access flags > > is writable. This function now searches only pte while access flag > > setting is done explicitly. > > > > This pte lookup is not kvm specific, so moved to common code > > (asm/pgtable.h) My Followup patch will use this on booke. > > > > Signed-off-by: Bharat Bhushan > > Paul, please ack. > > > Alex > > > --- > > v5->v6 > > - return NULL rather than _pte(0) as this was > > giving compilation error with STRICT_MM_TYPECHECKS > > - Also not only check for NULL pointer in caller rather than > > calling pte_present() twice > > > > arch/powerpc/include/asm/pgtable.h | 24 +++ > > arch/powerpc/kvm/book3s_hv_rm_mmu.c | 36 > > +++--- > > 2 files changed, 36 insertions(+), 24 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/pgtable.h > > b/arch/powerpc/include/asm/pgtable.h > > index 7d6eacf..5e41a31 100644 > > --- a/arch/powerpc/include/asm/pgtable.h > > +++ b/arch/powerpc/include/asm/pgtable.h > > @@ -223,6 +223,30 @@ extern int gup_hugepte(pte_t *ptep, unsigned long > > sz, unsigned long addr, #endif pte_t *find_linux_pte_or_hugepte(pgd_t > > *pgdir, unsigned long ea, > > unsigned *shift); > > + > > +static inline pte_t *lookup_linux_pte(pgd_t *pgdir, unsigned long hva, > > +unsigned long *pte_sizep) > > +{ > > + pte_t *ptep; > > + unsigned long ps = *pte_sizep; > > + unsigned int shift; > > + > > + ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); > > + if (!ptep) > > + return NULL; > > + if (shift) > > + *pte_sizep = 1ul << shift; > > + else > > + *pte_sizep = PAGE_SIZE; > > + > > + if (ps > *pte_sizep) > > + return NULL; > > + > > + if (!pte_present(*ptep)) > > + return NULL; > > + > > + return ptep; > > +} > > #endif /* __ASSEMBLY__ */ > > > > #endif /* __KERNEL__ */ > > diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c > > b/arch/powerpc/kvm/book3s_hv_rm_mmu.c > > index 45e30d6..8ab54e8 100644 > > --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c > > +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c > > @@ -134,25 +134,6 @@ static void remove_revmap_chain(struct kvm *kvm, long > pte_index, > > unlock_rmap(rmap); > > } > > > > -static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, > > - int writing, unsigned long *pte_sizep) > > -{ > > - pte_t *ptep; > > - unsigned long ps = *pte_sizep; > > - unsigned int hugepage_shift; > > - > > - ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift); > > - if (!ptep) > > - return __pte(0); > > - if (hugepage_shift) > > - *pte_sizep = 1ul << hugepage_shift; > > - else > > - *pte_sizep = PAGE_SIZE; > > - if (ps > *pte_sizep) > > - return __pte(0); > > - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); > > -} > > - > > static inline void unlock_hpte(unsigned long *hpte, unsigned long > > hpte_v) { > > asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); @@ -173,6 +154,7 > > @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, > > unsigned long is_io; > > unsigned long *rmap; > > pte_t pte; > > + pte_t *ptep; > > unsigned int writing; > > unsigned long mmu_seq; > > unsigned long rcbits; > > @@ -231,8 +213,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned > > long flags, > > > > /* Look up the Linux PTE for the backing page */ > > pte_size = psize; > > - pte = lookup_linux_pte(pgdir, hva, writing, &pte_size); > > - if (pte_present(pte)) { > > + ptep = lookup_linux_pte(pgdir, hva, &pte_size); > > + if (ptep) { > > +
RE: [PATCH 4/6 v5] kvm: powerpc: keep only pte search logic in lookup_linux_pte
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Friday, October 04, 2013 6:57 PM > To: Bhushan Bharat-R65777 > Cc: b...@kernel.crashing.org; pau...@samba.org; kvm@vger.kernel.org; kvm- > p...@vger.kernel.org; linuxppc-...@lists.ozlabs.org; Wood Scott-B07421; > Bhushan > Bharat-R65777 > Subject: Re: [PATCH 4/6 v5] kvm: powerpc: keep only pte search logic in > lookup_linux_pte > > > On 19.09.2013, at 08:02, Bharat Bhushan wrote: > > > lookup_linux_pte() was searching for a pte and also sets access flags > > is writable. This function now searches only pte while access flag > > setting is done explicitly. > > > > This pte lookup is not kvm specific, so moved to common code > > (asm/pgtable.h) My Followup patch will use this on booke. > > > > Signed-off-by: Bharat Bhushan > > --- > > v4->v5 > > - No change > > > > arch/powerpc/include/asm/pgtable.h | 24 +++ > > arch/powerpc/kvm/book3s_hv_rm_mmu.c | 36 > > +++--- > > 2 files changed, 36 insertions(+), 24 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/pgtable.h > > b/arch/powerpc/include/asm/pgtable.h > > index 7d6eacf..3a5de5c 100644 > > --- a/arch/powerpc/include/asm/pgtable.h > > +++ b/arch/powerpc/include/asm/pgtable.h > > @@ -223,6 +223,30 @@ extern int gup_hugepte(pte_t *ptep, unsigned long > > sz, unsigned long addr, #endif pte_t *find_linux_pte_or_hugepte(pgd_t > > *pgdir, unsigned long ea, > > unsigned *shift); > > + > > +static inline pte_t *lookup_linux_pte(pgd_t *pgdir, unsigned long hva, > > +unsigned long *pte_sizep) > > +{ > > + pte_t *ptep; > > + unsigned long ps = *pte_sizep; > > + unsigned int shift; > > + > > + ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); > > + if (!ptep) > > + return __pte(0); > > This returns a struct pte_t, but your return value of the function is a struct > pte_t *. So this code will fail compiling with STRICT_MM_TYPECHECKS set. Any > reason you don't just return NULL here? I want to return the ptep (pte pointer) , so yes this should be NULL. Will correct this. Thanks -Bharat > > That way callers could simply check on if (ptep) ... or you leave the return > value as struct pte_t. > > > Alex > > > + if (shift) > > + *pte_sizep = 1ul << shift; > > + else > > + *pte_sizep = PAGE_SIZE; > > + > > + if (ps > *pte_sizep) > > + return __pte(0); > > + > > + if (!pte_present(*ptep)) > > + return __pte(0); > > > + > > + return ptep; > > +} > > #endif /* __ASSEMBLY__ */ > > > > #endif /* __KERNEL__ */ > > diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c > > b/arch/powerpc/kvm/book3s_hv_rm_mmu.c > > index 45e30d6..74fa7f8 100644 > > --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c > > +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c > > @@ -134,25 +134,6 @@ static void remove_revmap_chain(struct kvm *kvm, long > pte_index, > > unlock_rmap(rmap); > > } > > > > -static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, > > - int writing, unsigned long *pte_sizep) > > -{ > > - pte_t *ptep; > > - unsigned long ps = *pte_sizep; > > - unsigned int hugepage_shift; > > - > > - ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift); > > - if (!ptep) > > - return __pte(0); > > - if (hugepage_shift) > > - *pte_sizep = 1ul << hugepage_shift; > > - else > > - *pte_sizep = PAGE_SIZE; > > - if (ps > *pte_sizep) > > - return __pte(0); > > - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); > > -} > > - > > static inline void unlock_hpte(unsigned long *hpte, unsigned long > > hpte_v) { > > asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); @@ -173,6 +154,7 > > @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, > > unsigned long is_io; > > unsigned long *rmap; > > pte_t pte; > > + pte_t *ptep; > > unsigned int writing; > > unsigned long mmu_seq; > > unsigned long rcbits; > > @@ -231,8 +213,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned > > long flags, > > > > /* Look up the Linux PTE for the backing page */ > > pt
RE: [PATCH 1/2] kvm/powerpc: rename kvm_hypercall() to epapr_hypercall()
> -Original Message- > From: Wood Scott-B07421 > Sent: Thursday, October 03, 2013 12:04 AM > To: Alexander Graf > Cc: Bhushan Bharat-R65777; kvm-...@vger.kernel.org; kvm@vger.kernel.org; > Bhushan > Bharat-R65777 > Subject: Re: [PATCH 1/2] kvm/powerpc: rename kvm_hypercall() to > epapr_hypercall() > > On Wed, 2013-10-02 at 19:54 +0200, Alexander Graf wrote: > > On 02.10.2013, at 19:49, Scott Wood wrote: > > > > > On Wed, 2013-10-02 at 19:46 +0200, Alexander Graf wrote: > > >> On 02.10.2013, at 19:42, Scott Wood wrote: > > >> > > >>> On Wed, 2013-10-02 at 19:17 +0200, Alexander Graf wrote: > > >>>> On 02.10.2013, at 19:04, Scott Wood wrote: > > >>>> > > >>>>> On Wed, 2013-10-02 at 18:53 +0200, Alexander Graf wrote: > > >>>>>> On 02.10.2013, at 18:40, Scott Wood wrote: > > >>>>>> > > >>>>>>> On Wed, 2013-10-02 at 16:19 +0200, Alexander Graf wrote: > > >>>>>>>> Won't this break when CONFIG_EPAPR_PARAVIRT=n? We wouldn't have > epapr_hcalls.S compiled into the code base then and the bl above would > reference > an unknown function. > > >>>>>>> > > >>>>>>> KVM_GUEST selects EPAPR_PARAVIRT. > > >>>>>> > > >>>>>> But you can not select KVM_GUEST and still call these inline > > >>>>>> functions, > no? > > >>>>> > > >>>>> No. > > >>>>> > > >>>>>> Like kvm_arch_para_features(). > > >>>>> > > >>>>> Where does that get called without KVM_GUEST? > > >>>>> > > >>>>> How would that work currently, with the call to kvm_hypercall() > > >>>>> in arch/powerpc/kernel/kvm.c (which calls epapr_hypercall, BTW)? > > >>>> > > >>>> It wouldn't ever get called because kvm_hypercall() ends up always > returning EV_UNIMPLEMENTED when #ifndef CONFIG_KVM_GUEST. > > >>> > > >>> OK, so the objection is to removing that stub? Where would we > > >>> actually want to call this without knowing that KVM_GUEST or > > >>> EPAPR_PARAVIRT are enabled? > > >> > > >> In probing code. I usually prefer > > >> > > >> if (kvm_feature_available(X)) { > > >> ... > > >> } > > >> > > >> over > > >> > > >> #ifdef CONFIG_KVM_GUEST > > >> if (kvm_feature_available(X)) { > > >> ... > > >> } > > >> #endif > > >> > > >> at least when I can avoid it. With the current code the compiler would be > smart enough to just optimize out the complete branch. > > > > > > Sure. My point is, where would you be calling that where the entire > > > file isn't predicated on (or selecting) CONFIG_KVM_GUEST or similar? > > > > > > We don't do these stubs for every single function in the kernel -- > > > only ones where the above is a reasonable use case. > > > > Yeah, I'm fine on dropping it, but we need to make that a conscious decision > and verify that no caller relies on it. > > kvm_para_has_feature() is called from arch/powerpc/kernel/kvm.c, > arch/x86/kernel/kvm.c, and arch/x86/kernel/kvmclock.c, all of which are > enabled > by CONFIG_KVM_GUEST. > > I did find one example of kvm_para_available() being used in an unexpected > place > -- sound/pci/intel8x0.c. It defines its own non-CONFIG_KVM_GUEST stub, even > though x86 defines kvm_para_available() using inline CPUID stuff which should > work without CONFIG_KVM_GUEST. > I'm not sure why it even needs to do that, though -- shouldn't the subsequent > PCI subsystem vendor/device check should be sufficient? No hypercalls are > involved. > > That said, the possibility that some random driver might want to make use of > paravirt features is a decent argument for keeping the stub. > I am not sure where we are agreeing on? Do we want to remove the stub in arch/powerpc/include/asm/kvm_para.h ? as there is no caller without KVM_GUEST and in future caller ensure this to be called only from code selected by KVM_GUEST? Or let this stub stay to avoid any random driver calling this ? Thanks -Bharat
RE: [PATCH 3/7] Return info for device and its memory regions and interrupts
> -Original Message- > From: iommu-boun...@lists.linux-foundation.org [mailto:iommu- > boun...@lists.linux-foundation.org] On Behalf Of Antonios Motakis > Sent: Monday, September 30, 2013 8:59 PM > To: kvm...@lists.cs.columbia.edu; alex.william...@redhat.com > Cc: linux-samsung-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; > Yoder > Stuart-B08248; io...@lists.linux-foundation.org; Antonios Motakis; > t...@virtualopensystems.com > Subject: [PATCH 3/7] Return info for device and its memory regions and > interrupts > > A VFIO userspace driver will start by opening the VFIO device that corresponds > to an IOMMU group, and will use the ioctl interface to get the basic device > info, such as number of memory regions and interrupts, and their properties. > > This patch implements the IOCTLs: > - VFIO_DEVICE_GET_INFO > - VFIO_DEVICE_GET_REGION_INFO > - VFIO_DEVICE_GET_IRQ_INFO > > Signed-off-by: Antonios Motakis > --- > drivers/vfio/vfio_platform.c | 60 > ++-- > 1 file changed, 53 insertions(+), 7 deletions(-) > > diff --git a/drivers/vfio/vfio_platform.c b/drivers/vfio/vfio_platform.c index > b9686b0..a0abcfa 100644 > --- a/drivers/vfio/vfio_platform.c > +++ b/drivers/vfio/vfio_platform.c > @@ -28,6 +28,10 @@ > #include > #include > #include > +#include > +#include > +#include > +#include > > #define DRIVER_VERSION "0.1" > #define DRIVER_AUTHOR "Antonios Motakis " > @@ -54,10 +58,13 @@ static long vfio_platform_ioctl(void *device_data, > unsigned int cmd, unsigned long arg) { > struct vfio_platform_device *vdev = device_data; > + struct device_node *of_node = vdev->pdev->dev.of_node; > unsigned long minsz; > > if (cmd == VFIO_DEVICE_GET_INFO) { > struct vfio_device_info info; > + struct resource res; > + int cnt = 0; > > minsz = offsetofend(struct vfio_device_info, num_irqs); > > @@ -68,18 +75,57 @@ static long vfio_platform_ioctl(void *device_data, > return -EINVAL; > > info.flags = VFIO_DEVICE_FLAGS_PLATFORM; > - info.num_regions = 0; > - info.num_irqs = 0; > + > + while (!of_address_to_resource(of_node, cnt, &res)) > + cnt++; > + > + info.num_regions = cnt; > + > + info.num_irqs = of_irq_count(of_node); > > return copy_to_user((void __user *)arg, &info, minsz); > > - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) > - return -EINVAL; > + } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { > + struct vfio_region_info info; > + struct resource res; > > - else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) > - return -EINVAL; > + minsz = offsetofend(struct vfio_region_info, offset); > + > + if (copy_from_user(&info, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz) > + return -EINVAL; > + > + if(of_address_to_resource(of_node, info.index, &res)) > + return -EINVAL; > + > + info.offset = res.start;/* map phys addr with offset */ > + info.size = resource_size(&res); > + info.flags = 0; > + > + return copy_to_user((void __user *)arg, &info, minsz); > + > + } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { > + struct vfio_irq_info info; > + struct resource res; > + > + minsz = offsetofend(struct vfio_irq_info, count); > + > + if (copy_from_user(&info, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz) > + return -EINVAL; > + > + of_irq_to_resource(of_node, info.index, &res); Why are we calling the above function if not using res? > + > + info.flags = 0; > + info.count = 1; I believe count here is number of interrupts, and we can have devices with more than 1 interrupt. -Bharat > + > + return copy_to_user((void __user *)arg, &info, minsz); > > - else if (cmd == VFIO_DEVICE_SET_IRQS) > + } else if (cmd == VFIO_DEVICE_SET_IRQS) > return -EINVAL; > > else if (cmd == VFIO_DEVICE_RESET) > -- > 1.8.1.2 > > ___ > iommu mailing list > io...@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/iommu -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/7] Initial skeleton of VFIO support for Device Tree based devices
> -Original Message- > From: iommu-boun...@lists.linux-foundation.org [mailto:iommu- > boun...@lists.linux-foundation.org] On Behalf Of Antonios Motakis > Sent: Monday, September 30, 2013 8:59 PM > To: kvm...@lists.cs.columbia.edu; alex.william...@redhat.com > Cc: linux-samsung-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; > Yoder > Stuart-B08248; io...@lists.linux-foundation.org; Antonios Motakis; > t...@virtualopensystems.com > Subject: [PATCH 2/7] Initial skeleton of VFIO support for Device Tree based > devices > > Platform devices in the Linux kernel are usually managed by the DT interface. > This patch forms the base to support these kind of devices with VFIO. > > Signed-off-by: Antonios Motakis > --- > drivers/vfio/Kconfig | 11 +++ > drivers/vfio/Makefile| 1 + > drivers/vfio/vfio_platform.c | 187 > +++ > include/uapi/linux/vfio.h| 1 + > 4 files changed, 200 insertions(+) > create mode 100644 drivers/vfio/vfio_platform.c > > diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index > 1f84eda..35254b7 > 100644 > --- a/drivers/vfio/Kconfig > +++ b/drivers/vfio/Kconfig > @@ -13,4 +13,15 @@ menuconfig VFIO > > If you don't know what to do here, say N. > > +config VFIO_PLATFORM > + tristate "VFIO support for device tree based platform devices" > + depends on VFIO && EVENTFD && OF > + help > + Support for platform devices with VFIO. This is required to make > + use of platform devices present on device tree nodes using the VFIO > + framework. Devices that are not described in the device tree cannot > + be used by this driver. > + > + If you don't know what to do here, say N. > + > source "drivers/vfio/pci/Kconfig" > diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index > 2398d4a..575c8dd 100644 > --- a/drivers/vfio/Makefile > +++ b/drivers/vfio/Makefile > @@ -1,3 +1,4 @@ > obj-$(CONFIG_VFIO) += vfio.o > obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o > obj-$(CONFIG_VFIO_PCI) += pci/ > +obj-$(CONFIG_VFIO_PLATFORM) += vfio_platform.o > diff --git a/drivers/vfio/vfio_platform.c b/drivers/vfio/vfio_platform.c new We can make this parallel to PCI, something like drivers/vfio/platform/platform.c -Bharat > file mode 100644 index 000..b9686b0 > --- /dev/null > +++ b/drivers/vfio/vfio_platform.c > @@ -0,0 +1,187 @@ > +/* > + * Copyright (C) 2013 - Virtual Open Systems > + * Author: Antonios Motakis > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License, version 2, as > + * published by the Free Software Foundation. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#define DRIVER_VERSION "0.1" > +#define DRIVER_AUTHOR "Antonios Motakis " > +#define DRIVER_DESC "VFIO Device Tree devices - User Level meta-driver" > + > +struct vfio_platform_device { > + struct platform_device *pdev; > +}; > + > +static void vfio_platform_release(void *device_data) { > + module_put(THIS_MODULE); > +} > + > +static int vfio_platform_open(void *device_data) { > + if (!try_module_get(THIS_MODULE)) > + return -ENODEV; > + > + return 0; > +} > + > +static long vfio_platform_ioctl(void *device_data, > +unsigned int cmd, unsigned long arg) { > + struct vfio_platform_device *vdev = device_data; > + unsigned long minsz; > + > + if (cmd == VFIO_DEVICE_GET_INFO) { > + struct vfio_device_info info; > + > + minsz = offsetofend(struct vfio_device_info, num_irqs); > + > + if (copy_from_user(&info, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz) > + return -EINVAL; > + > + info.flags = VFIO_DEVICE_FLAGS_PLATFORM; > + info.num_regions = 0; > + info.num_irqs = 0; > + > + return copy_to_user((void __user *)arg, &info, minsz); > + > + } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) > + return -EINVAL; > + > + else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) > + return -EINVAL; > + > + else if (cmd == VFIO_DEVICE_SET_IRQS) > + return -EINVAL; > + > + else if (cmd == VFIO_DEVICE_RESET) > + return -EINVA
RE: [PATCH 5/6 v5] kvm: booke: clear host tlb reference flag on guest tlb invalidation
> -Original Message- > From: Wood Scott-B07421 > Sent: Friday, September 20, 2013 9:48 PM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; b...@kernel.crashing.org; ag...@suse.de; > pau...@samba.org; kvm@vger.kernel.org; kvm-...@vger.kernel.org; linuxppc- > d...@lists.ozlabs.org > Subject: Re: [PATCH 5/6 v5] kvm: booke: clear host tlb reference flag on guest > tlb invalidation > > On Thu, 2013-09-19 at 23:19 -0500, Bhushan Bharat-R65777 wrote: > > > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Friday, September 20, 2013 2:38 AM > > > To: Bhushan Bharat-R65777 > > > Cc: b...@kernel.crashing.org; ag...@suse.de; pau...@samba.org; > > > kvm@vger.kernel.org; kvm-...@vger.kernel.org; > > > linuxppc-...@lists.ozlabs.org; Bhushan Bharat-R65777 > > > Subject: Re: [PATCH 5/6 v5] kvm: booke: clear host tlb reference > > > flag on guest tlb invalidation > > > > > > This breaks when you have both E500_TLB_BITMAP and E500_TLB_TLB0 set. > > > > I do not see any case where we set both E500_TLB_BITMAP and > > E500_TLB_TLB0. > > This would happen if you have a guest TLB1 entry that is backed by some 4K > pages > and some larger pages (e.g. if the guest maps CCSR with one big > TLB1 and there are varying I/O passthrough regions mapped). It's not common, > but it's possible. Agree > > > Also we have not optimized that yet (keeping track of multiple shadow > > TLB0 entries for one guest TLB1 entry) > > This is about correctness, not optimization. > > > We uses these bit flags only for TLB1 and if size of stlbe is 4K then > > we set E500_TLB_TLB0 otherwise we set E500_TLB_BITMAP. Although I > > think that E500_TLB_BITMAP should be set only if stlbe size is less > > than gtlbe size. > > Why? Even if there's only one bit set in the map, we need it to keep track of > which entry was used. If there is one entry then will not this be simple/faster to not lookup bitmap and guest->host array? A flag indicate it is 1:1 map and this is physical address. -Bharat > > -Scott > N�r��yb�X��ǧv�^�){.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf
RE: [PATCH 5/6 v5] kvm: booke: clear host tlb reference flag on guest tlb invalidation
> -Original Message- > From: Wood Scott-B07421 > Sent: Friday, September 20, 2013 11:38 PM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; b...@kernel.crashing.org; ag...@suse.de; > pau...@samba.org; kvm@vger.kernel.org; kvm-...@vger.kernel.org; linuxppc- > d...@lists.ozlabs.org > Subject: Re: [PATCH 5/6 v5] kvm: booke: clear host tlb reference flag on guest > tlb invalidation > > On Fri, 2013-09-20 at 13:04 -0500, Bhushan Bharat-R65777 wrote: > > > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Friday, September 20, 2013 9:48 PM > > > To: Bhushan Bharat-R65777 > > > Cc: Wood Scott-B07421; b...@kernel.crashing.org; ag...@suse.de; > > > pau...@samba.org; kvm@vger.kernel.org; kvm-...@vger.kernel.org; > > > linuxppc- d...@lists.ozlabs.org > > > Subject: Re: [PATCH 5/6 v5] kvm: booke: clear host tlb reference > > > flag on guest tlb invalidation > > > > > > On Thu, 2013-09-19 at 23:19 -0500, Bhushan Bharat-R65777 wrote: > > > > We uses these bit flags only for TLB1 and if size of stlbe is 4K > > > > then we set E500_TLB_TLB0 otherwise we set E500_TLB_BITMAP. > > > > Although I think that E500_TLB_BITMAP should be set only if stlbe > > > > size is less than gtlbe size. > > > > > > Why? Even if there's only one bit set in the map, we need it to > > > keep track of which entry was used. > > > > If there is one entry then will not this be simple/faster to not lookup > > bitmap > and guest->host array? > > A flag indicate it is 1:1 map and this is physical address. > > The difference would be negligible, and you'd have added overhead (both > runtime > and complexity) of making this a special case. May be you are right , I will see if I can give a try :) BTW I have already sent v6 of this patch. -Bharat > > -Scott >
RE: [PATCH 5/6 v5] kvm: booke: clear host tlb reference flag on guest tlb invalidation
> -Original Message- > From: Wood Scott-B07421 > Sent: Friday, September 20, 2013 2:38 AM > To: Bhushan Bharat-R65777 > Cc: b...@kernel.crashing.org; ag...@suse.de; pau...@samba.org; > kvm@vger.kernel.org; kvm-...@vger.kernel.org; linuxppc-...@lists.ozlabs.org; > Bhushan Bharat-R65777 > Subject: Re: [PATCH 5/6 v5] kvm: booke: clear host tlb reference flag on guest > tlb invalidation > > On Thu, 2013-09-19 at 11:32 +0530, Bharat Bhushan wrote: > > On booke, "struct tlbe_ref" contains host tlb mapping information > > (pfn: for guest-pfn to pfn, flags: attribute associated with this > > mapping) for a guest tlb entry. So when a guest creates a TLB entry > > then "struct tlbe_ref" is set to point to valid "pfn" and set > > attributes in "flags" field of the above said structure. When a guest > > TLB entry is invalidated then flags field of corresponding "struct > > tlbe_ref" is updated to point that this is no more valid, also we > > selectively clear some other attribute bits, example: if > > E500_TLB_BITMAP was set then we clear E500_TLB_BITMAP, if E500_TLB_TLB0 is > > set > then we clear this. > > > > Ideally we should clear complete "flags" as this entry is invalid and > > does not have anything to re-used. The other part of the problem is > > that when we use the same entry again then also we do not clear (started > > doing > or-ing etc). > > > > So far it was working because the selectively clearing mentioned above > > actually clears "flags" what was set during TLB mapping. But the > > problem starts coming when we add more attributes to this then we need > > to selectively clear them and which is not needed. > > > > This patch we do both > > - Clear "flags" when invalidating; > > - Clear "flags" when reusing same entry later > > > > Signed-off-by: Bharat Bhushan > > --- > > v3-> v5 > > - New patch (found this issue when doing vfio-pci development) > > > > arch/powerpc/kvm/e500_mmu_host.c | 12 +++- > > 1 files changed, 7 insertions(+), 5 deletions(-) > > > > diff --git a/arch/powerpc/kvm/e500_mmu_host.c > > b/arch/powerpc/kvm/e500_mmu_host.c > > index 1c6a9d7..60f5a3c 100644 > > --- a/arch/powerpc/kvm/e500_mmu_host.c > > +++ b/arch/powerpc/kvm/e500_mmu_host.c > > @@ -217,7 +217,8 @@ void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 > *vcpu_e500, int tlbsel, > > } > > mb(); > > vcpu_e500->g2h_tlb1_map[esel] = 0; > > - ref->flags &= ~(E500_TLB_BITMAP | E500_TLB_VALID); > > + /* Clear flags as TLB is not backed by the host anymore */ > > + ref->flags = 0; > > local_irq_restore(flags); > > } > > This breaks when you have both E500_TLB_BITMAP and E500_TLB_TLB0 set. I do not see any case where we set both E500_TLB_BITMAP and E500_TLB_TLB0. Also we have not optimized that yet (keeping track of multiple shadow TLB0 entries for one guest TLB1 entry) We uses these bit flags only for TLB1 and if size of stlbe is 4K then we set E500_TLB_TLB0 otherwise we set E500_TLB_BITMAP. Although I think that E500_TLB_BITMAP should be set only if stlbe size is less than gtlbe size. > > Instead, just convert the final E500_TLB_VALID clearing at the end into > ref->flags = 0, and convert the early return a few lines earlier into > conditional execution of the tlbil_one(). This looks better, will send the patch shortly. Thanks -Bharat > > -Scott > N�r��yb�X��ǧv�^�){.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf
RE: [PATCH 0/2] KVM: PPC: BOOKE: MMU Fixes
Hi Alex, Second patch (kvm: ppc: booke: check range page invalidation progress on page setup) of this patch series fixes a critical issue and we would like that to be part of 2.12. First Patch is not that important but pretty simple. Thanks -Bharat > -Original Message- > From: Bhushan Bharat-R65777 > Sent: Wednesday, August 07, 2013 3:34 PM > To: pau...@samba.org; Wood Scott-B07421; ag...@suse.de; > kvm-...@vger.kernel.org; > kvm@vger.kernel.org > Cc: Bhushan Bharat-R65777 > Subject: [PATCH 0/2] KVM: PPC: BOOKE: MMU Fixes > > From: Bharat Bhushan > > First Patch set missing _PAGE_ACCESSED when a guest page is accessed > > Second Patch check for MMU notifier range invalidation progress when setting a > reference for a guest page. This is based on > "KVM: PPC: Book3S PR: Use mmu_notifier_retry() in kvmppc_mmu_map_page()" > patch sent by Pauls (still in review). > > Bharat Bhushan (2): > kvm: powerpc: mark page accessed when mapping a guest page > kvm: ppc: booke: check range page invalidation progress on page setup > > arch/powerpc/kvm/e500_mmu_host.c | 22 -- > 1 files changed, 20 insertions(+), 2 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 6/6 v3] kvm: powerpc: use caching attributes as per linux pte
> -Original Message- > From: Wood Scott-B07421 > Sent: Saturday, August 10, 2013 6:35 AM > To: Bhushan Bharat-R65777 > Cc: b...@kernel.crashing.org; ag...@suse.de; pau...@samba.org; > kvm@vger.kernel.org; kvm-...@vger.kernel.org; linuxppc-...@lists.ozlabs.org; > Bhushan Bharat-R65777 > Subject: Re: [PATCH 6/6 v3] kvm: powerpc: use caching attributes as per linux > pte > > On Tue, 2013-08-06 at 17:01 +0530, Bharat Bhushan wrote: > > @@ -449,7 +446,16 @@ static inline int kvmppc_e500_shadow_map(struct > kvmppc_vcpu_e500 *vcpu_e500, > > gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); > > } > > > > - kvmppc_e500_ref_setup(ref, gtlbe, pfn); > > + pgdir = vcpu_e500->vcpu.arch.pgdir; > > + ptep = lookup_linux_pte(pgdir, hva, &tsize_pages); > > + if (pte_present(*ptep)) { > > + wimg = (pte_val(*ptep) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; > > + } else { > > + printk(KERN_ERR "pte not present: gfn %lx, pfn %lx\n", > > + (long)gfn, pfn); > > + return -EINVAL; > > Don't let the guest spam the host kernel console by repeatedly accessing bad > mappings (even if it requires host userspace to assist by pointing a memslot > at > a bad hva). This should at most be printk_ratelimited(), and probably just > pr_debug(). It should also have __func__ context. Very good point, I will make this printk_ratelimited() in this patch. And convert this and other error prints to pr_debug() when we will send machine check on error in this flow. > > Also, I don't see the return value getting checked (the immediate callers > check > it and propogate the error, but kvmppc_mmu_map() doesn't). > We want to send a machine check to the guest if this happens (or possibly exit > to userspace since it indicates a bad memslot, not just a guest bug). We > don't > want to just silently retry over and over. I completely agree with you, but this was something already missing (error return by this function is nothing new added in this patch), So I would like to take that separately. > > Otherwise, this series looks good to me. Thank you. :) -Bharat > > -Scott > N�r��yb�X��ǧv�^�){.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf
RE: [PATCH 21/23] KVM: PPC: Book3S PR: Use mmu_notifier_retry() in kvmppc_mmu_map_page()
> -Original Message- > From: Paul Mackerras [mailto:pau...@samba.org] > Sent: Wednesday, August 07, 2013 1:58 PM > To: Bhushan Bharat-R65777 > Cc: Alexander Graf; Benjamin Herrenschmidt; kvm-...@vger.kernel.org; > kvm@vger.kernel.org > Subject: Re: [PATCH 21/23] KVM: PPC: Book3S PR: Use mmu_notifier_retry() in > kvmppc_mmu_map_page() > > On Wed, Aug 07, 2013 at 05:17:29AM +, Bhushan Bharat-R65777 wrote: > > > > Pauls, I am trying to understand the flow; does retry mean that we do not > create the mapping and return to guest, which will fault again and then we > will > retry? > > Yes, and you do put_page or kvm_release_pfn_clean for any page that you got. Ok, but what is the value to return back to guest when we know it is again going to generate fault. Cannot we retry within KVM? Thanks -Bharat > > Paul. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 21/23] KVM: PPC: Book3S PR: Use mmu_notifier_retry() in kvmppc_mmu_map_page()
> -Original Message- > From: Paul Mackerras [mailto:pau...@samba.org] > Sent: Wednesday, August 07, 2013 9:59 AM > To: Bhushan Bharat-R65777 > Cc: Alexander Graf; Benjamin Herrenschmidt; kvm-...@vger.kernel.org; > kvm@vger.kernel.org > Subject: Re: [PATCH 21/23] KVM: PPC: Book3S PR: Use mmu_notifier_retry() in > kvmppc_mmu_map_page() > > On Wed, Aug 07, 2013 at 04:13:34AM +, Bhushan Bharat-R65777 wrote: > > > > > + /* used to check for invalidations in progress */ > > > + mmu_seq = kvm->mmu_notifier_seq; > > > + smp_rmb(); > > > > Should not the smp_rmb() come before reading kvm->mmu_notifier_seq. > > No, it should come after, because it is ordering the read of > kvm->mmu_notifier_seq before the read of the Linux PTE. Ahh, ok. Thanks -Bharat > > Paul. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 21/23] KVM: PPC: Book3S PR: Use mmu_notifier_retry() in kvmppc_mmu_map_page()
> -Original Message- > From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf > Of > Paul Mackerras > Sent: Tuesday, August 06, 2013 9:58 AM > To: Alexander Graf; Benjamin Herrenschmidt > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org > Subject: [PATCH 21/23] KVM: PPC: Book3S PR: Use mmu_notifier_retry() in > kvmppc_mmu_map_page() > > When the MM code is invalidating a range of pages, it calls the KVM > kvm_mmu_notifier_invalidate_range_start() notifier function, which calls > kvm_unmap_hva_range(), which arranges to flush all the existing host > HPTEs for guest pages. However, the Linux PTEs for the range being > flushed are still valid at that point. We are not supposed to establish > any new references to pages in the range until the ...range_end() > notifier gets called. The PPC-specific KVM code doesn't get any > explicit notification of that; instead, we are supposed to use > mmu_notifier_retry() to test whether we are or have been inside a > range flush notifier pair while we have been getting a page and > instantiating a host HPTE for the page. > > This therefore adds a call to mmu_notifier_retry inside > kvmppc_mmu_map_page(). This call is inside a region locked with > kvm->mmu_lock, which is the same lock that is called by the KVM > MMU notifier functions, thus ensuring that no new notification can > proceed while we are in the locked region. Inside this region we > also create the host HPTE and link the corresponding hpte_cache > structure into the lists used to find it later. We cannot allocate > the hpte_cache structure inside this locked region because that can > lead to deadlock, so we allocate it outside the region and free it > if we end up not using it. > > This also moves the updates of vcpu3s->hpte_cache_count inside the > regions locked with vcpu3s->mmu_lock, and does the increment in > kvmppc_mmu_hpte_cache_map() when the pte is added to the cache > rather than when it is allocated, in order that the hpte_cache_count > is accurate. > > Signed-off-by: Paul Mackerras > --- > arch/powerpc/include/asm/kvm_book3s.h | 1 + > arch/powerpc/kvm/book3s_64_mmu_host.c | 37 > ++- > arch/powerpc/kvm/book3s_mmu_hpte.c| 14 + > 3 files changed, 39 insertions(+), 13 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_book3s.h > b/arch/powerpc/include/asm/kvm_book3s.h > index 4fe6864..e711e77 100644 > --- a/arch/powerpc/include/asm/kvm_book3s.h > +++ b/arch/powerpc/include/asm/kvm_book3s.h > @@ -143,6 +143,7 @@ extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, > gva_t > eaddr, > > extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct > hpte_cache > *pte); > extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu); > +extern void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte); > extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu); > extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu); > extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct > hpte_cache > *pte); > diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c > b/arch/powerpc/kvm/book3s_64_mmu_host.c > index 7fcf38f..b7e9504 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_host.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c > @@ -93,6 +93,13 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct > kvmppc_pte *orig_pte, > int r = 0; > int hpsize = MMU_PAGE_4K; > bool writable; > + unsigned long mmu_seq; > + struct kvm *kvm = vcpu->kvm; > + struct hpte_cache *cpte; > + > + /* used to check for invalidations in progress */ > + mmu_seq = kvm->mmu_notifier_seq; > + smp_rmb(); > > /* Get host physical address for gpa */ > hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT, > @@ -143,6 +150,14 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct > kvmppc_pte *orig_pte, > > hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M); > > + cpte = kvmppc_mmu_hpte_cache_next(vcpu); > + > + spin_lock(&kvm->mmu_lock); > + if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) { > + r = -EAGAIN; Pauls, I am trying to understand the flow; does retry mean that we do not create the mapping and return to guest, which will fault again and then we will retry? Thanks -Bharat > + goto out_unlock; > + } > + > map_again: > hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); > > @@ -150,7 +165,7 @@ map_again: > if (attempt > 1) > if (ppc_md.hpte_remove(hpteg) < 0) { > r = -1; > - goto out; > + goto out_unlock; > } > > ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags, > @@ -163,8 +178,6 @@ map_again: > attempt++; > goto map_again; > } else { > - struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu); > -
RE: [PATCH 21/23] KVM: PPC: Book3S PR: Use mmu_notifier_retry() in kvmppc_mmu_map_page()
> -Original Message- > From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf > Of > Paul Mackerras > Sent: Tuesday, August 06, 2013 9:58 AM > To: Alexander Graf; Benjamin Herrenschmidt > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org > Subject: [PATCH 21/23] KVM: PPC: Book3S PR: Use mmu_notifier_retry() in > kvmppc_mmu_map_page() > > When the MM code is invalidating a range of pages, it calls the KVM > kvm_mmu_notifier_invalidate_range_start() notifier function, which calls > kvm_unmap_hva_range(), which arranges to flush all the existing host HPTEs for > guest pages. However, the Linux PTEs for the range being flushed are still > valid at that point. We are not supposed to establish any new references to > pages in the range until the ...range_end() notifier gets called. The PPC- > specific KVM code doesn't get any explicit notification of that; instead, we > are > supposed to use > mmu_notifier_retry() to test whether we are or have been inside a range flush > notifier pair while we have been getting a page and instantiating a host HPTE > for the page. > > This therefore adds a call to mmu_notifier_retry inside kvmppc_mmu_map_page(). > This call is inside a region locked with > kvm->mmu_lock, which is the same lock that is called by the KVM > MMU notifier functions, thus ensuring that no new notification can proceed > while > we are in the locked region. Inside this region we also create the host HPTE > and link the corresponding hpte_cache structure into the lists used to find it > later. We cannot allocate the hpte_cache structure inside this locked region > because that can lead to deadlock, so we allocate it outside the region and > free > it if we end up not using it. > > This also moves the updates of vcpu3s->hpte_cache_count inside the regions > locked with vcpu3s->mmu_lock, and does the increment in > kvmppc_mmu_hpte_cache_map() when the pte is added to the cache rather than > when > it is allocated, in order that the hpte_cache_count is accurate. > > Signed-off-by: Paul Mackerras > --- > arch/powerpc/include/asm/kvm_book3s.h | 1 + > arch/powerpc/kvm/book3s_64_mmu_host.c | 37 ++- > arch/powerpc/kvm/book3s_mmu_hpte.c| 14 + > 3 files changed, 39 insertions(+), 13 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_book3s.h > b/arch/powerpc/include/asm/kvm_book3s.h > index 4fe6864..e711e77 100644 > --- a/arch/powerpc/include/asm/kvm_book3s.h > +++ b/arch/powerpc/include/asm/kvm_book3s.h > @@ -143,6 +143,7 @@ extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, > gva_t > eaddr, > > extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct > hpte_cache > *pte); extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu > *vcpu); > +extern void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte); > extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu); extern int > kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu); extern void > kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte); > diff - > -git a/arch/powerpc/kvm/book3s_64_mmu_host.c > b/arch/powerpc/kvm/book3s_64_mmu_host.c > index 7fcf38f..b7e9504 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_host.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c > @@ -93,6 +93,13 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct > kvmppc_pte *orig_pte, > int r = 0; > int hpsize = MMU_PAGE_4K; > bool writable; > + unsigned long mmu_seq; > + struct kvm *kvm = vcpu->kvm; > + struct hpte_cache *cpte; > + > + /* used to check for invalidations in progress */ > + mmu_seq = kvm->mmu_notifier_seq; > + smp_rmb(); Should not the smp_rmb() come before reading kvm->mmu_notifier_seq. -Bharat > > /* Get host physical address for gpa */ > hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT, @@ > -143,6 > +150,14 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte > *orig_pte, > > hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M); > > + cpte = kvmppc_mmu_hpte_cache_next(vcpu); > + > + spin_lock(&kvm->mmu_lock); > + if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) { > + r = -EAGAIN; > + goto out_unlock; > + } > + > map_again: > hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); > > @@ -150,7 +165,7 @@ map_again: > if (attempt > 1) > if (ppc_md.hpte_remove(hpteg) < 0) { > r = -1; > - goto out; > + goto out_unlock; > } > > ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags, @@ -163,8 > +178,6 @@ map_again: > attempt++; > goto map_again; > } else { > - struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu); > - > trace_kvm_book3s_64_mmu_map(rflags, hpteg, >
RE: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like booke3s
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, August 06, 2013 12:49 AM > To: Bhushan Bharat-R65777 > Cc: Benjamin Herrenschmidt; Wood Scott-B07421; ag...@suse.de; kvm- > p...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like > booke3s > > On Mon, 2013-08-05 at 09:27 -0500, Bhushan Bharat-R65777 wrote: > > > > > -Original Message- > > > From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org] > > > Sent: Saturday, August 03, 2013 9:54 AM > > > To: Bhushan Bharat-R65777 > > > Cc: Wood Scott-B07421; ag...@suse.de; kvm-...@vger.kernel.org; > > > kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org > > > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte > > > lookup like booke3s > > > > > > On Sat, 2013-08-03 at 02:58 +, Bhushan Bharat-R65777 wrote: > > > > One of the problem I saw was that if I put this code in > > > > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other > > > > friend function (on which this code depends) are defined in pgtable.h. > > > > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h > > > > before it defines pte_present() and friends functions. > > > > > > > > Ok I move wove this in asm/pgtable*.h, initially I fought with > > > > myself to take this code in pgtable* but finally end up doing here > > > > (got biased by book3s :)). > > > > > > Is there a reason why these routines can not be completely generic > > > in pgtable.h ? > > > > How about the generic function: > > > > diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h > > b/arch/powerpc/include/asm/pgtable-ppc64.h > > index d257d98..21daf28 100644 > > --- a/arch/powerpc/include/asm/pgtable-ppc64.h > > +++ b/arch/powerpc/include/asm/pgtable-ppc64.h > > @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct mm_struct > *mm, > > return old; > > } > > > > +static inline unsigned long pte_read(pte_t *p) { #ifdef > > +PTE_ATOMIC_UPDATES > > + pte_t pte; > > + pte_t tmp; > > + __asm__ __volatile__ ( > > + "1: ldarx %0,0,%3\n" > > + " andi. %1,%0,%4\n" > > + " bne-1b\n" > > + " ori %1,%0,%4\n" > > + " stdcx. %1,0,%3\n" > > + " bne-1b" > > + : "=&r" (pte), "=&r" (tmp), "=m" (*p) > > + : "r" (p), "i" (_PAGE_BUSY) > > + : "cc"); > > + > > + return pte; > > +#else > > + return pte_val(*p); > > +#endif > > +#endif > > +} > > static inline int __ptep_test_and_clear_young(struct mm_struct *mm, > > unsigned long addr, > > pte_t *ptep) > > Please leave a blank line between functions. > > > { > > diff --git a/arch/powerpc/include/asm/pgtable.h > > b/arch/powerpc/include/asm/pgtable.h > > index 690c8c2..dad712c 100644 > > --- a/arch/powerpc/include/asm/pgtable.h > > +++ b/arch/powerpc/include/asm/pgtable.h > > @@ -254,6 +254,45 @@ static inline pte_t > > *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } #endif > > /* !CONFIG_HUGETLB_PAGE */ > > > > +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, > > +int writing, unsigned long > > +*pte_sizep) > > The name implies that it just reads the PTE. Setting accessed/dirty shouldn't > be an undocumented side-effect. Why can't the caller do that (or a different > function that the caller calls afterward if desired)? Scott, I sent the next version of patch based on above idea. Now I think we do not need to update the pte flags on booke So we do not need to solve the kvmppc_read_update_linux_pte() stuff of book3s. -Bharat > > Though even then you have the undocumented side effect of locking the PTE on > certain targets. > > > +{ > > + pte_t *ptep; > > + pte_t pte; > > + unsigned long ps = *pte_sizep; > > + unsigned int shift; > > + > > + ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); > > + if (!ptep) > > + return __pte(0); > > + if (shift) > > + *pte_sizep = 1ul << shift; > > + else > > + *pte_sizep = PAGE_SIZE; > > + > > + if (ps > *pte_sizep) > > + return __pte(0); > > + > > + if (!pte_present(*ptep)) > > + return __pte(0); > > + > > +#ifdef CONFIG_PPC64 > > + /* Lock PTE (set _PAGE_BUSY) and read */ > > + pte = pte_read(ptep); > > +#else > > + pte = pte_val(*ptep); > > +#endif > > What about 32-bit platforms that need atomic PTEs? > > -Scott > N�r��yb�X��ǧv�^�){.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf
RE: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like booke3s
> -Original Message- > From: Bhushan Bharat-R65777 > Sent: Tuesday, August 06, 2013 6:42 AM > To: Wood Scott-B07421 > Cc: Benjamin Herrenschmidt; ag...@suse.de; kvm-...@vger.kernel.org; > kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org > Subject: RE: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like > booke3s > > > > > -Original Message- > > From: Wood Scott-B07421 > > Sent: Tuesday, August 06, 2013 12:49 AM > > To: Bhushan Bharat-R65777 > > Cc: Benjamin Herrenschmidt; Wood Scott-B07421; ag...@suse.de; kvm- > > p...@vger.kernel.org; kvm@vger.kernel.org; > > linuxppc-...@lists.ozlabs.org > > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup > > like booke3s > > > > On Mon, 2013-08-05 at 09:27 -0500, Bhushan Bharat-R65777 wrote: > > > > > > > -Original Message- > > > > From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org] > > > > Sent: Saturday, August 03, 2013 9:54 AM > > > > To: Bhushan Bharat-R65777 > > > > Cc: Wood Scott-B07421; ag...@suse.de; kvm-...@vger.kernel.org; > > > > kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org > > > > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte > > > > lookup like booke3s > > > > > > > > On Sat, 2013-08-03 at 02:58 +, Bhushan Bharat-R65777 wrote: > > > > > One of the problem I saw was that if I put this code in > > > > > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and > > > > > other friend function (on which this code depends) are defined in > pgtable.h. > > > > > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h > > > > > before it defines pte_present() and friends functions. > > > > > > > > > > Ok I move wove this in asm/pgtable*.h, initially I fought with > > > > > myself to take this code in pgtable* but finally end up doing > > > > > here (got biased by book3s :)). > > > > > > > > Is there a reason why these routines can not be completely generic > > > > in pgtable.h ? > > > > > > How about the generic function: > > > > > > diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h > > > b/arch/powerpc/include/asm/pgtable-ppc64.h > > > index d257d98..21daf28 100644 > > > --- a/arch/powerpc/include/asm/pgtable-ppc64.h > > > +++ b/arch/powerpc/include/asm/pgtable-ppc64.h > > > @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct > > > mm_struct > > *mm, > > > return old; > > > } > > > > > > +static inline unsigned long pte_read(pte_t *p) { #ifdef > > > +PTE_ATOMIC_UPDATES > > > + pte_t pte; > > > + pte_t tmp; > > > + __asm__ __volatile__ ( > > > + "1: ldarx %0,0,%3\n" > > > + " andi. %1,%0,%4\n" > > > + " bne-1b\n" > > > + " ori %1,%0,%4\n" > > > + " stdcx. %1,0,%3\n" > > > + " bne-1b" > > > + : "=&r" (pte), "=&r" (tmp), "=m" (*p) > > > + : "r" (p), "i" (_PAGE_BUSY) > > > + : "cc"); > > > + > > > + return pte; > > > +#else > > > + return pte_val(*p); > > > +#endif > > > +#endif > > > +} > > > static inline int __ptep_test_and_clear_young(struct mm_struct *mm, > > > unsigned long addr, > > > pte_t *ptep) > > > > Please leave a blank line between functions. > > > > > { > > > diff --git a/arch/powerpc/include/asm/pgtable.h > > > b/arch/powerpc/include/asm/pgtable.h > > > index 690c8c2..dad712c 100644 > > > --- a/arch/powerpc/include/asm/pgtable.h > > > +++ b/arch/powerpc/include/asm/pgtable.h > > > @@ -254,6 +254,45 @@ static inline pte_t > > > *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } > > > #endif > > > /* !CONFIG_HUGETLB_PAGE */ > > > > > > +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, > > > +int writing, unsigned long > > > +*pte_sizep) > > > > The name implies that it just reads the PTE. Setting accessed/dirty > > shouldn't be an undocumented side-eff
RE: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like booke3s
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, August 06, 2013 12:49 AM > To: Bhushan Bharat-R65777 > Cc: Benjamin Herrenschmidt; Wood Scott-B07421; ag...@suse.de; kvm- > p...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like > booke3s > > On Mon, 2013-08-05 at 09:27 -0500, Bhushan Bharat-R65777 wrote: > > > > > -Original Message- > > > From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org] > > > Sent: Saturday, August 03, 2013 9:54 AM > > > To: Bhushan Bharat-R65777 > > > Cc: Wood Scott-B07421; ag...@suse.de; kvm-...@vger.kernel.org; > > > kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org > > > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte > > > lookup like booke3s > > > > > > On Sat, 2013-08-03 at 02:58 +, Bhushan Bharat-R65777 wrote: > > > > One of the problem I saw was that if I put this code in > > > > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other > > > > friend function (on which this code depends) are defined in pgtable.h. > > > > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h > > > > before it defines pte_present() and friends functions. > > > > > > > > Ok I move wove this in asm/pgtable*.h, initially I fought with > > > > myself to take this code in pgtable* but finally end up doing here > > > > (got biased by book3s :)). > > > > > > Is there a reason why these routines can not be completely generic > > > in pgtable.h ? > > > > How about the generic function: > > > > diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h > > b/arch/powerpc/include/asm/pgtable-ppc64.h > > index d257d98..21daf28 100644 > > --- a/arch/powerpc/include/asm/pgtable-ppc64.h > > +++ b/arch/powerpc/include/asm/pgtable-ppc64.h > > @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct mm_struct > *mm, > > return old; > > } > > > > +static inline unsigned long pte_read(pte_t *p) { #ifdef > > +PTE_ATOMIC_UPDATES > > + pte_t pte; > > + pte_t tmp; > > + __asm__ __volatile__ ( > > + "1: ldarx %0,0,%3\n" > > + " andi. %1,%0,%4\n" > > + " bne-1b\n" > > + " ori %1,%0,%4\n" > > + " stdcx. %1,0,%3\n" > > + " bne-1b" > > + : "=&r" (pte), "=&r" (tmp), "=m" (*p) > > + : "r" (p), "i" (_PAGE_BUSY) > > + : "cc"); > > + > > + return pte; > > +#else > > + return pte_val(*p); > > +#endif > > +#endif > > +} > > static inline int __ptep_test_and_clear_young(struct mm_struct *mm, > > unsigned long addr, > > pte_t *ptep) > > Please leave a blank line between functions. > > > { > > diff --git a/arch/powerpc/include/asm/pgtable.h > > b/arch/powerpc/include/asm/pgtable.h > > index 690c8c2..dad712c 100644 > > --- a/arch/powerpc/include/asm/pgtable.h > > +++ b/arch/powerpc/include/asm/pgtable.h > > @@ -254,6 +254,45 @@ static inline pte_t > > *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } #endif > > /* !CONFIG_HUGETLB_PAGE */ > > > > +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, > > +int writing, unsigned long > > +*pte_sizep) > > The name implies that it just reads the PTE. Setting accessed/dirty shouldn't > be an undocumented side-effect. Ok, will rename and document. > Why can't the caller do that (or a different > function that the caller calls afterward if desired)? The current implementation in book3s is; 1) find a pte/hugepte 2) return null if pte not present 3) take _PAGE_BUSY lock 4) set accessed/dirty 5) clear _PAGE_BUSY. What I tried was 1) find a pte/hugepte 2) return null if pte not present 3) return pte (not take lock by not setting _PAGE_BUSY) 4) then user calls __ptep_set_access_flags() to atomic update the dirty/accessed flags in pte. - but the benchmark results were not good - Also can there be race as we do not take lock in step 3 and update in step 4 ? > > Though even then you have the undocumented side effect of locking the PTE on > certain targets. > > > +{ > > + pte_t *ptep; > > +
RE: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like booke3s
> -Original Message- > From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org] > Sent: Saturday, August 03, 2013 9:54 AM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; ag...@suse.de; kvm-...@vger.kernel.org; > kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like > booke3s > > On Sat, 2013-08-03 at 02:58 +, Bhushan Bharat-R65777 wrote: > > One of the problem I saw was that if I put this code in > > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other > > friend function (on which this code depends) are defined in pgtable.h. > > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h before it > > defines pte_present() and friends functions. > > > > Ok I move wove this in asm/pgtable*.h, initially I fought with myself > > to take this code in pgtable* but finally end up doing here (got > > biased by book3s :)). > > Is there a reason why these routines can not be completely generic in > pgtable.h > ? How about the generic function: diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index d257d98..21daf28 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct mm_struct *mm, return old; } +static inline unsigned long pte_read(pte_t *p) +{ +#ifdef PTE_ATOMIC_UPDATES + pte_t pte; + pte_t tmp; + __asm__ __volatile__ ( + "1: ldarx %0,0,%3\n" + " andi. %1,%0,%4\n" + " bne-1b\n" + " ori %1,%0,%4\n" + " stdcx. %1,0,%3\n" + " bne-1b" + : "=&r" (pte), "=&r" (tmp), "=m" (*p) + : "r" (p), "i" (_PAGE_BUSY) + : "cc"); + + return pte; +#else + return pte_val(*p); +#endif +#endif +} static inline int __ptep_test_and_clear_young(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 690c8c2..dad712c 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -254,6 +254,45 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } #endif /* !CONFIG_HUGETLB_PAGE */ +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, +int writing, unsigned long *pte_sizep) +{ + pte_t *ptep; + pte_t pte; + unsigned long ps = *pte_sizep; + unsigned int shift; + + ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); + if (!ptep) + return __pte(0); + if (shift) + *pte_sizep = 1ul << shift; + else + *pte_sizep = PAGE_SIZE; + + if (ps > *pte_sizep) + return __pte(0); + + if (!pte_present(*ptep)) + return __pte(0); + +#ifdef CONFIG_PPC64 + /* Lock PTE (set _PAGE_BUSY) and read */ + pte = pte_read(ptep); +#else + pte = pte_val(*ptep); +#endif + if (pte_present(pte)) { + pte = pte_mkyoung(pte); + if (writing && pte_write(pte)) + pte = pte_mkdirty(pte); + } + + *ptep = __pte(pte); /* 64bit: Also unlock pte (clear _PAGE_BUSY) */ + + return pte; +} + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */
RE: [PATCH 6/6 v2] kvm: powerpc: use caching attributes as per linux pte
> -Original Message- > From: Wood Scott-B07421 > Sent: Saturday, August 03, 2013 5:05 AM > To: Bhushan Bharat-R65777 > Cc: b...@kernel.crashing.org; ag...@suse.de; kvm-...@vger.kernel.org; > kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org; Bhushan Bharat-R65777 > Subject: Re: [PATCH 6/6 v2] kvm: powerpc: use caching attributes as per linux > pte > > On Thu, Aug 01, 2013 at 04:42:38PM +0530, Bharat Bhushan wrote: > > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index > > 17722d8..eb2 100644 > > --- a/arch/powerpc/kvm/booke.c > > +++ b/arch/powerpc/kvm/booke.c > > @@ -697,7 +697,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, > > struct kvm_vcpu *vcpu) #endif > > > > kvmppc_fix_ee_before_entry(); > > - > > + vcpu->arch.pgdir = current->mm->pgd; > > ret = __kvmppc_vcpu_run(kvm_run, vcpu); > > kvmppc_fix_ee_before_entry() is supposed to be the last thing that happens > before __kvmppc_vcpu_run(). > > > @@ -332,6 +324,8 @@ static inline int kvmppc_e500_shadow_map(struct > kvmppc_vcpu_e500 *vcpu_e500, > > unsigned long hva; > > int pfnmap = 0; > > int tsize = BOOK3E_PAGESZ_4K; > > + pte_t pte; > > + int wimg = 0; > > > > /* > > * Translate guest physical to true physical, acquiring @@ -437,6 > > +431,8 @@ static inline int kvmppc_e500_shadow_map(struct > > kvmppc_vcpu_e500 *vcpu_e500, > > > > if (likely(!pfnmap)) { > > unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); > > + pgd_t *pgdir; > > + > > pfn = gfn_to_pfn_memslot(slot, gfn); > > if (is_error_noslot_pfn(pfn)) { > > printk(KERN_ERR "Couldn't get real page for gfn > > %lx!\n", @@ > -447,9 > > +443,18 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 > *vcpu_e500, > > /* Align guest and physical address to page map boundaries */ > > pfn &= ~(tsize_pages - 1); > > gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); > > + pgdir = vcpu_e500->vcpu.arch.pgdir; > > + pte = lookup_linux_pte(pgdir, hva, 1, &tsize_pages); > > + if (pte_present(pte)) { > > + wimg = (pte >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; > > + } else { > > + printk(KERN_ERR "pte not present: gfn %lx, pfn %lx\n", > > + (long)gfn, pfn); > > + return -EINVAL; > > + } > > } > > How does wimg get set in the pfnmap case? Pfnmap is not kernel managed pages, right? So should we set I+G there ? > > Could you explain why we need to set dirty/referenced on the PTE, when we > didn't > need to do that before? All we're getting from the PTE is wimg. > We have MMU notifiers to take care of the page being unmapped, and we've > already > marked the page itself as dirty if the TLB entry is writeable. I pulled this code from book3s. Ben, can you describe why we need this on book3s ? Thanks -Bharat > > -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like booke3s
> -Original Message- > From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org] > Sent: Saturday, August 03, 2013 4:47 AM > To: Wood Scott-B07421 > Cc: Bhushan Bharat-R65777; ag...@suse.de; kvm-...@vger.kernel.org; > kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org; Bhushan Bharat-R65777 > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like > booke3s > > On Fri, 2013-08-02 at 17:58 -0500, Scott Wood wrote: > > > > What about 64-bit PTEs on 32-bit kernels? > > > > In any case, this code does not belong in KVM. It should be in the > > main PPC mm code, even if KVM is the only user. > > Also don't we do similar things in BookS KVM ? At the very least that sutff > should become common. And yes, I agree, it should probably also move to > pgtable* One of the problem I saw was that if I put this code in asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other friend function (on which this code depends) are defined in pgtable.h. And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h before it defines pte_present() and friends functions. Ok I move wove this in asm/pgtable*.h, initially I fought with myself to take this code in pgtable* but finally end up doing here (got biased by book3s :)). Thanks -Bharat > > Cheers, > Ben. > > N�r��yb�X��ǧv�^�){.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf
RE: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages
> -Original Message- > From: Wood Scott-B07421 > Sent: Wednesday, July 31, 2013 12:19 AM > To: Bhushan Bharat-R65777 > Cc: Benjamin Herrenschmidt; Alexander Graf; kvm-...@vger.kernel.org; > kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org; Wood Scott-B07421 > Subject: Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages > > On 07/30/2013 11:22:54 AM, Bhushan Bharat-R65777 wrote: > > diff --git a/arch/powerpc/kvm/e500_mmu_host.c > > b/arch/powerpc/kvm/e500_mmu_host.c > > index 5cbdc8f..a48c13f 100644 > > --- a/arch/powerpc/kvm/e500_mmu_host.c > > +++ b/arch/powerpc/kvm/e500_mmu_host.c > > @@ -40,6 +40,84 @@ > > > > static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; > > > > +/* > > + * find_linux_pte returns the address of a linux pte for a given > > + * effective address and directory. If not found, it returns zero. > > + */ > > +static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) { > > +pgd_t *pg; > > +pud_t *pu; > > +pmd_t *pm; > > +pte_t *pt = NULL; > > + > > +pg = pgdir + pgd_index(ea); > > +if (!pgd_none(*pg)) { > > +pu = pud_offset(pg, ea); > > +if (!pud_none(*pu)) { > > +pm = pmd_offset(pu, ea); > > +if (pmd_present(*pm)) > > +pt = pte_offset_kernel(pm, ea); > > +} > > +} > > +return pt; > > +} > > How is this specific to KVM or e500? > > > +#ifdef CONFIG_HUGETLB_PAGE > > +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, > > + unsigned *shift); #else static > > +inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, > > unsigned long ea, > > + unsigned *shift) { > > +if (shift) > > +*shift = 0; > > +return find_linux_pte(pgdir, ea); } #endif /* > > +!CONFIG_HUGETLB_PAGE */ > > This is already declared in asm/pgtable.h. If we need a non-hugepage > alternative, that should also go in asm/pgtable.h. > > > +/* > > + * Lock and read a linux PTE. If it's present and writable, > > atomically > > + * set dirty and referenced bits and return the PTE, otherwise > > return 0. > > + */ > > +static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int > > writing) > > +{ > > + pte_t pte = pte_val(*p); > > + > > + if (pte_present(pte)) { > > + pte = pte_mkyoung(pte); > > + if (writing && pte_write(pte)) > > + pte = pte_mkdirty(pte); > > + } > > + > > + *p = pte; > > + > > + return pte; > > +} > > + > > +static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, > > + int writing, unsigned long *pte_sizep) { > > + pte_t *ptep; > > + unsigned long ps = *pte_sizep; > > + unsigned int shift; > > + > > + ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); > > + if (!ptep) > > + return __pte(0); > > + if (shift) > > + *pte_sizep = 1ul << shift; > > + else > > + *pte_sizep = PAGE_SIZE; > > + > > + if (ps > *pte_sizep) > > + return __pte(0); > > + if (!pte_present(*ptep)) > > + return __pte(0); > > + > > + return kvmppc_read_update_linux_pte(ptep, writing); } > > + > > None of this belongs in this file either. > > > @@ -326,8 +405,8 @@ static void kvmppc_e500_setup_stlbe( > > > > /* Force IPROT=0 for all guest mappings. */ > > stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | > > MAS1_VALID; > > - stlbe->mas2 = (gvaddr & MAS2_EPN) | > > - e500_shadow_mas2_attrib(gtlbe->mas2, pfn); > > + stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & > > E500_TLB_WIMGE_MASK); > > +// e500_shadow_mas2_attrib(gtlbe->mas2, pfn); > > MAS2_E and MAS2_G should be safe to come from the guest. This is handled when setting WIMGE in ref->flags. > > How does this work for TLB1? One ref corresponds to one guest entry, which > may > correspond to multiple host entries, potentially each with different WIM > settings. Yes, one ref corresponds to one guest e
RE: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages
> -Original Message- > From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org] > Sent: Saturday, July 27, 2013 3:57 AM > To: Bhushan Bharat-R65777 > Cc: Alexander Graf; kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- > d...@lists.ozlabs.org; Wood Scott-B07421 > Subject: Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages > > On Fri, 2013-07-26 at 15:03 +, Bhushan Bharat-R65777 wrote: > > Will not searching the Linux PTE is a overkill? > > That's the best approach. Also we are searching it already to resolve the page > fault. That does mean we search twice but on the other hand that also means > it's > hot in the cache. Below is early git diff (not a proper cleanup patch), to be sure that this is what we want on PowerPC and take early feedback. Also I run some benchmark to understand the overhead if any. Using kvm_is_mmio_pfn(); what the current patch does: Real: 0m46.616s + 0m49.517s + 0m49.510s + 0m46.936s + 0m46.889s + 0m46.684s = Avg; 47.692s User: 0m31.636s + 0m31.816s + 0m31.456s + 0m31.752s + 0m32.028s + 0m31.848s = Avg; 31.756s Sys: 0m11.596s + 0m11.868s + 0m12.244s + 0m11.672s + 0m11.356s + 0m11.432s = Avg; 11.695s Using kernel page table search (below changes): Real: 0m46.431s + 0m50.269s + 0m46.724s + 0m46.645s + 0m46.670s + 0m50.259s = Avg; 47.833s User: 0m31.568s + 0m31.816s + 0m31.444s + 0m31.808s + 0m31.312s + 0m31.740s = Avg; 31.614s Sys: 0m11.516s + 0m12.060s + 0m11.872s + 0m11.476s + 0m12.000s + 0m12.152s = Avg; 11.846s -- diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3328353..d6d0dac 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -532,6 +532,7 @@ struct kvm_vcpu_arch { u32 epr; u32 crit_save; struct kvmppc_booke_debug_reg dbg_reg; + pgd_t *pgdir; #endif gpa_t paddr_accessed; gva_t vaddr_accessed; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 17722d8..eb2 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -697,7 +697,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif kvmppc_fix_ee_before_entry(); - + vcpu->arch.pgdir = current->mm->pgd; ret = __kvmppc_vcpu_run(kvm_run, vcpu); /* No need for kvm_guest_exit. It's done in handle_exit. diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index 4fd9650..fc4b2f6 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -31,11 +31,13 @@ enum vcpu_ftr { #define E500_TLB_NUM 2 /* entry is mapped somewhere in host TLB */ -#define E500_TLB_VALID (1 << 0) +#define E500_TLB_VALID (1 << 31) /* TLB1 entry is mapped by host TLB1, tracked by bitmaps */ -#define E500_TLB_BITMAP(1 << 1) +#define E500_TLB_BITMAP(1 << 30) /* TLB1 entry is mapped by host TLB0 */ -#define E500_TLB_TLB0 (1 << 2) +#define E500_TLB_TLB0 (1 << 29) +/* Lower 5 bits have WIMGE value */ +#define E500_TLB_WIMGE_MASK(0x1f) struct tlbe_ref { pfn_t pfn; /* valid only for TLB0, except briefly */ diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 5cbdc8f..a48c13f 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -40,6 +40,84 @@ static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; +/* + * find_linux_pte returns the address of a linux pte for a given + * effective address and directory. If not found, it returns zero. + */ +static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) +{ +pgd_t *pg; +pud_t *pu; +pmd_t *pm; +pte_t *pt = NULL; + +pg = pgdir + pgd_index(ea); +if (!pgd_none(*pg)) { +pu = pud_offset(pg, ea); +if (!pud_none(*pu)) { +pm = pmd_offset(pu, ea); +if (pmd_present(*pm)) +pt = pte_offset_kernel(pm, ea); +} +} +return pt; +} + +#ifdef CONFIG_HUGETLB_PAGE +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift); +#else +static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift) +{ +if (shift) +*shift = 0; +return find_linux_pte(pgdir, ea); +} +#endif /* !CONFIG_HUGETLB_PAGE */ + +/* + * Lock and read a linux PTE. If it's present and writable, atomically + * set dirty and referenced bits and return the PTE, otherwise return 0. + */ +static inline pte_t kvmp
RE: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages
> -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Alexander Graf > Sent: Friday, July 26, 2013 2:20 PM > To: Benjamin Herrenschmidt > Cc: Bhushan Bharat-R65777; kvm-...@vger.kernel.org; kvm@vger.kernel.org; > linuxppc-...@lists.ozlabs.org; Wood Scott-B07421; Bhushan Bharat-R65777 > Subject: Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages > > > On 26.07.2013, at 10:26, Benjamin Herrenschmidt wrote: > > > On Fri, 2013-07-26 at 11:16 +0530, Bharat Bhushan wrote: > >> If the page is RAM then map this as cacheable and coherent (set "M" > >> bit) otherwise this page is treated as I/O and map this as cache > >> inhibited and guarded (set "I + G") > >> > >> This helps setting proper MMU mapping for direct assigned device. > >> > >> NOTE: There can be devices that require cacheable mapping, which is not yet > supported. > > > > Why don't you do like server instead and enforce the use of the same I > > and M bits as the corresponding qemu PTE ? > > Specifically, Ben is talking about this code: > > > /* Translate to host virtual address */ > hva = __gfn_to_hva_memslot(memslot, gfn); > > /* Look up the Linux PTE for the backing page */ > pte_size = psize; > pte = lookup_linux_pte(pgdir, hva, writing, &pte_size); > if (pte_present(pte)) { > if (writing && !pte_write(pte)) > /* make the actual HPTE be read-only */ > ptel = hpte_make_readonly(ptel); > is_io = hpte_cache_bits(pte_val(pte)); > pa = pte_pfn(pte) << PAGE_SHIFT; > } > Will not searching the Linux PTE is a overkill? =Bharat -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages
> -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Alexander Graf > Sent: Friday, July 26, 2013 2:20 PM > To: Benjamin Herrenschmidt > Cc: Bhushan Bharat-R65777; kvm-...@vger.kernel.org; kvm@vger.kernel.org; > linuxppc-...@lists.ozlabs.org; Wood Scott-B07421; Bhushan Bharat-R65777 > Subject: Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages > > > On 26.07.2013, at 10:26, Benjamin Herrenschmidt wrote: > > > On Fri, 2013-07-26 at 11:16 +0530, Bharat Bhushan wrote: > >> If the page is RAM then map this as cacheable and coherent (set "M" > >> bit) otherwise this page is treated as I/O and map this as cache > >> inhibited and guarded (set "I + G") > >> > >> This helps setting proper MMU mapping for direct assigned device. > >> > >> NOTE: There can be devices that require cacheable mapping, which is not yet > supported. > > > > Why don't you do like server instead and enforce the use of the same I > > and M bits as the corresponding qemu PTE ? > > Specifically, Ben is talking about this code: > > > /* Translate to host virtual address */ > hva = __gfn_to_hva_memslot(memslot, gfn); > > /* Look up the Linux PTE for the backing page */ > pte_size = psize; > pte = lookup_linux_pte(pgdir, hva, writing, &pte_size); > if (pte_present(pte)) { > if (writing && !pte_write(pte)) > /* make the actual HPTE be read-only */ > ptel = hpte_make_readonly(ptel); > is_io = hpte_cache_bits(pte_val(pte)); > pa = pte_pfn(pte) << PAGE_SHIFT; > } > Ok Thanks -Bharat > > Alex > > -- > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body > of a message to majord...@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages
> -Original Message- > From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org] > Sent: Friday, July 26, 2013 1:57 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; > linuxppc-...@lists.ozlabs.org; > ag...@suse.de; Wood Scott-B07421; Bhushan Bharat-R65777 > Subject: Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages > > On Fri, 2013-07-26 at 11:16 +0530, Bharat Bhushan wrote: > > If the page is RAM then map this as cacheable and coherent (set "M" > > bit) otherwise this page is treated as I/O and map this as cache > > inhibited and guarded (set "I + G") > > > > This helps setting proper MMU mapping for direct assigned device. > > > > NOTE: There can be devices that require cacheable mapping, which is not yet > supported. > > Why don't you do like server instead and enforce the use of the same I and M > bits as the corresponding qemu PTE ? Ben/Alex, I will look into the code. Can you please describe how this is handled on server? Thanks -Bharat > > Cheers, > Ben. > > > Signed-off-by: Bharat Bhushan > > --- > > arch/powerpc/kvm/e500_mmu_host.c | 24 +++- > > 1 files changed, 19 insertions(+), 5 deletions(-) > > > > diff --git a/arch/powerpc/kvm/e500_mmu_host.c > > b/arch/powerpc/kvm/e500_mmu_host.c > > index 1c6a9d7..5cbdc8f 100644 > > --- a/arch/powerpc/kvm/e500_mmu_host.c > > +++ b/arch/powerpc/kvm/e500_mmu_host.c > > @@ -64,13 +64,27 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int > usermode) > > return mas3; > > } > > > > -static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) > > +static inline u32 e500_shadow_mas2_attrib(u32 mas2, pfn_t pfn) > > { > > + u32 mas2_attr; > > + > > + mas2_attr = mas2 & MAS2_ATTRIB_MASK; > > + > > + if (kvm_is_mmio_pfn(pfn)) { > > + /* > > +* If page is not RAM then it is treated as I/O page. > > +* Map it with cache inhibited and guarded (set "I" + "G"). > > +*/ > > + mas2_attr |= MAS2_I | MAS2_G; > > + return mas2_attr; > > + } > > + > > + /* Map RAM pages as cacheable (Not setting "I" in MAS2) */ > > #ifdef CONFIG_SMP > > - return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M; > > -#else > > - return mas2 & MAS2_ATTRIB_MASK; > > + /* Also map as coherent (set "M") in SMP */ > > + mas2_attr |= MAS2_M; > > #endif > > + return mas2_attr; > > } > > > > /* > > @@ -313,7 +327,7 @@ static void kvmppc_e500_setup_stlbe( > > /* Force IPROT=0 for all guest mappings. */ > > stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; > > stlbe->mas2 = (gvaddr & MAS2_EPN) | > > - e500_shadow_mas2_attrib(gtlbe->mas2, pr); > > + e500_shadow_mas2_attrib(gtlbe->mas2, pfn); > > stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | > > e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); > > > >
RE: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Wednesday, July 24, 2013 1:55 PM > To: "“tiejun.chen”" > Cc: Bhushan Bharat-R65777; kvm-...@vger.kernel.org; kvm@vger.kernel.org list; > Wood Scott-B07421; Gleb Natapov; Paolo Bonzini > Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel > managed pages > > > On 24.07.2013, at 04:26, “tiejun.chen” wrote: > > > On 07/18/2013 06:27 PM, Alexander Graf wrote: > >> > >> On 18.07.2013, at 12:19, “tiejun.chen” wrote: > >> > >>> On 07/18/2013 06:12 PM, Alexander Graf wrote: > >>>> > >>>> On 18.07.2013, at 12:08, “tiejun.chen” wrote: > >>>> > >>>>> On 07/18/2013 05:48 PM, Alexander Graf wrote: > >>>>>> > >>>>>> On 18.07.2013, at 10:25, Bhushan Bharat-R65777 wrote: > >>>>>> > >>>>>>> > >>>>>>> > >>>>>>>> -Original Message- > >>>>>>>> From: Bhushan Bharat-R65777 > >>>>>>>> Sent: Thursday, July 18, 2013 1:53 PM > >>>>>>>> To: '" tiejun.chen "' > >>>>>>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; > >>>>>>>> ag...@suse.de; Wood Scott- > >>>>>>>> B07421 > >>>>>>>> Subject: RE: [PATCH 2/2] kvm: powerpc: set cache coherency only > >>>>>>>> for kernel managed pages > >>>>>>>> > >>>>>>>> > >>>>>>>> > >>>>>>>>> -Original Message- > >>>>>>>>> From: " tiejun.chen " [mailto:tiejun.c...@windriver.com] > >>>>>>>>> Sent: Thursday, July 18, 2013 1:52 PM > >>>>>>>>> To: Bhushan Bharat-R65777 > >>>>>>>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; > >>>>>>>>> ag...@suse.de; Wood > >>>>>>>>> Scott- > >>>>>>>>> B07421 > >>>>>>>>> Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency > >>>>>>>>> only for kernel managed pages > >>>>>>>>> > >>>>>>>>> On 07/18/2013 04:08 PM, Bhushan Bharat-R65777 wrote: > >>>>>>>>>> > >>>>>>>>>> > >>>>>>>>>>> -Original Message- > >>>>>>>>>>> From: kvm-ppc-ow...@vger.kernel.org > >>>>>>>>>>> [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of " tiejun.chen > >>>>>>>>>>> " > >>>>>>>>>>> Sent: Thursday, July 18, 2013 1:01 PM > >>>>>>>>>>> To: Bhushan Bharat-R65777 > >>>>>>>>>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; > >>>>>>>>>>> ag...@suse.de; Wood > >>>>>>>>>>> Scott- > >>>>>>>>>>> B07421 > >>>>>>>>>>> Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency > >>>>>>>>>>> only for kernel managed pages > >>>>>>>>>>> > >>>>>>>>>>> On 07/18/2013 03:12 PM, Bhushan Bharat-R65777 wrote: > >>>>>>>>>>>> > >>>>>>>>>>>> > >>>>>>>>>>>>> -Original Message- > >>>>>>>>>>>>> From: " tiejun.chen " [mailto:tiejun.c...@windriver.com] > >>>>>>>>>>>>> Sent: Thursday, July 18, 2013 11:56 AM > >>>>>>>>>>>>> To: Bhushan Bharat-R65777 > >>>>>>>>>>>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; > >>>>>>>>>>>>> ag...@suse.de; Wood > >>>>>>>>>>>>> Scott- B07421; Bhushan Bharat-R65777 > >>>>>>>>>>>>> Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency > >>>>>>>>>>>>> only for kernel managed pages > >>>>>>>>>>>>> &g
RE: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, July 23, 2013 11:50 PM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; Alexander Graf; kvm-...@vger.kernel.org; > kvm@vger.kernel.org > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel > managed pages > > On 07/23/2013 11:50:35 AM, Bhushan Bharat-R65777 wrote: > > > > > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Tuesday, July 23, 2013 10:15 PM > > > To: Bhushan Bharat-R65777 > > > Cc: Wood Scott-B07421; Alexander Graf; kvm-...@vger.kernel.org; > > > kvm@vger.kernel.org > > > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only > > for kernel > > > managed pages > > > > > > On 07/22/2013 10:39:16 PM, Bhushan Bharat-R65777 wrote: > > > > > > > > > > > > > -Original Message- > > > > > From: Wood Scott-B07421 > > > > > Sent: Tuesday, July 23, 2013 12:18 AM > > > > > To: Bhushan Bharat-R65777 > > > > > Cc: Wood Scott-B07421; Alexander Graf; kvm-...@vger.kernel.org; > > > > > kvm@vger.kernel.org > > > > > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache coherency > > only > > > > for kernel > > > > > managed pages > > > > > > > > > > On 07/21/2013 11:39:45 PM, Bhushan Bharat-R65777 wrote: > > > > > > > > > > > > > > > > > > > -Original Message- > > > > > > > From: Wood Scott-B07421 > > > > > > > Sent: Thursday, July 18, 2013 11:09 PM > > > > > > > To: Alexander Graf > > > > > > > Cc: Bhushan Bharat-R65777; kvm-...@vger.kernel.org; > > > > > > kvm@vger.kernel.org; Bhushan > > > > > > > Bharat-R65777 > > > > > > > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache > > coherency > > > > only > > > > > > for kernel > > > > > > > managed pages > > > > > > > > > > > > > > On 07/18/2013 12:32:18 PM, Alexander Graf wrote: > > > > > > > > > > > > > > > > On 18.07.2013, at 19:17, Scott Wood wrote: > > > > > > > > > > > > > > > > > On 07/18/2013 08:19:03 AM, Bharat Bhushan wrote: > > > > > > > > > Likewise, we want to make sure this matches the host > > entry. > > > > > > > > Unfortunately, this is a bit of a mess already. 64-bit > > booke > > > > > > appears > > > > > > > > to always set MAS2_M for TLB0 mappings. The initial > > > > KERNELBASE > > > > > > > > mapping on boot uses M_IF_SMP, and the settlbcam() that > > (IIRC) > > > > > > > > replaces it uses _PAGE_COHERENT. 32-bit always uses > > > > > > _PAGE_COHERENT, > > > > > > > > except that initial KERNELBASE mapping. _PAGE_COHERENT > > > > appears > > > > > > to be > > > > > > > > set based on CONFIG_SMP || CONFIG_PPC_STD_MMU (the latter > > > > config > > > > > > > > clears _PAGE_COHERENT in the non-CPU_FTR_NEED_COHERENT > > case). > > > > > > > > > > > > > > > > > > As for what we actually want to happen, there are cases > > > > when we > > > > > > > > want M to be set for non-SMP. One such case is AMP, where > > > > CPUs > > > > > > may be > > > > > > > > sharing memory even if the Linux instance only runs on > > one CPU > > > > > > (this > > > > > > > > is not hypothetical, BTW). It's also possible that we > > > > encounter a > > > > > > > > hardware bug that requires MAS2_M, similar to what some > > of our > > > > > > > > non-booke chips require. > > > > > > > > > > > > > > > > How about we always set M then for RAM? > > > > > > > > > > > > > > M is like I in that bad things happen if you mix them. > > > > > > > > > > > > I am trying to list the invalid mixing of WIMG: > > > > > > > > > > > > 1) I & M > > > > > > 2) W &
RE: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, July 23, 2013 10:15 PM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; Alexander Graf; kvm-...@vger.kernel.org; > kvm@vger.kernel.org > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel > managed pages > > On 07/22/2013 10:39:16 PM, Bhushan Bharat-R65777 wrote: > > > > > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Tuesday, July 23, 2013 12:18 AM > > > To: Bhushan Bharat-R65777 > > > Cc: Wood Scott-B07421; Alexander Graf; kvm-...@vger.kernel.org; > > > kvm@vger.kernel.org > > > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only > > for kernel > > > managed pages > > > > > > On 07/21/2013 11:39:45 PM, Bhushan Bharat-R65777 wrote: > > > > > > > > > > > > > -Original Message- > > > > > From: Wood Scott-B07421 > > > > > Sent: Thursday, July 18, 2013 11:09 PM > > > > > To: Alexander Graf > > > > > Cc: Bhushan Bharat-R65777; kvm-...@vger.kernel.org; > > > > kvm@vger.kernel.org; Bhushan > > > > > Bharat-R65777 > > > > > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache coherency > > only > > > > for kernel > > > > > managed pages > > > > > > > > > > On 07/18/2013 12:32:18 PM, Alexander Graf wrote: > > > > > > > > > > > > On 18.07.2013, at 19:17, Scott Wood wrote: > > > > > > > > > > > > > On 07/18/2013 08:19:03 AM, Bharat Bhushan wrote: > > > > > > > Likewise, we want to make sure this matches the host entry. > > > > > > Unfortunately, this is a bit of a mess already. 64-bit booke > > > > appears > > > > > > to always set MAS2_M for TLB0 mappings. The initial > > KERNELBASE > > > > > > mapping on boot uses M_IF_SMP, and the settlbcam() that (IIRC) > > > > > > replaces it uses _PAGE_COHERENT. 32-bit always uses > > > > _PAGE_COHERENT, > > > > > > except that initial KERNELBASE mapping. _PAGE_COHERENT > > appears > > > > to be > > > > > > set based on CONFIG_SMP || CONFIG_PPC_STD_MMU (the latter > > config > > > > > > clears _PAGE_COHERENT in the non-CPU_FTR_NEED_COHERENT case). > > > > > > > > > > > > > > As for what we actually want to happen, there are cases > > when we > > > > > > want M to be set for non-SMP. One such case is AMP, where > > CPUs > > > > may be > > > > > > sharing memory even if the Linux instance only runs on one CPU > > > > (this > > > > > > is not hypothetical, BTW). It's also possible that we > > encounter a > > > > > > hardware bug that requires MAS2_M, similar to what some of our > > > > > > non-booke chips require. > > > > > > > > > > > > How about we always set M then for RAM? > > > > > > > > > > M is like I in that bad things happen if you mix them. > > > > > > > > I am trying to list the invalid mixing of WIMG: > > > > > > > > 1) I & M > > > > 2) W & I > > > > 3) W & M (Scott mentioned that he observed issues when mixing > > these > > > > two) > > > > 4) is there any other? > > > > > > That's not what I was talking about (and I don't think I mentioned > > W at all, > > > though it is also potentially problematic). > > > > Here is cut paste of your one response: > > "The architecture makes it illegal to mix cacheable and > > cache-inhibited mappings to the same physical page. Mixing W or M > > bits is generally bad as well. I've seen it cause machine checks, > > error interrupts, etc. > > -- not just corrupting the page in question." > > > > So I added not mixing W & M. But at that time I missed to understood > > why mixing M & I for same physical address can be issue :). > > "W or M", not "W and M". I meant that each one, separately, is in a similar > situation as the I bit. > > None of this is about invalid combinations of attributes on a single TLB entry > (though there are architectural restrictions there as well). Ok, I misread again :(. The second part of comment was (looks like you missed so copy pasted below) " When we say all RAM (page_is_ram() is true) will be having "M" bit, then same RAM physical address will not have "M" mixed with any other, right? Similarly, For IO (which is not RAM), we will set "I+G", so "I" will not be mixed with "M". Is not that? " -Bharat > > -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: Wood Scott-B07421 > Sent: Thursday, July 18, 2013 10:48 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Bhushan > Bharat- > R65777 > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel > managed pages > > On 07/18/2013 08:19:03 AM, Bharat Bhushan wrote: > > If there is a struct page for the requested mapping then it's normal > > RAM and the mapping is set to "M" bit (coherent, cacheable) otherwise > > this is treated as I/O and we set "I + G" (cache inhibited, guarded) > > > > This helps setting proper TLB mapping for direct assigned device > > > > Signed-off-by: Bharat Bhushan > > --- > > v2: some cleanup and added comment > > - > > arch/powerpc/kvm/e500_mmu_host.c | 23 ++- > > 1 files changed, 18 insertions(+), 5 deletions(-) > > > > diff --git a/arch/powerpc/kvm/e500_mmu_host.c > > b/arch/powerpc/kvm/e500_mmu_host.c > > index 1c6a9d7..02eb973 100644 > > --- a/arch/powerpc/kvm/e500_mmu_host.c > > +++ b/arch/powerpc/kvm/e500_mmu_host.c > > @@ -64,13 +64,26 @@ static inline u32 e500_shadow_mas3_attrib(u32 > > mas3, int usermode) > > return mas3; > > } > > > > -static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) > > +static inline u32 e500_shadow_mas2_attrib(u32 mas2, pfn_t pfn) > > { > > + u32 mas2_attr; > > + > > + mas2_attr = mas2 & MAS2_ATTRIB_MASK; > > + > > + /* > > +* RAM is always mappable on e500 systems, so this is identical > > +* to kvm_is_mmio_pfn(), just without its overhead. > > +*/ > > + if (!pfn_valid(pfn)) { > > Please use page_is_ram(), which is what gets used when setting the WIMG for > the > host userspace mapping. We want to make sure the two are consistent. > > > + /* Pages not managed by Kernel are treated as I/O, set > > I + G */ > > + mas2_attr |= MAS2_I | MAS2_G; > > #ifdef CONFIG_SMP > > - return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M; > > -#else > > - return mas2 & MAS2_ATTRIB_MASK; > > + } else { > > + /* Kernel managed pages are actually RAM so set "M" */ > > + mas2_attr |= MAS2_M; > > #endif > > Likewise, we want to make sure this matches the host entry. > Unfortunately, this is a bit of a mess already. 64-bit booke appears to > always > set MAS2_M for TLB0 mappings. Scott, can you please point to the code where MAS2_M is always set for TLB0? -Bharat > The initial KERNELBASE mapping on boot uses > M_IF_SMP, and the settlbcam() that (IIRC) replaces it uses _PAGE_COHERENT. > 32- > bit always uses _PAGE_COHERENT, except that initial KERNELBASE mapping. > _PAGE_COHERENT appears to be set based on CONFIG_SMP || CONFIG_PPC_STD_MMU > (the > latter config clears _PAGE_COHERENT in the non-CPU_FTR_NEED_COHERENT case). > > As for what we actually want to happen, there are cases when we want M to be > set > for non-SMP. One such case is AMP, where CPUs may be sharing memory even if > the > Linux instance only runs on one CPU (this is not hypothetical, BTW). It's > also > possible that we encounter a hardware bug that requires MAS2_M, similar to > what > some of our non-booke chips require. > > -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, July 23, 2013 12:18 AM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; Alexander Graf; kvm-...@vger.kernel.org; > kvm@vger.kernel.org > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel > managed pages > > On 07/21/2013 11:39:45 PM, Bhushan Bharat-R65777 wrote: > > > > > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Thursday, July 18, 2013 11:09 PM > > > To: Alexander Graf > > > Cc: Bhushan Bharat-R65777; kvm-...@vger.kernel.org; > > kvm@vger.kernel.org; Bhushan > > > Bharat-R65777 > > > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only > > for kernel > > > managed pages > > > > > > On 07/18/2013 12:32:18 PM, Alexander Graf wrote: > > > > > > > > On 18.07.2013, at 19:17, Scott Wood wrote: > > > > > > > > > On 07/18/2013 08:19:03 AM, Bharat Bhushan wrote: > > > > > Likewise, we want to make sure this matches the host entry. > > > > Unfortunately, this is a bit of a mess already. 64-bit booke > > appears > > > > to always set MAS2_M for TLB0 mappings. The initial KERNELBASE > > > > mapping on boot uses M_IF_SMP, and the settlbcam() that (IIRC) > > > > replaces it uses _PAGE_COHERENT. 32-bit always uses > > _PAGE_COHERENT, > > > > except that initial KERNELBASE mapping. _PAGE_COHERENT appears > > to be > > > > set based on CONFIG_SMP || CONFIG_PPC_STD_MMU (the latter config > > > > clears _PAGE_COHERENT in the non-CPU_FTR_NEED_COHERENT case). > > > > > > > > > > As for what we actually want to happen, there are cases when we > > > > want M to be set for non-SMP. One such case is AMP, where CPUs > > may be > > > > sharing memory even if the Linux instance only runs on one CPU > > (this > > > > is not hypothetical, BTW). It's also possible that we encounter a > > > > hardware bug that requires MAS2_M, similar to what some of our > > > > non-booke chips require. > > > > > > > > How about we always set M then for RAM? > > > > > > M is like I in that bad things happen if you mix them. > > > > I am trying to list the invalid mixing of WIMG: > > > > 1) I & M > > 2) W & I > > 3) W & M (Scott mentioned that he observed issues when mixing these > > two) > > 4) is there any other? > > That's not what I was talking about (and I don't think I mentioned W at all, > though it is also potentially problematic). Here is cut paste of your one response: "The architecture makes it illegal to mix cacheable and cache-inhibited mappings to the same physical page. Mixing W or M bits is generally bad as well. I've seen it cause machine checks, error interrupts, etc. -- not just corrupting the page in question." So I added not mixing W & M. But at that time I missed to understood why mixing M & I for same physical address can be issue :). > I'm talking about mixing I with > not-I (on two different virtual addresses pointing to the same physical), M > with > not-M, etc. When we say all RAM (page_is_ram() is true) will be having "M" bit, then RAM physical address will not have "M" mixed with any other, right? Similarly, For IO (which is not RAM), we will set "I+G", so "I" will not be mixed with "M". Is not that? -Bharat > > > > So we really want to > > > match exactly what the rest of the kernel is doing. > > > > How the rest of kernel is doing is a bit complex. IIUC, if we forget > > about the boot state then this is how kernel set WIMG bits: > > 1) For Memory always set M if CONFIG_SMP set. > > - So KVM can do same. "M" will not be mixed with "W" and "I". G and E > > are guest control. > > I don't think this is accurate for 64-bit. And what about the AMP case? > > -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: Wood Scott-B07421 > Sent: Thursday, July 18, 2013 11:09 PM > To: Alexander Graf > Cc: Bhushan Bharat-R65777; kvm-...@vger.kernel.org; kvm@vger.kernel.org; > Bhushan > Bharat-R65777 > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel > managed pages > > On 07/18/2013 12:32:18 PM, Alexander Graf wrote: > > > > On 18.07.2013, at 19:17, Scott Wood wrote: > > > > > On 07/18/2013 08:19:03 AM, Bharat Bhushan wrote: > > > Likewise, we want to make sure this matches the host entry. > > Unfortunately, this is a bit of a mess already. 64-bit booke appears > > to always set MAS2_M for TLB0 mappings. The initial KERNELBASE > > mapping on boot uses M_IF_SMP, and the settlbcam() that (IIRC) > > replaces it uses _PAGE_COHERENT. 32-bit always uses _PAGE_COHERENT, > > except that initial KERNELBASE mapping. _PAGE_COHERENT appears to be > > set based on CONFIG_SMP || CONFIG_PPC_STD_MMU (the latter config > > clears _PAGE_COHERENT in the non-CPU_FTR_NEED_COHERENT case). > > > > > > As for what we actually want to happen, there are cases when we > > want M to be set for non-SMP. One such case is AMP, where CPUs may be > > sharing memory even if the Linux instance only runs on one CPU (this > > is not hypothetical, BTW). It's also possible that we encounter a > > hardware bug that requires MAS2_M, similar to what some of our > > non-booke chips require. > > > > How about we always set M then for RAM? > > M is like I in that bad things happen if you mix them. I am trying to list the invalid mixing of WIMG: 1) I & M 2) W & I 3) W & M (Scott mentioned that he observed issues when mixing these two) 4) is there any other? So it mean it is safe to let guest control G and E. > So we really want to > match exactly what the rest of the kernel is doing. How the rest of kernel is doing is a bit complex. IIUC, if we forget about the boot state then this is how kernel set WIMG bits: 1) For Memory always set M if CONFIG_SMP set. - So KVM can do same. "M" will not be mixed with "W" and "I". G and E are guest control. 2) For I/O , drivers can pass flags to set M or "I + G". - For KVM; if not memory then it is I/O. For now we can always set "I + G". - Later we can design some mechanism in VFIO interface to let KVM somehow know whether to set "M" or "I+G". -Bharat > > Plus, the performance penalty on some single-core chips can be pretty bad. > > -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 1/2 v2] kvm: powerpc: Do not ignore "E" attribute in mas2
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Thursday, July 18, 2013 8:50 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > Subject: Re: [PATCH 1/2 v2] kvm: powerpc: Do not ignore "E" attribute in mas2 > > > On 18.07.2013, at 17:12, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: kvm-ppc-ow...@vger.kernel.org > >> [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf > >> Sent: Thursday, July 18, 2013 8:18 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; > >> Bhushan > >> Bharat-R65777 > >> Subject: Re: [PATCH 1/2 v2] kvm: powerpc: Do not ignore "E" attribute > >> in mas2 > >> > >> > >> This needs a description. Why shouldn't we ignore E? > > > > What I understood is that "there is no reason to stop guest setting "E", so > allow him." > > Please add that to the patch description. Also explain what the bit means. Ok :) -Bharat > > > Alex > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Alexander Graf > Sent: Thursday, July 18, 2013 8:23 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; Bhushan > Bharat-R65777 > Subject: Re: [PATCH 2/2 v2] kvm: powerpc: set cache coherency only for kernel > managed pages > > > On 18.07.2013, at 15:19, Bharat Bhushan wrote: > > > If there is a struct page for the requested mapping then it's normal > > RAM and the mapping is set to "M" bit (coherent, cacheable) otherwise > > this is treated as I/O and we set "I + G" (cache inhibited, guarded) > > > > This helps setting proper TLB mapping for direct assigned device > > > > Signed-off-by: Bharat Bhushan > > --- > > v2: some cleanup and added comment > > - > > arch/powerpc/kvm/e500_mmu_host.c | 23 ++- > > 1 files changed, 18 insertions(+), 5 deletions(-) > > > > diff --git a/arch/powerpc/kvm/e500_mmu_host.c > > b/arch/powerpc/kvm/e500_mmu_host.c > > index 1c6a9d7..02eb973 100644 > > --- a/arch/powerpc/kvm/e500_mmu_host.c > > +++ b/arch/powerpc/kvm/e500_mmu_host.c > > @@ -64,13 +64,26 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int > usermode) > > return mas3; > > } > > > > -static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) > > +static inline u32 e500_shadow_mas2_attrib(u32 mas2, pfn_t pfn) > > { > > + u32 mas2_attr; > > + > > + mas2_attr = mas2 & MAS2_ATTRIB_MASK; > > + > > + /* > > +* RAM is always mappable on e500 systems, so this is identical > > +* to kvm_is_mmio_pfn(), just without its overhead. > > +*/ > > + if (!pfn_valid(pfn)) { > > + /* Pages not managed by Kernel are treated as I/O, set I + G */ > > Please also document the intermediate thought that I/O should be mapped non- > cached. I did not get what you mean to document? > > > + mas2_attr |= MAS2_I | MAS2_G; > > #ifdef CONFIG_SMP > > Please separate the SMP case out of the branch. Really :) this was looking simple to me. > > > - return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M; > > -#else > > - return mas2 & MAS2_ATTRIB_MASK; > > + } else { > > + /* Kernel managed pages are actually RAM so set "M" */ > > This comment doesn't tell me why M can be set ;). RAM in SMP, so setting coherent, is not that obvious? -Bharat > > > Alex > > > + mas2_attr |= MAS2_M; > > #endif > > + } > > + return mas2_attr; > > } > > > > /* > > @@ -313,7 +326,7 @@ static void kvmppc_e500_setup_stlbe( > > /* Force IPROT=0 for all guest mappings. */ > > stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; > > stlbe->mas2 = (gvaddr & MAS2_EPN) | > > - e500_shadow_mas2_attrib(gtlbe->mas2, pr); > > + e500_shadow_mas2_attrib(gtlbe->mas2, pfn); > > stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | > > e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); > > > > -- > > 1.7.0.4 > > > > > > -- > > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in > > the body of a message to majord...@vger.kernel.org More majordomo info > > at http://vger.kernel.org/majordomo-info.html > > -- > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body > of a message to majord...@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 1/2 v2] kvm: powerpc: Do not ignore "E" attribute in mas2
> -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Alexander Graf > Sent: Thursday, July 18, 2013 8:18 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; Bhushan > Bharat-R65777 > Subject: Re: [PATCH 1/2 v2] kvm: powerpc: Do not ignore "E" attribute in mas2 > > > This needs a description. Why shouldn't we ignore E? What I understood is that "there is no reason to stop guest setting "E", so allow him." -Bharat > > > Alex > > On 18.07.2013, at 15:19, Bharat Bhushan wrote: > > > Signed-off-by: Bharat Bhushan > > --- > > v2: > > - No change > > arch/powerpc/kvm/e500.h |2 +- > > 1 files changed, 1 insertions(+), 1 deletions(-) > > > > diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index > > c2e5e98..277cb18 100644 > > --- a/arch/powerpc/kvm/e500.h > > +++ b/arch/powerpc/kvm/e500.h > > @@ -117,7 +117,7 @@ static inline struct kvmppc_vcpu_e500 > > *to_e500(struct kvm_vcpu *vcpu) #define E500_TLB_USER_PERM_MASK > > (MAS3_UX|MAS3_UR|MAS3_UW) #define E500_TLB_SUPER_PERM_MASK > > (MAS3_SX|MAS3_SR|MAS3_SW) #define MAS2_ATTRIB_MASK \ > > - (MAS2_X0 | MAS2_X1) > > + (MAS2_X0 | MAS2_X1 | MAS2_E) > > #define MAS3_ATTRIB_MASK \ > > (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ > >| E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) > > -- > > 1.7.0.4 > > > > > > -- > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body > of a message to majord...@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Alexander Graf > Sent: Thursday, July 18, 2013 3:19 PM > To: Bhushan Bharat-R65777 > Cc: "“tiejun.chen”"; kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott- > B07421 > Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel > managed pages > > > On 18.07.2013, at 10:25, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: Bhushan Bharat-R65777 > >> Sent: Thursday, July 18, 2013 1:53 PM > >> To: '"“tiejun.chen”"' > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood > >> Scott- > >> B07421 > >> Subject: RE: [PATCH 2/2] kvm: powerpc: set cache coherency only for > >> kernel managed pages > >> > >> > >> > >>> -Original Message- > >>> From: "“tiejun.chen”" [mailto:tiejun.c...@windriver.com] > >>> Sent: Thursday, July 18, 2013 1:52 PM > >>> To: Bhushan Bharat-R65777 > >>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; > >>> Wood > >>> Scott- > >>> B07421 > >>> Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only for > >>> kernel managed pages > >>> > >>> On 07/18/2013 04:08 PM, Bhushan Bharat-R65777 wrote: > >>>> > >>>> > >>>>> -Original Message- > >>>>> From: kvm-ppc-ow...@vger.kernel.org > >>>>> [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of "“tiejun.chen”" > >>>>> Sent: Thursday, July 18, 2013 1:01 PM > >>>>> To: Bhushan Bharat-R65777 > >>>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; > >>>>> Wood > >>>>> Scott- > >>>>> B07421 > >>>>> Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only > >>>>> for kernel managed pages > >>>>> > >>>>> On 07/18/2013 03:12 PM, Bhushan Bharat-R65777 wrote: > >>>>>> > >>>>>> > >>>>>>> -Original Message- > >>>>>>> From: "“tiejun.chen”" [mailto:tiejun.c...@windriver.com] > >>>>>>> Sent: Thursday, July 18, 2013 11:56 AM > >>>>>>> To: Bhushan Bharat-R65777 > >>>>>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; > >>>>>>> Wood > >>>>>>> Scott- B07421; Bhushan Bharat-R65777 > >>>>>>> Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only > >>>>>>> for kernel managed pages > >>>>>>> > >>>>>>> On 07/18/2013 02:04 PM, Bharat Bhushan wrote: > >>>>>>>> If there is a struct page for the requested mapping then it's > >>>>>>>> normal DDR and the mapping sets "M" bit (coherent, cacheable) > >>>>>>>> else this is treated as I/O and we set "I + G" (cache > >>>>>>>> inhibited, > >>>>>>>> guarded) > >>>>>>>> > >>>>>>>> This helps setting proper TLB mapping for direct assigned > >>>>>>>> device > >>>>>>>> > >>>>>>>> Signed-off-by: Bharat Bhushan > >>>>>>>> --- > >>>>>>>>arch/powerpc/kvm/e500_mmu_host.c | 17 - > >>>>>>>>1 files changed, 12 insertions(+), 5 deletions(-) > >>>>>>>> > >>>>>>>> diff --git a/arch/powerpc/kvm/e500_mmu_host.c > >>>>>>>> b/arch/powerpc/kvm/e500_mmu_host.c > >>>>>>>> index 1c6a9d7..089c227 100644 > >>>>>>>> --- a/arch/powerpc/kvm/e500_mmu_host.c > >>>>>>>> +++ b/arch/powerpc/kvm/e500_mmu_host.c > >>>>>>>> @@ -64,13 +64,20 @@ static inline u32 > >>>>>>>> e500_shadow_mas3_attrib(u32 mas3, int > >>>>>>> usermode) > >>>>>>>> return mas3; > >>>>>>>>} > >>>>>>>> > >>>>>>>> -static inline u32 e500_sh
RE: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: Bhushan Bharat-R65777 > Sent: Thursday, July 18, 2013 1:53 PM > To: '"“tiejun.chen”"' > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood Scott- > B07421 > Subject: RE: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel > managed pages > > > > > -Original Message- > > From: "“tiejun.chen”" [mailto:tiejun.c...@windriver.com] > > Sent: Thursday, July 18, 2013 1:52 PM > > To: Bhushan Bharat-R65777 > > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood > > Scott- > > B07421 > > Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only for > > kernel managed pages > > > > On 07/18/2013 04:08 PM, Bhushan Bharat-R65777 wrote: > > > > > > > > >> -Original Message- > > >> From: kvm-ppc-ow...@vger.kernel.org > > >> [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of "“tiejun.chen”" > > >> Sent: Thursday, July 18, 2013 1:01 PM > > >> To: Bhushan Bharat-R65777 > > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; > > >> Wood > > >> Scott- > > >> B07421 > > >> Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only for > > >> kernel managed pages > > >> > > >> On 07/18/2013 03:12 PM, Bhushan Bharat-R65777 wrote: > > >>> > > >>> > > >>>> -Original Message- > > >>>> From: "“tiejun.chen”" [mailto:tiejun.c...@windriver.com] > > >>>> Sent: Thursday, July 18, 2013 11:56 AM > > >>>> To: Bhushan Bharat-R65777 > > >>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; > > >>>> Wood > > >>>> Scott- B07421; Bhushan Bharat-R65777 > > >>>> Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only > > >>>> for kernel managed pages > > >>>> > > >>>> On 07/18/2013 02:04 PM, Bharat Bhushan wrote: > > >>>>> If there is a struct page for the requested mapping then it's > > >>>>> normal DDR and the mapping sets "M" bit (coherent, cacheable) > > >>>>> else this is treated as I/O and we set "I + G" (cache > > >>>>> inhibited, > > >>>>> guarded) > > >>>>> > > >>>>> This helps setting proper TLB mapping for direct assigned device > > >>>>> > > >>>>> Signed-off-by: Bharat Bhushan > > >>>>> --- > > >>>>> arch/powerpc/kvm/e500_mmu_host.c | 17 - > > >>>>> 1 files changed, 12 insertions(+), 5 deletions(-) > > >>>>> > > >>>>> diff --git a/arch/powerpc/kvm/e500_mmu_host.c > > >>>>> b/arch/powerpc/kvm/e500_mmu_host.c > > >>>>> index 1c6a9d7..089c227 100644 > > >>>>> --- a/arch/powerpc/kvm/e500_mmu_host.c > > >>>>> +++ b/arch/powerpc/kvm/e500_mmu_host.c > > >>>>> @@ -64,13 +64,20 @@ static inline u32 > > >>>>> e500_shadow_mas3_attrib(u32 mas3, int > > >>>> usermode) > > >>>>> return mas3; > > >>>>> } > > >>>>> > > >>>>> -static inline u32 e500_shadow_mas2_attrib(u32 mas2, int > > >>>>> usermode) > > >>>>> +static inline u32 e500_shadow_mas2_attrib(u32 mas2, pfn_t pfn) > > >>>>> { > > >>>>> + u32 mas2_attr; > > >>>>> + > > >>>>> + mas2_attr = mas2 & MAS2_ATTRIB_MASK; > > >>>>> + > > >>>>> + if (!pfn_valid(pfn)) { > > >>>> > > >>>> Why not directly use kvm_is_mmio_pfn()? > > >>> > > >>> What I understand from this function (someone can correct me) is > > >>> that it > > >> returns "false" when the page is managed by kernel and is not > > >> marked as RESERVED (for some reason). For us it does not matter > > >> whether the page is reserved or not, if it is kernel visible page then it > is DDR. > > >>> > > >> > > >> I think you are setting I|G by addressing all mmio pages, right? If > >
RE: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: "“tiejun.chen”" [mailto:tiejun.c...@windriver.com] > Sent: Thursday, July 18, 2013 1:52 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood Scott- > B07421 > Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel > managed pages > > On 07/18/2013 04:08 PM, Bhushan Bharat-R65777 wrote: > > > > > >> -Original Message- > >> From: kvm-ppc-ow...@vger.kernel.org > >> [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of "“tiejun.chen”" > >> Sent: Thursday, July 18, 2013 1:01 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood > >> Scott- > >> B07421 > >> Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only for > >> kernel managed pages > >> > >> On 07/18/2013 03:12 PM, Bhushan Bharat-R65777 wrote: > >>> > >>> > >>>> -Original Message- > >>>> From: "“tiejun.chen”" [mailto:tiejun.c...@windriver.com] > >>>> Sent: Thursday, July 18, 2013 11:56 AM > >>>> To: Bhushan Bharat-R65777 > >>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; > >>>> Wood > >>>> Scott- B07421; Bhushan Bharat-R65777 > >>>> Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only for > >>>> kernel managed pages > >>>> > >>>> On 07/18/2013 02:04 PM, Bharat Bhushan wrote: > >>>>> If there is a struct page for the requested mapping then it's > >>>>> normal DDR and the mapping sets "M" bit (coherent, cacheable) else > >>>>> this is treated as I/O and we set "I + G" (cache inhibited, > >>>>> guarded) > >>>>> > >>>>> This helps setting proper TLB mapping for direct assigned device > >>>>> > >>>>> Signed-off-by: Bharat Bhushan > >>>>> --- > >>>>> arch/powerpc/kvm/e500_mmu_host.c | 17 - > >>>>> 1 files changed, 12 insertions(+), 5 deletions(-) > >>>>> > >>>>> diff --git a/arch/powerpc/kvm/e500_mmu_host.c > >>>>> b/arch/powerpc/kvm/e500_mmu_host.c > >>>>> index 1c6a9d7..089c227 100644 > >>>>> --- a/arch/powerpc/kvm/e500_mmu_host.c > >>>>> +++ b/arch/powerpc/kvm/e500_mmu_host.c > >>>>> @@ -64,13 +64,20 @@ static inline u32 e500_shadow_mas3_attrib(u32 > >>>>> mas3, int > >>>> usermode) > >>>>> return mas3; > >>>>> } > >>>>> > >>>>> -static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) > >>>>> +static inline u32 e500_shadow_mas2_attrib(u32 mas2, pfn_t pfn) > >>>>> { > >>>>> + u32 mas2_attr; > >>>>> + > >>>>> + mas2_attr = mas2 & MAS2_ATTRIB_MASK; > >>>>> + > >>>>> + if (!pfn_valid(pfn)) { > >>>> > >>>> Why not directly use kvm_is_mmio_pfn()? > >>> > >>> What I understand from this function (someone can correct me) is > >>> that it > >> returns "false" when the page is managed by kernel and is not marked > >> as RESERVED (for some reason). For us it does not matter whether the > >> page is reserved or not, if it is kernel visible page then it is DDR. > >>> > >> > >> I think you are setting I|G by addressing all mmio pages, right? If > >> so, > >> > >> KVM: direct mmio pfn check > >> > >> Userspace may specify memory slots that are backed by mmio > >> pages rather than > >> normal RAM. In some cases it is not enough to identify these mmio > pages > >> by pfn_valid(). This patch adds checking the PageReserved as well. > > > > Do you know what are those "some cases" and how checking PageReserved helps > > in > those cases? > > No, myself didn't see these actual cases in qemu,too. But this should be > chronically persistent as I understand ;-) Then I will wait till someone educate me :) -Bharat > > Tiejun > > > > > -Bharat > > > >> > >> Tiejun > >> > >>> -Bharat > >>> > >>>> > >>>> Tiejun > >>>> > >>>>> + mas2_attr |= MAS2_I | MAS2_G; > >>>>> + } else { > >>>>> #ifdef CONFIG_SMP > >>>>> - return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M; > >>>>> -#else > >>>>> - return mas2 & MAS2_ATTRIB_MASK; > >>>>> + mas2_attr |= MAS2_M; > >>>>> #endif > >>>>> + } > >>>>> + return mas2_attr; > >>>>> } > >>>>> > >>>>> /* > >>>>> @@ -313,7 +320,7 @@ static void kvmppc_e500_setup_stlbe( > >>>>> /* Force IPROT=0 for all guest mappings. */ > >>>>> stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | > >>>>> MAS1_VALID; > >>>>> stlbe->mas2 = (gvaddr & MAS2_EPN) | > >>>>> - e500_shadow_mas2_attrib(gtlbe->mas2, pr); > >>>>> + e500_shadow_mas2_attrib(gtlbe->mas2, pfn); > >>>>> stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | > >>>>> e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); > >>>>> > >>>>> > >>>> > >>> > >> > >> -- > >> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in > >> the body of a message to majord...@vger.kernel.org More majordomo > >> info at http://vger.kernel.org/majordomo-info.html > > >
RE: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of "“tiejun.chen”" > Sent: Thursday, July 18, 2013 1:01 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood Scott- > B07421 > Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel > managed pages > > On 07/18/2013 03:12 PM, Bhushan Bharat-R65777 wrote: > > > > > >> -Original Message- > >> From: "“tiejun.chen”" [mailto:tiejun.c...@windriver.com] > >> Sent: Thursday, July 18, 2013 11:56 AM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood > >> Scott- B07421; Bhushan Bharat-R65777 > >> Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only for > >> kernel managed pages > >> > >> On 07/18/2013 02:04 PM, Bharat Bhushan wrote: > >>> If there is a struct page for the requested mapping then it's normal > >>> DDR and the mapping sets "M" bit (coherent, cacheable) else this is > >>> treated as I/O and we set "I + G" (cache inhibited, guarded) > >>> > >>> This helps setting proper TLB mapping for direct assigned device > >>> > >>> Signed-off-by: Bharat Bhushan > >>> --- > >>>arch/powerpc/kvm/e500_mmu_host.c | 17 - > >>>1 files changed, 12 insertions(+), 5 deletions(-) > >>> > >>> diff --git a/arch/powerpc/kvm/e500_mmu_host.c > >>> b/arch/powerpc/kvm/e500_mmu_host.c > >>> index 1c6a9d7..089c227 100644 > >>> --- a/arch/powerpc/kvm/e500_mmu_host.c > >>> +++ b/arch/powerpc/kvm/e500_mmu_host.c > >>> @@ -64,13 +64,20 @@ static inline u32 e500_shadow_mas3_attrib(u32 > >>> mas3, int > >> usermode) > >>> return mas3; > >>>} > >>> > >>> -static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) > >>> +static inline u32 e500_shadow_mas2_attrib(u32 mas2, pfn_t pfn) > >>>{ > >>> + u32 mas2_attr; > >>> + > >>> + mas2_attr = mas2 & MAS2_ATTRIB_MASK; > >>> + > >>> + if (!pfn_valid(pfn)) { > >> > >> Why not directly use kvm_is_mmio_pfn()? > > > > What I understand from this function (someone can correct me) is that it > returns "false" when the page is managed by kernel and is not marked as > RESERVED > (for some reason). For us it does not matter whether the page is reserved or > not, if it is kernel visible page then it is DDR. > > > > I think you are setting I|G by addressing all mmio pages, right? If so, > > KVM: direct mmio pfn check > > Userspace may specify memory slots that are backed by mmio pages rather > than > normal RAM. In some cases it is not enough to identify these mmio pages > by pfn_valid(). This patch adds checking the PageReserved as well. Do you know what are those "some cases" and how checking PageReserved helps in those cases? -Bharat > > Tiejun > > > -Bharat > > > >> > >> Tiejun > >> > >>> + mas2_attr |= MAS2_I | MAS2_G; > >>> + } else { > >>>#ifdef CONFIG_SMP > >>> - return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M; > >>> -#else > >>> - return mas2 & MAS2_ATTRIB_MASK; > >>> + mas2_attr |= MAS2_M; > >>>#endif > >>> + } > >>> + return mas2_attr; > >>>} > >>> > >>>/* > >>> @@ -313,7 +320,7 @@ static void kvmppc_e500_setup_stlbe( > >>> /* Force IPROT=0 for all guest mappings. */ > >>> stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | > >>> MAS1_VALID; > >>> stlbe->mas2 = (gvaddr & MAS2_EPN) | > >>> - e500_shadow_mas2_attrib(gtlbe->mas2, pr); > >>> + e500_shadow_mas2_attrib(gtlbe->mas2, pfn); > >>> stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | > >>> e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); > >>> > >>> > >> > > > > -- > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body > of a message to majord...@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html N�r��yb�X��ǧv�^�){.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf
RE: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel managed pages
> -Original Message- > From: "“tiejun.chen”" [mailto:tiejun.c...@windriver.com] > Sent: Thursday, July 18, 2013 11:56 AM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood Scott- > B07421; Bhushan Bharat-R65777 > Subject: Re: [PATCH 2/2] kvm: powerpc: set cache coherency only for kernel > managed pages > > On 07/18/2013 02:04 PM, Bharat Bhushan wrote: > > If there is a struct page for the requested mapping then it's normal > > DDR and the mapping sets "M" bit (coherent, cacheable) else this is > > treated as I/O and we set "I + G" (cache inhibited, guarded) > > > > This helps setting proper TLB mapping for direct assigned device > > > > Signed-off-by: Bharat Bhushan > > --- > > arch/powerpc/kvm/e500_mmu_host.c | 17 - > > 1 files changed, 12 insertions(+), 5 deletions(-) > > > > diff --git a/arch/powerpc/kvm/e500_mmu_host.c > > b/arch/powerpc/kvm/e500_mmu_host.c > > index 1c6a9d7..089c227 100644 > > --- a/arch/powerpc/kvm/e500_mmu_host.c > > +++ b/arch/powerpc/kvm/e500_mmu_host.c > > @@ -64,13 +64,20 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int > usermode) > > return mas3; > > } > > > > -static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) > > +static inline u32 e500_shadow_mas2_attrib(u32 mas2, pfn_t pfn) > > { > > + u32 mas2_attr; > > + > > + mas2_attr = mas2 & MAS2_ATTRIB_MASK; > > + > > + if (!pfn_valid(pfn)) { > > Why not directly use kvm_is_mmio_pfn()? What I understand from this function (someone can correct me) is that it returns "false" when the page is managed by kernel and is not marked as RESERVED (for some reason). For us it does not matter whether the page is reserved or not, if it is kernel visible page then it is DDR. -Bharat > > Tiejun > > > + mas2_attr |= MAS2_I | MAS2_G; > > + } else { > > #ifdef CONFIG_SMP > > - return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M; > > -#else > > - return mas2 & MAS2_ATTRIB_MASK; > > + mas2_attr |= MAS2_M; > > #endif > > + } > > + return mas2_attr; > > } > > > > /* > > @@ -313,7 +320,7 @@ static void kvmppc_e500_setup_stlbe( > > /* Force IPROT=0 for all guest mappings. */ > > stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; > > stlbe->mas2 = (gvaddr & MAS2_EPN) | > > - e500_shadow_mas2_attrib(gtlbe->mas2, pr); > > + e500_shadow_mas2_attrib(gtlbe->mas2, pfn); > > stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | > > e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); > > > > > N�r��yb�X��ǧv�^�){.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf
RE: [PATCH 3/5] booke: define reset and shutdown hcalls
> On 17.07.2013, at 13:00, Gleb Natapov wrote: > > > On Tue, Jul 16, 2013 at 06:04:34PM -0500, Scott Wood wrote: > >> On 07/16/2013 01:35:55 AM, Gleb Natapov wrote: > >>> On Mon, Jul 15, 2013 at 01:17:33PM -0500, Scott Wood wrote: > On 07/15/2013 06:30:20 AM, Gleb Natapov wrote: > > There is no much sense to share hypercalls between > > architectures. > > There > > is zero probability x86 will implement those for instance > > This is similar to the question of whether to keep device > API enumerations per-architecture... It costs very little > to keep it in a common place, and it's hard to go back in > the other direction if we later realize there are things > that should be > >> shared. > > >>> This is different from device API since with device API all > >>> arches have to create/destroy devices, so it make sense to > >>> put device lifecycle management into the common code, and > >>> device API has single entry point to the code - device fd > >>> ioctl - where it makes sense to handle common tasks, if any, > >>> and despatch others to specific device implementation. > >>> > >>> This is totally unlike hypercalls which are, by definition, > >>> very architecture specific (the way they are triggered, the > >>> way parameter are passed from guest to host, what hypercalls > >>> arch > >> needs...). > >> > >> The ABI is architecture specific. The API doesn't need to > >> be, any more than it does with syscalls (I consider the > >> architecture-specific definition of syscall numbers and > >> similar constants in Linux to be unfortunate, especially for > >> tools such as strace or QEMU's linux-user emulation). > >> > > Unlike syscalls different arches have very different ideas > > what hypercalls they need to implement, so while with unified > > syscall space I can see how it may benefit (very) small number > > of tools, I do not see what advantage it will give us. The > > disadvantage is one more global name space to manage. > > > Keeping it in a common place also makes it more visible to > people looking to add new hcalls, which could cut down on > reinventing the wheel. > >>> I do not want other arches to start using hypercalls in the > >>> way powerpc started to use them: separate device io space, > >>> so it is better to hide this as far away from common code as > >>> possible :) But on a more serious note hypercalls should be > >>> a last resort and added only when no other possibility > >>> exists, so people should not look what hcalls others > >>> implemented, so they can add them to their favorite arch, > >>> but they should have a problem at hand that they cannot > >>> solve without hcall, but at this point they will have pretty good > idea what this hcall should do. > >> > >> Why are hcalls such a bad thing? > >> > > Because they often used to do non architectural things making > > OSes behave different from how they runs on real HW and real > > HW is what OSes are designed and tested for. Example: there > > once was a KVM (XEN have/had similar one) hypercall to > > accelerate MMU > >> operation. > > One thing it allowed is to to flush tlb without doing IPI if > > vcpu is not running. Later optimization was added to Linux MMU > > code that _relies_ on those IPIs for synchronisation. Good > > that at that point those hypercalls were already deprecated on > > KVM (IIRC XEN was broke for some time in that regard). Which > > brings me to another point: they often get obsoleted by code > > improvement and HW advancement (happened to aforementioned MMU > > hypercalls), but they hard to deprecate if hypervisor supports > > live migration, without live migration it is less of a problem. > > Next point is that people often try to use them instead of > > emulate PV or real device just because they think it is > > easier, but it > >> is often not so. Example: > > pvpanic device was initially proposed as hypercall, so lets > > say we would implement it as such. It would have been KVM > > specific, implementation would touch core guest KVM code and > > would have been Linux guest specific. Instead it was > > implemented as platform device with very small platform driver > > confined in drivers/ directory, immediately usable by XEN and > > QEMU tcg in addition > > This is actually a very good p
RE: [PATCH 3/5] booke: define reset and shutdown hcalls
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Wednesday, July 17, 2013 9:22 PM > To: Bhushan Bharat-R65777 > Cc: Yoder Stuart-B08248; Wood Scott-B07421; kvm@vger.kernel.org; kvm- > p...@vger.kernel.org; Gleb Natapov > Subject: Re: [PATCH 3/5] booke: define reset and shutdown hcalls > > > On 17.07.2013, at 17:47, Bhushan Bharat-R65777 wrote: > > > > >>>>>> On 17.07.2013, at 13:00, Gleb Natapov wrote: > >>>>>> > >>>>>>> On Tue, Jul 16, 2013 at 06:04:34PM -0500, Scott Wood wrote: > >>>>>>>> On 07/16/2013 01:35:55 AM, Gleb Natapov wrote: > >>>>>>>>> On Mon, Jul 15, 2013 at 01:17:33PM -0500, Scott Wood wrote: > >>>>>>>>>> On 07/15/2013 06:30:20 AM, Gleb Natapov wrote: > >>>>>>>>>>> There is no much sense to share hypercalls between architectures. > >>>>>>>>>>> There > >>>>>>>>>>> is zero probability x86 will implement those for instance > >>>>>>>>>> > >>>>>>>>>> This is similar to the question of whether to keep device API > >>>>>>>>>> enumerations per-architecture... It costs very little to > >>>>>>>>>> keep it in a common place, and it's hard to go back in the > >>>>>>>>>> other direction if we later realize there are things that should be > shared. > >>>>>>>>>> > >>>>>>>>> This is different from device API since with device API all > >>>>>>>>> arches have to create/destroy devices, so it make sense to put > >>>>>>>>> device lifecycle management into the common code, and device > >>>>>>>>> API has single entry point to the code - device fd ioctl - > >>>>>>>>> where it makes sense to handle common tasks, if any, and > >>>>>>>>> despatch others to specific device implementation. > >>>>>>>>> > >>>>>>>>> This is totally unlike hypercalls which are, by definition, > >>>>>>>>> very architecture specific (the way they are triggered, the > >>>>>>>>> way parameter are passed from guest to host, what hypercalls arch > needs...). > >>>>>>>> > >>>>>>>> The ABI is architecture specific. The API doesn't need to be, > >>>>>>>> any more than it does with syscalls (I consider the > >>>>>>>> architecture-specific definition of syscall numbers and similar > >>>>>>>> constants in Linux to be unfortunate, especially for tools such > >>>>>>>> as strace or QEMU's linux-user emulation). > >>>>>>>> > >>>>>>> Unlike syscalls different arches have very different ideas what > >>>>>>> hypercalls they need to implement, so while with unified syscall > >>>>>>> space I can see how it may benefit (very) small number of tools, > >>>>>>> I do not see what advantage it will give us. The disadvantage is > >>>>>>> one more global name space to manage. > >>>>>>> > >>>>>>>>>> Keeping it in a common place also makes it more visible to > >>>>>>>>>> people looking to add new hcalls, which could cut down on > >>>>>>>>>> reinventing the wheel. > >>>>>>>>> I do not want other arches to start using hypercalls in the > >>>>>>>>> way powerpc started to use them: separate device io space, so > >>>>>>>>> it is better to hide this as far away from common code as > >>>>>>>>> possible :) But on a more serious note hypercalls should be a > >>>>>>>>> last resort and added only when no other possibility exists, > >>>>>>>>> so people should not look what hcalls others implemented, so > >>>>>>>>> they can add them to their favorite arch, but they should have > >>>>>>>>> a problem at hand that they cannot solve without hcall, but at > >>>>>>>>> this point they will have pretty good idea what this hcall should > >>>&
RE: [PATCH 3/5] booke: define reset and shutdown hcalls
> On 17.07.2013, at 13:00, Gleb Natapov wrote: > > > On Tue, Jul 16, 2013 at 06:04:34PM -0500, Scott Wood wrote: > >> On 07/16/2013 01:35:55 AM, Gleb Natapov wrote: > >>> On Mon, Jul 15, 2013 at 01:17:33PM -0500, Scott Wood wrote: > On 07/15/2013 06:30:20 AM, Gleb Natapov wrote: > > There is no much sense to share hypercalls between architectures. > > There > > is zero probability x86 will implement those for instance > > This is similar to the question of whether to keep device API > enumerations per-architecture... It costs very little to keep > it in a common place, and it's hard to go back in the other > direction if we later realize there are things that should be shared. > > >>> This is different from device API since with device API all > >>> arches have to create/destroy devices, so it make sense to put > >>> device lifecycle management into the common code, and device API > >>> has single entry point to the code - device fd ioctl - where it > >>> makes sense to handle common tasks, if any, and despatch others > >>> to specific device implementation. > >>> > >>> This is totally unlike hypercalls which are, by definition, very > >>> architecture specific (the way they are triggered, the way > >>> parameter are passed from guest to host, what hypercalls arch > >>> needs...). > >> > >> The ABI is architecture specific. The API doesn't need to be, > >> any more than it does with syscalls (I consider the > >> architecture-specific definition of syscall numbers and similar > >> constants in Linux to be unfortunate, especially for tools such > >> as strace or QEMU's linux-user emulation). > >> > > Unlike syscalls different arches have very different ideas what > > hypercalls they need to implement, so while with unified syscall > > space I can see how it may benefit (very) small number of tools, I > > do not see what advantage it will give us. The disadvantage is one > > more global name space to manage. > > > Keeping it in a common place also makes it more visible to > people looking to add new hcalls, which could cut down on > reinventing the wheel. > >>> I do not want other arches to start using hypercalls in the way > >>> powerpc started to use them: separate device io space, so it is > >>> better to hide this as far away from common code as possible :) > >>> But on a more serious note hypercalls should be a last resort > >>> and added only when no other possibility exists, so people > >>> should not look what hcalls others implemented, so they can add > >>> them to their favorite arch, but they should have a problem at > >>> hand that they cannot solve without hcall, but at this point > >>> they will have pretty good idea what this hcall should do. > >> > >> Why are hcalls such a bad thing? > >> > > Because they often used to do non architectural things making OSes > > behave different from how they runs on real HW and real HW is what > > OSes are designed and tested for. Example: there once was a KVM > > (XEN have/had similar one) hypercall to accelerate MMU operation. > > One thing it allowed is to to flush tlb without doing IPI if vcpu > > is not running. Later optimization was added to Linux MMU code > > that _relies_ on those IPIs for synchronisation. Good that at that > > point those hypercalls were already deprecated on KVM (IIRC XEN > > was broke for some time in that regard). Which brings me to > > another point: they often get obsoleted by code improvement and HW > > advancement (happened to aforementioned MMU hypercalls), but they > > hard to deprecate if hypervisor supports live migration, without > > live migration it is less of a problem. Next point is that people > > often try to use them instead of emulate PV or real device just > > because they think it is easier, but it is often not so. Example: > > pvpanic device was initially proposed as hypercall, so lets say we > > would implement it as such. It would have been KVM specific, > > implementation would touch core guest KVM code and would have been > > Linux guest specific. Instead it was implemented as platform > > device with very small platform driver confined in drivers/ > > directory, immediately usable by XEN and QEMU tcg in addition > > This is actually a very good point. How do we support reboot and > shutdown for TCG guests? We surely don't want to expose TCG as KVM > hypervisor. > >>> > >>> Hmm...so are you proposing that we abandon the current approach, and > >>> switch to a device-based mechanism for reboot/shutdown? > >> > >> Reading Gleb's email it sounds like the more future proof approach, > >> yes. I'm not quite sure yet where we should plu
RE: [PATCH 2/5] booke: exit to guest userspace for unimplemented hcalls in kvm
> -Original Message- > From: Wood Scott-B07421 > Sent: Monday, July 15, 2013 11:38 PM > To: Bhushan Bharat-R65777 > Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; ag...@suse.de; Yoder Stuart- > B08248; Bhushan Bharat-R65777; Bhushan Bharat-R65777 > Subject: Re: [PATCH 2/5] booke: exit to guest userspace for unimplemented > hcalls > in kvm > > On 07/15/2013 06:11:16 AM, Bharat Bhushan wrote: > > Exit to guest user space if kvm does not implement the hcall. > > > > Signed-off-by: Bharat Bhushan > > --- > > arch/powerpc/kvm/booke.c | 47 > > +-- > > arch/powerpc/kvm/powerpc.c |1 + > > include/uapi/linux/kvm.h |1 + > > 3 files changed, 42 insertions(+), 7 deletions(-) > > > > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index > > 17722d8..c8b41b4 100644 > > --- a/arch/powerpc/kvm/booke.c > > +++ b/arch/powerpc/kvm/booke.c > > @@ -1005,9 +1005,25 @@ int kvmppc_handle_exit(struct kvm_run *run, > > struct kvm_vcpu *vcpu, > > break; > > > > #ifdef CONFIG_KVM_BOOKE_HV > > - case BOOKE_INTERRUPT_HV_SYSCALL: > > + case BOOKE_INTERRUPT_HV_SYSCALL: { > > + int i; > > if (!(vcpu->arch.shared->msr & MSR_PR)) { > > - kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); > > + r = kvmppc_kvm_pv(vcpu); > > + if (r != EV_UNIMPLEMENTED) { > > + /* except unimplemented return to guest > > */ > > + kvmppc_set_gpr(vcpu, 3, r); > > + kvmppc_account_exit(vcpu, > > SYSCALL_EXITS); > > + r = RESUME_GUEST; > > + break; > > + } > > + /* Exit to userspace for unimplemented hcalls > > in kvm */ > > + run->epapr_hcall.nr = kvmppc_get_gpr(vcpu, 11); > > + run->epapr_hcall.ret = 0; > > + for (i = 0; i < 8; i++) > > + run->epapr_hcall.args[i] = > > kvmppc_get_gpr(vcpu, 3 + i); > > You need to clear the upper half of each register if CONFIG_PPC64=y and MSR_CM > is not set. > > > + vcpu->arch.hcall_needed = 1; > > The existing code for hcall_needed restores 9 return arguments, rather than > the > 8 that are defined for this interface. Thus, you'll be restoring one word of > padding into the guest -- which could be arbitrary userspace data that > shouldn't > be leaked. r12 is volatile in the ePAPR hcall ABI so simply clobbering it > isn't > a problem, though. Oops; Not just that, currently this uses struct type "papr_hcall" while on booke we should use epapr_hcall. I will make a function which will be defined in book3s.c and booke.c to setup hcall return registers accordingly. -Bharat > > -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/5] booke: exit to guest userspace for unimplemented hcalls in kvm
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Monday, July 15, 2013 8:59 PM > To: Bhushan Bharat-R65777 > Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; Yoder > Stuart-B08248 > Subject: Re: [PATCH 2/5] booke: exit to guest userspace for unimplemented > hcalls > in kvm > > > On 15.07.2013, at 17:13, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: Alexander Graf [mailto:ag...@suse.de] > >> Sent: Monday, July 15, 2013 8:27 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; > >> Yoder > >> Stuart-B08248 > >> Subject: Re: [PATCH 2/5] booke: exit to guest userspace for > >> unimplemented hcalls in kvm > >> > >> > >> On 15.07.2013, at 16:50, Bhushan Bharat-R65777 wrote: > >> > >>> > >>> > >>>> -Original Message- > >>>> From: Alexander Graf [mailto:ag...@suse.de] > >>>> Sent: Monday, July 15, 2013 5:16 PM > >>>> To: Bhushan Bharat-R65777 > >>>> Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood > >>>> Scott-B07421; Yoder > >>>> Stuart-B08248 > >>>> Subject: Re: [PATCH 2/5] booke: exit to guest userspace for > >>>> unimplemented hcalls in kvm > >>>> > >>>> > >>>> On 15.07.2013, at 13:38, Bhushan Bharat-R65777 wrote: > >>>> > >>>>> > >>>>> > >>>>>> -Original Message- > >>>>>> From: Alexander Graf [mailto:ag...@suse.de] > >>>>>> Sent: Monday, July 15, 2013 5:02 PM > >>>>>> To: Bhushan Bharat-R65777 > >>>>>> Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood > >>>>>> Scott-B07421; Yoder Stuart-B08248; Bhushan Bharat-R65777 > >>>>>> Subject: Re: [PATCH 2/5] booke: exit to guest userspace for > >>>>>> unimplemented hcalls in kvm > >>>>>> > >>>>>> > >>>>>> On 15.07.2013, at 13:11, Bharat Bhushan wrote: > >>>>>> > >>>>>>> Exit to guest user space if kvm does not implement the hcall. > >>>>>>> > >>>>>>> Signed-off-by: Bharat Bhushan > >>>>>>> --- > >>>>>>> arch/powerpc/kvm/booke.c | 47 > >>>>>>> +- > -- > >> -- > >>>> - > >>>>>>> arch/powerpc/kvm/powerpc.c |1 + > >>>>>>> include/uapi/linux/kvm.h |1 + > >>>>>>> 3 files changed, 42 insertions(+), 7 deletions(-) > >>>>>>> > >>>>>>> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c > >>>>>>> index > >>>>>>> 17722d8..c8b41b4 100644 > >>>>>>> --- a/arch/powerpc/kvm/booke.c > >>>>>>> +++ b/arch/powerpc/kvm/booke.c > >>>>>>> @@ -1005,9 +1005,25 @@ int kvmppc_handle_exit(struct kvm_run > >>>>>>> *run, struct > >>>>>> kvm_vcpu *vcpu, > >>>>>>> break; > >>>>>>> > >>>>>>> #ifdef CONFIG_KVM_BOOKE_HV > >>>>>>> - case BOOKE_INTERRUPT_HV_SYSCALL: > >>>>>>> + case BOOKE_INTERRUPT_HV_SYSCALL: { > >>>>>> > >>>>>> This is getting large. Please extract hcall handling into its own > function. > >>>>>> Maybe you can merge the HV and non-HV case then too. > >>>>>> > >>>>>>> + int i; > >>>>>>> if (!(vcpu->arch.shared->msr & MSR_PR)) { > >>>>>>> - kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); > >>>>>>> + r = kvmppc_kvm_pv(vcpu); > >>>>>>> + if (r != EV_UNIMPLEMENTED) { > >>>>>>> + /* except unimplemented return to guest > >>>>>>> */ > >>>>>>> + kvmppc_set_gpr(vcpu, 3, r); > >>>>>>> + kvmppc_account_exit(vcpu, > >>>>
RE: [PATCH 5/5] powerpc: using reset hcall when kvm,has-reset
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Monday, July 15, 2013 8:40 PM > To: Bhushan Bharat-R65777 > Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; Yoder > Stuart-B08248 > Subject: Re: [PATCH 5/5] powerpc: using reset hcall when kvm,has-reset > > > On 15.07.2013, at 17:05, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: Alexander Graf [mailto:ag...@suse.de] > >> Sent: Monday, July 15, 2013 5:20 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; > >> Yoder Stuart-B08248; Bhushan Bharat-R65777 > >> Subject: Re: [PATCH 5/5] powerpc: using reset hcall when > >> kvm,has-reset > >> > >> > >> On 15.07.2013, at 13:11, Bharat Bhushan wrote: > >> > >>> Detect the availability of the reset hcalls by looking at > >>> kvm,has-reset property on the /hypervisor node in the device tree > >>> passed to the VM and patches the reset mechanism to use reset hcall. > >>> > >>> This patch uses the reser hcall when kvm,has-reset is there in > >> > >> Your patch description is pretty broken :). > >> > >>> > >>> Signed-off-by: Bharat Bhushan > >>> --- > >>> arch/powerpc/kernel/epapr_paravirt.c | 12 > >>> 1 files changed, 12 insertions(+), 0 deletions(-) > >>> > >>> diff --git a/arch/powerpc/kernel/epapr_paravirt.c > >>> b/arch/powerpc/kernel/epapr_paravirt.c > >>> index d44a571..651d701 100644 > >>> --- a/arch/powerpc/kernel/epapr_paravirt.c > >>> +++ b/arch/powerpc/kernel/epapr_paravirt.c > >>> @@ -22,6 +22,8 @@ > >>> #include > >>> #include > >>> #include > >>> +#include > >>> +#include > >> > >> Why would we need kvm_host.h? This is guest code. > >> > >>> > >>> #if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) extern > >>> void epapr_ev_idle(void); @@ -30,6 +32,14 @@ extern u32 > >>> epapr_ev_idle_start[]; > >>> > >>> bool epapr_paravirt_enabled; > >>> > >>> +void epapr_hypercall_reset(char *cmd) { > >>> + long ret; > >>> + ret = kvm_hypercall0(KVM_HC_VM_RESET); > >> > >> Is this available without CONFIG_KVM_GUEST? kvm_hypercall() simply > >> returns "unimplemented" for everything when that config option is not set. > > > > We are here because we patched the ppc_md.restart to point to new handler. > > So I think we should patch the ppc_md.restart only if CONFIG_KVM_GUEST is > true. > > We should only patch it if kvm_para_available(). That should guard us against > everything. > > > > > > >> > >>> + printk("error: system reset returned with error %ld\n", ret); > >> > >> So we should fall back to the normal reset handler here. > > > > Do you mean return normally from here, no BUG() etc? > > If we guard the patching against everything, we can treat a broken hcall as > BUG. > However, if we don't we want to fall back to the normal guts based reset. Will let Scott comment on this? But ppc_md.restart can point to only one handler and during paravirt patching we changed this to new handler. So we cannot jump back to guts type handler -Bharat -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/5] booke: exit to guest userspace for unimplemented hcalls in kvm
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Monday, July 15, 2013 8:27 PM > To: Bhushan Bharat-R65777 > Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; Yoder > Stuart-B08248 > Subject: Re: [PATCH 2/5] booke: exit to guest userspace for unimplemented > hcalls > in kvm > > > On 15.07.2013, at 16:50, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: Alexander Graf [mailto:ag...@suse.de] > >> Sent: Monday, July 15, 2013 5:16 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; > >> Yoder > >> Stuart-B08248 > >> Subject: Re: [PATCH 2/5] booke: exit to guest userspace for > >> unimplemented hcalls in kvm > >> > >> > >> On 15.07.2013, at 13:38, Bhushan Bharat-R65777 wrote: > >> > >>> > >>> > >>>> -Original Message- > >>>> From: Alexander Graf [mailto:ag...@suse.de] > >>>> Sent: Monday, July 15, 2013 5:02 PM > >>>> To: Bhushan Bharat-R65777 > >>>> Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood > >>>> Scott-B07421; Yoder Stuart-B08248; Bhushan Bharat-R65777 > >>>> Subject: Re: [PATCH 2/5] booke: exit to guest userspace for > >>>> unimplemented hcalls in kvm > >>>> > >>>> > >>>> On 15.07.2013, at 13:11, Bharat Bhushan wrote: > >>>> > >>>>> Exit to guest user space if kvm does not implement the hcall. > >>>>> > >>>>> Signed-off-by: Bharat Bhushan > >>>>> --- > >>>>> arch/powerpc/kvm/booke.c | 47 > >>>>> +--- > -- > >> - > >>>>> arch/powerpc/kvm/powerpc.c |1 + > >>>>> include/uapi/linux/kvm.h |1 + > >>>>> 3 files changed, 42 insertions(+), 7 deletions(-) > >>>>> > >>>>> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c > >>>>> index > >>>>> 17722d8..c8b41b4 100644 > >>>>> --- a/arch/powerpc/kvm/booke.c > >>>>> +++ b/arch/powerpc/kvm/booke.c > >>>>> @@ -1005,9 +1005,25 @@ int kvmppc_handle_exit(struct kvm_run *run, > >>>>> struct > >>>> kvm_vcpu *vcpu, > >>>>> break; > >>>>> > >>>>> #ifdef CONFIG_KVM_BOOKE_HV > >>>>> - case BOOKE_INTERRUPT_HV_SYSCALL: > >>>>> + case BOOKE_INTERRUPT_HV_SYSCALL: { > >>>> > >>>> This is getting large. Please extract hcall handling into its own > >>>> function. > >>>> Maybe you can merge the HV and non-HV case then too. > >>>> > >>>>> + int i; > >>>>> if (!(vcpu->arch.shared->msr & MSR_PR)) { > >>>>> - kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); > >>>>> + r = kvmppc_kvm_pv(vcpu); > >>>>> + if (r != EV_UNIMPLEMENTED) { > >>>>> + /* except unimplemented return to guest > >>>>> */ > >>>>> + kvmppc_set_gpr(vcpu, 3, r); > >>>>> + kvmppc_account_exit(vcpu, > >>>>> SYSCALL_EXITS); > >>>>> + r = RESUME_GUEST; > >>>>> + break; > >>>>> + } > >>>>> + /* Exit to userspace for unimplemented hcalls > >>>>> in kvm > */ > >>>>> + run->epapr_hcall.nr = kvmppc_get_gpr(vcpu, 11); > >>>>> + run->epapr_hcall.ret = 0; > >>>>> + for (i = 0; i < 8; i++) > >>>>> + run->epapr_hcall.args[i] = > >>>>> kvmppc_get_gpr(vcpu, > 3 + > >>>> i); > >>>>> + vcpu->arch.hcall_needed = 1; > >>>>> + kvmppc_account_exit(vcpu, SYSCALL_EXITS); > >>>>> + r = RESUME_HOST; > >>>>> } else
RE: [PATCH 5/5] powerpc: using reset hcall when kvm,has-reset
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Monday, July 15, 2013 5:20 PM > To: Bhushan Bharat-R65777 > Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; Yoder > Stuart-B08248; Bhushan Bharat-R65777 > Subject: Re: [PATCH 5/5] powerpc: using reset hcall when kvm,has-reset > > > On 15.07.2013, at 13:11, Bharat Bhushan wrote: > > > Detect the availability of the reset hcalls by looking at > > kvm,has-reset property on the /hypervisor node in the device tree > > passed to the VM and patches the reset mechanism to use reset hcall. > > > > This patch uses the reser hcall when kvm,has-reset is there in > > Your patch description is pretty broken :). > > > > > Signed-off-by: Bharat Bhushan > > --- > > arch/powerpc/kernel/epapr_paravirt.c | 12 > > 1 files changed, 12 insertions(+), 0 deletions(-) > > > > diff --git a/arch/powerpc/kernel/epapr_paravirt.c > > b/arch/powerpc/kernel/epapr_paravirt.c > > index d44a571..651d701 100644 > > --- a/arch/powerpc/kernel/epapr_paravirt.c > > +++ b/arch/powerpc/kernel/epapr_paravirt.c > > @@ -22,6 +22,8 @@ > > #include > > #include > > #include > > +#include > > +#include > > Why would we need kvm_host.h? This is guest code. > > > > > #if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) extern > > void epapr_ev_idle(void); @@ -30,6 +32,14 @@ extern u32 > > epapr_ev_idle_start[]; > > > > bool epapr_paravirt_enabled; > > > > +void epapr_hypercall_reset(char *cmd) { > > + long ret; > > + ret = kvm_hypercall0(KVM_HC_VM_RESET); > > Is this available without CONFIG_KVM_GUEST? kvm_hypercall() simply returns > "unimplemented" for everything when that config option is not set. We are here because we patched the ppc_md.restart to point to new handler. So I think we should patch the ppc_md.restart only if CONFIG_KVM_GUEST is true. > > > + printk("error: system reset returned with error %ld\n", ret); > > So we should fall back to the normal reset handler here. Do you mean return normally from here, no BUG() etc? -Bharat > > > Alex > > > + BUG(); > > +} > > + > > static int __init epapr_paravirt_init(void) { > > struct device_node *hyper_node; > > @@ -58,6 +68,8 @@ static int __init epapr_paravirt_init(void) > > if (of_get_property(hyper_node, "has-idle", NULL)) > > ppc_md.power_save = epapr_ev_idle; > > #endif > > + if (of_get_property(hyper_node, "kvm,has-reset", NULL)) > > + ppc_md.restart = epapr_hypercall_reset; > > > > epapr_paravirt_enabled = true; > > > > -- > > 1.7.0.4 > > > > > > -- > > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in > > the body of a message to majord...@vger.kernel.org More majordomo info > > at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/5] booke: exit to guest userspace for unimplemented hcalls in kvm
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Monday, July 15, 2013 5:16 PM > To: Bhushan Bharat-R65777 > Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; Yoder > Stuart-B08248 > Subject: Re: [PATCH 2/5] booke: exit to guest userspace for unimplemented > hcalls > in kvm > > > On 15.07.2013, at 13:38, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: Alexander Graf [mailto:ag...@suse.de] > >> Sent: Monday, July 15, 2013 5:02 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; > >> Yoder Stuart-B08248; Bhushan Bharat-R65777 > >> Subject: Re: [PATCH 2/5] booke: exit to guest userspace for > >> unimplemented hcalls in kvm > >> > >> > >> On 15.07.2013, at 13:11, Bharat Bhushan wrote: > >> > >>> Exit to guest user space if kvm does not implement the hcall. > >>> > >>> Signed-off-by: Bharat Bhushan > >>> --- > >>> arch/powerpc/kvm/booke.c | 47 > >>> +- > - > >>> arch/powerpc/kvm/powerpc.c |1 + > >>> include/uapi/linux/kvm.h |1 + > >>> 3 files changed, 42 insertions(+), 7 deletions(-) > >>> > >>> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c > >>> index > >>> 17722d8..c8b41b4 100644 > >>> --- a/arch/powerpc/kvm/booke.c > >>> +++ b/arch/powerpc/kvm/booke.c > >>> @@ -1005,9 +1005,25 @@ int kvmppc_handle_exit(struct kvm_run *run, > >>> struct > >> kvm_vcpu *vcpu, > >>> break; > >>> > >>> #ifdef CONFIG_KVM_BOOKE_HV > >>> - case BOOKE_INTERRUPT_HV_SYSCALL: > >>> + case BOOKE_INTERRUPT_HV_SYSCALL: { > >> > >> This is getting large. Please extract hcall handling into its own function. > >> Maybe you can merge the HV and non-HV case then too. > >> > >>> + int i; > >>> if (!(vcpu->arch.shared->msr & MSR_PR)) { > >>> - kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); > >>> + r = kvmppc_kvm_pv(vcpu); > >>> + if (r != EV_UNIMPLEMENTED) { > >>> + /* except unimplemented return to guest */ > >>> + kvmppc_set_gpr(vcpu, 3, r); > >>> + kvmppc_account_exit(vcpu, SYSCALL_EXITS); > >>> + r = RESUME_GUEST; > >>> + break; > >>> + } > >>> + /* Exit to userspace for unimplemented hcalls in kvm */ > >>> + run->epapr_hcall.nr = kvmppc_get_gpr(vcpu, 11); > >>> + run->epapr_hcall.ret = 0; > >>> + for (i = 0; i < 8; i++) > >>> + run->epapr_hcall.args[i] = kvmppc_get_gpr(vcpu, > >>> 3 + > >> i); > >>> + vcpu->arch.hcall_needed = 1; > >>> + kvmppc_account_exit(vcpu, SYSCALL_EXITS); > >>> + r = RESUME_HOST; > >>> } else { > >>> /* > >>>* hcall from guest userspace -- send privileged @@ > >>> -1016,22 > >>> +1032,39 @@ int kvmppc_handle_exit(struct kvm_run *run, struct > >>> +kvm_vcpu *vcpu, > >>> kvmppc_core_queue_program(vcpu, ESR_PPR); > >>> } > >>> > >>> - r = RESUME_GUEST; > >>> + run->exit_reason = KVM_EXIT_EPAPR_HCALL; > > > > > > Oops, what I have done, I wanted this to be kvmppc_account_exit(vcpu, > > SYSCALL_EXITS); > > > > s/ run->exit_reason = KVM_EXIT_EPAPR_HCALL;/ kvmppc_account_exit(vcpu, > > SYSCALL_EXITS); > > > > -Bharat > > > >> > >> This looks odd. Your exit reason only changes when you do the hcall > >> exiting, right? > >> > >> You also need to guard user space hcall exits with an ENABLE_CAP. > >> Otherwise older user space will break, as it doesn't know about the exit > >> type > yet. > > > > So the user space so make enable_cap also? > > User space needs to call enable_cap on this cap, yes. Otherwise a guest can > confuse user space with an hcall exit it can't handle. We do not have enable_cap for book3s, any specific reason why ? -Bharat > > > Alex > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/5] booke: exit to guest userspace for unimplemented hcalls in kvm
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Monday, July 15, 2013 5:02 PM > To: Bhushan Bharat-R65777 > Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; Yoder > Stuart-B08248; Bhushan Bharat-R65777 > Subject: Re: [PATCH 2/5] booke: exit to guest userspace for unimplemented > hcalls > in kvm > > > On 15.07.2013, at 13:11, Bharat Bhushan wrote: > > > Exit to guest user space if kvm does not implement the hcall. > > > > Signed-off-by: Bharat Bhushan > > --- > > arch/powerpc/kvm/booke.c | 47 > > +-- > > arch/powerpc/kvm/powerpc.c |1 + > > include/uapi/linux/kvm.h |1 + > > 3 files changed, 42 insertions(+), 7 deletions(-) > > > > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index > > 17722d8..c8b41b4 100644 > > --- a/arch/powerpc/kvm/booke.c > > +++ b/arch/powerpc/kvm/booke.c > > @@ -1005,9 +1005,25 @@ int kvmppc_handle_exit(struct kvm_run *run, struct > kvm_vcpu *vcpu, > > break; > > > > #ifdef CONFIG_KVM_BOOKE_HV > > - case BOOKE_INTERRUPT_HV_SYSCALL: > > + case BOOKE_INTERRUPT_HV_SYSCALL: { > > This is getting large. Please extract hcall handling into its own function. > Maybe you can merge the HV and non-HV case then too. > > > + int i; > > if (!(vcpu->arch.shared->msr & MSR_PR)) { > > - kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); > > + r = kvmppc_kvm_pv(vcpu); > > + if (r != EV_UNIMPLEMENTED) { > > + /* except unimplemented return to guest */ > > + kvmppc_set_gpr(vcpu, 3, r); > > + kvmppc_account_exit(vcpu, SYSCALL_EXITS); > > + r = RESUME_GUEST; > > + break; > > + } > > + /* Exit to userspace for unimplemented hcalls in kvm */ > > + run->epapr_hcall.nr = kvmppc_get_gpr(vcpu, 11); > > + run->epapr_hcall.ret = 0; > > + for (i = 0; i < 8; i++) > > + run->epapr_hcall.args[i] = kvmppc_get_gpr(vcpu, > > 3 + > i); > > + vcpu->arch.hcall_needed = 1; > > + kvmppc_account_exit(vcpu, SYSCALL_EXITS); > > + r = RESUME_HOST; > > } else { > > /* > > * hcall from guest userspace -- send privileged @@ > > -1016,22 > > +1032,39 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu > > *vcpu, > > kvmppc_core_queue_program(vcpu, ESR_PPR); > > } > > > > - r = RESUME_GUEST; > > + run->exit_reason = KVM_EXIT_EPAPR_HCALL; Oops, what I have done, I wanted this to be kvmppc_account_exit(vcpu, SYSCALL_EXITS); s/ run->exit_reason = KVM_EXIT_EPAPR_HCALL;/ kvmppc_account_exit(vcpu, SYSCALL_EXITS); -Bharat > > This looks odd. Your exit reason only changes when you do the hcall exiting, > right? > > You also need to guard user space hcall exits with an ENABLE_CAP. Otherwise > older user space will break, as it doesn't know about the exit type yet. So the user space so make enable_cap also? -Bharat > > > Alex > > > break; > > + } > > #else > > - case BOOKE_INTERRUPT_SYSCALL: > > + case BOOKE_INTERRUPT_SYSCALL: { > > + int i; > > + r = RESUME_GUEST; > > if (!(vcpu->arch.shared->msr & MSR_PR) && > > (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { > > /* KVM PV hypercalls */ > > - kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); > > - r = RESUME_GUEST; > > + r = kvmppc_kvm_pv(vcpu); > > + if (r != EV_UNIMPLEMENTED) { > > + /* except unimplemented return to guest */ > > + kvmppc_set_gpr(vcpu, 3, r); > > + kvmppc_account_exit(vcpu, SYSCALL_EXITS); > > + r = RESUME_GUEST; > > + break; > > + } > > + /* Exit to userspace for unimplemented hcalls in kvm */ > > + run->epapr_hcall.nr = kvmppc_get_gpr(vcpu, 11); > > + run->epapr_hcall.re
RE: [PATCH 1/5] powerpc: define ePAPR hcall exit interface
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Monday, July 15, 2013 4:51 PM > To: Bhushan Bharat-R65777 > Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org; Wood Scott-B07421; Yoder > Stuart-B08248; Bhushan Bharat-R65777 > Subject: Re: [PATCH 1/5] powerpc: define ePAPR hcall exit interface > > > On 15.07.2013, at 13:11, Bharat Bhushan wrote: > > > This patch defines the ePAPR hcall exit interface to guest user space. > > The subject line is misleading. This is a kvm patch. Same applies for most > other > patches. Ok, will make this "kvm: powerpc: define ePAPR hcall exit interface" > > > > > Signed-off-by: Bharat Bhushan > > --- > > Documentation/virtual/kvm/api.txt | 20 > > include/uapi/linux/kvm.h |7 +++ > > 2 files changed, 27 insertions(+), 0 deletions(-) > > > > diff --git a/Documentation/virtual/kvm/api.txt > > b/Documentation/virtual/kvm/api.txt > > index 66dd2aa..054f2f4 100644 > > --- a/Documentation/virtual/kvm/api.txt > > +++ b/Documentation/virtual/kvm/api.txt > > @@ -2597,6 +2597,26 @@ The possible hypercalls are defined in the > > Power Architecture Platform Requirements (PAPR) document available > > from www.power.org (free developer registration required to access it). > > > > + /* KVM_EXIT_EPAPR_HCALL */ > > + struct { > > + __u64 nr; > > + __u64 ret; > > + __u64 args[8]; > > + } epapr_hcall; > > + > > +This is used on PowerPC platforms that support ePAPR hcalls. > > +It occurs when a guest does a hypercall (as defined in the ePAPR 1.1) > > +and the hcall is not handled by the kernel. > > + > > +The 'nr' field contains the hypercall number (from the guest R11), > > +and 'args' contains the arguments (from the guest R3 - R10). > > +Userspace should put the return code in 'ret' and any extra returned > > +values in args[]. If the VM is not in 64-bit mode KVM zeros the > > +upper half of each field in the struct. > > + > > +As per the ePAPR hcall ABI, the return value is returned to the guest > > +in R3 and output return values in R4 - R10. > > + > > /* KVM_EXIT_S390_TSCH */ > > struct { > > __u16 subchannel_id; > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index > > acccd08..01ee50e 100644 > > --- a/include/uapi/linux/kvm.h > > +++ b/include/uapi/linux/kvm.h > > @@ -171,6 +171,7 @@ struct kvm_pit_config { > > #define KVM_EXIT_WATCHDOG 21 > > #define KVM_EXIT_S390_TSCH22 > > #define KVM_EXIT_EPR 23 > > +#define KVM_EXIT_EPAPR_HCALL 24 > > > > /* For KVM_EXIT_INTERNAL_ERROR */ > > /* Emulate instruction failed. */ > > @@ -288,6 +289,12 @@ struct kvm_run { > > __u64 ret; > > __u64 args[9]; > > } papr_hcall; > > + /* KVM_EXIT_EPAPR_HCALL */ > > + struct { > > + __u64 nr; > > + __u64 ret; > > + __u64 args[8]; > > + } epapr_hcall; > > This should be at the end of the union. Ok. -Bharat > > > Alex > > > /* KVM_EXIT_S390_TSCH */ > > struct { > > __u16 subchannel_id; > > -- > > 1.7.0.4 > > > > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 6/6 v5] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Monday, June 24, 2013 4:13 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; > tiejun.c...@windriver.com; Bhushan Bharat-R65777 > Subject: Re: [PATCH 6/6 v5] KVM: PPC: Add userspace debug stub support > > > On 24.06.2013, at 11:08, Bharat Bhushan wrote: > > > This patch adds the debug stub support on booke/bookehv. > > Now QEMU debug stub can use hw breakpoint, watchpoint and software > > breakpoint to debug guest. > > > > This is how we save/restore debug register context when switching > > between guest, userspace and kernel user-process: > > > > When QEMU is running > > -> thread->debug_reg == QEMU debug register context. > > -> Kernel will handle switching the debug register on context switch. > > -> no vcpu_load() called > > > > QEMU makes ioctls (except RUN) > > -> This will call vcpu_load() > > -> should not change context. > > -> Some ioctls can change vcpu debug register, context saved in > > -> vcpu->debug_regs > > > > QEMU Makes RUN ioctl > > -> Save thread->debug_reg on STACK > > -> Store thread->debug_reg == vcpu->debug_reg load thread->debug_reg > > -> RUN VCPU ( So thread points to vcpu context ) > > > > Context switch happens When VCPU running > > -> makes vcpu_load() should not load any context kernel loads the vcpu > > -> context as thread->debug_regs points to vcpu context. > > > > On heavyweight_exit > > -> Load the context saved on stack in thread->debug_reg > > > > Currently we do not support debug resource emulation to guest, On > > debug exception, always exit to user space irrespective of user space > > is expecting the debug exception or not. If this is unexpected > > exception (breakpoint/watchpoint event not set by > > userspace) then let us leave the action on user space. This is similar > > to what it was before, only thing is that now we have proper exit > > state available to user space. > > > > Signed-off-by: Bharat Bhushan > > --- > > arch/powerpc/include/asm/kvm_host.h |3 + > > arch/powerpc/include/uapi/asm/kvm.h |1 + > > arch/powerpc/kvm/booke.c| 233 > > --- > > arch/powerpc/kvm/booke.h|5 + > > 4 files changed, 224 insertions(+), 18 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/kvm_host.h > > b/arch/powerpc/include/asm/kvm_host.h > > index 838a577..aeb490d 100644 > > --- a/arch/powerpc/include/asm/kvm_host.h > > +++ b/arch/powerpc/include/asm/kvm_host.h > > @@ -524,7 +524,10 @@ struct kvm_vcpu_arch { > > u32 eptcfg; > > u32 epr; > > u32 crit_save; > > + /* guest debug registers*/ > > struct debug_reg dbg_reg; > > + /* hardware visible debug registers when in guest state */ > > + struct debug_reg shadow_dbg_reg; > > #endif > > gpa_t paddr_accessed; > > gva_t vaddr_accessed; > > diff --git a/arch/powerpc/include/uapi/asm/kvm.h > > b/arch/powerpc/include/uapi/asm/kvm.h > > index ded0607..f5077c2 100644 > > --- a/arch/powerpc/include/uapi/asm/kvm.h > > +++ b/arch/powerpc/include/uapi/asm/kvm.h > > @@ -27,6 +27,7 @@ > > #define __KVM_HAVE_PPC_SMT > > #define __KVM_HAVE_IRQCHIP > > #define __KVM_HAVE_IRQ_LINE > > +#define __KVM_HAVE_GUEST_DEBUG > > > > struct kvm_regs { > > __u64 pc; > > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index > > 3e9fc1d..8be3502 100644 > > --- a/arch/powerpc/kvm/booke.c > > +++ b/arch/powerpc/kvm/booke.c > > @@ -133,6 +133,29 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu > > *vcpu) #endif } > > > > +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) { > > + /* Synchronize guest's desire to get debug interrupts into shadow > > +MSR */ #ifndef CONFIG_KVM_BOOKE_HV > > + vcpu->arch.shadow_msr &= ~MSR_DE; > > + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE; #endif > > + > > + /* Force enable debug interrupts when user space wants to debug */ > > + if (vcpu->guest_debug) { > > +#ifdef CONFIG_KVM_BOOKE_HV > > + /* > > +* Since there is no shadow MSR, sync MSR_DE into the guest > > +* visible MSR. > > +*/ > > + vcpu->arch.shared->msr |= MSR_DE; > > +#else > > +
RE: [PATCH 3/6 v5] powerpc: export debug register save function for KVM
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Monday, June 24, 2013 3:03 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; > tiejun.c...@windriver.com; Bhushan Bharat-R65777 > Subject: Re: [PATCH 3/6 v5] powerpc: export debug register save function for > KVM > > > On 24.06.2013, at 11:08, Bharat Bhushan wrote: > > > KVM need this function when switching from vcpu to user-space thread. > > My subsequent patch will use this function. > > > > Signed-off-by: Bharat Bhushan > > --- > > arch/powerpc/include/asm/switch_to.h |4 > > arch/powerpc/kernel/process.c|3 ++- > > 2 files changed, 6 insertions(+), 1 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/switch_to.h > > b/arch/powerpc/include/asm/switch_to.h > > index 200d763..50b357f 100644 > > --- a/arch/powerpc/include/asm/switch_to.h > > +++ b/arch/powerpc/include/asm/switch_to.h > > @@ -30,6 +30,10 @@ extern void enable_kernel_spe(void); extern void > > giveup_spe(struct task_struct *); extern void load_up_spe(struct > > task_struct *); > > > > +#ifdef CONFIG_PPC_ADV_DEBUG_REGS > > +extern void switch_booke_debug_regs(struct thread_struct > > +*new_thread); #endif > > + > > #ifndef CONFIG_SMP > > extern void discard_lazy_cpu_state(void); #else diff --git > > a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index > > 01ff496..3375cb7 100644 > > --- a/arch/powerpc/kernel/process.c > > +++ b/arch/powerpc/kernel/process.c > > @@ -362,12 +362,13 @@ static void prime_debug_regs(struct > > thread_struct *thread) > > * debug registers, set the debug registers from the values > > * stored in the new thread. > > */ > > -static void switch_booke_debug_regs(struct thread_struct *new_thread) > > +void switch_booke_debug_regs(struct thread_struct *new_thread) > > { > > if ((current->thread.debug.dbcr0 & DBCR0_IDM) > > || (new_thread->debug.dbcr0 & DBCR0_IDM)) > > prime_debug_regs(new_thread); > > } > > +EXPORT_SYMBOL(switch_booke_debug_regs); > > EXPORT_SYMBOL_GPL? Oops, I missed this comment. Will correct in next version. -Bharat > > > Alex > > > #else /* !CONFIG_PPC_ADV_DEBUG_REGS */ > > #ifndef CONFIG_HAVE_HW_BREAKPOINT > > static void set_debug_reg_defaults(struct thread_struct *thread) > > -- > > 1.7.0.4 > > > > > > -- > > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in > > the body of a message to majord...@vger.kernel.org More majordomo info > > at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Friday, May 10, 2013 11:14 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; > tiejun.c...@windriver.com > Subject: Re: [PATCH] KVM: PPC: Add userspace debug stub support > > > On 10.05.2013, at 19:31, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: Alexander Graf [mailto:ag...@suse.de] > >> Sent: Friday, May 10, 2013 3:48 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; > >> tiejun.c...@windriver.com; Bhushan Bharat-R65777 > >> Subject: Re: [PATCH] KVM: PPC: Add userspace debug stub support > >> > >> > >> On 07.05.2013, at 11:40, Bharat Bhushan wrote: > >> > >>> This patch adds the debug stub support on booke/bookehv. > >>> Now QEMU debug stub can use hw breakpoint, watchpoint and software > >>> breakpoint to debug guest. > >>> > >>> This is how we save/restore debug register context when switching > >>> between guest, userspace and kernel user-process: > >>> > >>> When QEMU is running > >>> -> thread->debug_reg == QEMU debug register context. > >>> -> Kernel will handle switching the debug register on context switch. > >>> -> no vcpu_load() called > >>> > >>> QEMU makes ioctls (except RUN) > >>> -> This will call vcpu_load() > >>> -> should not change context. > >>> -> Some ioctls can change vcpu debug register, context saved in > >>> -> vcpu->debug_regs > >>> > >>> QEMU Makes RUN ioctl > >>> -> Save thread->debug_reg on STACK > >>> -> Store thread->debug_reg == vcpu->debug_reg load thread->debug_reg > >>> -> RUN VCPU ( So thread points to vcpu context ) > >>> > >>> Context switch happens When VCPU running > >>> -> makes vcpu_load() should not load any context kernel loads the > >>> -> vcpu context as thread->debug_regs points to vcpu context. > >>> > >>> On heavyweight_exit > >>> -> Load the context saved on stack in thread->debug_reg > >>> > >>> Currently we do not support debug resource emulation to guest, On > >>> debug exception, always exit to user space irrespective of user > >>> space is expecting the debug exception or not. If this is unexpected > >>> exception (breakpoint/watchpoint event not set by > >>> userspace) then let us leave the action on user space. This is > >>> similar to what it was before, only thing is that now we have proper > >>> exit state available to user space. > >>> > >>> Signed-off-by: Bharat Bhushan > >>> --- > >>> arch/powerpc/include/asm/kvm_host.h |3 + > >>> arch/powerpc/include/uapi/asm/kvm.h |1 + > >>> arch/powerpc/kvm/booke.c| 242 > >>> - > -- > >>> arch/powerpc/kvm/booke.h|5 + > >>> 4 files changed, 233 insertions(+), 18 deletions(-) > >>> > >>> diff --git a/arch/powerpc/include/asm/kvm_host.h > >>> b/arch/powerpc/include/asm/kvm_host.h > >>> index 838a577..1b29945 100644 > >>> --- a/arch/powerpc/include/asm/kvm_host.h > >>> +++ b/arch/powerpc/include/asm/kvm_host.h > >>> @@ -524,7 +524,10 @@ struct kvm_vcpu_arch { > >>> u32 eptcfg; > >>> u32 epr; > >>> u32 crit_save; > >>> + /* guest debug registers*/ > >>> struct debug_reg dbg_reg; > >>> + /* shadow debug registers */ > >> > >> Please be more verbose here. What exactly does this contain? Why do > >> we need shadow and non-shadow registers? The comment as it is reads > >> like > >> > >> /* Add one plus one */ > >> x = 1 + 1; > > > > > > /* > > * Shadow debug registers hold the debug register content > > * to be written in h/w debug register on behalf of guest > > * written value or user space written value. > > */ > > /* hardware visible debug registers when in guest state */ > > > > > > >> > >>> + struct debug_reg shadow_dbg_reg; > >>> #endif > >>> gpa_t paddr_accessed; > >>> gva_t vaddr_accessed; > >>&
RE: [PATCH] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Friday, May 10, 2013 3:48 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; > tiejun.c...@windriver.com; Bhushan Bharat-R65777 > Subject: Re: [PATCH] KVM: PPC: Add userspace debug stub support > > > On 07.05.2013, at 11:40, Bharat Bhushan wrote: > > > This patch adds the debug stub support on booke/bookehv. > > Now QEMU debug stub can use hw breakpoint, watchpoint and software > > breakpoint to debug guest. > > > > This is how we save/restore debug register context when switching > > between guest, userspace and kernel user-process: > > > > When QEMU is running > > -> thread->debug_reg == QEMU debug register context. > > -> Kernel will handle switching the debug register on context switch. > > -> no vcpu_load() called > > > > QEMU makes ioctls (except RUN) > > -> This will call vcpu_load() > > -> should not change context. > > -> Some ioctls can change vcpu debug register, context saved in > > -> vcpu->debug_regs > > > > QEMU Makes RUN ioctl > > -> Save thread->debug_reg on STACK > > -> Store thread->debug_reg == vcpu->debug_reg load thread->debug_reg > > -> RUN VCPU ( So thread points to vcpu context ) > > > > Context switch happens When VCPU running > > -> makes vcpu_load() should not load any context kernel loads the vcpu > > -> context as thread->debug_regs points to vcpu context. > > > > On heavyweight_exit > > -> Load the context saved on stack in thread->debug_reg > > > > Currently we do not support debug resource emulation to guest, On > > debug exception, always exit to user space irrespective of user space > > is expecting the debug exception or not. If this is unexpected > > exception (breakpoint/watchpoint event not set by > > userspace) then let us leave the action on user space. This is similar > > to what it was before, only thing is that now we have proper exit > > state available to user space. > > > > Signed-off-by: Bharat Bhushan > > --- > > arch/powerpc/include/asm/kvm_host.h |3 + > > arch/powerpc/include/uapi/asm/kvm.h |1 + > > arch/powerpc/kvm/booke.c| 242 > > --- > > arch/powerpc/kvm/booke.h|5 + > > 4 files changed, 233 insertions(+), 18 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/kvm_host.h > > b/arch/powerpc/include/asm/kvm_host.h > > index 838a577..1b29945 100644 > > --- a/arch/powerpc/include/asm/kvm_host.h > > +++ b/arch/powerpc/include/asm/kvm_host.h > > @@ -524,7 +524,10 @@ struct kvm_vcpu_arch { > > u32 eptcfg; > > u32 epr; > > u32 crit_save; > > + /* guest debug registers*/ > > struct debug_reg dbg_reg; > > + /* shadow debug registers */ > > Please be more verbose here. What exactly does this contain? Why do we need > shadow and non-shadow registers? The comment as it is reads like > > /* Add one plus one */ > x = 1 + 1; /* * Shadow debug registers hold the debug register content * to be written in h/w debug register on behalf of guest * written value or user space written value. */ > > > + struct debug_reg shadow_dbg_reg; > > #endif > > gpa_t paddr_accessed; > > gva_t vaddr_accessed; > > diff --git a/arch/powerpc/include/uapi/asm/kvm.h > > b/arch/powerpc/include/uapi/asm/kvm.h > > index ded0607..f5077c2 100644 > > --- a/arch/powerpc/include/uapi/asm/kvm.h > > +++ b/arch/powerpc/include/uapi/asm/kvm.h > > @@ -27,6 +27,7 @@ > > #define __KVM_HAVE_PPC_SMT > > #define __KVM_HAVE_IRQCHIP > > #define __KVM_HAVE_IRQ_LINE > > +#define __KVM_HAVE_GUEST_DEBUG > > > > struct kvm_regs { > > __u64 pc; > > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index > > ef99536..6a44ad4 100644 > > --- a/arch/powerpc/kvm/booke.c > > +++ b/arch/powerpc/kvm/booke.c > > @@ -133,6 +133,29 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu > > *vcpu) #endif } > > > > +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) { > > + /* Synchronize guest's desire to get debug interrupts into shadow > > +MSR */ #ifndef CONFIG_KVM_BOOKE_HV > > + vcpu->arch.shadow_msr &= ~MSR_DE; > > + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE; #endif > > + > > + /* Force enable debug interrupts when user space wants to debug */ > >
RE: [PATCH v2 4/4] kvm/ppc: IRQ disabling cleanup
> -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Scott Wood > Sent: Friday, May 10, 2013 8:40 AM > To: Alexander Graf; Benjamin Herrenschmidt > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; > linuxppc-...@lists.ozlabs.org; > Wood Scott-B07421 > Subject: [PATCH v2 4/4] kvm/ppc: IRQ disabling cleanup > > Simplify the handling of lazy EE by going directly from fully-enabled > to hard-disabled. This replaces the lazy_irq_pending() check > (including its misplaced kvm_guest_exit() call). > > As suggested by Tiejun Chen, move the interrupt disabling into > kvmppc_prepare_to_enter() rather than have each caller do it. Also > move the IRQ enabling on heavyweight exit into > kvmppc_prepare_to_enter(). > > Don't move kvmppc_fix_ee_before_entry() into kvmppc_prepare_to_enter(), > so that the caller can avoid marking interrupts enabled earlier than > necessary (e.g. book3s_pr waits until after FP save/restore is done). > > Signed-off-by: Scott Wood > --- > arch/powerpc/include/asm/kvm_ppc.h |6 ++ > arch/powerpc/kvm/book3s_pr.c | 12 +++- > arch/powerpc/kvm/booke.c |9 ++--- > arch/powerpc/kvm/powerpc.c | 21 - > 4 files changed, 19 insertions(+), 29 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_ppc.h > b/arch/powerpc/include/asm/kvm_ppc.h > index 6885846..e4474f8 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -404,6 +404,12 @@ static inline void kvmppc_fix_ee_before_entry(void) > trace_hardirqs_on(); > > #ifdef CONFIG_PPC64 > + /* > + * To avoid races, the caller must have gone directly from having > + * interrupts fully-enabled to hard-disabled. > + */ > + WARN_ON(local_paca->irq_happened != PACA_IRQ_HARD_DIS); > + > /* Only need to enable IRQs by hard enabling them after this */ > local_paca->irq_happened = 0; > local_paca->soft_enabled = 1; > diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c > index 0b97ce4..e61e39e 100644 > --- a/arch/powerpc/kvm/book3s_pr.c > +++ b/arch/powerpc/kvm/book3s_pr.c > @@ -884,14 +884,11 @@ program_interrupt: >* and if we really did time things so badly, then we just exit >* again due to a host external interrupt. >*/ > - local_irq_disable(); > s = kvmppc_prepare_to_enter(vcpu); > - if (s <= 0) { > - local_irq_enable(); > + if (s <= 0) > r = s; > - } else { > + else > kvmppc_fix_ee_before_entry(); > - } > } > > trace_kvm_book3s_reenter(r, vcpu); > @@ -1121,12 +1118,9 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct > kvm_vcpu *vcpu) >* really did time things so badly, then we just exit again due to >* a host external interrupt. >*/ > - local_irq_disable(); > ret = kvmppc_prepare_to_enter(vcpu); > - if (ret <= 0) { > - local_irq_enable(); > + if (ret <= 0) > goto out; > - } > > /* Save FPU state in stack */ > if (current->thread.regs->msr & MSR_FP) > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c > index eb89b83..f7c0111 100644 > --- a/arch/powerpc/kvm/booke.c > +++ b/arch/powerpc/kvm/booke.c > @@ -666,10 +666,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct > kvm_vcpu *vcpu) > return -EINVAL; > } > > - local_irq_disable(); > s = kvmppc_prepare_to_enter(vcpu); > if (s <= 0) { > - local_irq_enable(); > ret = s; > goto out; > } > @@ -1148,14 +1146,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct > kvm_vcpu *vcpu, >* aren't already exiting to userspace for some other reason. >*/ > if (!(r & RESUME_HOST)) { > - local_irq_disable(); Ok, Now we do not soft disable before kvmppc_prapare_to_enter(). > s = kvmppc_prepare_to_enter(vcpu); > - if (s <= 0) { > - local_irq_enable(); > + if (s <= 0) > r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); > - } else { > + else > kvmppc_fix_ee_before_entry(); > - } > } > > return r; > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c > index 4e05f8c..f8659aa 100644 > --- a/arch/powerpc/kvm/powerpc.c > +++ b/arch/powerpc/kvm/powerpc.c > @@ -64,12 +64,14 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) > { > int r = 1; > > - WARN_ON_ONCE(!irqs_disabled()); > + WARN_ON(irqs_disabled()); > + hard_irq_disable(); Here we hard disable in kvmppc_prepare_to_enter(), so my comment in other patch about interrupt loss is no more valid. S
RE: [PATCH v2 2/4] kvm/ppc/booke64: Fix lazy ee handling in kvmppc_handle_exit()
> -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Scott Wood > Sent: Friday, May 10, 2013 8:40 AM > To: Alexander Graf; Benjamin Herrenschmidt > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; > linuxppc-...@lists.ozlabs.org; > Wood Scott-B07421 > Subject: [PATCH v2 2/4] kvm/ppc/booke64: Fix lazy ee handling in > kvmppc_handle_exit() > > EE is hard-disabled on entry to kvmppc_handle_exit(), so call > hard_irq_disable() so that PACA_IRQ_HARD_DIS is set, and soft_enabled > is unset. > > Without this, we get warnings such as arch/powerpc/kernel/time.c:300, > and sometimes host kernel hangs. > > Signed-off-by: Scott Wood > --- > arch/powerpc/kvm/booke.c |5 + > 1 file changed, 5 insertions(+) > > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c > index 1020119..705fc5c 100644 > --- a/arch/powerpc/kvm/booke.c > +++ b/arch/powerpc/kvm/booke.c > @@ -833,6 +833,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct > kvm_vcpu > *vcpu, > int r = RESUME_HOST; > int s; > > +#ifdef CONFIG_PPC64 > + WARN_ON(local_paca->irq_happened != 0); > +#endif > + hard_irq_disable(); It is not actually to hard disable as EE is already clear but to make it looks like hard_disable to host. Right? If so, should we write a comment here on why we are doing this? -Bharat > + > /* update before a new last_exit_type is rewritten */ > kvmppc_update_timing_stats(vcpu); > > -- > 1.7.10.4 > > > -- > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v2 3/4] kvm/ppc: Call trace_hardirqs_on before entry
> -Original Message- > From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf > Of > Scott Wood > Sent: Friday, May 10, 2013 8:40 AM > To: Alexander Graf; Benjamin Herrenschmidt > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; > linuxppc-...@lists.ozlabs.org; > Wood Scott-B07421 > Subject: [PATCH v2 3/4] kvm/ppc: Call trace_hardirqs_on before entry > > Currently this is only being done on 64-bit. Rather than just move it > out of the 64-bit ifdef, move it to kvm_lazy_ee_enable() so that it is > consistent with lazy ee state, and so that we don't track more host > code as interrupts-enabled than necessary. > > Rename kvm_lazy_ee_enable() to kvm_fix_ee_before_entry() to reflect > that this function now has a role on 32-bit as well. > > Signed-off-by: Scott Wood > --- > arch/powerpc/include/asm/kvm_ppc.h | 11 --- > arch/powerpc/kvm/book3s_pr.c |4 ++-- > arch/powerpc/kvm/booke.c |4 ++-- > arch/powerpc/kvm/powerpc.c |2 -- > 4 files changed, 12 insertions(+), 9 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_ppc.h > b/arch/powerpc/include/asm/kvm_ppc.h > index a5287fe..6885846 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -394,10 +394,15 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn) > } > } > > -/* Please call after prepare_to_enter. This function puts the lazy ee state > - back to normal mode, without actually enabling interrupts. */ > -static inline void kvmppc_lazy_ee_enable(void) > +/* > + * Please call after prepare_to_enter. This function puts the lazy ee and irq > + * disabled tracking state back to normal mode, without actually enabling > + * interrupts. > + */ > +static inline void kvmppc_fix_ee_before_entry(void) > { > + trace_hardirqs_on(); > + > #ifdef CONFIG_PPC64 > /* Only need to enable IRQs by hard enabling them after this */ > local_paca->irq_happened = 0; > diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c > index bdc40b8..0b97ce4 100644 > --- a/arch/powerpc/kvm/book3s_pr.c > +++ b/arch/powerpc/kvm/book3s_pr.c > @@ -890,7 +890,7 @@ program_interrupt: > local_irq_enable(); > r = s; > } else { > - kvmppc_lazy_ee_enable(); > + kvmppc_fix_ee_before_entry(); > } > } > > @@ -1161,7 +1161,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct > kvm_vcpu *vcpu) > if (vcpu->arch.shared->msr & MSR_FP) > kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); > > - kvmppc_lazy_ee_enable(); > + kvmppc_fix_ee_before_entry(); > > ret = __kvmppc_vcpu_run(kvm_run, vcpu); > > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c > index 705fc5c..eb89b83 100644 > --- a/arch/powerpc/kvm/booke.c > +++ b/arch/powerpc/kvm/booke.c > @@ -673,7 +673,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct > kvm_vcpu > *vcpu) > ret = s; > goto out; > } > - kvmppc_lazy_ee_enable(); > + kvmppc_fix_ee_before_entry(); local_irq_disable() is called before kvmppc_prepare_to_enter(). Now we put the irq_happend and soft_enabled back to previous state without checking for any interrupt happened in between. If any interrupt happens in between, will not that be lost? -Bharat > > kvm_guest_enter(); > > @@ -1154,7 +1154,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct > kvm_vcpu *vcpu, > local_irq_enable(); > r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); > } else { > - kvmppc_lazy_ee_enable(); > + kvmppc_fix_ee_before_entry(); > } > } > > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c > index 6316ee3..4e05f8c 100644 > --- a/arch/powerpc/kvm/powerpc.c > +++ b/arch/powerpc/kvm/powerpc.c > @@ -117,8 +117,6 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) > kvm_guest_exit(); > continue; > } > - > - trace_hardirqs_on(); > #endif > > kvm_guest_enter(); > -- > 1.7.10.4 > > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 7/7 v3] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf > Of > Alexander Graf > Sent: Friday, May 03, 2013 6:48 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > Subject: Re: [PATCH 7/7 v3] KVM: PPC: Add userspace debug stub support > > > On 03.05.2013, at 15:11, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: Alexander Graf [mailto:ag...@suse.de] > >> Sent: Friday, May 03, 2013 6:00 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > >> Subject: Re: [PATCH 7/7 v3] KVM: PPC: Add userspace debug stub > >> support > >> > >> > >> On 03.05.2013, at 13:08, Alexander Graf wrote: > >> > >>> > >>> > >>> Am 03.05.2013 um 12:48 schrieb Bhushan Bharat-R65777 > >>> : > >>> > >>>>>>>> +static void kvmppc_booke_vcpu_load_debug_regs(struct kvm_vcpu > >>>>>>>> +*vcpu) { > >>>>>>>> +if (!vcpu->arch.debug_active) > >>>>>>>> +return; > >>>>>>>> + > >>>>>>>> +/* Disable all debug events and clead pending debug events */ > >>>>>>>> +mtspr(SPRN_DBCR0, 0x0); > >>>>>>>> +kvmppc_clear_dbsr(); > >>>>>>>> + > >>>>>>>> +/* > >>>>>>>> + * Check whether guest still need debug resource, if not then > there > >>>>>>>> + * is no need to restore guest context. > >>>>>>>> + */ > >>>>>>>> +if (!vcpu->arch.shadow_dbg_reg.dbcr0) > >>>>>>>> +return; > >>>>>>>> + > >>>>>>>> +/* Load Guest Context */ > >>>>>>>> +mtspr(SPRN_DBCR1, vcpu->arch.shadow_dbg_reg.dbcr1); > >>>>>>>> +mtspr(SPRN_DBCR2, vcpu->arch.shadow_dbg_reg.dbcr2); #ifdef > >>>>>>>> +CONFIG_KVM_E500MC > >>>>>>>> +mtspr(SPRN_DBCR4, vcpu->arch.shadow_dbg_reg.dbcr4); > >>>>>>> > >>>>>>> You need to make sure DBCR4 is 0 when you leave things back to > >>>>>>> normal user space. Otherwise guest debug can interfere with host > >>>>>>> debug. > >>>>>> > >>>>>> > >>>>>> ok > >>>>>> > >>>>>>> > >>>>>>>> +#endif > >>>>>>>> +mtspr(SPRN_IAC1, vcpu->arch.shadow_dbg_reg.iac[0]); > >>>>>>>> +mtspr(SPRN_IAC2, vcpu->arch.shadow_dbg_reg.iac[1]); > >>>>>>>> +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 > >>>>>>>> +mtspr(SPRN_IAC3, vcpu->arch.shadow_dbg_reg.iac[2]); > >>>>>>>> +mtspr(SPRN_IAC4, vcpu->arch.shadow_dbg_reg.iac[3]); > >>>>>>>> +#endif > >>>>>>>> +mtspr(SPRN_DAC1, vcpu->arch.shadow_dbg_reg.dac[0]); > >>>>>>>> +mtspr(SPRN_DAC2, vcpu->arch.shadow_dbg_reg.dac[1]); > >>>>>>>> + > >>>>>>>> +/* Enable debug events after other debug registers restored */ > >>>>>>>> +mtspr(SPRN_DBCR0, vcpu->arch.shadow_dbg_reg.dbcr0); } > >>>>>>> > >>>>>>> All of the code above looks suspiciously similar to > >>>>>>> prime_debug_regs();. Can't we somehow reuse that? > >>>>>> > >>>>>> I think we can if > >>>>>> - Save thread->debug_regs in local data structure > >>>>> > >>>>> Yes, it can even be on the stack. > >>>>> > >>>>>> - Load vcpu->arch->debug_regs in thread->debug_regs > >>>>>> - Call prime_debug_regs(); > >>>>>> - Restore thread->debug_regs from local save values in first step > >>>>> > >>>>> On heavyweight exit, based on the values on stack, yes. > >>>> > >>>> This is how I think we can save/restore debug context. Please > >>>> correct if I am > >> missing something. > >>> > >>> Sounds about right :) > >> > >> Actually, what happens if a guest breakpoint is set to a kernel > >> address that happens to be within the scope of kvm code? > > > > You mean address of kvm code in guest or host? > > > > If host, we already mentioned that we do not support that. Right? > > QEMU wants to debug the guest at address 0xc123. kvm_run happens to be at > that address. We switch the debug registers through prime_debug_regs. Will the > host kernel receive a debug interrupt when it runs kvm_run()? No, On e500v2, we uses DBCR1 and DBCR2 to not allow debug events when MSR.PR = 0 On e500mc+, we uses EPCR.DUVD to not allow debug events when in hypervisor mode. -Bharat > > > Alex > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in the body of > a > message to majord...@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 7/7 v3] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Friday, May 03, 2013 6:00 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > Subject: Re: [PATCH 7/7 v3] KVM: PPC: Add userspace debug stub support > > > On 03.05.2013, at 13:08, Alexander Graf wrote: > > > > > > > Am 03.05.2013 um 12:48 schrieb Bhushan Bharat-R65777 : > > > >>>>>> +static void kvmppc_booke_vcpu_load_debug_regs(struct kvm_vcpu > >>>>>> +*vcpu) { > >>>>>> +if (!vcpu->arch.debug_active) > >>>>>> +return; > >>>>>> + > >>>>>> +/* Disable all debug events and clead pending debug events */ > >>>>>> +mtspr(SPRN_DBCR0, 0x0); > >>>>>> +kvmppc_clear_dbsr(); > >>>>>> + > >>>>>> +/* > >>>>>> + * Check whether guest still need debug resource, if not then > >>>>>> there > >>>>>> + * is no need to restore guest context. > >>>>>> + */ > >>>>>> +if (!vcpu->arch.shadow_dbg_reg.dbcr0) > >>>>>> +return; > >>>>>> + > >>>>>> +/* Load Guest Context */ > >>>>>> +mtspr(SPRN_DBCR1, vcpu->arch.shadow_dbg_reg.dbcr1); > >>>>>> +mtspr(SPRN_DBCR2, vcpu->arch.shadow_dbg_reg.dbcr2); #ifdef > >>>>>> +CONFIG_KVM_E500MC > >>>>>> +mtspr(SPRN_DBCR4, vcpu->arch.shadow_dbg_reg.dbcr4); > >>>>> > >>>>> You need to make sure DBCR4 is 0 when you leave things back to > >>>>> normal user space. Otherwise guest debug can interfere with host debug. > >>>> > >>>> > >>>> ok > >>>> > >>>>> > >>>>>> +#endif > >>>>>> +mtspr(SPRN_IAC1, vcpu->arch.shadow_dbg_reg.iac[0]); > >>>>>> +mtspr(SPRN_IAC2, vcpu->arch.shadow_dbg_reg.iac[1]); > >>>>>> +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 > >>>>>> +mtspr(SPRN_IAC3, vcpu->arch.shadow_dbg_reg.iac[2]); > >>>>>> +mtspr(SPRN_IAC4, vcpu->arch.shadow_dbg_reg.iac[3]); > >>>>>> +#endif > >>>>>> +mtspr(SPRN_DAC1, vcpu->arch.shadow_dbg_reg.dac[0]); > >>>>>> +mtspr(SPRN_DAC2, vcpu->arch.shadow_dbg_reg.dac[1]); > >>>>>> + > >>>>>> +/* Enable debug events after other debug registers restored */ > >>>>>> +mtspr(SPRN_DBCR0, vcpu->arch.shadow_dbg_reg.dbcr0); } > >>>>> > >>>>> All of the code above looks suspiciously similar to > >>>>> prime_debug_regs();. Can't we somehow reuse that? > >>>> > >>>> I think we can if > >>>> - Save thread->debug_regs in local data structure > >>> > >>> Yes, it can even be on the stack. > >>> > >>>> - Load vcpu->arch->debug_regs in thread->debug_regs > >>>> - Call prime_debug_regs(); > >>>> - Restore thread->debug_regs from local save values in first step > >>> > >>> On heavyweight exit, based on the values on stack, yes. > >> > >> This is how I think we can save/restore debug context. Please correct if I > >> am > missing something. > > > > Sounds about right :) > > Actually, what happens if a guest breakpoint is set to a kernel address that > happens to be within the scope of kvm code? You mean address of kvm code in guest or host? If host, we already mentioned that we do not support that. Right? -Bharat > We do accept debug events between > vcpu_run and the assembly code, right? > > > Alex > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 7/7 v3] KVM: PPC: Add userspace debug stub support
> >>> +static void kvmppc_booke_vcpu_load_debug_regs(struct kvm_vcpu > >>> +*vcpu) { > >>> + if (!vcpu->arch.debug_active) > >>> + return; > >>> + > >>> + /* Disable all debug events and clead pending debug events */ > >>> + mtspr(SPRN_DBCR0, 0x0); > >>> + kvmppc_clear_dbsr(); > >>> + > >>> + /* > >>> + * Check whether guest still need debug resource, if not then there > >>> + * is no need to restore guest context. > >>> + */ > >>> + if (!vcpu->arch.shadow_dbg_reg.dbcr0) > >>> + return; > >>> + > >>> + /* Load Guest Context */ > >>> + mtspr(SPRN_DBCR1, vcpu->arch.shadow_dbg_reg.dbcr1); > >>> + mtspr(SPRN_DBCR2, vcpu->arch.shadow_dbg_reg.dbcr2); #ifdef > >>> +CONFIG_KVM_E500MC > >>> + mtspr(SPRN_DBCR4, vcpu->arch.shadow_dbg_reg.dbcr4); > >> > >> You need to make sure DBCR4 is 0 when you leave things back to normal > >> user space. Otherwise guest debug can interfere with host debug. > > > > > > ok > > > >> > >>> +#endif > >>> + mtspr(SPRN_IAC1, vcpu->arch.shadow_dbg_reg.iac[0]); > >>> + mtspr(SPRN_IAC2, vcpu->arch.shadow_dbg_reg.iac[1]); > >>> +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 > >>> + mtspr(SPRN_IAC3, vcpu->arch.shadow_dbg_reg.iac[2]); > >>> + mtspr(SPRN_IAC4, vcpu->arch.shadow_dbg_reg.iac[3]); > >>> +#endif > >>> + mtspr(SPRN_DAC1, vcpu->arch.shadow_dbg_reg.dac[0]); > >>> + mtspr(SPRN_DAC2, vcpu->arch.shadow_dbg_reg.dac[1]); > >>> + > >>> + /* Enable debug events after other debug registers restored */ > >>> + mtspr(SPRN_DBCR0, vcpu->arch.shadow_dbg_reg.dbcr0); } > >> > >> All of the code above looks suspiciously similar to > >> prime_debug_regs();. Can't we somehow reuse that? > > > > I think we can if > > - Save thread->debug_regs in local data structure > > Yes, it can even be on the stack. > > > - Load vcpu->arch->debug_regs in thread->debug_regs > > - Call prime_debug_regs(); > > - Restore thread->debug_regs from local save values in first step > > On heavyweight exit, based on the values on stack, yes. This is how I think we can save/restore debug context. Please correct if I am missing something. 1) When QEMU is running -> thread->debug_reg == QEMU debug register context. -> Kernel will handle switching the debug register on context switch. -> no vcpu_load() called 2) QEMU makes ioctls (except RUN) -> This will call vcpu_load() -> should not change context. -> Some ioctls can change vcpu debug register, context saved in vcpu->debug_regs 3) QEMU Makes RUN ioctl -> Save thread->debug_reg on STACK -> Store thread->debug_reg == vcpu->debug_reg -> load thread->debug_reg -> RUN VCPU ( So thread points to vcpu context ) 4) Context switch happens When VCPU running -> makes vcpu_load() should not load any context -> kernel loads the vcpu context as thread->debug_regs points to vcpu context. 5) On heavyweight_exit -> Load the context saved on stack in thread->debug_reg Thanks -Bharat -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 7/7 v3] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Thursday, May 02, 2013 4:35 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > Subject: Re: [PATCH 7/7 v3] KVM: PPC: Add userspace debug stub support > > > On 02.05.2013, at 11:46, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: Alexander Graf [mailto:ag...@suse.de] > >> Sent: Friday, April 26, 2013 4:46 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; > >> Bhushan > >> Bharat-R65777 > >> Subject: Re: [PATCH 7/7 v3] KVM: PPC: Add userspace debug stub > >> support > >> > >> > >> On 08.04.2013, at 12:32, Bharat Bhushan wrote: > >> > >>> From: Bharat Bhushan > >>> > >>> This patch adds the debug stub support on booke/bookehv. > >>> Now QEMU debug stub can use hw breakpoint, watchpoint and software > >>> breakpoint to debug guest. > >>> > >>> Debug registers are saved/restored on vcpu_put()/vcpu_get(). > >>> Also the debug registers are saved restored only if guest is using > >>> debug resources. > >>> > >>> Currently we do not support debug resource emulation to guest, so > >>> always exit to user space irrespective of user space is expecting > >>> the debug exception or not. This is unexpected event and let us > >>> leave the action on user space. This is similar to what it was > >>> before, only thing is that now we have proper exit state available to user > space. > >>> > >>> Signed-off-by: Bharat Bhushan > >>> --- > >>> arch/powerpc/include/asm/kvm_host.h |8 + > >>> arch/powerpc/include/uapi/asm/kvm.h | 22 +++- > >>> arch/powerpc/kvm/booke.c| 242 > >>> - > -- > >>> arch/powerpc/kvm/booke.h|5 + > >>> 4 files changed, 255 insertions(+), 22 deletions(-) > >>> > >>> diff --git a/arch/powerpc/include/asm/kvm_host.h > >> b/arch/powerpc/include/asm/kvm_host.h > >>> index e34f8fe..b9ad20f 100644 > >>> --- a/arch/powerpc/include/asm/kvm_host.h > >>> +++ b/arch/powerpc/include/asm/kvm_host.h > >>> @@ -505,7 +505,15 @@ struct kvm_vcpu_arch { > >>> u32 mmucfg; > >>> u32 epr; > >>> u32 crit_save; > >>> + > >>> + /* Flag indicating that debug registers are used by guest */ > >>> + bool debug_active; > >>> + /* for save/restore thread->dbcr0 on vcpu run/heavyweight_exit */ > >>> + u32 saved_dbcr0; > >>> + /* guest debug registers*/ > >>> struct kvmppc_booke_debug_reg dbg_reg; > >>> + /* shadow debug registers */ > >>> + struct kvmppc_booke_debug_reg shadow_dbg_reg; > >>> #endif > >>> gpa_t paddr_accessed; > >>> gva_t vaddr_accessed; > >>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h > >> b/arch/powerpc/include/uapi/asm/kvm.h > >>> index c0c38ed..d7ce449 100644 > >>> --- a/arch/powerpc/include/uapi/asm/kvm.h > >>> +++ b/arch/powerpc/include/uapi/asm/kvm.h > >>> @@ -25,6 +25,7 @@ > >>> /* Select powerpc specific features in */ #define > >>> __KVM_HAVE_SPAPR_TCE #define __KVM_HAVE_PPC_SMT > >>> +#define __KVM_HAVE_GUEST_DEBUG > >>> > >>> struct kvm_regs { > >>> __u64 pc; > >>> @@ -267,7 +268,24 @@ struct kvm_fpu { > >>> __u64 fpr[32]; > >>> }; > >>> > >>> +/* > >>> + * Defines for h/w breakpoint, watchpoint (read, write or both) and > >>> + * software breakpoint. > >>> + * These are used as "type" in KVM_SET_GUEST_DEBUG ioctl and "status" > >>> + * for KVM_DEBUG_EXIT. > >>> + */ > >>> +#define KVMPPC_DEBUG_NONE0x0 > >>> +#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1) > >>> +#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) > >>> +#define KVMPPC_DEBUG_WATCH_READ (1UL << 3) > >>> struct kvm_debug_exit_arch { > >>> + __u64 address; > >>> + /* > >>> + * exiting to userspace because of h/w breakpoint, watchpoint > >>> + * (read, write or both) and software breakpoint. > &g
RE: [PATCH 7/7 v3] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Friday, April 26, 2013 4:46 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; Bhushan > Bharat-R65777 > Subject: Re: [PATCH 7/7 v3] KVM: PPC: Add userspace debug stub support > > > On 08.04.2013, at 12:32, Bharat Bhushan wrote: > > > From: Bharat Bhushan > > > > This patch adds the debug stub support on booke/bookehv. > > Now QEMU debug stub can use hw breakpoint, watchpoint and software > > breakpoint to debug guest. > > > > Debug registers are saved/restored on vcpu_put()/vcpu_get(). > > Also the debug registers are saved restored only if guest > > is using debug resources. > > > > Currently we do not support debug resource emulation to guest, > > so always exit to user space irrespective of user space is expecting > > the debug exception or not. This is unexpected event and let us > > leave the action on user space. This is similar to what it was before, > > only thing is that now we have proper exit state available to user space. > > > > Signed-off-by: Bharat Bhushan > > --- > > arch/powerpc/include/asm/kvm_host.h |8 + > > arch/powerpc/include/uapi/asm/kvm.h | 22 +++- > > arch/powerpc/kvm/booke.c| 242 > > --- > > arch/powerpc/kvm/booke.h|5 + > > 4 files changed, 255 insertions(+), 22 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/kvm_host.h > b/arch/powerpc/include/asm/kvm_host.h > > index e34f8fe..b9ad20f 100644 > > --- a/arch/powerpc/include/asm/kvm_host.h > > +++ b/arch/powerpc/include/asm/kvm_host.h > > @@ -505,7 +505,15 @@ struct kvm_vcpu_arch { > > u32 mmucfg; > > u32 epr; > > u32 crit_save; > > + > > + /* Flag indicating that debug registers are used by guest */ > > + bool debug_active; > > + /* for save/restore thread->dbcr0 on vcpu run/heavyweight_exit */ > > + u32 saved_dbcr0; > > + /* guest debug registers*/ > > struct kvmppc_booke_debug_reg dbg_reg; > > + /* shadow debug registers */ > > + struct kvmppc_booke_debug_reg shadow_dbg_reg; > > #endif > > gpa_t paddr_accessed; > > gva_t vaddr_accessed; > > diff --git a/arch/powerpc/include/uapi/asm/kvm.h > b/arch/powerpc/include/uapi/asm/kvm.h > > index c0c38ed..d7ce449 100644 > > --- a/arch/powerpc/include/uapi/asm/kvm.h > > +++ b/arch/powerpc/include/uapi/asm/kvm.h > > @@ -25,6 +25,7 @@ > > /* Select powerpc specific features in */ > > #define __KVM_HAVE_SPAPR_TCE > > #define __KVM_HAVE_PPC_SMT > > +#define __KVM_HAVE_GUEST_DEBUG > > > > struct kvm_regs { > > __u64 pc; > > @@ -267,7 +268,24 @@ struct kvm_fpu { > > __u64 fpr[32]; > > }; > > > > +/* > > + * Defines for h/w breakpoint, watchpoint (read, write or both) and > > + * software breakpoint. > > + * These are used as "type" in KVM_SET_GUEST_DEBUG ioctl and "status" > > + * for KVM_DEBUG_EXIT. > > + */ > > +#define KVMPPC_DEBUG_NONE 0x0 > > +#define KVMPPC_DEBUG_BREAKPOINT(1UL << 1) > > +#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) > > +#define KVMPPC_DEBUG_WATCH_READ(1UL << 3) > > struct kvm_debug_exit_arch { > > + __u64 address; > > + /* > > +* exiting to userspace because of h/w breakpoint, watchpoint > > +* (read, write or both) and software breakpoint. > > +*/ > > + __u32 status; > > + __u32 reserved; > > }; > > > > /* for KVM_SET_GUEST_DEBUG */ > > @@ -279,10 +297,6 @@ struct kvm_guest_debug_arch { > > * Type denotes h/w breakpoint, read watchpoint, write > > * watchpoint or watchpoint (both read and write). > > */ > > -#define KVMPPC_DEBUG_NONE 0x0 > > -#define KVMPPC_DEBUG_BREAKPOINT(1UL << 1) > > -#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) > > -#define KVMPPC_DEBUG_WATCH_READ(1UL << 3) > > __u32 type; > > __u32 reserved; > > } bp[16]; > > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c > > index 97ae158..0e93416 100644 > > --- a/arch/powerpc/kvm/booke.c > > +++ b/arch/powerpc/kvm/booke.c > > @@ -133,6 +133,29 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) > > #endif > > } > > > > +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vc
RE: [PATCH] ppc: initialize GPRs as per epapr
This was supposed to go to qemu-devel. Please Ignore this patch: Thanks -Bharat > -Original Message- > From: Bhushan Bharat-R65777 > Sent: Friday, April 26, 2013 11:44 AM > To: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood Scott- > B07421 > Cc: Bhushan Bharat-R65777; Bhushan Bharat-R65777; Yoder Stuart-B08248 > Subject: [PATCH] ppc: initialize GPRs as per epapr > > ePAPR defines the initial values of cpu registers. This patch initialize the > GPRs as per ePAPR specification. > > This resolves the issue of guest reboot/reset (guest hang on reboot). > > Signed-off-by: Bharat Bhushan > Signed-off-by: Stuart Yoder > --- > hw/ppc/e500.c |7 +++ > 1 files changed, 7 insertions(+), 0 deletions(-) > > diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index c1bdb6b..a47f976 100644 > --- a/hw/ppc/e500.c > +++ b/hw/ppc/e500.c > @@ -37,6 +37,7 @@ > #include "qemu/host-utils.h" > #include "hw/pci-host/ppce500.h" > > +#define EPAPR_MAGIC(0x45504150) > #define BINARY_DEVICE_TREE_FILE"mpc8544ds.dtb" > #define UIMAGE_LOAD_BASE 0 > #define DTC_LOAD_PAD 0x180 > @@ -444,6 +445,12 @@ static void ppce500_cpu_reset(void *opaque) > cs->halted = 0; > env->gpr[1] = (16<<20) - 8; > env->gpr[3] = bi->dt_base; > +env->gpr[4] = 0; > +env->gpr[5] = 0; > +env->gpr[6] = EPAPR_MAGIC; > +env->gpr[7] = (64 * 1024 * 1024); > +env->gpr[8] = 0; > +env->gpr[9] = 0; > env->nip = bi->entry; > mmubooke_create_initial_mapping(env); > } > -- > 1.7.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH] KVM : PPC : cache flush for kernel managed pages
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Thursday, April 25, 2013 8:36 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; Bhushan > Bharat-R65777 > Subject: Re: [PATCH] KVM : PPC : cache flush for kernel managed pages > > > On 23.04.2013, at 08:39, Bharat Bhushan wrote: > > > Kernel should only try flushing pages which are managed by kernel. > > pfn_to_page will returns junk struct page for pages not managed by > > kernel, so if kernel will try to flush direct mapped memory or direct > > assigned device mapping then it will work on junk struct page. > > > > Signed-off-by: Bharat Bhushan > > --- > > arch/powerpc/kvm/e500_mmu_host.c |3 ++- > > 1 files changed, 2 insertions(+), 1 deletions(-) > > > > diff --git a/arch/powerpc/kvm/e500_mmu_host.c > > b/arch/powerpc/kvm/e500_mmu_host.c > > index 1c6a9d7..e07da21 100644 > > --- a/arch/powerpc/kvm/e500_mmu_host.c > > +++ b/arch/powerpc/kvm/e500_mmu_host.c > > @@ -455,7 +455,8 @@ static inline int kvmppc_e500_shadow_map(struct > kvmppc_vcpu_e500 *vcpu_e500, > > ref, gvaddr, stlbe); > > > > /* Clear i-cache for new pages */ > > - kvmppc_mmu_flush_icache(pfn); > > + if (pfn_valid(pfn)) > > + kvmppc_mmu_flush_icache(pfn); > > Could you please move the check into kvmppc_mmu_flush_icache()? That way we're > guaranteed we can't screw up cache flushes ever :). > > Also, please add a comment saying why we need this. Ok -Bharat > > > Alex > > > > > /* Drop refcount on page, so that mmu notifiers can clear it */ > > kvm_release_pfn_clean(pfn); > > -- > > 1.7.0.4 > > > > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH] KVM/PPC: emulate ehpriv
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Friday, April 19, 2013 5:44 PM > To: Tiejun Chen > Cc: kvm@vger.kernel.org mailing list; kvm-...@vger.kernel.org; Bhushan Bharat- > R65777 > Subject: Re: [PATCH] KVM/PPC: emulate ehpriv > > > On 19.04.2013, at 04:44, Tiejun Chen wrote: > > > We can provide this emulation to simplify more extension later. > > Works for me, but this should really be part of a series that makes use of > ehpriv. Alex, this already planned to be in my debug patches. I know you are busy and I am just waiting for other patches to be reviewed :) -Bharat > > > Alex > > > > > Signed-off-by: Tiejun Chen > > --- > > arch/powerpc/include/asm/disassemble.h |4 > > arch/powerpc/kvm/e500_emulate.c| 17 + > > 2 files changed, 21 insertions(+) > > > > diff --git a/arch/powerpc/include/asm/disassemble.h > > b/arch/powerpc/include/asm/disassemble.h > > index 9b198d1..856f8de 100644 > > --- a/arch/powerpc/include/asm/disassemble.h > > +++ b/arch/powerpc/include/asm/disassemble.h > > @@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst) > > return inst & 0x; > > } > > > > +static inline unsigned int get_oc(u32 inst) { > > + return (inst >> 11) & 0x7fff; > > +} > > #endif /* __ASM_PPC_DISASSEMBLE_H__ */ diff --git > > a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c > > index e78f353..36492cf 100644 > > --- a/arch/powerpc/kvm/e500_emulate.c > > +++ b/arch/powerpc/kvm/e500_emulate.c > > @@ -26,6 +26,7 @@ > > #define XOP_TLBRE 946 > > #define XOP_TLBWE 978 > > #define XOP_TLBILX 18 > > +#define XOP_EHPRIV 270 > > > > #ifdef CONFIG_KVM_E500MC > > static int dbell2prio(ulong param) > > @@ -80,6 +81,18 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu > > *vcpu, int rb) > > > > return EMULATE_DONE; > > } > > + > > +static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu > *vcpu, > > + unsigned int inst) > > +{ > > + int emulated = EMULATE_DONE; > > + > > + switch (get_oc(inst)) { > > + default: > > + emulated = EMULATE_FAIL; > > + } > > + return emulated; > > +} > > #endif > > > > int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, > > @@ -130,6 +143,10 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct > kvm_vcpu *vcpu, > > emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); > > break; > > > > + case XOP_EHPRIV: > > + emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst); > > + break; > > + > > default: > > emulated = EMULATE_FAIL; > > } > > -- > > 1.7.9.5 > > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: RFC: vfio API changes needed for powerpc (v3)
So now the sequence would be something like: 1)VFIO_GROUP_SET_CONTAINER // add groups to the container 2)VFIO_SET_IOMMU(VFIO_FSL_PAMU)// set iommu model 3)count = VFIO_IOMMU_GET_MSI_BANK_COUNT// returns max # of MSI banks 4)VFIO_IOMMU_SET_ATTR(ATTR_GEOMETRY) // set overall aperture 5)VFIO_IOMMU_SET_ATTR(ATTR_WINDOWS) // set # of windows, including MSI banks 6) For (int I = 0; I < count; i++) VFIO_IOMMU_PAMU_MAP_MSI_BANK() // map the MSI banks, do not enable aperture here. 7) Memory Listener will call-> VFIO_IOMMU_MAP_DMA// map the guest's memory ---> kernel enables aperture here on first VFIO_IOMMU_MAP_DMA 8)VFIO_DEVICE_SET_IRQS ---> VFIO in kernel makes pci_enable_msix()/pci_enable_msi_block() calls, this sets actual MSI addr/data in physical device. ---> As the address set by above APIs is not what we want so -> is using MSIX, VFIO will update address in the MSI-X table -> if using MSI, update MSI address in PCI configuration space. Thanks -Bharat > -Original Message- > From: Yoder Stuart-B08248 > Sent: Friday, April 05, 2013 3:40 AM > To: Alex Williamson > Cc: Wood Scott-B07421; ag...@suse.de; Bhushan Bharat-R65777; Sethi > Varun-B16395; > kvm@vger.kernel.org; qemu-de...@nongnu.org; io...@lists.linux-foundation.org > Subject: RFC: vfio API changes needed for powerpc (v3) > > -v3 updates >-made vfio_pamu_attr a union, added flags >-s/VFIO_PAMU_/VFIO_IOMMU_PAMU_/ for the ioctls to make it more > clear which fd is being operated on >-added flags to vfio_pamu_msi_bank_map/umap >-VFIO_PAMU_GET_MSI_BANK_COUNT now just returns a __u32 > not a struct >-fixed some typos > > > > The Freescale PAMU is an aperture-based IOMMU with the following > characteristics. Each device has an entry in a table in memory > describing the iova->phys mapping. The mapping has: >-an overall aperture that is power of 2 sized, and has a start iova that > is naturally aligned >-has 1 or more windows within the aperture > -number of windows must be power of 2, max is 256 > -size of each window is determined by aperture size / # of windows > -iova of each window is determined by aperture start iova / # of windows > -the mapped region in each window can be different than >the window size...mapping must power of 2 > -physical address of the mapping must be naturally aligned >with the mapping size > > These ioctls operate on the VFIO file descriptor (/dev/vfio/vfio). > > /* > * VFIO_IOMMU_PAMU_GET_ATTR > * > * Gets the iommu attributes for the current vfio container. This > * ioctl is applicable to an iommu type of VFIO_PAMU only. > * Caller sets argsz and attribute. The ioctl fills in > * the provided struct vfio_pamu_attr based on the attribute > * value that was set. > > * Return: 0 on success, -errno on failure > */ > struct vfio_pamu_attr { > __u32 argsz; > __u32 flags;/* no flags currently */ > __u32 attribute; > > union { > /* VFIO_ATTR_GEOMETRY */ > struct { > __u64 aperture_start; /* first addr that can be mapped > */ > __u64 aperture_end; /* last addr that can be mapped > */ > } attr; > > /* VFIO_ATTR_WINDOWS */ > __u32 windows; /* number of windows in the aperture */ > /* initially this will be the max number > * of windows that can be set > */ > > /* VFIO_ATTR_PAMU_STASH */ > struct { > __u32 cpu; /* CPU number for stashing */ > __u32 cache; /* cache ID for stashing */ > } stash; > } > }; > #define VFIO_IOMMU_PAMU_GET_ATTR _IO(VFIO_TYPE, VFIO_BASE + x, > struct vfio_pamu_attr) > > /* > * VFIO_IOMMU_PAMU_SET_ATTR > * > * Sets the iommu attributes for the current vfio container. This > * ioctl is applicable to an iommu type of VFIO_PAMU only. > * Caller sets struct vfio_pamu attr, including argsz and attribute and > * setting any fields that are valid for the attribute. > * Return: 0 on success, -errno on failure > */ > #define VFIO_IOMMU_PAMU_SET_ATTR _IO(VFIO_TYPE, VFIO_BASE + x, > struct vfio_pamu_attr) > > /* > * VFIO_IOMMU
RE: [PATCH] bookehv: Handle debug exception on guest exit
Hi Kumar/Benh, After further looking into the code I think that if we correct the vector range below in DebugDebug handler then we do not need the change I provided in this patch. Here is the snapshot for 32 bit (head_booke.h, same will be true for 64 bit): #define DEBUG_DEBUG_EXCEPTION \ START_EXCEPTION(DebugDebug); \ DEBUG_EXCEPTION_PROLOG; \ \ /*\ * If there is a single step or branch-taken exception in an \ * exception entry sequence, it was probably meant to apply to\ * the code where the exception occurred (since exception entry \ * doesn't turn off DE automatically). We simulate the effect\ * of turning off DE on entry to an exception handler by turning \ * off DE in the DSRR1 value and clearing the debug status. \ */ \ mfspr r10,SPRN_DBSR; /* check single-step/branch taken */ \ andis. r10,r10,(DBSR_IC|DBSR_BT)@h; \ beq+2f; \ \ lis r10,KERNELBASE@h; /* check if exception in vectors */ \ ori r10,r10,KERNELBASE@l; \ cmplw r12,r10; \ blt+2f; /* addr below exception vectors */\ \ lis r10,DebugDebug@h;\ ori r10,r10,DebugDebug@l; \ Here we assume all exception vector ends at DebugDebug, which is not correct. We probably should get proper end by using some start_vector and end_vector lebels or at least use end at Ehvpriv (which is last defined in head_fsl_booke.S for PowerPC. Is that correct? cmplw r12,r10; \ bgt+2f; /* addr above exception vectors */\ Thanks -Bharat > -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Bhushan Bharat-R65777 > Sent: Thursday, April 04, 2013 8:29 PM > To: Alexander Graf > Cc: linuxppc-...@lists.ozlabs.org; kvm@vger.kernel.org; > kvm-...@vger.kernel.org; > Wood Scott-B07421 > Subject: RE: [PATCH] bookehv: Handle debug exception on guest exit > > > > > -Original Message- > > From: Alexander Graf [mailto:ag...@suse.de] > > Sent: Thursday, April 04, 2013 6:55 PM > > To: Bhushan Bharat-R65777 > > Cc: linuxppc-...@lists.ozlabs.org; kvm@vger.kernel.org; > > kvm-...@vger.kernel.org; Wood Scott-B07421; Bhushan Bharat-R65777 > > Subject: Re: [PATCH] bookehv: Handle debug exception on guest exit > > > > > > On 20.03.2013, at 18:45, Bharat Bhushan wrote: > > > > > EPCR.DUVD controls whether the debug events can come in hypervisor > > > mode or not. When KVM guest is using the debug resource then we do > > > not want debug events to be captured in guest entry/exit path. So we > > > set EPCR.DUVD when entering and clears EPCR.DUVD when exiting from guest. > > > > > > Debug instruction complete is a post-completion debug exception but > > > debug event gets posted on the basis of MSR before the instruction > > > is executed. Now if the instruction switches the context from guest > > > mode (MSR.GS = 1) to hypervisor mode (MSR.GS = 0) then the xSRR0 > > > points to first instruction of KVM handler and xSRR1 points that > > > MSR.GS is clear (hypervisor context). Now as xSRR1.GS is used to > > > decide whether KVM handler will be invoked to handle the exception > > > or host host kernel debug handler will be invoked to handle the exception. > > > This leads to host kernel debug handler handling the exception which > > > should either be handled by KVM. > > > > > > This is tested on e500mc in 32 bit mode > > > > > > Signed-off-by: Bharat Bhushan > > > --- > > > v0: > > > - Do not apply this change for debug_crit as we do not know those > > > chips have > > issue or not. > > > - corrected 64bit case branch
RE: [PATCH] bookehv: Handle debug exception on guest exit
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Thursday, April 04, 2013 6:55 PM > To: Bhushan Bharat-R65777 > Cc: linuxppc-...@lists.ozlabs.org; kvm@vger.kernel.org; > kvm-...@vger.kernel.org; > Wood Scott-B07421; Bhushan Bharat-R65777 > Subject: Re: [PATCH] bookehv: Handle debug exception on guest exit > > > On 20.03.2013, at 18:45, Bharat Bhushan wrote: > > > EPCR.DUVD controls whether the debug events can come in hypervisor > > mode or not. When KVM guest is using the debug resource then we do not > > want debug events to be captured in guest entry/exit path. So we set > > EPCR.DUVD when entering and clears EPCR.DUVD when exiting from guest. > > > > Debug instruction complete is a post-completion debug exception but > > debug event gets posted on the basis of MSR before the instruction is > > executed. Now if the instruction switches the context from guest mode > > (MSR.GS = 1) to hypervisor mode (MSR.GS = 0) then the xSRR0 points to > > first instruction of KVM handler and xSRR1 points that MSR.GS is clear > > (hypervisor context). Now as xSRR1.GS is used to decide whether KVM > > handler will be invoked to handle the exception or host host kernel > > debug handler will be invoked to handle the exception. > > This leads to host kernel debug handler handling the exception which > > should either be handled by KVM. > > > > This is tested on e500mc in 32 bit mode > > > > Signed-off-by: Bharat Bhushan > > --- > > v0: > > - Do not apply this change for debug_crit as we do not know those chips have > issue or not. > > - corrected 64bit case branching > > > > arch/powerpc/kernel/exceptions-64e.S | 29 - > > arch/powerpc/kernel/head_booke.h | 26 ++ > > 2 files changed, 54 insertions(+), 1 deletions(-) > > > > diff --git a/arch/powerpc/kernel/exceptions-64e.S > > b/arch/powerpc/kernel/exceptions-64e.S > > index 4684e33..8b26294 100644 > > --- a/arch/powerpc/kernel/exceptions-64e.S > > +++ b/arch/powerpc/kernel/exceptions-64e.S > > @@ -516,6 +516,33 @@ kernel_dbg_exc: > > andis. r15,r14,DBSR_IC@h > > beq+1f > > > > +#ifdef CONFIG_KVM_BOOKE_HV > > + /* > > +* EPCR.DUVD controls whether the debug events can come in > > +* hypervisor mode or not. When KVM guest is using the debug > > +* resource then we do not want debug events to be captured > > +* in guest entry/exit path. So we set EPCR.DUVD when entering > > +* and clears EPCR.DUVD when exiting from guest. > > +* Debug instruction complete is a post-completion debug > > +* exception but debug event gets posted on the basis of MSR > > +* before the instruction is executed. Now if the instruction > > +* switches the context from guest mode (MSR.GS = 1) to hypervisor > > +* mode (MSR.GS = 0) then the xSRR0 points to first instruction of > > Can't we just execute that code path with MSR.DE=0? Single stepping uses DBCR0.IC (instruction complete). Can you describe how MSR.DE = 0 will work? > > > Alex > > > +* KVM handler and xSRR1 points that MSR.GS is clear > > +* (hypervisor context). Now as xSRR1.GS is used to decide whether > > +* KVM handler will be invoked to handle the exception or host > > +* host kernel debug handler will be invoked to handle the exception. > > +* This leads to host kernel debug handler handling the exception > > +* which should either be handled by KVM. > > +*/ > > + mfspr r10, SPRN_EPCR > > + andis. r10,r10,SPRN_EPCR_DUVD@h > > + beq+2f > > + > > + andis. r10,r9,MSR_GS@h > > + beq+3f > > +2: > > +#endif > > LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) > > LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e) > > cmpld cr0,r10,r14 > > @@ -523,7 +550,7 @@ kernel_dbg_exc: > > blt+cr0,1f > > bge+cr1,1f > > > > - /* here it looks like we got an inappropriate debug exception. */ > > +3: /* here it looks like we got an inappropriate debug exception. */ > > lis r14,DBSR_IC@h /* clear the IC event */ > > rlwinm r11,r11,0,~MSR_DE /* clear DE in the DSRR1 value */ > > mtspr SPRN_DBSR,r14 > > diff --git a/arch/powerpc/kernel/head_booke.h > > b/arch/powerpc/kernel/head_booke.h > > index 5f051ee..edc6a3b 100644 > > --- a/arch/powerpc/kernel/head_booke.h > > +++ b/arch/powerpc/kernel/head_booke.h > > @@ -285,7 +285,33 @@ l
RE: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf > Of > Alexander Graf > Sent: Wednesday, April 03, 2013 11:26 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support > > > > Am 03.04.2013 um 19:47 schrieb Bhushan Bharat-R65777 : > > >>>>>>>>> +dbg_reg =&(vcpu->arch.shadow_dbg_reg); > >>>>>>>>> + > >>>>>>>>> +/* > >>>>>>>>> + * On BOOKE (e500v2); Set DBCR1 and DBCR2 to allow debug events > >>>>>>>>> + * to occur when MSR.PR is set. > >>>>>>>>> + * On BOOKE-HV (e500mc+); MSR.PR = 0 when guest is running. So > >>>>>>>>> we > >>>>>>>>> + * should clear DBCR1 and DBCR2. > >>>>>>>>> + */ > >>>>>>>>> +#ifdef CONFIG_KVM_BOOKE_HV > >>>>>>>>> +dbg_reg->dbcr1 = 0; > >>>>>>>>> +dbg_reg->dbcr2 = 0; > >>>>>>>> Does that mean we can't debug guest user space? > >>>>>>> Yes > >>>>>> This is wrong. > >>>>> Really, So far I am assuming qemu debug stub is not mean for > >>>>> debugging guest > >>>> application. > >>>> > >>>> Ok, let me rephrase: This is confusing. You do trap in PR mode on > >>>> e500v2. IIRC > >>>> x86 also traps in kernel and user space. I don't see why e500 hv > >>>> should be different. > >>> > >>> I am sorry, I think did not read the document correctly. > >>> > >>> DBCR1 = 0 ; means the "00 IAC1 debug conditions unaffected by > MSR[PR],MSR[GS]. > >>> > >>> Similarly for dbcr2. > >>> > >>> So yes the guest user space can be debugged. > >> > >> So why is this conditional on BOOKE_HV then? Wouldn't it make things > >> easier to treat HV and PR identical? > > > > On BOOKE-HV we have to keep these to 0, so guest and guest application both > can be debugged. Also on HV we have EPCR.DUVD to control that debug events > will > not come in hypervisor (GS = 0). > > > > On BOOKE; guest and guest application both runs in PR = 1 and hypervisor in > > PR > = 0. So with dbcr1/dbcr2 on booke we control debug exception not to come in > hypervisor mode still allow guest and its application debugging. > > Ah, can we group these 2 overrides next to each other with an #ifdef ... #else > to make this obvious from the code? I will try :) Thanks -Bharat -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support
> >>> + dbg_reg =&(vcpu->arch.shadow_dbg_reg); > >>> + > >>> + /* > >>> + * On BOOKE (e500v2); Set DBCR1 and DBCR2 to allow debug events > >>> + * to occur when MSR.PR is set. > >>> + * On BOOKE-HV (e500mc+); MSR.PR = 0 when guest is running. So > >>> we > >>> + * should clear DBCR1 and DBCR2. > >>> + */ > >>> +#ifdef CONFIG_KVM_BOOKE_HV > >>> + dbg_reg->dbcr1 = 0; > >>> + dbg_reg->dbcr2 = 0; > >> Does that mean we can't debug guest user space? > > Yes > This is wrong. > >>> Really, So far I am assuming qemu debug stub is not mean for > >>> debugging guest > >> application. > >> > >> Ok, let me rephrase: This is confusing. You do trap in PR mode on > >> e500v2. IIRC > >> x86 also traps in kernel and user space. I don't see why e500 hv > >> should be different. > > > > I am sorry, I think did not read the document correctly. > > > > DBCR1 = 0 ; means the "00 IAC1 debug conditions unaffected by > > MSR[PR],MSR[GS]. > > > > Similarly for dbcr2. > > > > So yes the guest user space can be debugged. > > So why is this conditional on BOOKE_HV then? Wouldn't it make things easier to > treat HV and PR identical? > On BOOKE-HV we have to keep these to 0, so guest and guest application both can be debugged. Also on HV we have EPCR.DUVD to control that debug events will not come in hypervisor (GS = 0). On BOOKE; guest and guest application both runs in PR = 1 and hypervisor in PR = 0. So with dbcr1/dbcr2 on booke we control debug exception not to come in hypervisor mode still allow guest and its application debugging. Thanks -Bharat -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Tuesday, April 02, 2013 9:11 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support > > On 04/02/2013 04:09 PM, Bhushan Bharat-R65777 wrote: > > > >> -Original Message- > >> From: Alexander Graf [mailto:ag...@suse.de] > >> Sent: Tuesday, April 02, 2013 1:57 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > >> Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > >> support > >> > >> > >> On 29.03.2013, at 07:04, Bhushan Bharat-R65777 wrote: > >> > >>> > >>>> -Original Message- > >>>> From: Alexander Graf [mailto:ag...@suse.de] > >>>> Sent: Thursday, March 28, 2013 10:06 PM > >>>> To: Bhushan Bharat-R65777 > >>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood > >>>> Scott-B07421; Bhushan > >>>> Bharat-R65777 > >>>> Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > >>>> support > >>>> > >>>> > >>>> On 21.03.2013, at 07:25, Bharat Bhushan wrote: > >>>> > >>>>> From: Bharat Bhushan > >>>>> > >>>>> This patch adds the debug stub support on booke/bookehv. > >>>>> Now QEMU debug stub can use hw breakpoint, watchpoint and software > >>>>> breakpoint to debug guest. > >>>>> > >>>>> Debug registers are saved/restored on vcpu_put()/vcpu_get(). > >>>>> Also the debug registers are saved restored only if guest is using > >>>>> debug resources. > >>>>> > >>>>> Signed-off-by: Bharat Bhushan > >>>>> --- > >>>>> v2: > >>>>> - save/restore in vcpu_get()/vcpu_put() > >>>>> - some more minor cleanup based on review comments. > >>>>> > >>>>> arch/powerpc/include/asm/kvm_host.h | 10 ++ > >>>>> arch/powerpc/include/uapi/asm/kvm.h | 22 +++- > >>>>> arch/powerpc/kvm/booke.c| 252 > - > >> -- > >>>>> arch/powerpc/kvm/e500_emulate.c | 10 ++ > >>>>> 4 files changed, 272 insertions(+), 22 deletions(-) > >>>>> > >>>>> diff --git a/arch/powerpc/include/asm/kvm_host.h > >>>>> b/arch/powerpc/include/asm/kvm_host.h > >>>>> index f4ba881..8571952 100644 > >>>>> --- a/arch/powerpc/include/asm/kvm_host.h > >>>>> +++ b/arch/powerpc/include/asm/kvm_host.h > >>>>> @@ -504,7 +504,17 @@ struct kvm_vcpu_arch { > >>>>> u32 mmucfg; > >>>>> u32 epr; > >>>>> u32 crit_save; > >>>>> + /* guest debug registers*/ > >>>>> struct kvmppc_booke_debug_reg dbg_reg; > >>>>> + /* shadow debug registers */ > >>>>> + struct kvmppc_booke_debug_reg shadow_dbg_reg; > >>>>> + /* host debug registers*/ > >>>>> + struct kvmppc_booke_debug_reg host_dbg_reg; > >>>>> + /* > >>>>> +* Flag indicating that debug registers are used by guest > >>>>> +* and requires save restore. > >>>>> + */ > >>>>> + bool debug_save_restore; > >>>>> #endif > >>>>> gpa_t paddr_accessed; > >>>>> gva_t vaddr_accessed; > >>>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h > >>>>> b/arch/powerpc/include/uapi/asm/kvm.h > >>>>> index 15f9a00..d7ce449 100644 > >>>>> --- a/arch/powerpc/include/uapi/asm/kvm.h > >>>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h > >>>>> @@ -25,6 +25,7 @@ > >>>>> /* Select powerpc specific features in */ #define > >>>>> __KVM_HAVE_SPAPR_TCE #define __KVM_HAVE_PPC_SMT > >>>>> +#define __KVM_HAVE_GUEST_DEBUG > >>>>> > >>>>> struct kvm_regs { > >>>>> __u64 pc; > >>>>> @@ -267,7 +268,24 @@ struct kvm_fpu { > >>
RE: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Wednesday, April 03, 2013 7:39 PM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; kvm-...@vger.kernel.org; kvm@vger.kernel.org > Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support > > > On 03.04.2013, at 15:50, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: kvm-ppc-ow...@vger.kernel.org > >> [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf > >> Sent: Wednesday, April 03, 2013 3:58 PM > >> To: Bhushan Bharat-R65777 > >> Cc: Wood Scott-B07421; kvm-...@vger.kernel.org; kvm@vger.kernel.org > >> Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > >> support > >> > >> > >> > >> Am 03.04.2013 um 12:03 schrieb Bhushan Bharat-R65777 > >> : > >> > >>> > >>> > >>>> -Original Message- > >>>> From: Wood Scott-B07421 > >>>> Sent: Tuesday, April 02, 2013 11:30 PM > >>>> To: Bhushan Bharat-R65777 > >>>> Cc: Alexander Graf; kvm-...@vger.kernel.org; kvm@vger.kernel.org; > >>>> Wood Scott- > >>>> B07421 > >>>> Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > >>>> support > >>>> > >>>> On 04/02/2013 09:09:34 AM, Bhushan Bharat-R65777 wrote: > >>>>> > >>>>> > >>>>>> -Original Message- > >>>>>> From: Alexander Graf [mailto:ag...@suse.de] > >>>>>> Sent: Tuesday, April 02, 2013 1:57 PM > >>>>>> To: Bhushan Bharat-R65777 > >>>>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood > >>>>>> Scott-B07421 > >>>>>> Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > >>>>> support > >>>>>> > >>>>>> > >>>>>> On 29.03.2013, at 07:04, Bhushan Bharat-R65777 wrote: > >>>>>> > >>>>>>> > >>>>>>> > >>>>>>>> -Original Message- > >>>>>>>> From: Alexander Graf [mailto:ag...@suse.de] > >>>>>>>> Sent: Thursday, March 28, 2013 10:06 PM > >>>>>>>> To: Bhushan Bharat-R65777 > >>>>>>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood > >>>>> Scott-B07421; > >>>>>>>> Bhushan > >>>>>>>> Bharat-R65777 > >>>>>>>> Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > >>>>>>>> support > >>>>>>>> > >>>>>>>> > >>>>>>>> How does the normal debug register switching code work in Linux? > >>>>>>>> Can't we just reuse that? Or rely on it to restore working > >>>>>>>> state > >>>>> when > >>>>>>>> another process gets scheduled in? > >>>>>>> > >>>>>>> Good point, I can see debug registers loading in function > >>>>> __switch_to()- > >>>>>>> switch_booke_debug_regs() in file arch/powerpc/kernel/process.c. > >>>>>>> So as long as assume that host will not use debug resources we > >>>>> can rely on > >>>>>> this restore. But I am not sure that this is a fare assumption. > >>>>>> As > >>>>> Scott earlier > >>>>>> mentioned someone can use debug resource for kernel debugging also. > >>>>>> > >>>>>> Someone in the kernel can also use floating point registers. But > >>>>> then it's his > >>>>>> responsibility to clean up the mess he leaves behind. > >>>>> > >>>>> I am neither convinced by what you said and nor even have much > >>>>> reason to oppose :) > >>>>> > >>>>> Scott, > >>>>> I remember you mentioned that host can use debug resources, you > >>>>> comment on this ? > >>>> > >>>> I thought the conclusion we reached was that it was OK as long as > >>>> KVM waits until it actually needs the debug resources to mess with the > registers.
RE: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Alexander Graf > Sent: Wednesday, April 03, 2013 3:58 PM > To: Bhushan Bharat-R65777 > Cc: Wood Scott-B07421; kvm-...@vger.kernel.org; kvm@vger.kernel.org > Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support > > > > Am 03.04.2013 um 12:03 schrieb Bhushan Bharat-R65777 : > > > > > > >> -Original Message- > >> From: Wood Scott-B07421 > >> Sent: Tuesday, April 02, 2013 11:30 PM > >> To: Bhushan Bharat-R65777 > >> Cc: Alexander Graf; kvm-...@vger.kernel.org; kvm@vger.kernel.org; > >> Wood Scott- > >> B07421 > >> Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > >> support > >> > >> On 04/02/2013 09:09:34 AM, Bhushan Bharat-R65777 wrote: > >>> > >>> > >>>> -Original Message- > >>>> From: Alexander Graf [mailto:ag...@suse.de] > >>>> Sent: Tuesday, April 02, 2013 1:57 PM > >>>> To: Bhushan Bharat-R65777 > >>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > >>>> Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > >>> support > >>>> > >>>> > >>>> On 29.03.2013, at 07:04, Bhushan Bharat-R65777 wrote: > >>>> > >>>>> > >>>>> > >>>>>> -Original Message- > >>>>>> From: Alexander Graf [mailto:ag...@suse.de] > >>>>>> Sent: Thursday, March 28, 2013 10:06 PM > >>>>>> To: Bhushan Bharat-R65777 > >>>>>> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood > >>> Scott-B07421; > >>>>>> Bhushan > >>>>>> Bharat-R65777 > >>>>>> Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > >>>>>> support > >>>>>> > >>>>>> > >>>>>> How does the normal debug register switching code work in Linux? > >>>>>> Can't we just reuse that? Or rely on it to restore working state > >>> when > >>>>>> another process gets scheduled in? > >>>>> > >>>>> Good point, I can see debug registers loading in function > >>> __switch_to()- > >>>>> switch_booke_debug_regs() in file arch/powerpc/kernel/process.c. > >>>>> So as long as assume that host will not use debug resources we > >>> can rely on > >>>> this restore. But I am not sure that this is a fare assumption. As > >>> Scott earlier > >>>> mentioned someone can use debug resource for kernel debugging also. > >>>> > >>>> Someone in the kernel can also use floating point registers. But > >>> then it's his > >>>> responsibility to clean up the mess he leaves behind. > >>> > >>> I am neither convinced by what you said and nor even have much > >>> reason to oppose :) > >>> > >>> Scott, > >>>I remember you mentioned that host can use debug resources, you > >>> comment on this ? > >> > >> I thought the conclusion we reached was that it was OK as long as KVM > >> waits until it actually needs the debug resources to mess with the > >> registers. > > > > Right, Are we also agreeing on that KVM will not save/restore host debug > context on vcpu_load/vcpu_put()? KVM will load its context in vcpu_load() if > needed and on vcpu_put() it will clear DBCR0 and DBSR. > > That depends on whether the kernel restores the debug registers. Please > double- > check that. Currently the kernel code restore the debug state of new schedule process in context_switch(). switch_booke_debug_regs() from __switch_to() and defined as : /* * Unless neither the old or new thread are making use of the * debug registers, set the debug registers from the values * stored in the new thread. */ static void switch_booke_debug_regs(struct thread_struct *new_thread) { if ((current->thread.dbcr0 & DBCR0_IDM) || (new_thread->dbcr0 & DBCR0_IDM)) prime_debug_regs(new_thread); } static void prime_debug_regs(struct thread_struct *thread) { mtspr(SPRN_IAC1, thread->iac1); mtspr(SPRN_IAC2, thread->iac2); #if CONFIG_PPC_ADV_DEBUG_IACS > 2 mtspr(SPRN_IAC3, thread->iac3); mtspr(SPRN_IAC4,
RE: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, April 02, 2013 11:30 PM > To: Bhushan Bharat-R65777 > Cc: Alexander Graf; kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott- > B07421 > Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support > > On 04/02/2013 09:09:34 AM, Bhushan Bharat-R65777 wrote: > > > > > > > -Original Message- > > > From: Alexander Graf [mailto:ag...@suse.de] > > > Sent: Tuesday, April 02, 2013 1:57 PM > > > To: Bhushan Bharat-R65777 > > > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > > > Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > > support > > > > > > > > > On 29.03.2013, at 07:04, Bhushan Bharat-R65777 wrote: > > > > > > > > > > > > > > >> -Original Message- > > > >> From: Alexander Graf [mailto:ag...@suse.de] > > > >> Sent: Thursday, March 28, 2013 10:06 PM > > > >> To: Bhushan Bharat-R65777 > > > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood > > Scott-B07421; > > > >> Bhushan > > > >> Bharat-R65777 > > > >> Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > > > >> support > > > >> > > > >> > > > >> How does the normal debug register switching code work in Linux? > > > >> Can't we just reuse that? Or rely on it to restore working state > > when > > > >> another process gets scheduled in? > > > > > > > > Good point, I can see debug registers loading in function > > __switch_to()- > > > >switch_booke_debug_regs() in file arch/powerpc/kernel/process.c. > > > > So as long as assume that host will not use debug resources we > > can rely on > > > this restore. But I am not sure that this is a fare assumption. As > > Scott earlier > > > mentioned someone can use debug resource for kernel debugging also. > > > > > > Someone in the kernel can also use floating point registers. But > > then it's his > > > responsibility to clean up the mess he leaves behind. > > > > I am neither convinced by what you said and nor even have much reason > > to oppose :) > > > > Scott, > > I remember you mentioned that host can use debug resources, you > > comment on this ? > > I thought the conclusion we reached was that it was OK as long as KVM waits > until it actually needs the debug resources to mess with the registers. Right, Are we also agreeing on that KVM will not save/restore host debug context on vcpu_load/vcpu_put()? KVM will load its context in vcpu_load() if needed and on vcpu_put() it will clear DBCR0 and DBSR. Thanks -Bharat -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Tuesday, April 02, 2013 1:57 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support > > > On 29.03.2013, at 07:04, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: Alexander Graf [mailto:ag...@suse.de] > >> Sent: Thursday, March 28, 2013 10:06 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; > >> Bhushan > >> Bharat-R65777 > >> Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub > >> support > >> > >> > >> On 21.03.2013, at 07:25, Bharat Bhushan wrote: > >> > >>> From: Bharat Bhushan > >>> > >>> This patch adds the debug stub support on booke/bookehv. > >>> Now QEMU debug stub can use hw breakpoint, watchpoint and software > >>> breakpoint to debug guest. > >>> > >>> Debug registers are saved/restored on vcpu_put()/vcpu_get(). > >>> Also the debug registers are saved restored only if guest is using > >>> debug resources. > >>> > >>> Signed-off-by: Bharat Bhushan > >>> --- > >>> v2: > >>> - save/restore in vcpu_get()/vcpu_put() > >>> - some more minor cleanup based on review comments. > >>> > >>> arch/powerpc/include/asm/kvm_host.h | 10 ++ > >>> arch/powerpc/include/uapi/asm/kvm.h | 22 +++- > >>> arch/powerpc/kvm/booke.c| 252 > >>> - > -- > >>> arch/powerpc/kvm/e500_emulate.c | 10 ++ > >>> 4 files changed, 272 insertions(+), 22 deletions(-) > >>> > >>> diff --git a/arch/powerpc/include/asm/kvm_host.h > >>> b/arch/powerpc/include/asm/kvm_host.h > >>> index f4ba881..8571952 100644 > >>> --- a/arch/powerpc/include/asm/kvm_host.h > >>> +++ b/arch/powerpc/include/asm/kvm_host.h > >>> @@ -504,7 +504,17 @@ struct kvm_vcpu_arch { > >>> u32 mmucfg; > >>> u32 epr; > >>> u32 crit_save; > >>> + /* guest debug registers*/ > >>> struct kvmppc_booke_debug_reg dbg_reg; > >>> + /* shadow debug registers */ > >>> + struct kvmppc_booke_debug_reg shadow_dbg_reg; > >>> + /* host debug registers*/ > >>> + struct kvmppc_booke_debug_reg host_dbg_reg; > >>> + /* > >>> + * Flag indicating that debug registers are used by guest > >>> + * and requires save restore. > >>> + */ > >>> + bool debug_save_restore; > >>> #endif > >>> gpa_t paddr_accessed; > >>> gva_t vaddr_accessed; > >>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h > >>> b/arch/powerpc/include/uapi/asm/kvm.h > >>> index 15f9a00..d7ce449 100644 > >>> --- a/arch/powerpc/include/uapi/asm/kvm.h > >>> +++ b/arch/powerpc/include/uapi/asm/kvm.h > >>> @@ -25,6 +25,7 @@ > >>> /* Select powerpc specific features in */ #define > >>> __KVM_HAVE_SPAPR_TCE #define __KVM_HAVE_PPC_SMT > >>> +#define __KVM_HAVE_GUEST_DEBUG > >>> > >>> struct kvm_regs { > >>> __u64 pc; > >>> @@ -267,7 +268,24 @@ struct kvm_fpu { > >>> __u64 fpr[32]; > >>> }; > >>> > >>> +/* > >>> + * Defines for h/w breakpoint, watchpoint (read, write or both) and > >>> + * software breakpoint. > >>> + * These are used as "type" in KVM_SET_GUEST_DEBUG ioctl and "status" > >>> + * for KVM_DEBUG_EXIT. > >>> + */ > >>> +#define KVMPPC_DEBUG_NONE0x0 > >>> +#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1) > >>> +#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) > >>> +#define KVMPPC_DEBUG_WATCH_READ (1UL << 3) > >>> struct kvm_debug_exit_arch { > >>> + __u64 address; > >>> + /* > >>> + * exiting to userspace because of h/w breakpoint, watchpoint > >>> + * (read, write or both) and software breakpoint. > >>> + */ > >>> + __u32 status; > >>> + __u32 reserved; > >>> }; > >>> > >>> /* for KVM_SET_GUEST_DEBUG */ > >>> @@ -279
RE: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Thursday, March 28, 2013 10:06 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; Bhushan > Bharat-R65777 > Subject: Re: [PATCH 4/4 v2] KVM: PPC: Add userspace debug stub support > > > On 21.03.2013, at 07:25, Bharat Bhushan wrote: > > > From: Bharat Bhushan > > > > This patch adds the debug stub support on booke/bookehv. > > Now QEMU debug stub can use hw breakpoint, watchpoint and software > > breakpoint to debug guest. > > > > Debug registers are saved/restored on vcpu_put()/vcpu_get(). > > Also the debug registers are saved restored only if guest is using > > debug resources. > > > > Signed-off-by: Bharat Bhushan > > --- > > v2: > > - save/restore in vcpu_get()/vcpu_put() > > - some more minor cleanup based on review comments. > > > > arch/powerpc/include/asm/kvm_host.h | 10 ++ > > arch/powerpc/include/uapi/asm/kvm.h | 22 +++- > > arch/powerpc/kvm/booke.c| 252 > > --- > > arch/powerpc/kvm/e500_emulate.c | 10 ++ > > 4 files changed, 272 insertions(+), 22 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/kvm_host.h > > b/arch/powerpc/include/asm/kvm_host.h > > index f4ba881..8571952 100644 > > --- a/arch/powerpc/include/asm/kvm_host.h > > +++ b/arch/powerpc/include/asm/kvm_host.h > > @@ -504,7 +504,17 @@ struct kvm_vcpu_arch { > > u32 mmucfg; > > u32 epr; > > u32 crit_save; > > + /* guest debug registers*/ > > struct kvmppc_booke_debug_reg dbg_reg; > > + /* shadow debug registers */ > > + struct kvmppc_booke_debug_reg shadow_dbg_reg; > > + /* host debug registers*/ > > + struct kvmppc_booke_debug_reg host_dbg_reg; > > + /* > > +* Flag indicating that debug registers are used by guest > > +* and requires save restore. > > + */ > > + bool debug_save_restore; > > #endif > > gpa_t paddr_accessed; > > gva_t vaddr_accessed; > > diff --git a/arch/powerpc/include/uapi/asm/kvm.h > > b/arch/powerpc/include/uapi/asm/kvm.h > > index 15f9a00..d7ce449 100644 > > --- a/arch/powerpc/include/uapi/asm/kvm.h > > +++ b/arch/powerpc/include/uapi/asm/kvm.h > > @@ -25,6 +25,7 @@ > > /* Select powerpc specific features in */ #define > > __KVM_HAVE_SPAPR_TCE #define __KVM_HAVE_PPC_SMT > > +#define __KVM_HAVE_GUEST_DEBUG > > > > struct kvm_regs { > > __u64 pc; > > @@ -267,7 +268,24 @@ struct kvm_fpu { > > __u64 fpr[32]; > > }; > > > > +/* > > + * Defines for h/w breakpoint, watchpoint (read, write or both) and > > + * software breakpoint. > > + * These are used as "type" in KVM_SET_GUEST_DEBUG ioctl and "status" > > + * for KVM_DEBUG_EXIT. > > + */ > > +#define KVMPPC_DEBUG_NONE 0x0 > > +#define KVMPPC_DEBUG_BREAKPOINT(1UL << 1) > > +#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) > > +#define KVMPPC_DEBUG_WATCH_READ(1UL << 3) > > struct kvm_debug_exit_arch { > > + __u64 address; > > + /* > > +* exiting to userspace because of h/w breakpoint, watchpoint > > +* (read, write or both) and software breakpoint. > > +*/ > > + __u32 status; > > + __u32 reserved; > > }; > > > > /* for KVM_SET_GUEST_DEBUG */ > > @@ -279,10 +297,6 @@ struct kvm_guest_debug_arch { > > * Type denotes h/w breakpoint, read watchpoint, write > > * watchpoint or watchpoint (both read and write). > > */ > > -#define KVMPPC_DEBUG_NOTYPE0x0 > > -#define KVMPPC_DEBUG_BREAKPOINT(1UL << 1) > > -#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) > > -#define KVMPPC_DEBUG_WATCH_READ(1UL << 3) > > __u32 type; > > __u32 reserved; > > } bp[16]; > > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index > > 1de93a8..bf20056 100644 > > --- a/arch/powerpc/kvm/booke.c > > +++ b/arch/powerpc/kvm/booke.c > > @@ -133,6 +133,30 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu > > *vcpu) #endif } > > > > +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) { > > + /* Synchronize guest's desire to get debug interrupts into shadow > > +MSR */ #ifndef CONFIG_KVM_BOOKE_HV > > + vcpu->arch.shadow_msr &= ~MSR_DE; > > +
RE: [PATCH 2/4 v2] KVM: PPC: debug stub interface parameter defined
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Friday, March 29, 2013 7:26 AM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; Bhushan > Bharat-R65777 > Subject: Re: [PATCH 2/4 v2] KVM: PPC: debug stub interface parameter defined > > > On 21.03.2013, at 07:24, Bharat Bhushan wrote: > > > From: Bharat Bhushan > > > > This patch defines the interface parameter for KVM_SET_GUEST_DEBUG > > ioctl support. Follow up patches will use this for setting up hardware > > breakpoints, watchpoints and software breakpoints. > > > > Also kvm_arch_vcpu_ioctl_set_guest_debug() is brought one level below. > > This is because I am not sure what is required for book3s. So this > > ioctl behaviour will not change for book3s. > > > > Signed-off-by: Bharat Bhushan > > --- > > v2: > > - No Change > > > > arch/powerpc/include/uapi/asm/kvm.h | 23 +++ > > arch/powerpc/kvm/book3s.c |6 ++ > > arch/powerpc/kvm/booke.c|6 ++ > > arch/powerpc/kvm/powerpc.c |6 -- > > 4 files changed, 35 insertions(+), 6 deletions(-) > > > > diff --git a/arch/powerpc/include/uapi/asm/kvm.h > > b/arch/powerpc/include/uapi/asm/kvm.h > > index c2ff99c..15f9a00 100644 > > --- a/arch/powerpc/include/uapi/asm/kvm.h > > +++ b/arch/powerpc/include/uapi/asm/kvm.h > > @@ -272,8 +272,31 @@ struct kvm_debug_exit_arch { > > > > /* for KVM_SET_GUEST_DEBUG */ > > struct kvm_guest_debug_arch { > > + struct { > > + /* H/W breakpoint/watchpoint address */ > > + __u64 addr; > > + /* > > +* Type denotes h/w breakpoint, read watchpoint, write > > +* watchpoint or watchpoint (both read and write). > > +*/ > > +#define KVMPPC_DEBUG_NOTYPE0x0 > > +#define KVMPPC_DEBUG_BREAKPOINT(1UL << 1) > > +#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) > > +#define KVMPPC_DEBUG_WATCH_READ(1UL << 3) > > Are you sure you want to introduce these here, just to remove them again in a > later patch? Up to this patch the scope was limited to this structure. So for clarity I defined here and later the scope expands so moved out of this structure. I do not think this really matters, let me know how you want to see ? -Bharat > > > Alex > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 7/7] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: Wood Scott-B07421 > Sent: Thursday, March 14, 2013 9:36 PM > To: Bhushan Bharat-R65777 > Cc: Alexander Graf; kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott- > B07421 > Subject: Re: [PATCH 7/7] KVM: PPC: Add userspace debug stub support > > On 03/14/2013 08:57:53 AM, Bhushan Bharat-R65777 wrote: > > > >>> diff --git a/arch/powerpc/kvm/e500mc.c > > b/arch/powerpc/kvm/e500mc.c > > > >>> index 1f89d26..f5fc6f5 100644 > > > >>> --- a/arch/powerpc/kvm/e500mc.c > > > >>> +++ b/arch/powerpc/kvm/e500mc.c > > > >>> @@ -182,8 +182,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu > > > >>> *vcpu) { > > > >>> struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); > > > >>> > > > >>> - vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | > > SPRN_EPCR_DGTMI | \ > > > >>> - SPRN_EPCR_DUVD; > > > >>> + vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | > > SPRN_EPCR_DGTMI; > > > >> > > > >> Doesn't this route all debug events through the host? > > > > > > > > No; This means that debug events can occur in hypervisor state or > > not. > > > > > > > > EPCR.DUVD = 0 ; Debug events can occur in the hypervisor state. > > > > > > > > EPCR.DUVD = 1 ; Debug events cannot occur in the hypervisor state. > > > > > > > > So we allow debug events to occur in hypervisor state. > > > > > > Why do we care about debug events in our entry/exit code and didn't > > care about > > > them before? > > > > We care for single stepping in guest to not step in KVM code. > > > > > If anything, this is a completely separate patch, orthogonal to this > > > patch series, and requires a good bit of explanation. > > > > Not sure why you think separate patch; this patch add support for > > single stepping and also takes care that debug event does not comes in > > host when doing single stepping. > > How does *removing* DUVD ensure that? By default we clear DUVD, so debug events can come in hypervisor state. But on lightweight exit, when restoring guest debug context, we set DUVD so the debug interrupt will not come in hypervisor state as debug resource are taken by guest. On guest exit, when restoring the host context we clear DUVD so now debug resource are having host context. With proposed change of save and restore on vcpu_get/vcpu_put this switching witching will be done in vcpu_get/set(). Thanks -Bharat > > -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 4/7] booke: Save and restore debug registers on guest entry and exit
> >> -Original Message- > >> From: Alexander Graf [mailto:ag...@suse.de] > >> Sent: Thursday, March 07, 2013 6:56 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; > >> Bhushan > >> Bharat-R65777 > >> Subject: Re: [PATCH 4/7] booke: Save and restore debug registers on > >> guest entry and exit > >> > >> > >> On 28.02.2013, at 05:13, Bharat Bhushan wrote: > >> > >>> On Guest entry: if guest is wants to use the debug register then > >>> save h/w debug register in host_dbg_reg and load the debug registers > >>> with shadow_dbg_reg. Otherwise leave h/w debug registers as is. > >> > >> Why can't we switch the majority of registers on vcpu_put/get and > >> only enable or disable debugging on guest entry/exit? > > > > > > One of the reason for not doing this is that the KVM is a host kernel > > module and let this be debugged by host (I do not this how much useful this > > is > :)) So I am not able to recall the specific reason, maybe we have just coded > this like this and tried to keep overhead as low as possible by switching > registers only when they are used. > > My point is that the overhead is _higher_ this way, because we need to do > checks > and switches on every guest entry/exit, which happens a _lot_ more often than > a > host context switch. > > > As we discussed before, we can keep this option open for future. > > What future? Just ignore debug events in the entry/exit code path and > suddenly a > lot of the code becomes a lot easier. Just to summarize what we agreed upon: - Save/restore will happen on vcpu_get()/vcpu_put(). This will happen only if guest is using debug registers. Probably using a flag to indicate guest is using debug APU. - On debug register access from QEMU, always set value in h/w debug register. - On guest access of debug register, also save xxx h/w register in vcpu->host_debug_reg.xxx and load guest provided value in h/w debug register, ensure this happen on first access only, probably all debug registers once debug events enabled in dbcr0. Direct access from guest was not part of this patchset and support for this will be done separately. Thanks -Bharat > > > Alex > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 7/7] KVM: PPC: Add userspace debug stub support
> -Original Message- > From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On > Behalf Of Alexander Graf > Sent: Thursday, March 14, 2013 5:20 PM > To: Bhushan Bharat-R65777 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421 > Subject: Re: [PATCH 7/7] KVM: PPC: Add userspace debug stub support > > > On 14.03.2013, at 06:18, Bhushan Bharat-R65777 wrote: > > > > > > >> -Original Message- > >> From: Alexander Graf [mailto:ag...@suse.de] > >> Sent: Thursday, March 07, 2013 7:09 PM > >> To: Bhushan Bharat-R65777 > >> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; > >> Bhushan > >> Bharat-R65777 > >> Subject: Re: [PATCH 7/7] KVM: PPC: Add userspace debug stub support > >> > >> > >> On 28.02.2013, at 05:13, Bharat Bhushan wrote: > >> > >>> This patch adds the debug stub support on booke/bookehv. > >>> Now QEMU debug stub can use hw breakpoint, watchpoint and software > >>> breakpoint to debug guest. > >>> > >>> Signed-off-by: Bharat Bhushan > >>> --- > >>> arch/powerpc/include/uapi/asm/kvm.h | 22 +- > >>> arch/powerpc/kvm/booke.c| 143 > >>> +++-- > - > >>> arch/powerpc/kvm/e500_emulate.c |6 ++ > >>> arch/powerpc/kvm/e500mc.c |3 +- > >>> 4 files changed, 155 insertions(+), 19 deletions(-) > >>> > >>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h > >>> b/arch/powerpc/include/uapi/asm/kvm.h > >>> index 15f9a00..d7ce449 100644 > >>> --- a/arch/powerpc/include/uapi/asm/kvm.h > >>> +++ b/arch/powerpc/include/uapi/asm/kvm.h > >>> @@ -25,6 +25,7 @@ > >>> /* Select powerpc specific features in */ #define > >>> __KVM_HAVE_SPAPR_TCE #define __KVM_HAVE_PPC_SMT > >>> +#define __KVM_HAVE_GUEST_DEBUG > >>> > >>> struct kvm_regs { > >>> __u64 pc; > >>> @@ -267,7 +268,24 @@ struct kvm_fpu { > >>> __u64 fpr[32]; > >>> }; > >>> > >>> +/* > >>> + * Defines for h/w breakpoint, watchpoint (read, write or both) and > >>> + * software breakpoint. > >>> + * These are used as "type" in KVM_SET_GUEST_DEBUG ioctl and "status" > >>> + * for KVM_DEBUG_EXIT. > >>> + */ > >>> +#define KVMPPC_DEBUG_NONE0x0 > >>> +#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1) > >>> +#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) > >>> +#define KVMPPC_DEBUG_WATCH_READ (1UL << 3) > >>> struct kvm_debug_exit_arch { > >>> + __u64 address; > >>> + /* > >>> + * exiting to userspace because of h/w breakpoint, watchpoint > >>> + * (read, write or both) and software breakpoint. > >>> + */ > >>> + __u32 status; > >>> + __u32 reserved; > >>> }; > >>> > >>> /* for KVM_SET_GUEST_DEBUG */ > >>> @@ -279,10 +297,6 @@ struct kvm_guest_debug_arch { > >>>* Type denotes h/w breakpoint, read watchpoint, write > >>>* watchpoint or watchpoint (both read and write). > >>>*/ > >>> -#define KVMPPC_DEBUG_NOTYPE 0x0 > >>> -#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1) > >>> -#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) > >>> -#define KVMPPC_DEBUG_WATCH_READ (1UL << 3) > >>> __u32 type; > >>> __u32 reserved; > >>> } bp[16]; > >>> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c > >>> index > >>> 1de93a8..21b0313 100644 > >>> --- a/arch/powerpc/kvm/booke.c > >>> +++ b/arch/powerpc/kvm/booke.c > >>> @@ -133,6 +133,30 @@ static void kvmppc_vcpu_sync_fpu(struct > >>> kvm_vcpu > >>> *vcpu) #endif } > >>> > >>> +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) { > >>> + /* Synchronize guest's desire to get debug interrupts into shadow > >>> +MSR */ #ifndef CONFIG_KVM_BOOKE_HV > >>> + vcpu->arch.shadow_msr &= ~MSR_DE; > >>> + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE; #endif > >>> + > >>> + /* Force enable debug interrupts when user spa