[PATCH kernel v11 08/34] vfio: powerpc/spapr: Use it_page_size

2015-05-29 Thread Alexey Kardashevskiy
This makes use of the it_page_size from the iommu_table struct
as page size can differ.

This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
as recently introduced IOMMU_PAGE_XXX macros do not include
IOMMU_PAGE_SHIFT.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 735b308..64300cc 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -91,7 +91,7 @@ static int tce_iommu_enable(struct tce_container *container)
 * enforcing the limit based on the max that the guest can map.
 */
down_write(current-mm-mmap_sem);
-   npages = (tbl-it_size  IOMMU_PAGE_SHIFT_4K)  PAGE_SHIFT;
+   npages = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
locked = current-mm-locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
if (locked  lock_limit  !capable(CAP_IPC_LOCK)) {
@@ -120,7 +120,7 @@ static void tce_iommu_disable(struct tce_container 
*container)
 
down_write(current-mm-mmap_sem);
current-mm-locked_vm -= (container-tbl-it_size 
-   IOMMU_PAGE_SHIFT_4K)  PAGE_SHIFT;
+   container-tbl-it_page_shift)  PAGE_SHIFT;
up_write(current-mm-mmap_sem);
 }
 
@@ -215,7 +215,7 @@ static long tce_iommu_build(struct tce_container *container,
tce, ret);
break;
}
-   tce += IOMMU_PAGE_SIZE_4K;
+   tce += IOMMU_PAGE_SIZE(tbl);
}
 
if (ret)
@@ -260,8 +260,8 @@ static long tce_iommu_ioctl(void *iommu_data,
if (info.argsz  minsz)
return -EINVAL;
 
-   info.dma32_window_start = tbl-it_offset  IOMMU_PAGE_SHIFT_4K;
-   info.dma32_window_size = tbl-it_size  IOMMU_PAGE_SHIFT_4K;
+   info.dma32_window_start = tbl-it_offset  tbl-it_page_shift;
+   info.dma32_window_size = tbl-it_size  tbl-it_page_shift;
info.flags = 0;
 
if (copy_to_user((void __user *)arg, info, minsz))
@@ -291,8 +291,8 @@ static long tce_iommu_ioctl(void *iommu_data,
VFIO_DMA_MAP_FLAG_WRITE))
return -EINVAL;
 
-   if ((param.size  ~IOMMU_PAGE_MASK_4K) ||
-   (param.vaddr  ~IOMMU_PAGE_MASK_4K))
+   if ((param.size  ~IOMMU_PAGE_MASK(tbl)) ||
+   (param.vaddr  ~IOMMU_PAGE_MASK(tbl)))
return -EINVAL;
 
/* iova is checked by the IOMMU API */
@@ -307,8 +307,8 @@ static long tce_iommu_ioctl(void *iommu_data,
return ret;
 
ret = tce_iommu_build(container, tbl,
-   param.iova  IOMMU_PAGE_SHIFT_4K,
-   tce, param.size  IOMMU_PAGE_SHIFT_4K);
+   param.iova  tbl-it_page_shift,
+   tce, param.size  tbl-it_page_shift);
 
iommu_flush_tce(tbl);
 
@@ -334,17 +334,17 @@ static long tce_iommu_ioctl(void *iommu_data,
if (param.flags)
return -EINVAL;
 
-   if (param.size  ~IOMMU_PAGE_MASK_4K)
+   if (param.size  ~IOMMU_PAGE_MASK(tbl))
return -EINVAL;
 
ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
-   param.size  IOMMU_PAGE_SHIFT_4K);
+   param.size  tbl-it_page_shift);
if (ret)
return ret;
 
ret = tce_iommu_clear(container, tbl,
-   param.iova  IOMMU_PAGE_SHIFT_4K,
-   param.size  IOMMU_PAGE_SHIFT_4K);
+   param.iova  tbl-it_page_shift,
+   param.size  tbl-it_page_shift);
iommu_flush_tce(tbl);
 
return ret;
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group

2015-05-29 Thread Alexey Kardashevskiy
Modern IBM POWERPC systems support multiple (currently two) TCE tables
per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
for TCE tables. Right now just one table is supported.

This defines iommu_table_group struct which stores pointers to
iommu_group and iommu_table(s). This replaces iommu_table with
iommu_table_group where iommu_table was used to identify a group:
- iommu_register_group();
- iommudata of generic iommu_group;

This removes @data from iommu_table as it_table_group provides
same access to pnv_ioda_pe.

For IODA, instead of embedding iommu_table, the new iommu_table_group
keeps pointers to those. The iommu_table structs are allocated
dynamically.

For P5IOC2, both iommu_table_group and iommu_table are embedded into
PE struct. As there is no EEH and SRIOV support for P5IOC2,
iommu_free_table() should not be called on iommu_table struct pointers
so we can keep it embedded in pnv_phb::p5ioc2.

For pSeries, this replaces multiple calls of kzalloc_node() with a new
iommu_pseries_alloc_group() helper and stores the table group struct
pointer into the pci_dn struct. For release, a iommu_table_free_group()
helper is added.

This moves iommu_table struct allocation from SR-IOV code to
the generic DMA initialization code in pnv_pci_ioda_setup_dma_pe and
pnv_pci_ioda2_setup_dma_pe as this is where DMA is actually initialized.
This change is here because those lines had to be changed anyway.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v11:
* iommu_table_group moved outside #ifdef CONFIG_IOMMU_API as iommu_table
is dynamically allocated and it needs a pointer to PE and
iommu_table_group is this pointer

v10:
* new to the series, separated from
powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group
* iommu_table is not embedded into iommu_table_group but allocated
dynamically in most cases
* iommu_table allocation is moved to a single place for IODA2's
pnv_pci_ioda_setup_dma_pe where it belongs to
* added list of groups into iommu_table; most of the code just looks at
the first item to keep the patch simpler
---
 arch/powerpc/include/asm/iommu.h|  19 ++---
 arch/powerpc/include/asm/pci-bridge.h   |   2 +-
 arch/powerpc/kernel/iommu.c |  17 ++---
 arch/powerpc/platforms/powernv/pci-ioda.c   |  55 +++---
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  18 +++--
 arch/powerpc/platforms/powernv/pci.h|   3 +-
 arch/powerpc/platforms/pseries/iommu.c  | 107 +++-
 drivers/vfio/vfio_iommu_spapr_tce.c |  23 +++---
 8 files changed, 152 insertions(+), 92 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index e2a45c3..5a7267f 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -91,14 +91,9 @@ struct iommu_table {
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now */
unsigned long  it_page_shift;/* table iommu page size */
-#ifdef CONFIG_IOMMU_API
-   struct iommu_group *it_group;
-#endif
+   struct iommu_table_group *it_table_group;
struct iommu_table_ops *it_ops;
void (*set_bypass)(struct iommu_table *tbl, bool enable);
-#ifdef CONFIG_PPC_POWERNV
-   void   *data;
-#endif
 };
 
 /* Pure 2^n version of get_order */
@@ -129,14 +124,22 @@ extern void iommu_free_table(struct iommu_table *tbl, 
const char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
int nid);
+#define IOMMU_TABLE_GROUP_MAX_TABLES   1
+
+struct iommu_table_group {
+   struct iommu_group *group;
+   struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+};
+
 #ifdef CONFIG_IOMMU_API
-extern void iommu_register_group(struct iommu_table *tbl,
+
+extern void iommu_register_group(struct iommu_table_group *table_group,
 int pci_domain_number, unsigned long pe_num);
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 #else
-static inline void iommu_register_group(struct iommu_table *tbl,
+static inline void iommu_register_group(struct iommu_table_group *table_group,
int pci_domain_number,
unsigned long pe_num)
 {
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 1811c44..e2d7479 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -185,7 +185,7 @@ struct pci_dn {
 
struct  pci_dn *parent;
struct  pci_controller *phb;/* for pci devices */
-   struct  iommu_table *iommu_table; 

[PATCH kernel v11 31/34] vfio: powerpc/spapr: powerpc/powernv/ioda2: Use DMA windows API in ownership control

2015-05-29 Thread Alexey Kardashevskiy
Before the IOMMU user (VFIO) would take control over the IOMMU table
belonging to a specific IOMMU group. This approach did not allow sharing
tables between IOMMU groups attached to the same container.

This introduces a new IOMMU ownership flavour when the user can not
just control the existing IOMMU table but remove/create tables on demand.
If an IOMMU implements take/release_ownership() callbacks, this lets
the user have full control over the IOMMU group. When the ownership
is taken, the platform code removes all the windows so the caller must
create them.
Before returning the ownership back to the platform code, VFIO
unprograms and removes all the tables it created.

This changes IODA2's onwership handler to remove the existing table
rather than manipulating with the existing one. From now on,
iommu_take_ownership() and iommu_release_ownership() are only called
from the vfio_iommu_spapr_tce driver.

Old-style ownership is still supported allowing VFIO to run on older
P5IOC2 and IODA IO controllers.

No change in userspace-visible behaviour is expected. Since it recreates
TCE tables on each ownership change, related kernel traces will appear
more often.

This adds a pnv_pci_ioda2_setup_default_config() which is called
when PE is being configured at boot time and when the ownership is
passed from VFIO to the platform code.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v10:
* created pnv_pci_ioda2_setup_default_config() helper

v9:
* fixed crash in tce_iommu_detach_group() on tbl-it_ops-free as
tce_iommu_attach_group() used to initialize the table from a descriptor
on stack (it does not matter for the series as this bit is changed later anyway
but it ruing bisectability)

v6:
* fixed commit log that VFIO removes tables before passing ownership
back to the platform code, not userspace
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 101 --
 drivers/vfio/vfio_iommu_spapr_tce.c   |  88 +-
 2 files changed, 141 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index c77c85e..6057ca4 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2072,6 +2072,49 @@ static long pnv_pci_ioda2_create_table(struct 
iommu_table_group *table_group,
return 0;
 }
 
+static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
+{
+   struct iommu_table *tbl = NULL;
+   long rc;
+
+   rc = pnv_pci_ioda2_create_table(pe-table_group, 0,
+   IOMMU_PAGE_SHIFT_4K,
+   pe-table_group.tce32_size,
+   POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
+   if (rc) {
+   pe_err(pe, Failed to create 32-bit TCE table, err %ld,
+   rc);
+   return rc;
+   }
+
+   iommu_init_table(tbl, pe-phb-hose-node);
+
+   rc = pnv_pci_ioda2_set_window(pe-table_group, 0, tbl);
+   if (rc) {
+   pe_err(pe, Failed to configure 32-bit TCE table, err %ld\n,
+   rc);
+   pnv_ioda2_table_free(tbl);
+   return rc;
+   }
+
+   if (!pnv_iommu_bypass_disabled)
+   pnv_pci_ioda2_set_bypass(pe, true);
+
+   /* OPAL variant of PHB3 invalidated TCEs */
+   if (pe-phb-ioda.tce_inval_reg)
+   tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+
+   /*
+* Setting table base here only for carrying iommu_group
+* further down to let iommu_add_device() do the job.
+* pnv_pci_ioda_dma_dev_setup will override it later anyway.
+*/
+   if (pe-flags  PNV_IODA_PE_DEV)
+   set_iommu_table_base(pe-pdev-dev, tbl);
+
+   return 0;
+}
+
 #ifdef CONFIG_IOMMU_API
 static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
__u64 window_size, __u32 levels)
@@ -2133,9 +2176,12 @@ static void pnv_ioda2_take_ownership(struct 
iommu_table_group *table_group)
 {
struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
table_group);
+   /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
+   struct iommu_table *tbl = pe-table_group.tables[0];
 
-   iommu_take_ownership(table_group-tables[0]);
pnv_pci_ioda2_set_bypass(pe, false);
+   pnv_pci_ioda2_unset_window(pe-table_group, 0);
+   pnv_ioda2_table_free(tbl);
 }
 
 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
@@ -2143,8 +2189,7 @@ static void pnv_ioda2_release_ownership(struct 
iommu_table_group *table_group)
struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
table_group);
 
-   

[PATCH kernel v11 12/34] vfio: powerpc/spapr: Rework groups attaching

2015-05-29 Thread Alexey Kardashevskiy
This is to make extended ownership and multiple groups support patches
simpler for review.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 40 ++---
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 115d5e6..0fbe03e 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -460,16 +460,21 @@ static int tce_iommu_attach_group(void *iommu_data,
iommu_group_id(container-tbl-it_group),
iommu_group_id(iommu_group));
ret = -EBUSY;
-   } else if (container-enabled) {
+   goto unlock_exit;
+   }
+
+   if (container-enabled) {
pr_err(tce_vfio: attaching group #%u to enabled container\n,
iommu_group_id(iommu_group));
ret = -EBUSY;
-   } else {
-   ret = iommu_take_ownership(tbl);
-   if (!ret)
-   container-tbl = tbl;
+   goto unlock_exit;
}
 
+   ret = iommu_take_ownership(tbl);
+   if (!ret)
+   container-tbl = tbl;
+
+unlock_exit:
mutex_unlock(container-lock);
 
return ret;
@@ -487,19 +492,22 @@ static void tce_iommu_detach_group(void *iommu_data,
pr_warn(tce_vfio: detaching group #%u, expected group is 
#%u\n,
iommu_group_id(iommu_group),
iommu_group_id(tbl-it_group));
-   } else {
-   if (container-enabled) {
-   pr_warn(tce_vfio: detaching group #%u from enabled 
container, forcing disable\n,
-   iommu_group_id(tbl-it_group));
-   tce_iommu_disable(container);
-   }
+   goto unlock_exit;
+   }
 
-   /* pr_debug(tce_vfio: detaching group #%u from iommu %p\n,
-   iommu_group_id(iommu_group), iommu_group); */
-   container-tbl = NULL;
-   tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
-   iommu_release_ownership(tbl);
+   if (container-enabled) {
+   pr_warn(tce_vfio: detaching group #%u from enabled container, 
forcing disable\n,
+   iommu_group_id(tbl-it_group));
+   tce_iommu_disable(container);
}
+
+   /* pr_debug(tce_vfio: detaching group #%u from iommu %p\n,
+  iommu_group_id(iommu_group), iommu_group); */
+   container-tbl = NULL;
+   tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
+   iommu_release_ownership(tbl);
+
+unlock_exit:
mutex_unlock(container-lock);
 }
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 07/34] vfio: powerpc/spapr: Check that IOMMU page is fully contained by system page

2015-05-29 Thread Alexey Kardashevskiy
This checks that the TCE table page size is not bigger that the size of
a page we just pinned and going to put its physical address to the table.

Otherwise the hardware gets unwanted access to physical memory between
the end of the actual page and the end of the aligned up TCE page.

Since compound_order() and compound_head() work correctly on non-huge
pages, there is no need for additional check whether the page is huge.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v8: changed subject

v6:
* the helper is simplified to one line

v4:
* s/tce_check_page_size/tce_page_is_contained/
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index b95fa2b..735b308 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -47,6 +47,16 @@ struct tce_container {
bool enabled;
 };
 
+static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+{
+   /*
+* Check that the TCE table granularity is not bigger than the size of
+* a page we just found. Otherwise the hardware can get access to
+* a bigger memory chunk that it should.
+*/
+   return (PAGE_SHIFT + compound_order(compound_head(page))) = page_shift;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
int ret = 0;
@@ -189,6 +199,12 @@ static long tce_iommu_build(struct tce_container 
*container,
ret = -EFAULT;
break;
}
+
+   if (!tce_page_is_contained(page, tbl-it_page_shift)) {
+   ret = -EPERM;
+   break;
+   }
+
hva = (unsigned long) page_address(page) + offset;
 
ret = iommu_tce_build(tbl, entry + i, hva, direction);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 29/34] powerpc/powernv/ioda2: Use new helpers to do proper cleanup on PE release

2015-05-29 Thread Alexey Kardashevskiy
The existing code programmed TVT#0 with some address and then
immediately released that memory.

This makes use of pnv_pci_ioda2_unset_window() and
pnv_pci_ioda2_set_bypass() which do correct resource release and
TVT update.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 25 ++---
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 1059bf6..00739883 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1287,34 +1287,21 @@ m64_failed:
return -EBUSY;
 }
 
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+   int num);
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+
 static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct 
pnv_ioda_pe *pe)
 {
-   struct pci_bus*bus;
-   struct pci_controller *hose;
-   struct pnv_phb*phb;
struct iommu_table*tbl;
-   unsigned long addr;
int64_t   rc;
 
-   bus = dev-bus;
-   hose = pci_bus_to_host(bus);
-   phb = hose-private_data;
tbl = pe-table_group.tables[0];
-   addr = tbl-it_base;
-
-   opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
-  pe-pe_number  1, 1, __pa(addr),
-  0, 0x1000);
-
-   rc = opal_pci_map_pe_dma_window_real(pe-phb-opal_id,
-   pe-pe_number,
-   (pe-pe_number  1) + 1,
-   pe-tce_bypass_base,
-   0);
+   rc = pnv_pci_ioda2_unset_window(pe-table_group, 0);
if (rc)
pe_warn(pe, OPAL error %ld release DMA window\n, rc);
 
-   pnv_pci_unlink_table_and_group(tbl, pe-table_group);
+   pnv_pci_ioda2_set_bypass(pe, false);
if (pe-table_group.group) {
iommu_group_put(pe-table_group.group);
BUG_ON(pe-table_group.group);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 18/34] vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership control

2015-05-29 Thread Alexey Kardashevskiy
This adds tce_iommu_take_ownership() and tce_iommu_release_ownership
which call in a loop iommu_take_ownership()/iommu_release_ownership()
for every table on the group. As there is just one now, no change in
behaviour is expected.

At the moment the iommu_table struct has a set_bypass() which enables/
disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code
which calls this callback when external IOMMU users such as VFIO are
about to get over a PHB.

The set_bypass() callback is not really an iommu_table function but
IOMMU/PE function. This introduces a iommu_table_group_ops struct and
adds take_ownership()/release_ownership() callbacks to it which are
called when an external user takes/releases control over the IOMMU.

This replaces set_bypass() with ownership callbacks as it is not
necessarily just bypass enabling, it can be something else/more
so let's give it more generic name.

The callbacks is implemented for IODA2 only. Other platforms (P5IOC2,
IODA1) will use the old iommu_take_ownership/iommu_release_ownership API.
The following patches will replace iommu_take_ownership/
iommu_release_ownership calls in IODA2 with full IOMMU table release/
create.

As we here and touching bypass control, this removes
pnv_pci_ioda2_setup_bypass_pe() as it does not do much
more compared to pnv_pci_ioda2_set_bypass. This moves tce_bypass_base
initialization to pnv_pci_ioda2_setup_dma_pe.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v10:
* fixed comments around take_ownership/release_ownership in 
iommu_table_group_ops

v9:
* squashed vfio: powerpc/spapr: powerpc/iommu: Rework IOMMU ownership control
and vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework IOMMU ownership control
into a single patch
* moved helpers with a loop through tables in a group
to vfio_iommu_spapr_tce.c to keep the platform code free of IOMMU table
groups as much as possible
* added missing tce_iommu_clear() to tce_iommu_release_ownership()
* replaced the set_ownership(enable) callback with take_ownership() and
release_ownership()
---
 arch/powerpc/include/asm/iommu.h  | 11 -
 arch/powerpc/kernel/iommu.c   | 12 -
 arch/powerpc/platforms/powernv/pci-ioda.c | 73 ++-
 drivers/vfio/vfio_iommu_spapr_tce.c   | 70 ++---
 4 files changed, 118 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 44a20cc..489133c 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -93,7 +93,6 @@ struct iommu_table {
unsigned long  it_page_shift;/* table iommu page size */
struct list_head it_group_list;/* List of iommu_table_group_link */
struct iommu_table_ops *it_ops;
-   void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
 
 /* Pure 2^n version of get_order */
@@ -126,6 +125,15 @@ extern struct iommu_table *iommu_init_table(struct 
iommu_table * tbl,
int nid);
 #define IOMMU_TABLE_GROUP_MAX_TABLES   1
 
+struct iommu_table_group;
+
+struct iommu_table_group_ops {
+   /* Switch ownership from platform code to external user (e.g. VFIO) */
+   void (*take_ownership)(struct iommu_table_group *table_group);
+   /* Switch ownership from external user (e.g. VFIO) back to core */
+   void (*release_ownership)(struct iommu_table_group *table_group);
+};
+
 struct iommu_table_group_link {
struct list_head next;
struct rcu_head rcu;
@@ -135,6 +143,7 @@ struct iommu_table_group_link {
 struct iommu_table_group {
struct iommu_group *group;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+   struct iommu_table_group_ops *ops;
 };
 
 #ifdef CONFIG_IOMMU_API
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index e305a8f..b6a397a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1047,14 +1047,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
 
memset(tbl-it_map, 0xff, sz);
 
-   /*
-* Disable iommu bypass, otherwise the user can DMA to all of
-* our physical memory via the bypass window instead of just
-* the pages that has been explicitly mapped into the iommu
-*/
-   if (tbl-set_bypass)
-   tbl-set_bypass(tbl, false);
-
return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
@@ -1068,10 +1060,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
/* Restore bit#0 set by iommu_init_table() */
if (tbl-it_offset == 0)
set_bit(0, tbl-it_map);
-
-   /* The kernel owns the device now, we can restore the iommu bypass */
-   if (tbl-set_bypass)
-   tbl-set_bypass(tbl, true);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);

[PATCH kernel v11 34/34] vfio: powerpc/spapr: Support Dynamic DMA windows

2015-05-29 Thread Alexey Kardashevskiy
This adds create/remove window ioctls to create and remove DMA windows.
sPAPR defines a Dynamic DMA windows capability which allows
para-virtualized guests to create additional DMA windows on a PCI bus.
The existing linux kernels use this new window to map the entire guest
memory and switch to the direct DMA operations saving time on map/unmap
requests which would normally happen in a big amounts.

This adds 2 ioctl handlers - VFIO_IOMMU_SPAPR_TCE_CREATE and
VFIO_IOMMU_SPAPR_TCE_REMOVE - to create and remove windows.
Up to 2 windows are supported now by the hardware and by this driver.

This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional
information such as a number of supported windows and maximum number
levels of TCE tables.

DDW is added as a capability, not as a SPAPR TCE IOMMU v2 unique feature
as we still want to support v2 on platforms which cannot do DDW for
the sake of TCE acceleration in KVM (coming soon).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v7:
* s/VFIO_IOMMU_INFO_DDW/VFIO_IOMMU_SPAPR_INFO_DDW/
* fixed typos in and updated vfio.txt
* fixed VFIO_IOMMU_SPAPR_TCE_GET_INFO handler
* moved ddw properties to vfio_iommu_spapr_tce_ddw_info

v6:
* added explicit VFIO_IOMMU_INFO_DDW flag to vfio_iommu_spapr_tce_info,
it used to be page mask flags from platform code
* added explicit pgsizes field
* added cleanup if tce_iommu_create_window() failed in a middle
* added checks for callbacks in tce_iommu_create_window and remove those
from tce_iommu_remove_window when it is too late to test anyway
* spapr_tce_find_free_table returns sensible error code now
* updated description of VFIO_IOMMU_SPAPR_TCE_CREATE/
VFIO_IOMMU_SPAPR_TCE_REMOVE

v4:
* moved code to tce_iommu_create_window()/tce_iommu_remove_window()
helpers
* added docs
---
 Documentation/vfio.txt  |  19 
 arch/powerpc/include/asm/iommu.h|   2 +-
 drivers/vfio/vfio_iommu_spapr_tce.c | 196 +++-
 include/uapi/linux/vfio.h   |  61 ++-
 4 files changed, 273 insertions(+), 5 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 7dcf2b5..8b1ec51 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -452,6 +452,25 @@ address is from pre-registered range.
 
 This separation helps in optimizing DMA for guests.
 
+6) sPAPR specification allows guests to have an additional DMA window(s) on
+a PCI bus with a variable page size. Two ioctls have been added to support
+this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE.
+The platform has to support the functionality or error will be returned to
+the userspace. The existing hardware supports up to 2 DMA windows, one is
+2GB long, uses 4K pages and called default 32bit window; the other can
+be as big as entire RAM, use different page size, it is optional - guests
+create those in run-time if the guest driver supports 64bit DMA.
+
+VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and
+a number of TCE table levels (if a TCE table is going to be big enough and
+the kernel may not be able to allocate enough of physically contiguous memory).
+It creates a new window in the available slot and returns the bus address where
+the new window starts. Due to hardware limitation, the user space cannot choose
+the location of DMA windows.
+
+VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window
+and removes it.
+
 ---
 
 [1] VFIO was originally an acronym for Virtual Function I/O in its
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index f9957eb..ca18cff 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -149,7 +149,7 @@ extern void iommu_free_table(struct iommu_table *tbl, const 
char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
int nid);
-#define IOMMU_TABLE_GROUP_MAX_TABLES   1
+#define IOMMU_TABLE_GROUP_MAX_TABLES   2
 
 struct iommu_table_group;
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index cadd9f8..199d5db 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -211,6 +211,18 @@ static long tce_iommu_find_table(struct tce_container 
*container,
return -1;
 }
 
+static int tce_iommu_find_free_table(struct tce_container *container)
+{
+   int i;
+
+   for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+   if (!container-tables[i])
+   return i;
+   }
+
+   return -ENOSPC;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
int ret = 0;
@@ -593,11 +605,115 @@ static void tce_iommu_free_table(struct iommu_table *tbl)

[PATCH kernel v11 28/34] vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA windows API

2015-05-29 Thread Alexey Kardashevskiy
This extends iommu_table_group_ops by a set of callbacks to support
dynamic DMA windows management.

create_table() creates a TCE table with specific parameters.
it receives iommu_table_group to know nodeid in order to allocate
TCE table memory closer to the PHB. The exact format of allocated
multi-level table might be also specific to the PHB model (not
the case now though).
This callback calculated the DMA window offset on a PCI bus from @num
and stores it in a just created table.

set_window() sets the window at specified TVT index + @num on PHB.

unset_window() unsets the window from specified TVT.

This adds a free() callback to iommu_table_ops to free the memory
(potentially a tree of tables) allocated for the TCE table.

create_table() and free() are supposed to be called once per
VFIO container and set_window()/unset_window() are supposed to be
called for every group in a container.

This adds IOMMU capabilities to iommu_table_group such as default
32bit window parameters and others. This makes use of new values in
vfio_iommu_spapr_tce. IODA1/P5IOC2 do not support DDW so they do not
advertise pagemasks to the userspace.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v10:
* squashed vfio: powerpc/spapr: Use 32bit DMA window properties from 
table_group
into this
* shortened the subject

v9:
* new in the series - to make the next patch simpler
---
 arch/powerpc/include/asm/iommu.h| 19 ++
 arch/powerpc/platforms/powernv/pci-ioda.c   | 96 ++---
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  7 ++-
 drivers/vfio/vfio_iommu_spapr_tce.c | 19 +++---
 4 files changed, 124 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 706cfc0..e554175 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -70,6 +70,7 @@ struct iommu_table_ops {
/* get() returns a physical address */
unsigned long (*get)(struct iommu_table *tbl, long index);
void (*flush)(struct iommu_table *tbl);
+   void (*free)(struct iommu_table *tbl);
 };
 
 /* These are used by VIO */
@@ -146,6 +147,17 @@ extern struct iommu_table *iommu_init_table(struct 
iommu_table * tbl,
 struct iommu_table_group;
 
 struct iommu_table_group_ops {
+   long (*create_table)(struct iommu_table_group *table_group,
+   int num,
+   __u32 page_shift,
+   __u64 window_size,
+   __u32 levels,
+   struct iommu_table **ptbl);
+   long (*set_window)(struct iommu_table_group *table_group,
+   int num,
+   struct iommu_table *tblnew);
+   long (*unset_window)(struct iommu_table_group *table_group,
+   int num);
/* Switch ownership from platform code to external user (e.g. VFIO) */
void (*take_ownership)(struct iommu_table_group *table_group);
/* Switch ownership from external user (e.g. VFIO) back to core */
@@ -159,6 +171,13 @@ struct iommu_table_group_link {
 };
 
 struct iommu_table_group {
+   /* IOMMU properties */
+   __u32 tce32_start;
+   __u32 tce32_size;
+   __u64 pgsizes; /* Bitmap of supported page sizes */
+   __u32 max_dynamic_windows_supported;
+   __u32 max_levels;
+
struct iommu_group *group;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
struct iommu_table_group_ops *ops;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 68ffc7a..1059bf6 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -25,6 +25,7 @@
 #include linux/memblock.h
 #include linux/iommu.h
 #include linux/rculist.h
+#include linux/sizes.h
 
 #include asm/sections.h
 #include asm/io.h
@@ -1868,6 +1869,12 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, 
long index,
pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
 }
 
+static void pnv_ioda2_table_free(struct iommu_table *tbl)
+{
+   pnv_pci_ioda2_table_free_pages(tbl);
+   iommu_free_table(tbl, pnv);
+}
+
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
.set = pnv_ioda2_tce_build,
 #ifdef CONFIG_IOMMU_API
@@ -1875,6 +1882,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 #endif
.clear = pnv_ioda2_tce_free,
.get = pnv_tce_get,
+   .free = pnv_ioda2_table_free,
 };
 
 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
@@ -1945,6 +1953,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 TCE_PCI_SWINV_PAIR);
 
tbl-it_ops = pnv_ioda1_iommu_ops;
+   pe-table_group.tce32_start = tbl-it_offset  tbl-it_page_shift;
+   pe-table_group.tce32_size = tbl-it_size  tbl-it_page_shift;

[PATCH kernel v11 09/34] vfio: powerpc/spapr: Move locked_vm accounting to helpers

2015-05-29 Thread Alexey Kardashevskiy
There moves locked pages accounting to helpers.
Later they will be reused for Dynamic DMA windows (DDW).

This reworks debug messages to show the current value and the limit.

This stores the locked pages number in the container so when unlocking
the iommu table pointer won't be needed. This does not have an effect
now but it will with the multiple tables per container as then we will
allow attaching/detaching groups on fly and we may end up having
a container with no group attached but with the counter incremented.

While we are here, update the comment explaining why RLIMIT_MEMLOCK
might be required to be bigger than the guest RAM. This also prints
pid of the current process in pr_warn/pr_debug.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v4:
* new helpers do nothing if @npages == 0
* tce_iommu_disable() now can decrement the counter if the group was
detached (not possible now but will be in the future)
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 82 -
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 64300cc..40583f9 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -29,6 +29,51 @@
 static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group);
 
+static long try_increment_locked_vm(long npages)
+{
+   long ret = 0, locked, lock_limit;
+
+   if (!current || !current-mm)
+   return -ESRCH; /* process exited */
+
+   if (!npages)
+   return 0;
+
+   down_write(current-mm-mmap_sem);
+   locked = current-mm-locked_vm + npages;
+   lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
+   if (locked  lock_limit  !capable(CAP_IPC_LOCK))
+   ret = -ENOMEM;
+   else
+   current-mm-locked_vm += npages;
+
+   pr_debug([%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n, current-pid,
+   npages  PAGE_SHIFT,
+   current-mm-locked_vm  PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK),
+   ret ?  - exceeded : );
+
+   up_write(current-mm-mmap_sem);
+
+   return ret;
+}
+
+static void decrement_locked_vm(long npages)
+{
+   if (!current || !current-mm || !npages)
+   return; /* process exited */
+
+   down_write(current-mm-mmap_sem);
+   if (npages  current-mm-locked_vm)
+   npages = current-mm-locked_vm;
+   current-mm-locked_vm -= npages;
+   pr_debug([%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n, current-pid,
+   npages  PAGE_SHIFT,
+   current-mm-locked_vm  PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK));
+   up_write(current-mm-mmap_sem);
+}
+
 /*
  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  *
@@ -45,6 +90,7 @@ struct tce_container {
struct mutex lock;
struct iommu_table *tbl;
bool enabled;
+   unsigned long locked_pages;
 };
 
 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, 
unsigned page_shift)
 static int tce_iommu_enable(struct tce_container *container)
 {
int ret = 0;
-   unsigned long locked, lock_limit, npages;
+   unsigned long locked;
struct iommu_table *tbl = container-tbl;
 
if (!container-tbl)
@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container 
*container)
 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 * that would effectively kill the guest at random points, much better
 * enforcing the limit based on the max that the guest can map.
+*
+* Unfortunately at the moment it counts whole tables, no matter how
+* much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
+* each with 2GB DMA window, 8GB will be counted here. The reason for
+* this is that we cannot tell here the amount of RAM used by the guest
+* as this information is only available from KVM and VFIO is
+* KVM agnostic.
 */
-   down_write(current-mm-mmap_sem);
-   npages = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
-   locked = current-mm-locked_vm + npages;
-   lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
-   if (locked  lock_limit  !capable(CAP_IPC_LOCK)) {
-   pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n,
-   rlimit(RLIMIT_MEMLOCK));
-   ret = -ENOMEM;
-   } else {
+   locked = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
+   ret = try_increment_locked_vm(locked);
+   if (ret)
+   return 

[PATCH kernel v11 04/34] powerpc/iommu: Put IOMMU group explicitly

2015-05-29 Thread Alexey Kardashevskiy
So far an iommu_table lifetime was the same as PE. Dynamic DMA windows
will change this and iommu_free_table() will not always require
the group to be released.

This moves iommu_group_put() out of iommu_free_table().

This adds a iommu_pseries_free_table() helper which does
iommu_group_put() and iommu_free_table(). Later it will be
changed to receive a table_group and we will have to change less
lines then.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/kernel/iommu.c   |  7 ---
 arch/powerpc/platforms/powernv/pci-ioda.c |  5 +
 arch/powerpc/platforms/pseries/iommu.c| 16 +++-
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index b054f33..3d47eb3 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -726,13 +726,6 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
if (tbl-it_offset == 0)
clear_bit(0, tbl-it_map);
 
-#ifdef CONFIG_IOMMU_API
-   if (tbl-it_group) {
-   iommu_group_put(tbl-it_group);
-   BUG_ON(tbl-it_group);
-   }
-#endif
-
/* verify that table contains no entries */
if (!bitmap_empty(tbl-it_map, tbl-it_size))
pr_warn(%s: Unexpected TCEs for %s\n, __func__, node_name);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8ca7abd..8c3c4bf 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -23,6 +23,7 @@
 #include linux/io.h
 #include linux/msi.h
 #include linux/memblock.h
+#include linux/iommu.h
 
 #include asm/sections.h
 #include asm/io.h
@@ -1310,6 +1311,10 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev 
*dev, struct pnv_ioda_pe
if (rc)
pe_warn(pe, OPAL error %ld release DMA window\n, rc);
 
+   if (tbl-it_group) {
+   iommu_group_put(tbl-it_group);
+   BUG_ON(tbl-it_group);
+   }
iommu_free_table(tbl, of_node_full_name(dev-dev.of_node));
free_pages(addr, get_order(TCE32_TABLE_SIZE));
pe-tce32_table = NULL;
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 05ab06d..fe5117b 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -36,6 +36,7 @@
 #include linux/crash_dump.h
 #include linux/memory.h
 #include linux/of.h
+#include linux/iommu.h
 #include asm/io.h
 #include asm/prom.h
 #include asm/rtas.h
@@ -51,6 +52,18 @@
 
 #include pseries.h
 
+static void iommu_pseries_free_table(struct iommu_table *tbl,
+   const char *node_name)
+{
+#ifdef CONFIG_IOMMU_API
+   if (tbl-it_group) {
+   iommu_group_put(tbl-it_group);
+   BUG_ON(tbl-it_group);
+   }
+#endif
+   iommu_free_table(tbl, node_name);
+}
+
 static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
  __be64 *startp, __be64 *endp)
 {
@@ -1271,7 +1284,8 @@ static int iommu_reconfig_notifier(struct notifier_block 
*nb, unsigned long acti
 */
remove_ddw(np, false);
if (pci  pci-iommu_table)
-   iommu_free_table(pci-iommu_table, np-full_name);
+   iommu_pseries_free_table(pci-iommu_table,
+   np-full_name);
 
spin_lock(direct_window_list_lock);
list_for_each_entry(window, direct_window_list, list) {
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 30/34] powerpc/iommu/ioda2: Add get_table_size() to calculate the size of future table

2015-05-29 Thread Alexey Kardashevskiy
This adds a way for the IOMMU user to know how much a new table will
use so it can be accounted in the locked_vm limit before allocation
happens.

This stores the allocated table size in pnv_pci_ioda2_get_table_size()
so the locked_vm counter can be updated correctly when a table is
being disposed.

This defines an iommu_table_group_ops callback to let VFIO know
how much memory will be locked if a table is created.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v10:
* s/ROUND_UP/_ALIGN_UP/
* fixed rounding up for @entries_shift (used to use ROUND_UP)

v9:
* reimplemented the whole patch
---
 arch/powerpc/include/asm/iommu.h  |  5 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 35 +++
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index e554175..9d37492 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -99,6 +99,7 @@ struct iommu_table {
unsigned long  it_size;  /* Size of iommu table in entries */
unsigned long  it_indirect_levels;
unsigned long  it_level_size;
+   unsigned long  it_allocated_size;
unsigned long  it_offset;/* Offset into global table */
unsigned long  it_base;  /* mapped address of tce table */
unsigned long  it_index; /* which iommu table this is */
@@ -147,6 +148,10 @@ extern struct iommu_table *iommu_init_table(struct 
iommu_table * tbl,
 struct iommu_table_group;
 
 struct iommu_table_group_ops {
+   unsigned long (*get_table_size)(
+   __u32 page_shift,
+   __u64 window_size,
+   __u32 levels);
long (*create_table)(struct iommu_table_group *table_group,
int num,
__u32 page_shift,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 00739883..c77c85e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -41,6 +41,7 @@
 #include asm/debug.h
 #include asm/firmware.h
 #include asm/pnv-pci.h
+#include asm/mmzone.h
 
 #include misc/cxl.h
 
@@ -2072,6 +2073,38 @@ static long pnv_pci_ioda2_create_table(struct 
iommu_table_group *table_group,
 }
 
 #ifdef CONFIG_IOMMU_API
+static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+   __u64 window_size, __u32 levels)
+{
+   unsigned long bytes = 0;
+   const unsigned window_shift = ilog2(window_size);
+   unsigned entries_shift = window_shift - page_shift;
+   unsigned table_shift = entries_shift + 3;
+   unsigned long tce_table_size = max(0x1000UL, 1UL  table_shift);
+   unsigned long direct_table_size;
+
+   if (!levels || (levels  POWERNV_IOMMU_MAX_LEVELS) ||
+   (window_size  memory_hotplug_max()) ||
+   !is_power_of_2(window_size))
+   return 0;
+
+   /* Calculate a direct table size from window_size and levels */
+   entries_shift = (entries_shift + levels - 1) / levels;
+   table_shift = entries_shift + 3;
+   table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
+   direct_table_size =  1UL  table_shift;
+
+   for ( ; levels; --levels) {
+   bytes += _ALIGN_UP(tce_table_size, direct_table_size);
+
+   tce_table_size /= direct_table_size;
+   tce_table_size = 3;
+   tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size);
+   }
+
+   return bytes;
+}
+
 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
int num)
 {
@@ -2115,6 +2148,7 @@ static void pnv_ioda2_release_ownership(struct 
iommu_table_group *table_group)
 }
 
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+   .get_table_size = pnv_pci_ioda2_get_table_size,
.create_table = pnv_pci_ioda2_create_table,
.set_window = pnv_pci_ioda2_set_window,
.unset_window = pnv_pci_ioda2_unset_window,
@@ -2219,6 +2253,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, 
__u64 bus_offset,
page_shift);
tbl-it_level_size = 1ULL  (level_shift - 3);
tbl-it_indirect_levels = levels - 1;
+   tbl-it_allocated_size = tce_table_allocated;
 
pr_devel(Created TCE table: ws=%08llx ts=%lx @%08llx\n,
window_size, tce_table_size, bus_offset);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 10/34] vfio: powerpc/spapr: Disable DMA mappings on disabled container

2015-05-29 Thread Alexey Kardashevskiy
At the moment DMA map/unmap requests are handled irrespective to
the container's state. This allows the user space to pin memory which
it might not be allowed to pin.

This adds checks to MAP/UNMAP that the container is enabled, otherwise
-EPERM is returned.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 40583f9..e21479c 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -318,6 +318,9 @@ static long tce_iommu_ioctl(void *iommu_data,
struct iommu_table *tbl = container-tbl;
unsigned long tce;
 
+   if (!container-enabled)
+   return -EPERM;
+
if (!tbl)
return -ENXIO;
 
@@ -362,6 +365,9 @@ static long tce_iommu_ioctl(void *iommu_data,
struct vfio_iommu_type1_dma_unmap param;
struct iommu_table *tbl = container-tbl;
 
+   if (!container-enabled)
+   return -EPERM;
+
if (WARN_ON(!tbl))
return -ENXIO;
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 03/34] powerpc/powernv/ioda: Clean up IOMMU group registration

2015-05-29 Thread Alexey Kardashevskiy
The existing code has 3 calls to iommu_register_group() and
all 3 branches actually cover all possible cases.

This replaces 3 calls with one and moves the registration earlier;
the latter will make more sense when we add TCE table sharing.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 28 
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9a77f3c..8ca7abd 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1784,6 +1784,9 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
if (WARN_ON(pe-tce32_seg = 0))
return;
 
+   tbl = pe-tce32_table;
+   iommu_register_group(tbl, phb-hose-global_number, pe-pe_number);
+
/* Grab a 32-bit TCE table */
pe-tce32_seg = base;
pe_info(pe,  Setting up 32-bit TCE table at %08x..%08x\n,
@@ -1818,7 +1821,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
}
 
/* Setup linux iommu table */
-   tbl = pe-tce32_table;
pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
  base  28, IOMMU_PAGE_SHIFT_4K);
 
@@ -1840,8 +1842,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
iommu_init_table(tbl, phb-hose-node);
 
if (pe-flags  PNV_IODA_PE_DEV) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
/*
 * Setting table base here only for carrying iommu_group
 * further down to let iommu_add_device() do the job.
@@ -1849,14 +1849,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
 */
set_iommu_table_base(pe-pdev-dev, tbl);
iommu_add_device(pe-pdev-dev);
-   } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
+   } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
pnv_ioda_setup_bus_dma(pe, pe-pbus);
-   } else if (pe-flags  PNV_IODA_PE_VF) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
-   }
 
return;
  fail:
@@ -1923,6 +1917,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
if (WARN_ON(pe-tce32_seg = 0))
return;
 
+   tbl = pe-tce32_table;
+   iommu_register_group(tbl, phb-hose-global_number, pe-pe_number);
+
/* The PE will reserve all possible 32-bits space */
pe-tce32_seg = 0;
end = (1  ilog2(phb-ioda.m32_pci_base));
@@ -1954,7 +1951,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
}
 
/* Setup linux iommu table */
-   tbl = pe-tce32_table;
pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
IOMMU_PAGE_SHIFT_4K);
 
@@ -1974,8 +1970,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
iommu_init_table(tbl, phb-hose-node);
 
if (pe-flags  PNV_IODA_PE_DEV) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
/*
 * Setting table base here only for carrying iommu_group
 * further down to let iommu_add_device() do the job.
@@ -1983,14 +1977,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 */
set_iommu_table_base(pe-pdev-dev, tbl);
iommu_add_device(pe-pdev-dev);
-   } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
+   } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
pnv_ioda_setup_bus_dma(pe, pe-pbus);
-   } else if (pe-flags  PNV_IODA_PE_VF) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
-   }
 
/* Also create a bypass window */
if (!pnv_iommu_bypass_disabled)
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 20/34] powerpc/powernv/ioda2: Move TCE kill register address to PE

2015-05-29 Thread Alexey Kardashevskiy
At the moment the DMA setup code looks for the ibm,opal-tce-kill
property which contains the TCE kill register address. Writing to
this register invalidates TCE cache on IODA/IODA2 hub.

This moves the register address from iommu_table to pnv_pnb as this
register belongs to PHB and invalidates TCE cache for all tables of
all attached PEs.

This moves the property reading/remapping code to a helper which is
called when DMA is being configured for PE and which does DMA setup
for both IODA1 and IODA2.

This adds a new pnv_pci_ioda2_tce_invalidate_entire() helper which
invalidates cache for the entire table. It should be called after
every call to opal_pci_map_pe_dma_window(). It was not required before
because there was just a single TCE table and 64bit DMA was handled via
bypass window (which has no table so no cache was used) but this is going
to change with Dynamic DMA windows (DDW).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v11:
* s/pnv_pci_ioda2_tvt_invalidate/pnv_pci_ioda2_tce_invalidate_entire/g
(cannot think of better-and-shorter name)
* moved tce_inval_reg_phys/tce_inval_reg to pnv_phb

v10:
* fixed error from checkpatch.pl
* removed comment at ibm,opal-tce-kill parsing as irrelevant
* s/addr/val/ in pnv_pci_ioda2_tvt_invalidate() as it was not a kernel address

v9:
* new in the series
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 66 ++-
 arch/powerpc/platforms/powernv/pci.h  |  7 +++-
 2 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 1d0bb5b..3fd8b18 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1679,8 +1679,8 @@ static void pnv_pci_ioda1_tce_invalidate(struct 
iommu_table *tbl,
struct pnv_ioda_pe *pe = container_of(tgl-table_group,
struct pnv_ioda_pe, table_group);
__be64 __iomem *invalidate = rm ?
-   (__be64 __iomem *)pe-tce_inval_reg_phys :
-   (__be64 __iomem *)tbl-it_index;
+   (__be64 __iomem *)pe-phb-ioda.tce_inval_reg_phys :
+   pe-phb-ioda.tce_inval_reg;
unsigned long start, end, inc;
const unsigned shift = tbl-it_page_shift;
 
@@ -1751,6 +1751,19 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
.get = pnv_tce_get,
 };
 
+static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
+{
+   /* 01xb - invalidate TCEs that match the specified PE# */
+   unsigned long val = (0x4ull  60) | (pe-pe_number  0xFF);
+   struct pnv_phb *phb = pe-phb;
+
+   if (!phb-ioda.tce_inval_reg)
+   return;
+
+   mb(); /* Ensure above stores are visible */
+   __raw_writeq(cpu_to_be64(val), phb-ioda.tce_inval_reg);
+}
+
 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
unsigned long index, unsigned long npages, bool rm)
 {
@@ -1761,8 +1774,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
iommu_table *tbl,
struct pnv_ioda_pe, table_group);
unsigned long start, end, inc;
__be64 __iomem *invalidate = rm ?
-   (__be64 __iomem *)pe-tce_inval_reg_phys :
-   (__be64 __iomem *)tbl-it_index;
+   (__be64 __iomem *)pe-phb-ioda.tce_inval_reg_phys :
+   pe-phb-ioda.tce_inval_reg;
const unsigned shift = tbl-it_page_shift;
 
/* We'll invalidate DMA address in PE scope */
@@ -1820,7 +1833,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 {
 
struct page *tce_mem = NULL;
-   const __be64 *swinvp;
struct iommu_table *tbl;
unsigned int i;
int64_t rc;
@@ -1877,20 +1889,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
  base  28, IOMMU_PAGE_SHIFT_4K);
 
/* OPAL variant of P7IOC SW invalidated TCEs */
-   swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
-   if (swinvp) {
-   /* We need a couple more fields -- an address and a data
-* to or.  Since the bus is only printed out on table free
-* errors, and on the first pass the data will be a relative
-* bus number, print that out instead.
-*/
-   pe-tce_inval_reg_phys = be64_to_cpup(swinvp);
-   tbl-it_index = (unsigned long)ioremap(pe-tce_inval_reg_phys,
-   8);
+   if (phb-ioda.tce_inval_reg)
tbl-it_type |= (TCE_PCI_SWINV_CREATE |
 TCE_PCI_SWINV_FREE   |
 TCE_PCI_SWINV_PAIR);
-   }
+
tbl-it_ops = pnv_ioda1_iommu_ops;
iommu_init_table(tbl, phb-hose-node);
 
@@ -1971,12 +1974,24 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = 
{
 };
 #endif
 
+static void 

[PATCH kernel v11 25/34] powerpc/powernv/ioda2: Introduce helpers to allocate TCE pages

2015-05-29 Thread Alexey Kardashevskiy
This is a part of moving TCE table allocation into an iommu_ops
callback to support multiple IOMMU groups per one VFIO container.

This moves the code which allocates the actual TCE tables to helpers:
pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages().
These do not allocate/free the iommu_table struct.

This enforces window size to be a power of two.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v10:
* removed @table_group parameter from pnv_pci_create_table as it was not used
* removed *tce_table_allocated from pnv_alloc_tce_table_pages()
* pnv_pci_create_table/pnv_pci_free_table renamed to
pnv_pci_ioda2_table_alloc_pages/pnv_pci_ioda2_table_free_pages and moved
back to pci-ioda.c as these only allocate pages for IODA2 and there is
no chance they will be reused for IODA1/P5IOC2
* shortened subject line

v9:
* moved helpers to the common powernv pci.c file from pci-ioda.c
* moved bits from pnv_pci_create_table() to pnv_alloc_tce_table_pages()
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 82 +++
 1 file changed, 62 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 0e88241..3d29fe3 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -49,6 +49,8 @@
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE   ((0x1000 / 0x1000) * 8)
 
+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
+
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...)
 {
@@ -1313,8 +1315,8 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev 
*dev, struct pnv_ioda_pe
iommu_group_put(pe-table_group.group);
BUG_ON(pe-table_group.group);
}
+   pnv_pci_ioda2_table_free_pages(tbl);
iommu_free_table(tbl, of_node_full_name(dev-dev.of_node));
-   free_pages(addr, get_order(TCE32_TABLE_SIZE));
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
@@ -2032,13 +2034,62 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct 
pnv_phb *phb)
phb-ioda.tce_inval_reg = ioremap(phb-ioda.tce_inval_reg_phys, 8);
 }
 
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
-  struct pnv_ioda_pe *pe)
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
 {
struct page *tce_mem = NULL;
+   __be64 *addr;
+   unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+
+   tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
+   if (!tce_mem) {
+   pr_err(Failed to allocate a TCE memory, order=%d\n, order);
+   return NULL;
+   }
+   addr = page_address(tce_mem);
+   memset(addr, 0, 1UL  (order + PAGE_SHIFT));
+
+   return addr;
+}
+
+static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+   __u32 page_shift, __u64 window_size, struct iommu_table *tbl)
+{
void *addr;
+   const unsigned window_shift = ilog2(window_size);
+   unsigned entries_shift = window_shift - page_shift;
+   unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
+   const unsigned long tce_table_size = 1UL  table_shift;
+
+   if ((window_size  memory_hotplug_max()) || !is_power_of_2(window_size))
+   return -EINVAL;
+
+   /* Allocate TCE table */
+   addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift);
+   if (!addr)
+   return -ENOMEM;
+
+   /* Setup linux iommu table */
+   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
+   page_shift);
+
+   pr_devel(Created TCE table: ws=%08llx ts=%lx @%08llx\n,
+   window_size, tce_table_size, bus_offset);
+
+   return 0;
+}
+
+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
+{
+   if (!tbl-it_size)
+   return;
+
+   free_pages(tbl-it_base, get_order(tbl-it_size  3));
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+  struct pnv_ioda_pe *pe)
+{
struct iommu_table *tbl;
-   unsigned int tce_table_size, end;
int64_t rc;
 
/* We shouldn't already have a 32-bit DMA associated */
@@ -2055,24 +2106,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 
/* The PE will reserve all possible 32-bits space */
pe-tce32_seg = 0;
-   end = (1  ilog2(phb-ioda.m32_pci_base));
-   tce_table_size = (end / 0x1000) * 8;
pe_info(pe, Setting up 32-bit TCE table at 0..%08x\n,
-   end);
+   phb-ioda.m32_pci_base);
 
-   /* Allocate TCE table */
-   tce_mem = 

[PATCH kernel v11 27/34] powerpc/powernv: Implement multilevel TCE tables

2015-05-29 Thread Alexey Kardashevskiy
TCE tables might get too big in case of 4K IOMMU pages and DDW enabled
on huge guests (hundreds of GB of RAM) so the kernel might be unable to
allocate contiguous chunk of physical memory to store the TCE table.

To address this, POWER8 CPU (actually, IODA2) supports multi-level
TCE tables, up to 5 levels which splits the table into a tree of
smaller subtables.

This adds multi-level TCE tables support to
pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages()
helpers.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v10:
* fixed multiple comments received for v9

v9:
* moved from ioda2 to common powernv pci code
* fixed cleanup if allocation fails in a middle
* removed check for the size - all boundary checks happen in the calling code
anyway
---
 arch/powerpc/include/asm/iommu.h  |  2 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 98 ---
 arch/powerpc/platforms/powernv/pci.c  | 13 
 3 files changed, 104 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 4636734..706cfc0 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -96,6 +96,8 @@ struct iommu_pool {
 struct iommu_table {
unsigned long  it_busno; /* Bus number this table belongs to */
unsigned long  it_size;  /* Size of iommu table in entries */
+   unsigned long  it_indirect_levels;
+   unsigned long  it_level_size;
unsigned long  it_offset;/* Offset into global table */
unsigned long  it_base;  /* mapped address of tce table */
unsigned long  it_index; /* which iommu table this is */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index fda01c1..68ffc7a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -49,6 +49,9 @@
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE   ((0x1000 / 0x1000) * 8)
 
+#define POWERNV_IOMMU_DEFAULT_LEVELS   1
+#define POWERNV_IOMMU_MAX_LEVELS   5
+
 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
 
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -1975,6 +1978,8 @@ static long pnv_pci_ioda2_set_window(struct 
iommu_table_group *table_group,
table_group);
struct pnv_phb *phb = pe-phb;
int64_t rc;
+   const unsigned long size = tbl-it_indirect_levels ?
+   tbl-it_level_size : tbl-it_size;
const __u64 start_addr = tbl-it_offset  tbl-it_page_shift;
const __u64 win_size = tbl-it_size  tbl-it_page_shift;
 
@@ -1989,9 +1994,9 @@ static long pnv_pci_ioda2_set_window(struct 
iommu_table_group *table_group,
rc = opal_pci_map_pe_dma_window(phb-opal_id,
pe-pe_number,
pe-pe_number  1,
-   1,
+   tbl-it_indirect_levels + 1,
__pa(tbl-it_base),
-   tbl-it_size  3,
+   size  3,
IOMMU_PAGE_SIZE(tbl));
if (rc) {
pe_err(pe, Failed to configure TCE table, err %ld\n, rc);
@@ -2071,11 +2076,19 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct 
pnv_phb *phb)
phb-ioda.tce_inval_reg = ioremap(phb-ioda.tce_inval_reg_phys, 8);
 }
 
-static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
+   unsigned levels, unsigned long limit,
+   unsigned long *tce_table_allocated)
 {
struct page *tce_mem = NULL;
-   __be64 *addr;
+   __be64 *addr, *tmp;
unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+   unsigned long local_allocated = 1UL  (order + PAGE_SHIFT);
+   unsigned entries = 1UL  (shift - 3);
+   long i;
+
+   if (*tce_table_allocated = limit)
+   return NULL;
 
tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
if (!tce_mem) {
@@ -2083,31 +2096,69 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int 
nid, unsigned shift)
return NULL;
}
addr = page_address(tce_mem);
-   memset(addr, 0, 1UL  (order + PAGE_SHIFT));
+   memset(addr, 0, local_allocated);
+
+   --levels;
+   if (!levels) {
+   *tce_table_allocated += local_allocated;
+   return addr;
+   }
+
+   for (i = 0; i  entries; ++i) {
+   tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
+   levels, limit, tce_table_allocated);
+   if (!tmp)
+   break;
+
+   addr[i] = cpu_to_be64(__pa(tmp) |
+   TCE_PCI_READ | TCE_PCI_WRITE);
+   }
 
return addr;
 }
 

[PATCH kernel v11 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows

2015-05-29 Thread Alexey Kardashevskiy

This enables sPAPR defined feature called Dynamic DMA windows (DDW).

Each Partitionable Endpoint (IOMMU group) has an address range on a PCI bus
where devices are allowed to do DMA. These ranges are called DMA windows.
By default, there is a single DMA window, 1 or 2GB big, mapped at zero
on a PCI bus.

Hi-speed devices may suffer from the limited size of the window.
The recent host kernels use a TCE bypass window on POWER8 CPU which implements
direct PCI bus address range mapping (with offset of 159) to the host memory.

For guests, PAPR defines a DDW RTAS API which allows pseries guests
querying the hypervisor about DDW support and capabilities (page size mask
for now). A pseries guest may request an additional (to the default)
DMA windows using this RTAS API.
The existing pseries Linux guests request an additional window as big as
the guest RAM and map the entire guest window which effectively creates
direct mapping of the guest memory to a PCI bus.

The multiple DMA windows feature is supported by POWER7/POWER8 CPUs; however
this patchset only adds support for POWER8 as TCE tables are implemented
in POWER7 in a quite different way ans POWER7 is not the highest priority.

This patchset reworks PPC64 IOMMU code and adds necessary structures
to support big windows.

Once a Linux guest discovers the presence of DDW, it does:
1. query hypervisor about number of available windows and page size masks;
2. create a window with the biggest possible page size (today 4K/64K/16M);
3. map the entire guest RAM via H_PUT_TCE* hypercalls;
4. switche dma_ops to direct_dma_ops on the selected PE.

Once this is done, H_PUT_TCE is not called anymore for 64bit devices and
the guest does not waste time on DMA map/unmap operations.

Note that 32bit devices won't use DDW and will keep using the default
DMA window so KVM optimizations will be required (to be posted later).

This is pushed to g...@github.com:aik/linux.git
 + 0ea0348...93b347697 vfio-for-github - vfio-for-github (forced update)

The pushed branch contains all patches from this patchset and KVM
acceleration patches as well to give an idea about the current state
of in-kernel acceleration support.


Please comment. Thanks!


Changes:
v11:
* reworked locking in pinned pages cache

v10:
* fixedtested on SRIOV system
* fixed multiple comments from David
* added bunch of iommu device attachment reworks

v9:
* rebased on top of SRIOV (which is in upstream now)
* fixed multiple comments from David
* reworked ownership patches
* removed vfio: powerpc/spapr: Do cleanup when releasing the group (used to be 
#2)
as updated #1 should do this
* moved powerpc/powernv: Implement accessor to TCE entry to a separate patch
* added a patch which moves TCE Kill register address to PE from IOMMU table

v8:
* fixed a bug in error fallback in powerpc/mmu: Add userspace-to-physical
addresses translation cache
* fixed subject in vfio: powerpc/spapr: Check that IOMMU page is fully
contained by system page
* moved v2 documentation to the correct patch
* added checks for failed vzalloc() in powerpc/iommu: Add userspace view
of TCE table

v7:
* moved memory preregistration to the current process's MMU context
* added code preventing unregistration if some pages are still mapped;
for this, there is a userspace view of the table is stored in iommu_table
* added locked_vm counting for DDW tables (including userspace view of those)

v6:
* fixed a bunch of errors in vfio: powerpc/spapr: Support Dynamic DMA windows
* moved static IOMMU properties from iommu_table_group to iommu_table_group_ops

v5:
* added SPAPR_TCE_IOMMU_v2 to tell the userspace that there is a memory
pre-registration feature
* added backward compatibility
* renamed few things (mostly powerpc_iommu - iommu_table_group)

v4:
* moved patches around to have VFIO and PPC patches separated as much as
possible
* now works with the existing upstream QEMU

v3:
* redesigned the whole thing
* multiple IOMMU groups per PHB - one PHB is needed for VFIO in the guest -
no problems with locked_vm counting; also we save memory on actual tables
* guest RAM preregistration is required for DDW
* PEs (IOMMU groups) are passed to VFIO with no DMA windows at all so
we do not bother with iommu_table::it_map anymore
* added multilevel TCE tables support to support really huge guests

v2:
* added missing __pa() in powerpc/powernv: Release replaced TCE
* reposted to make some noise




Alexey Kardashevskiy (34):
  powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group
  powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group
  powerpc/powernv/ioda: Clean up IOMMU group registration
  powerpc/iommu: Put IOMMU group explicitly
  powerpc/iommu: Always release iommu_table in iommu_free_table()
  vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU
driver
  vfio: powerpc/spapr: Check that IOMMU page is fully contained by
system page
  vfio: powerpc/spapr: Use it_page_size
  vfio: powerpc/spapr: Move locked_vm 

[PATCH kernel v11 23/34] powerpc/iommu/powernv: Release replaced TCE

2015-05-29 Thread Alexey Kardashevskiy
At the moment writing new TCE value to the IOMMU table fails with EBUSY
if there is a valid entry already. However PAPR specification allows
the guest to write new TCE value without clearing it first.

Another problem this patch is addressing is the use of pool locks for
external IOMMU users such as VFIO. The pool locks are to protect
DMA page allocator rather than entries and since the host kernel does
not control what pages are in use, there is no point in pool locks and
exchange()+put_page(oldtce) is sufficient to avoid possible races.

This adds an exchange() callback to iommu_table_ops which does the same
thing as set() plus it returns replaced TCE and DMA direction so
the caller can release the pages afterwards. The exchange() receives
a physical address unlike set() which receives linear mapping address;
and returns a physical address as the clear() does.

This implements exchange() for P5IOC2/IODA/IODA2. This adds a requirement
for a platform to have exchange() implemented in order to support VFIO.

This replaces iommu_tce_build() and iommu_clear_tce() with
a single iommu_tce_xchg().

This makes sure that TCE permission bits are not set in TCE passed to
IOMMU API as those are to be calculated by platform code from
DMA direction.

This moves SetPageDirty() to the IOMMU code to make it work for both
VFIO ioctl interface in in-kernel TCE acceleration (when it becomes
available later).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v10:
* did s/tce/hpa/ in iommu_table_ops::exchange and tce_iommu_unuse_page()
* removed permission bits check from iommu_tce_put_param_check as
permission bits are not allowed in the address
* added BUG_ON(*hpa  ~IOMMU_PAGE_MASK(tbl)) to pnv_tce_xchg()

v9:
* changed exchange() to work with physical addresses as these addresses
are never accessed by the code and physical addresses are actual values
we put into the IOMMU table
---
 arch/powerpc/include/asm/iommu.h| 22 --
 arch/powerpc/kernel/iommu.c | 59 +--
 arch/powerpc/platforms/powernv/pci-ioda.c   | 34 
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  3 ++
 arch/powerpc/platforms/powernv/pci.c| 18 +
 arch/powerpc/platforms/powernv/pci.h|  2 +
 drivers/vfio/vfio_iommu_spapr_tce.c | 63 +
 7 files changed, 132 insertions(+), 69 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 489133c..4636734 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -45,13 +45,29 @@ extern int iommu_is_off;
 extern int iommu_force_on;
 
 struct iommu_table_ops {
+   /*
+* When called with direction==DMA_NONE, it is equal to clear().
+* uaddr is a linear map address.
+*/
int (*set)(struct iommu_table *tbl,
long index, long npages,
unsigned long uaddr,
enum dma_data_direction direction,
struct dma_attrs *attrs);
+#ifdef CONFIG_IOMMU_API
+   /*
+* Exchanges existing TCE with new TCE plus direction bits;
+* returns old TCE and DMA direction mask.
+* @tce is a physical address.
+*/
+   int (*exchange)(struct iommu_table *tbl,
+   long index,
+   unsigned long *hpa,
+   enum dma_data_direction *direction);
+#endif
void (*clear)(struct iommu_table *tbl,
long index, long npages);
+   /* get() returns a physical address */
unsigned long (*get)(struct iommu_table *tbl, long index);
void (*flush)(struct iommu_table *tbl);
 };
@@ -153,6 +169,8 @@ extern void iommu_register_group(struct iommu_table_group 
*table_group,
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
+extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
+   unsigned long *hpa, enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
int pci_domain_number,
@@ -225,10 +243,6 @@ extern int iommu_tce_clear_param_check(struct iommu_table 
*tbl,
unsigned long npages);
 extern int iommu_tce_put_param_check(struct iommu_table *tbl,
unsigned long ioba, unsigned long tce);
-extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-   unsigned long hwaddr, enum dma_data_direction direction);
-extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
-   unsigned long entry);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table 

[PATCH kernel v11 26/34] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window

2015-05-29 Thread Alexey Kardashevskiy
This is a part of moving DMA window programming to an iommu_ops
callback. pnv_pci_ioda2_set_window() takes an iommu_table_group as
a first parameter (not pnv_ioda_pe) as it is going to be used as
a callback for VFIO DDW code.

This adds pnv_pci_ioda2_tvt_invalidate() to invalidate TVT as it is
a good thing to do. It does not have immediate effect now as the table
is never recreated after reboot but it will in the following patches.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v11:
* replaced some 1it_page_shift with IOMMU_PAGE_SIZE() macro

v9:
* initialize pe-table_group.tables[0] at the very end when
tbl is fully initialized
* moved pnv_pci_ioda2_tvt_invalidate() from earlier patch
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 47 +--
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3d29fe3..fda01c1 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1968,6 +1968,43 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
}
 }
 
+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
+   int num, struct iommu_table *tbl)
+{
+   struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
+   struct pnv_phb *phb = pe-phb;
+   int64_t rc;
+   const __u64 start_addr = tbl-it_offset  tbl-it_page_shift;
+   const __u64 win_size = tbl-it_size  tbl-it_page_shift;
+
+   pe_info(pe, Setting up window %llx..%llx pg=%x\n,
+   start_addr, start_addr + win_size - 1,
+   IOMMU_PAGE_SIZE(tbl));
+
+   /*
+* Map TCE table through TVT. The TVE index is the PE number
+* shifted by 1 bit for 32-bits DMA space.
+*/
+   rc = opal_pci_map_pe_dma_window(phb-opal_id,
+   pe-pe_number,
+   pe-pe_number  1,
+   1,
+   __pa(tbl-it_base),
+   tbl-it_size  3,
+   IOMMU_PAGE_SIZE(tbl));
+   if (rc) {
+   pe_err(pe, Failed to configure TCE table, err %ld\n, rc);
+   return rc;
+   }
+
+   pnv_pci_link_table_and_group(phb-hose-node, num,
+   tbl, pe-table_group);
+   pnv_pci_ioda2_tce_invalidate_entire(pe);
+
+   return 0;
+}
+
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
uint16_t window_id = (pe-pe_number  1 ) + 1;
@@ -2123,21 +2160,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
pe-table_group.ops = pnv_pci_ioda2_ops;
 #endif
 
-   /*
-* Map TCE table through TVT. The TVE index is the PE number
-* shifted by 1 bit for 32-bits DMA space.
-*/
-   rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
-   pe-pe_number  1, 1, __pa(tbl-it_base),
-   tbl-it_size  3, 1ULL  tbl-it_page_shift);
+   rc = pnv_pci_ioda2_set_window(pe-table_group, 0, tbl);
if (rc) {
pe_err(pe, Failed to configure 32-bit TCE table,
err %ld\n, rc);
goto fail;
}
 
-   pnv_pci_ioda2_tce_invalidate_entire(pe);
-
/* OPAL variant of PHB3 invalidated TCEs */
if (phb-ioda.tce_inval_reg)
tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 24/34] powerpc/powernv/ioda2: Rework iommu_table creation

2015-05-29 Thread Alexey Kardashevskiy
This moves iommu_table creation to the beginning to make following changes
easier to review. This starts using table parameters from the iommu_table
struct.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* updated commit log and did minor cleanup
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index d7ac2d4..0e88241 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2070,13 +2070,23 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
addr = page_address(tce_mem);
memset(addr, 0, tce_table_size);
 
+   /* Setup linux iommu table */
+   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
+   IOMMU_PAGE_SHIFT_4K);
+
+   tbl-it_ops = pnv_ioda2_iommu_ops;
+   iommu_init_table(tbl, phb-hose-node);
+#ifdef CONFIG_IOMMU_API
+   pe-table_group.ops = pnv_pci_ioda2_ops;
+#endif
+
/*
 * Map TCE table through TVT. The TVE index is the PE number
 * shifted by 1 bit for 32-bits DMA space.
 */
rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
-   pe-pe_number  1, 1, __pa(addr),
-   tce_table_size, 0x1000);
+   pe-pe_number  1, 1, __pa(tbl-it_base),
+   tbl-it_size  3, 1ULL  tbl-it_page_shift);
if (rc) {
pe_err(pe, Failed to configure 32-bit TCE table,
err %ld\n, rc);
@@ -2085,20 +2095,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 
pnv_pci_ioda2_tce_invalidate_entire(pe);
 
-   /* Setup linux iommu table */
-   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
-   IOMMU_PAGE_SHIFT_4K);
-
/* OPAL variant of PHB3 invalidated TCEs */
if (phb-ioda.tce_inval_reg)
tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
 
-   tbl-it_ops = pnv_ioda2_iommu_ops;
-   iommu_init_table(tbl, phb-hose-node);
-#ifdef CONFIG_IOMMU_API
-   pe-table_group.ops = pnv_pci_ioda2_ops;
-#endif
-
if (pe-flags  PNV_IODA_PE_DEV) {
/*
 * Setting table base here only for carrying iommu_group
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 22/34] powerpc/powernv: Implement accessor to TCE entry

2015-05-29 Thread Alexey Kardashevskiy
This replaces direct accesses to TCE table with a helper which
returns an TCE entry address. This does not make difference now but will
when multi-level TCE tables get introduces.

No change in behavior is expected.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* new patch in the series to separate this mechanical change from
functional changes; this is not right before
powerpc/powernv: Implement multilevel TCE tables but here in order
to let the next patch - powerpc/iommu/powernv: Release replaced TCE -
use pnv_tce() and avoid changing the same code twice
---
 arch/powerpc/platforms/powernv/pci.c | 34 +-
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 4b4c583..b2a32d0 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -572,38 +572,46 @@ struct pci_ops pnv_pci_ops = {
.write = pnv_pci_write_config,
 };
 
+static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
+{
+   __be64 *tmp = ((__be64 *)tbl-it_base);
+
+   return tmp + idx;
+}
+
 int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
unsigned long uaddr, enum dma_data_direction direction,
struct dma_attrs *attrs)
 {
u64 proto_tce = iommu_direction_to_tce_perm(direction);
-   __be64 *tcep;
-   u64 rpn;
+   u64 rpn = __pa(uaddr)  tbl-it_page_shift;
+   long i;
 
-   tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset;
-   rpn = __pa(uaddr)  tbl-it_page_shift;
-
-   while (npages--)
-   *(tcep++) = cpu_to_be64(proto_tce |
-   (rpn++  tbl-it_page_shift));
+   for (i = 0; i  npages; i++) {
+   unsigned long newtce = proto_tce |
+   ((rpn + i)  tbl-it_page_shift);
+   unsigned long idx = index - tbl-it_offset + i;
 
+   *(pnv_tce(tbl, idx)) = cpu_to_be64(newtce);
+   }
 
return 0;
 }
 
 void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 {
-   __be64 *tcep;
+   long i;
 
-   tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset;
+   for (i = 0; i  npages; i++) {
+   unsigned long idx = index - tbl-it_offset + i;
 
-   while (npages--)
-   *(tcep++) = cpu_to_be64(0);
+   *(pnv_tce(tbl, idx)) = cpu_to_be64(0);
+   }
 }
 
 unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
 {
-   return ((u64 *)tbl-it_base)[index - tbl-it_offset];
+   return *(pnv_tce(tbl, index - tbl-it_offset));
 }
 
 struct iommu_table *pnv_pci_table_alloc(int nid)
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 06/34] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver

2015-05-29 Thread Alexey Kardashevskiy
This moves page pinning (get_user_pages_fast()/put_page()) code out of
the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs
to as the platform code does not deal with page pinning.

This makes iommu_take_ownership()/iommu_release_ownership() deal with
the IOMMU table bitmap only.

This removes page unpinning from iommu_take_ownership() as the actual
TCE table might contain garbage and doing put_page() on it is undefined
behaviour.

Besides the last part, the rest of the patch is mechanical.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* added missing tce_iommu_clear call after iommu_release_ownership()
* brought @offset (a local variable) back to make patch even more
mechanical

v4:
* s/iommu_tce_build(tbl, entry + 1/iommu_tce_build(tbl, entry + i/
---
 arch/powerpc/include/asm/iommu.h|  4 --
 arch/powerpc/kernel/iommu.c | 55 -
 drivers/vfio/vfio_iommu_spapr_tce.c | 80 +++--
 3 files changed, 67 insertions(+), 72 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 8353c86..e94a5e3 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -194,10 +194,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, 
unsigned long entry,
unsigned long hwaddr, enum dma_data_direction direction);
 extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
unsigned long entry);
-extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-   unsigned long entry, unsigned long pages);
-extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
-   unsigned long entry, unsigned long tce);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 73eb39a..0019c80 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -986,30 +986,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, 
unsigned long entry)
 }
 EXPORT_SYMBOL_GPL(iommu_clear_tce);
 
-int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-   unsigned long entry, unsigned long pages)
-{
-   unsigned long oldtce;
-   struct page *page;
-
-   for ( ; pages; --pages, ++entry) {
-   oldtce = iommu_clear_tce(tbl, entry);
-   if (!oldtce)
-   continue;
-
-   page = pfn_to_page(oldtce  PAGE_SHIFT);
-   WARN_ON(!page);
-   if (page) {
-   if (oldtce  TCE_PCI_WRITE)
-   SetPageDirty(page);
-   put_page(page);
-   }
-   }
-
-   return 0;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
-
 /*
  * hwaddr is a kernel virtual address here (0xc... bazillion),
  * tce_build converts it to a physical address.
@@ -1039,35 +1015,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned 
long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_build);
 
-int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
-   unsigned long tce)
-{
-   int ret;
-   struct page *page = NULL;
-   unsigned long hwaddr, offset = tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK;
-   enum dma_data_direction direction = iommu_tce_direction(tce);
-
-   ret = get_user_pages_fast(tce  PAGE_MASK, 1,
-   direction != DMA_TO_DEVICE, page);
-   if (unlikely(ret != 1)) {
-   /* pr_err(iommu_tce: get_user_pages_fast failed tce=%lx 
ioba=%lx ret=%d\n,
-   tce, entry  tbl-it_page_shift, ret); */
-   return -EFAULT;
-   }
-   hwaddr = (unsigned long) page_address(page) + offset;
-
-   ret = iommu_tce_build(tbl, entry, hwaddr, direction);
-   if (ret)
-   put_page(page);
-
-   if (ret  0)
-   pr_err(iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n,
-   __func__, entry  tbl-it_page_shift, tce, ret);
-
-   return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
-
 int iommu_take_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
@@ -1081,7 +1028,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
}
 
memset(tbl-it_map, 0xff, sz);
-   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
 
/*
 * Disable iommu bypass, otherwise the user can DMA to all of
@@ -1099,7 +1045,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
 
-   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
 

[PATCH kernel v11 13/34] powerpc/powernv: Do not set read flag if direction==DMA_NONE

2015-05-29 Thread Alexey Kardashevskiy
Normally a bitmap from the iommu_table is used to track what TCE entry
is in use. Since we are going to use iommu_table without its locks and
do xchg() instead, it becomes essential not to put bits which are not
implied in the direction flag as the old TCE value (more precisely -
the permission bits) will be used to decide whether to put the page or not.

This adds iommu_direction_to_tce_perm() (its counterpart is there already)
and uses it for powernv's pnv_tce_build().

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* added comment why we must put only valid permission bits
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c  | 15 +++
 arch/powerpc/platforms/powernv/pci.c |  7 +--
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index e94a5e3..d91bd69 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -200,6 +200,7 @@ extern int iommu_take_ownership(struct iommu_table *tbl);
 extern void iommu_release_ownership(struct iommu_table *tbl);
 
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
+extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0019c80..ac2f959 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -866,6 +866,21 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
}
 }
 
+unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
+{
+   switch (dir) {
+   case DMA_BIDIRECTIONAL:
+   return TCE_PCI_READ | TCE_PCI_WRITE;
+   case DMA_FROM_DEVICE:
+   return TCE_PCI_WRITE;
+   case DMA_TO_DEVICE:
+   return TCE_PCI_READ;
+   default:
+   return 0;
+   }
+}
+EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
+
 #ifdef CONFIG_IOMMU_API
 /*
  * SPAPR TCE API
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index bca2aeb..b7ea245 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -576,15 +576,10 @@ static int pnv_tce_build(struct iommu_table *tbl, long 
index, long npages,
 unsigned long uaddr, enum dma_data_direction direction,
 struct dma_attrs *attrs, bool rm)
 {
-   u64 proto_tce;
+   u64 proto_tce = iommu_direction_to_tce_perm(direction);
__be64 *tcep, *tces;
u64 rpn;
 
-   proto_tce = TCE_PCI_READ; // Read allowed
-
-   if (direction != DMA_TO_DEVICE)
-   proto_tce |= TCE_PCI_WRITE;
-
tces = tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset;
rpn = __pa(uaddr)  tbl-it_page_shift;
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 14/34] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table

2015-05-29 Thread Alexey Kardashevskiy
This adds a iommu_table_ops struct and puts pointer to it into
the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush
callbacks from ppc_md to the new struct where they really belong to.

This adds the requirement for @it_ops to be initialized before calling
iommu_init_table() to make sure that we do not leave any IOMMU table
with iommu_table_ops uninitialized. This is not a parameter of
iommu_init_table() though as there will be cases when iommu_init_table()
will not be called on TCE tables, for example - VFIO.

This does s/tce_build/set/, s/tce_free/clear/ and removes tce_
redundant prefixes.

This removes tce_xxx_rm handlers from ppc_md but does not add
them to iommu_table_ops as this will be done later if we decide to
support TCE hypercalls in real mode. This removes _vm callbacks as
only virtual mode is supported by now so this also removes @rm parameter.

For pSeries, this always uses tce_buildmulti_pSeriesLP/
tce_buildmulti_pSeriesLP. This changes multi callback to fall back to
tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not
present. The reason for this is we still have to support multitce=off
boot parameter in disable_multitce() and we do not want to walk through
all IOMMU tables in the system and replace multi callbacks with single
ones.

For powernv, this defines _ops per PHB type which are P5IOC2/IODA1/IODA2.
This makes the callbacks for them public. Later patches will extend
callbacks for IODA1/2.

No change in behaviour is expected.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* pnv_tce_build/pnv_tce_free/pnv_tce_get have been made public and lost
rm parameters to make following patches simpler (realmode is not
supported here anyway)
* got rid of _vm versions of callbacks
---
 arch/powerpc/include/asm/iommu.h| 17 +++
 arch/powerpc/include/asm/machdep.h  | 25 ---
 arch/powerpc/kernel/iommu.c | 46 ++--
 arch/powerpc/kernel/vio.c   |  5 +++
 arch/powerpc/platforms/cell/iommu.c |  8 +++--
 arch/powerpc/platforms/pasemi/iommu.c   |  7 +++--
 arch/powerpc/platforms/powernv/pci-ioda.c   | 14 +
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  7 +
 arch/powerpc/platforms/powernv/pci.c| 47 +
 arch/powerpc/platforms/powernv/pci.h|  5 +++
 arch/powerpc/platforms/pseries/iommu.c  | 34 -
 arch/powerpc/sysdev/dart_iommu.c| 12 +---
 12 files changed, 116 insertions(+), 111 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d91bd69..e2a45c3 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -44,6 +44,22 @@
 extern int iommu_is_off;
 extern int iommu_force_on;
 
+struct iommu_table_ops {
+   int (*set)(struct iommu_table *tbl,
+   long index, long npages,
+   unsigned long uaddr,
+   enum dma_data_direction direction,
+   struct dma_attrs *attrs);
+   void (*clear)(struct iommu_table *tbl,
+   long index, long npages);
+   unsigned long (*get)(struct iommu_table *tbl, long index);
+   void (*flush)(struct iommu_table *tbl);
+};
+
+/* These are used by VIO */
+extern struct iommu_table_ops iommu_table_lpar_multi_ops;
+extern struct iommu_table_ops iommu_table_pseries_ops;
+
 /*
  * IOMAP_MAX_ORDER defines the largest contiguous block
  * of dma space we can get.  IOMAP_MAX_ORDER = 13
@@ -78,6 +94,7 @@ struct iommu_table {
 #ifdef CONFIG_IOMMU_API
struct iommu_group *it_group;
 #endif
+   struct iommu_table_ops *it_ops;
void (*set_bypass)(struct iommu_table *tbl, bool enable);
 #ifdef CONFIG_PPC_POWERNV
void   *data;
diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index ef889943..ab721b4 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -65,31 +65,6 @@ struct machdep_calls {
 * destroyed as well */
void(*hpte_clear_all)(void);
 
-   int (*tce_build)(struct iommu_table *tbl,
-long index,
-long npages,
-unsigned long uaddr,
-enum dma_data_direction direction,
-struct dma_attrs *attrs);
-   void(*tce_free)(struct iommu_table *tbl,
-   long index,
-   long npages);
-   unsigned long   (*tce_get)(struct iommu_table *tbl,
-   long index);
-   void(*tce_flush)(struct iommu_table *tbl);
-
-   

[PATCH kernel v11 19/34] powerpc/iommu: Fix IOMMU ownership control functions

2015-05-29 Thread Alexey Kardashevskiy
This adds missing locks in iommu_take_ownership()/
iommu_release_ownership().

This marks all pages busy in iommu_table::it_map in order to catch
errors if there is an attempt to use this table while ownership over it
is taken.

This only clears TCE content if there is no page marked busy in it_map.
Clearing must be done outside of the table locks as iommu_clear_tce()
called from iommu_clear_tces_and_put_pages() does this.

In order to use bitmap_empty(), the existing code clears bit#0 which
is set even in an empty table if it is bus-mapped at 0 as
iommu_init_table() reserves page#0 to prevent buggy drivers
from crashing when allocated page is bus-mapped at zero
(which is correct). This restores the bit in the case of failure
to bring the it_map to the state it was in when we called
iommu_take_ownership().

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* iommu_table_take_ownership() did not return @ret (and ignored EBUSY),
now it does return correct error.
* updated commit log about setting bit#0 in the case of failure

v5:
* do not store bit#0 value, it has to be set for zero-based table
anyway
* removed test_and_clear_bit
---
 arch/powerpc/kernel/iommu.c | 30 +-
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index b6a397a..1ae1034 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1035,31 +1035,51 @@ EXPORT_SYMBOL_GPL(iommu_tce_build);
 
 int iommu_take_ownership(struct iommu_table *tbl)
 {
-   unsigned long sz = (tbl-it_size + 7)  3;
+   unsigned long flags, i, sz = (tbl-it_size + 7)  3;
+   int ret = 0;
+
+   spin_lock_irqsave(tbl-large_pool.lock, flags);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_lock(tbl-pools[i].lock);
 
if (tbl-it_offset == 0)
clear_bit(0, tbl-it_map);
 
if (!bitmap_empty(tbl-it_map, tbl-it_size)) {
pr_err(iommu_tce: it_map is not empty);
-   return -EBUSY;
+   ret = -EBUSY;
+   /* Restore bit#0 set by iommu_init_table() */
+   if (tbl-it_offset == 0)
+   set_bit(0, tbl-it_map);
+   } else {
+   memset(tbl-it_map, 0xff, sz);
}
 
-   memset(tbl-it_map, 0xff, sz);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_unlock(tbl-pools[i].lock);
+   spin_unlock_irqrestore(tbl-large_pool.lock, flags);
 
-   return 0;
+   return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
 
 void iommu_release_ownership(struct iommu_table *tbl)
 {
-   unsigned long sz = (tbl-it_size + 7)  3;
+   unsigned long flags, i, sz = (tbl-it_size + 7)  3;
+
+   spin_lock_irqsave(tbl-large_pool.lock, flags);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_lock(tbl-pools[i].lock);
 
memset(tbl-it_map, 0, sz);
 
/* Restore bit#0 set by iommu_init_table() */
if (tbl-it_offset == 0)
set_bit(0, tbl-it_map);
+
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_unlock(tbl-pools[i].lock);
+   spin_unlock_irqrestore(tbl-large_pool.lock, flags);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 02/34] powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group

2015-05-29 Thread Alexey Kardashevskiy
The set_iommu_table_base_and_group() name suggests that the function
sets table base and add a device to an IOMMU group.

The actual purpose for table base setting is to put some reference
into a device so later iommu_add_device() can get the IOMMU group
reference and the device to the group.

At the moment a group cannot be explicitly passed to iommu_add_device()
as we want it to work from the bus notifier, we can fix it later and
remove confusing calls of set_iommu_table_base().

This replaces set_iommu_table_base_and_group() with a couple of
set_iommu_table_base() + iommu_add_device() which makes reading the code
easier.

This adds few comments why set_iommu_table_base() and iommu_add_device()
are called where they are called.

For IODA1/2, this essentially removes iommu_add_device() call from
the pnv_pci_ioda_dma_dev_setup() as it will always fail at this particular
place:
- for physical PE, the device is already attached by iommu_add_device()
in pnv_pci_ioda_setup_dma_pe();
- for virtual PE, the sysfs entries are not ready to create all symlinks
so actual adding is happening in tce_iommu_bus_notifier.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v10:
* new to the series
---
 arch/powerpc/include/asm/iommu.h|  7 ---
 arch/powerpc/platforms/powernv/pci-ioda.c   | 27 +++
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  3 ++-
 arch/powerpc/platforms/pseries/iommu.c  | 15 ---
 4 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 1e27d63..8353c86 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -140,13 +140,6 @@ static inline int __init tce_iommu_bus_notifier_init(void)
 }
 #endif /* !CONFIG_IOMMU_API */
 
-static inline void set_iommu_table_base_and_group(struct device *dev,
- void *base)
-{
-   set_iommu_table_base(dev, base);
-   iommu_add_device(dev);
-}
-
 extern int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
struct scatterlist *sglist, int nelems,
unsigned long mask,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2f092bb..9a77f3c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1598,7 +1598,13 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb 
*phb, struct pci_dev *pdev
 
pe = phb-ioda.pe_array[pdn-pe_number];
WARN_ON(get_dma_ops(pdev-dev) != dma_iommu_ops);
-   set_iommu_table_base_and_group(pdev-dev, pe-tce32_table);
+   set_iommu_table_base(pdev-dev, pe-tce32_table);
+   /*
+* Note: iommu_add_device() will fail here as
+* for physical PE: the device is already added by now;
+* for virtual PE: sysfs entries are not ready yet and
+* tce_iommu_bus_notifier will add the device to a group later.
+*/
 }
 
 static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
@@ -1659,7 +1665,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
struct pci_dev *dev;
 
list_for_each_entry(dev, bus-devices, bus_list) {
-   set_iommu_table_base_and_group(dev-dev, pe-tce32_table);
+   set_iommu_table_base(dev-dev, pe-tce32_table);
+   iommu_add_device(dev-dev);
 
if (dev-subordinate)
pnv_ioda_setup_bus_dma(pe, dev-subordinate);
@@ -1835,7 +1842,13 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
if (pe-flags  PNV_IODA_PE_DEV) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
-   set_iommu_table_base_and_group(pe-pdev-dev, tbl);
+   /*
+* Setting table base here only for carrying iommu_group
+* further down to let iommu_add_device() do the job.
+* pnv_pci_ioda_dma_dev_setup will override it later anyway.
+*/
+   set_iommu_table_base(pe-pdev-dev, tbl);
+   iommu_add_device(pe-pdev-dev);
} else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
@@ -1963,7 +1976,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
if (pe-flags  PNV_IODA_PE_DEV) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
-   set_iommu_table_base_and_group(pe-pdev-dev, tbl);
+   /*
+* Setting table base here only for carrying iommu_group
+* further down to let iommu_add_device() do the job.
+* 

[PATCH kernel v11 01/34] powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group

2015-05-29 Thread Alexey Kardashevskiy
This relies on the fact that a PCI device always has an IOMMU table
which may not be the case when we get dynamic DMA windows so
let's use more reliable check for IOMMU group here.

As we do not rely on the table presence here, remove the workaround
from pnv_pci_ioda2_set_bypass(); also remove the @add_to_iommu_group
parameter from pnv_ioda_setup_bus_dma().

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Acked-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/kernel/eeh.c |  4 +---
 arch/powerpc/platforms/powernv/pci-ioda.c | 27 +--
 2 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9ee61d1..defd874 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1412,13 +1412,11 @@ static int dev_has_iommu_table(struct device *dev, void 
*data)
 {
struct pci_dev *pdev = to_pci_dev(dev);
struct pci_dev **ppdev = data;
-   struct iommu_table *tbl;
 
if (!dev)
return 0;
 
-   tbl = get_iommu_table_base(dev);
-   if (tbl  tbl-it_group) {
+   if (dev-iommu_group) {
*ppdev = pdev;
return 1;
}
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index f8bc950..2f092bb 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1654,21 +1654,15 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct 
pnv_phb *phb,
 }
 
 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
-  struct pci_bus *bus,
-  bool add_to_iommu_group)
+  struct pci_bus *bus)
 {
struct pci_dev *dev;
 
list_for_each_entry(dev, bus-devices, bus_list) {
-   if (add_to_iommu_group)
-   set_iommu_table_base_and_group(dev-dev,
-  pe-tce32_table);
-   else
-   set_iommu_table_base(dev-dev, pe-tce32_table);
+   set_iommu_table_base_and_group(dev-dev, pe-tce32_table);
 
if (dev-subordinate)
-   pnv_ioda_setup_bus_dma(pe, dev-subordinate,
-  add_to_iommu_group);
+   pnv_ioda_setup_bus_dma(pe, dev-subordinate);
}
 }
 
@@ -1845,7 +1839,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
} else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
-   pnv_ioda_setup_bus_dma(pe, pe-pbus, true);
+   pnv_ioda_setup_bus_dma(pe, pe-pbus);
} else if (pe-flags  PNV_IODA_PE_VF) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
@@ -1882,17 +1876,6 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table 
*tbl, bool enable)
 window_id,
 pe-tce_bypass_base,
 0);
-
-   /*
-* EEH needs the mapping between IOMMU table and group
-* of those VFIO/KVM pass-through devices. We can postpone
-* resetting DMA ops until the DMA mask is configured in
-* host side.
-*/
-   if (pe-pdev)
-   set_iommu_table_base(pe-pdev-dev, tbl);
-   else
-   pnv_ioda_setup_bus_dma(pe, pe-pbus, false);
}
if (rc)
pe_err(pe, OPAL error %lld configuring bypass window\n, rc);
@@ -1984,7 +1967,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
} else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
-   pnv_ioda_setup_bus_dma(pe, pe-pbus, true);
+   pnv_ioda_setup_bus_dma(pe, pe-pbus);
} else if (pe-flags  PNV_IODA_PE_VF) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 21/34] powerpc/powernv/ioda2: Add TCE invalidation for all attached groups

2015-05-29 Thread Alexey Kardashevskiy
The iommu_table struct keeps a list of IOMMU groups it is used for.
At the moment there is just a single group attached but further
patches will add TCE table sharing. When sharing is enabled, TCE cache
in each PE needs to be invalidated so does the patch.

This does not change pnv_pci_ioda1_tce_invalidate() as there is no plan
to enable TCE table sharing on PHBs older than IODA2.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v10:
* new to the series
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 35 ---
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3fd8b18..94fccc8 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -24,6 +24,7 @@
 #include linux/msi.h
 #include linux/memblock.h
 #include linux/iommu.h
+#include linux/rculist.h
 
 #include asm/sections.h
 #include asm/io.h
@@ -1764,23 +1765,15 @@ static inline void 
pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
__raw_writeq(cpu_to_be64(val), phb-ioda.tce_inval_reg);
 }
 
-static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
-   unsigned long index, unsigned long npages, bool rm)
+static void pnv_pci_ioda2_tce_do_invalidate(unsigned pe_number, bool rm,
+   __be64 __iomem *invalidate, unsigned shift,
+   unsigned long index, unsigned long npages)
 {
-   struct iommu_table_group_link *tgl = list_first_entry_or_null(
-   tbl-it_group_list, struct iommu_table_group_link,
-   next);
-   struct pnv_ioda_pe *pe = container_of(tgl-table_group,
-   struct pnv_ioda_pe, table_group);
unsigned long start, end, inc;
-   __be64 __iomem *invalidate = rm ?
-   (__be64 __iomem *)pe-phb-ioda.tce_inval_reg_phys :
-   pe-phb-ioda.tce_inval_reg;
-   const unsigned shift = tbl-it_page_shift;
 
/* We'll invalidate DMA address in PE scope */
start = 0x2ull  60;
-   start |= (pe-pe_number  0xFF);
+   start |= (pe_number  0xFF);
end = start;
 
/* Figure out the start, end and step */
@@ -1798,6 +1791,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
iommu_table *tbl,
}
 }
 
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool rm)
+{
+   struct iommu_table_group_link *tgl;
+
+   list_for_each_entry_rcu(tgl, tbl-it_group_list, next) {
+   struct pnv_ioda_pe *pe = container_of(tgl-table_group,
+   struct pnv_ioda_pe, table_group);
+   __be64 __iomem *invalidate = rm ?
+   (__be64 __iomem *)pe-phb-ioda.tce_inval_reg_phys :
+   pe-phb-ioda.tce_inval_reg;
+
+   pnv_pci_ioda2_tce_do_invalidate(pe-pe_number, rm,
+   invalidate, tbl-it_page_shift,
+   index, npages);
+   }
+}
+
 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
long npages, unsigned long uaddr,
enum dma_data_direction direction,
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 33/34] vfio: powerpc/spapr: Register memory and define IOMMU v2

2015-05-29 Thread Alexey Kardashevskiy
The existing implementation accounts the whole DMA window in
the locked_vm counter. This is going to be worse with multiple
containers and huge DMA windows. Also, real-time accounting would requite
additional tracking of accounted pages due to the page size difference -
IOMMU uses 4K pages and system uses 4K or 64K pages.

Another issue is that actual pages pinning/unpinning happens on every
DMA map/unmap request. This does not affect the performance much now as
we spend way too much time now on switching context between
guest/userspace/host but this will start to matter when we add in-kernel
DMA map/unmap acceleration.

This introduces a new IOMMU type for SPAPR - VFIO_SPAPR_TCE_v2_IOMMU.
New IOMMU deprecates VFIO_IOMMU_ENABLE/VFIO_IOMMU_DISABLE and introduces
2 new ioctls to register/unregister DMA memory -
VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY -
which receive user space address and size of a memory region which
needs to be pinned/unpinned and counted in locked_vm.
New IOMMU splits physical pages pinning and TCE table update
into 2 different operations. It requires:
1) guest pages to be registered first
2) consequent map/unmap requests to work only with pre-registered memory.
For the default single window case this means that the entire guest
(instead of 2GB) needs to be pinned before using VFIO.
When a huge DMA window is added, no additional pinning will be
required, otherwise it would be guest RAM + 2GB.

The new memory registration ioctls are not supported by
VFIO_SPAPR_TCE_IOMMU. Dynamic DMA window and in-kernel acceleration
will require memory to be preregistered in order to work.

The accounting is done per the user process.

This advertises v2 SPAPR TCE IOMMU and restricts what the userspace
can do with v1 or v2 IOMMUs.

In order to support memory pre-registration, we need a way to track
the use of every registered memory region and only allow unregistration
if a region is not in use anymore. So we need a way to tell from what
region the just cleared TCE was from.

This adds a userspace view of the TCE table into iommu_table struct.
It contains userspace address, one per TCE entry. The table is only
allocated when the ownership over an IOMMU group is taken which means
it is only used from outside of the powernv code (such as VFIO).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v11:
* mm_iommu_put() does not return a code so this does not check it
* moved v2 in tce_container to pack the struct

v10:
* moved it_userspace allocation to vfio_iommu_spapr_tce as it VFIO
specific thing
* squashed powerpc/iommu: Add userspace view of TCE table into this as
it is
a part of IOMMU v2
* s/tce_iommu_use_page_v2/tce_iommu_prereg_ua_to_hpa/
* fixed some function names to have tce_iommu_ in the beginning rather
just tce_
* as mm_iommu_mapped_inc() can now fail, check for the return code

v9:
* s/tce_get_hva_cached/tce_iommu_use_page_v2/

v7:
* now memory is registered per mm (i.e. process)
* moved memory registration code to powerpc/mmu
* merged vfio: powerpc/spapr: Define v2 IOMMU into this
* limited new ioctls to v2 IOMMU
* updated doc
* unsupported ioclts return -ENOTTY instead of -EPERM

v6:
* tce_get_hva_cached() returns hva via a pointer

v4:
* updated docs
* s/kzmalloc/vzalloc/
* in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
replaced offset with index
* renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
and removed duplicating vfio_iommu_spapr_register_memory
---
 Documentation/vfio.txt  |  31 ++-
 arch/powerpc/include/asm/iommu.h|   6 +
 drivers/vfio/vfio_iommu_spapr_tce.c | 512 ++--
 include/uapi/linux/vfio.h   |  27 ++
 4 files changed, 487 insertions(+), 89 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 96978ec..7dcf2b5 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
 
 This implementation has some specifics:
 
-1) Only one IOMMU group per container is supported as an IOMMU group
-represents the minimal entity which isolation can be guaranteed for and
-groups are allocated statically, one per a Partitionable Endpoint (PE)
+1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
+container is supported as an IOMMU table is allocated at the boot time,
+one table per a IOMMU group which is a Partitionable Endpoint (PE)
 (PE is often a PCI domain but not always).
+Newer systems (POWER8 with IODA2) have improved hardware design which allows
+to remove this limitation and have multiple IOMMU groups per a VFIO container.
 
 2) The hardware supports so called DMA windows - the PCI address range
 within which DMA transfer is allowed, any attempt to access address space
@@ -427,6 +429,29 @@ The code flow from the example above should be slightly 
changed:
 
 

[PATCH kernel v11 32/34] powerpc/mmu: Add userspace-to-physical addresses translation cache

2015-05-29 Thread Alexey Kardashevskiy
We are adding support for DMA memory pre-registration to be used in
conjunction with VFIO. The idea is that the userspace which is going to
run a guest may want to pre-register a user space memory region so
it all gets pinned once and never goes away. Having this done,
a hypervisor will not have to pin/unpin pages on every DMA map/unmap
request. This is going to help with multiple pinning of the same memory.

Another use of it is in-kernel real mode (mmu off) acceleration of
DMA requests where real time translation of guest physical to host
physical addresses is non-trivial and may fail as linux ptes may be
temporarily invalid. Also, having cached host physical addresses
(compared to just pinning at the start and then walking the page table
again on every H_PUT_TCE), we can be sure that the addresses which we put
into TCE table are the ones we already pinned.

This adds a list of memory regions to mm_context_t. Each region consists
of a header and a list of physical addresses. This adds API to:
1. register/unregister memory regions;
2. do final cleanup (which puts all pre-registered pages);
3. do userspace to physical address translation;
4. manage usage counters; multiple registration of the same memory
is allowed (once per container).

This implements 2 counters per registered memory region:
- @mapped: incremented on every DMA mapping; decremented on unmapping;
initialized to 1 when a region is just registered; once it becomes zero,
no more mappings allowe;
- @used: incremented on every register ioctl; decremented on
unregister; unregistration is allowed for DMA mapped regions unless
it is the very last reference. For the very last reference this checks
that the region is still mapped and returns -EBUSY so the userspace
gets to know that memory is still pinned and unregistration needs to
be retried; @used remains 1.

Host physical addresses are stored in vmalloc'ed array. In order to
access these in the real mode (mmu off), there is a real_vmalloc_addr()
helper. In-kernel acceleration patchset will move it from KVM to MMU code.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v11:
* added mutex to protect adding and removing
* added mm_iommu_init() helper
* kref is removed, now there are an atomic counter (@mapped) and a mutex
(for @used)
* merged mm_iommu_alloc into mm_iommu_get and do check-and-alloc under
one mutex lock; mm_iommu_get() returns old @used value so the caller can
know if it needs to elevate locked_vm counter
* do locked_vm counting in mmu_context_hash64_iommu.c

v10:
* split mm_iommu_mapped_update into mm_iommu_mapped_dec + mm_iommu_mapped_inc
* mapped counter now keep one reference for itself and mm_iommu_mapped_inc()
can tell if the region is being released
* updated commit log

v8:
* s/mm_iommu_table_group_mem_t/struct mm_iommu_table_group_mem_t/
* fixed error fallback look (s/[i]/[j]/)
---
 arch/powerpc/include/asm/mmu-hash64.h  |   3 +
 arch/powerpc/include/asm/mmu_context.h |  16 ++
 arch/powerpc/kernel/setup_64.c |   3 +
 arch/powerpc/mm/Makefile   |   1 +
 arch/powerpc/mm/mmu_context_hash64.c   |   6 +
 arch/powerpc/mm/mmu_context_hash64_iommu.c | 297 +
 6 files changed, 326 insertions(+)
 create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c

diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
b/arch/powerpc/include/asm/mmu-hash64.h
index 1da6a81..a82f534 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -536,6 +536,9 @@ typedef struct {
/* for 4K PTE fragment support */
void *pte_frag;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+   struct list_head iommu_group_mem_list;
+#endif
 } mm_context_t;
 
 
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 73382eb..70a4f2a 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -16,6 +16,22 @@
  */
 extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
 extern void destroy_context(struct mm_struct *mm);
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+struct mm_iommu_table_group_mem_t;
+
+extern bool mm_iommu_preregistered(void);
+extern long mm_iommu_get(unsigned long ua, unsigned long entries,
+   struct mm_iommu_table_group_mem_t **pmem);
+extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_init(mm_context_t *ctx);
+extern void mm_iommu_cleanup(mm_context_t *ctx);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+   unsigned long size);
+extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+   unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
+#endif
 
 extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
 

[PATCH kernel v11 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()

2015-05-29 Thread Alexey Kardashevskiy
At the moment iommu_free_table() only releases memory if
the table was initialized for the platform code use, i.e. it had
it_map initialized (which purpose is to track DMA memory space use).

With dynamic DMA windows, we will need to be able to release
iommu_table even if it was used for VFIO in which case it_map is NULL
so does the patch.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v11:
* fixed parameter checks
---
 arch/powerpc/kernel/iommu.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 3d47eb3..73eb39a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -713,9 +713,11 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
unsigned long bitmap_sz;
unsigned int order;
 
-   if (!tbl || !tbl-it_map) {
-   printk(KERN_ERR %s: expected TCE map for %s\n, __func__,
-   node_name);
+   if (!tbl)
+   return;
+
+   if (!tbl-it_map) {
+   kfree(tbl);
return;
}
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kernel v11 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-05-29 Thread Alexey Kardashevskiy
Modern IBM POWERPC systems support multiple (currently two) TCE tables
per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
for TCE tables. Right now just one table is supported.

For IODA, instead of embedding iommu_table, the new iommu_table_group
keeps pointers to those. The iommu_table structs are allocated
dynamically now by a pnv_pci_table_alloc() helper as PCI hotplug
code (for EEH recovery) and SRIOV are supported there.

For P5IOC2, both iommu_table_group and iommu_table are embedded into
PE struct. As there is no EEH and SRIOV support for P5IOC2,
iommu_free_table() should not be called on iommu_table struct pointers
so we can keep it embedded in pnv_phb::p5ioc2.

For pSeries, this replaces multiple calls of kzalloc_node() with a new
iommu_pseries_group_alloc() helper and stores the table group struct
pointer into the pci_dn struct. For release, a iommu_table_group_free()
helper is added.

This moves iommu_table struct allocation from SR-IOV code to
the generic DMA initialization code in pnv_pci_ioda2_setup_dma_pe.

This replaces a single pointer to iommu_group with a list of
iommu_table_group structs. For now it is just a single iommu_table_group
in this list but later with TCE table sharing enabled, the list will
keep all the IOMMU groups which use the particular table. The list
uses iommu_table_group_link structs rather than iommu_table_group::next
as a VFIO container may have 2 IOMMU tables, each will have its own list
head pointer as it is mainly for TCE invalidation code which should
walk through all attached groups and invalidate TCE cache so
the table has to keep the list head pointer. The other option would
be storing list head in a VFIO container but it would not work as
the platform code (which does TCE table update and invalidation) has
no idea about VFIO.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v10:
* iommu_table is not embedded into iommu_table_group but allocated
dynamically
* iommu_table allocation is moved to a single place for IODA2's
pnv_pci_ioda_setup_dma_pe where it belongs to
* added list of groups into iommu_table; most of the code just looks at
the first item to keep the patch simpler

v9:
* s/it_group/it_table_group/
* added and used iommu_table_group_free(), from now iommu_free_table()
is only used for VIO
* added iommu_pseries_group_alloc()
* squashed powerpc/iommu: Introduce iommu_table_alloc() helper into this
---
 arch/powerpc/include/asm/iommu.h|   8 +-
 arch/powerpc/kernel/iommu.c |   9 +-
 arch/powerpc/platforms/powernv/pci-ioda.c   |  45 ++
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |   3 +
 arch/powerpc/platforms/powernv/pci.c|  76 +
 arch/powerpc/platforms/powernv/pci.h|   7 ++
 arch/powerpc/platforms/pseries/iommu.c  |  33 +++-
 drivers/vfio/vfio_iommu_spapr_tce.c | 122 
 8 files changed, 242 insertions(+), 61 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 5a7267f..44a20cc 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -91,7 +91,7 @@ struct iommu_table {
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now */
unsigned long  it_page_shift;/* table iommu page size */
-   struct iommu_table_group *it_table_group;
+   struct list_head it_group_list;/* List of iommu_table_group_link */
struct iommu_table_ops *it_ops;
void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
@@ -126,6 +126,12 @@ extern struct iommu_table *iommu_init_table(struct 
iommu_table * tbl,
int nid);
 #define IOMMU_TABLE_GROUP_MAX_TABLES   1
 
+struct iommu_table_group_link {
+   struct list_head next;
+   struct rcu_head rcu;
+   struct iommu_table_group *table_group;
+};
+
 struct iommu_table_group {
struct iommu_group *group;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 719f048..e305a8f 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1078,6 +1078,7 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership);
 int iommu_add_device(struct device *dev)
 {
struct iommu_table *tbl;
+   struct iommu_table_group_link *tgl;
 
/*
 * The sysfs entries should be populated before
@@ -1095,15 +1096,17 @@ int iommu_add_device(struct device *dev)
}
 
tbl = get_iommu_table_base(dev);
-   if (!tbl || !tbl-it_table_group || !tbl-it_table_group-group) {
+   if (!tbl || list_empty(tbl-it_group_list)) {

[PATCH kernel v11 15/34] powerpc/powernv/ioda/ioda2: Rework TCE invalidation in tce_build()/tce_free()

2015-05-29 Thread Alexey Kardashevskiy
The pnv_pci_ioda_tce_invalidate() helper invalidates TCE cache. It is
supposed to be called on IODA1/2 and not called on p5ioc2. It receives
start and end host addresses of TCE table.

IODA2 actually needs PCI addresses to invalidate the cache. Those
can be calculated from host addresses but since we are going
to implement multi-level TCE tables, calculating PCI address from
a host address might get either tricky or ugly as TCE table remains flat
on PCI bus but not in RAM.

This moves pnv_pci_ioda_tce_invalidate() from generic pnv_tce_build/
pnt_tce_free and defines IODA1/2-specific callbacks which call generic
ones and do PHB-model-specific TCE cache invalidation. P5IOC2 keeps
using generic callbacks as before.

This changes pnv_pci_ioda2_tce_invalidate() to receives TCE index and
number of pages which are PCI addresses shifted by IOMMU page shift.

No change in behaviour is expected.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v11:
* changed type of some ret to int as everywhere else

v10:
* moved before Switch from iommu_table to new iommu_table_group as it adds
list of groups to iommu_table and tce invalidation depends on it

v9:
* removed confusing comment from commit log about unintentional calling of
pnv_pci_ioda_tce_invalidate()
* moved mechanical changes away to powerpc/iommu: Move tce_xxx callbacks from 
ppc_md to iommu_table
* fixed bug with broken invalidation in pnv_pci_ioda2_tce_invalidate -
@index includes @tbl-it_offset but old code added it anyway which later broke
DDW
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 81 ++-
 arch/powerpc/platforms/powernv/pci.c  | 17 ++-
 2 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2924abe..3d32c37 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1678,18 +1678,19 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe 
*pe,
}
 }
 
-static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
-struct iommu_table *tbl,
-__be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool rm)
 {
+   struct pnv_ioda_pe *pe = tbl-data;
__be64 __iomem *invalidate = rm ?
(__be64 __iomem *)pe-tce_inval_reg_phys :
(__be64 __iomem *)tbl-it_index;
unsigned long start, end, inc;
const unsigned shift = tbl-it_page_shift;
 
-   start = __pa(startp);
-   end = __pa(endp);
+   start = __pa(((__be64 *)tbl-it_base) + index - tbl-it_offset);
+   end = __pa(((__be64 *)tbl-it_base) + index - tbl-it_offset +
+   npages - 1);
 
/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
if (tbl-it_busno) {
@@ -1725,16 +1726,39 @@ static void pnv_pci_ioda1_tce_invalidate(struct 
pnv_ioda_pe *pe,
 */
 }
 
+static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
+   long npages, unsigned long uaddr,
+   enum dma_data_direction direction,
+   struct dma_attrs *attrs)
+{
+   int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+   attrs);
+
+   if (!ret  (tbl-it_type  TCE_PCI_SWINV_CREATE))
+   pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+
+   return ret;
+}
+
+static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
+   long npages)
+{
+   pnv_tce_free(tbl, index, npages);
+
+   if (tbl-it_type  TCE_PCI_SWINV_FREE)
+   pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+}
+
 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
-   .set = pnv_tce_build,
-   .clear = pnv_tce_free,
+   .set = pnv_ioda1_tce_build,
+   .clear = pnv_ioda1_tce_free,
.get = pnv_tce_get,
 };
 
-static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
-struct iommu_table *tbl,
-__be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool rm)
 {
+   struct pnv_ioda_pe *pe = tbl-data;
unsigned long start, end, inc;
__be64 __iomem *invalidate = rm ?
(__be64 __iomem *)pe-tce_inval_reg_phys :
@@ -1747,10 +1771,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
pnv_ioda_pe *pe,
end = start;
 
/* Figure out the start, end and step */
-   inc = tbl-it_offset + (((u64)startp - tbl-it_base) / sizeof(u64));
-   start |= (inc  shift);

[PATCH kernel v11 11/34] vfio: powerpc/spapr: Moving pinning/unpinning to helpers

2015-05-29 Thread Alexey Kardashevskiy
This is a pretty mechanical patch to make next patches simpler.

New tce_iommu_unuse_page() helper does put_page() now but it might skip
that after the memory registering patch applied.

As we are here, this removes unnecessary checks for a value returned
by pfn_to_page() as it cannot possibly return NULL.

This moves tce_iommu_disable() later to let tce_iommu_clear() know if
the container has been enabled because if it has not been, then
put_page() must not be called on TCEs from the TCE table. This situation
is not yet possible but it will after KVM acceleration patchset is
applied.

This changes code to work with physical addresses rather than linear
mapping addresses for better code readability. Following patches will
add an xchg() callback for an IOMMU table which will accept/return
physical addresses (unlike current tce_build()) which will eliminate
redundant conversions.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* changed helpers to work with physical addresses rather than linear
(for simplicity - later ::xchg() will receive physical and avoid
additional convertions)

v6:
* tce_get_hva() returns hva via a pointer
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 61 +
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index e21479c..115d5e6 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -191,69 +191,90 @@ static void tce_iommu_release(void *iommu_data)
struct tce_container *container = iommu_data;
 
WARN_ON(container-tbl  !container-tbl-it_group);
-   tce_iommu_disable(container);
 
if (container-tbl  container-tbl-it_group)
tce_iommu_detach_group(iommu_data, container-tbl-it_group);
 
+   tce_iommu_disable(container);
mutex_destroy(container-lock);
 
kfree(container);
 }
 
+static void tce_iommu_unuse_page(struct tce_container *container,
+   unsigned long oldtce)
+{
+   struct page *page;
+
+   if (!(oldtce  (TCE_PCI_READ | TCE_PCI_WRITE)))
+   return;
+
+   page = pfn_to_page(oldtce  PAGE_SHIFT);
+
+   if (oldtce  TCE_PCI_WRITE)
+   SetPageDirty(page);
+
+   put_page(page);
+}
+
 static int tce_iommu_clear(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long pages)
 {
unsigned long oldtce;
-   struct page *page;
 
for ( ; pages; --pages, ++entry) {
oldtce = iommu_clear_tce(tbl, entry);
if (!oldtce)
continue;
 
-   page = pfn_to_page(oldtce  PAGE_SHIFT);
-   WARN_ON(!page);
-   if (page) {
-   if (oldtce  TCE_PCI_WRITE)
-   SetPageDirty(page);
-   put_page(page);
-   }
+   tce_iommu_unuse_page(container, oldtce);
}
 
return 0;
 }
 
+static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
+{
+   struct page *page = NULL;
+   enum dma_data_direction direction = iommu_tce_direction(tce);
+
+   if (get_user_pages_fast(tce  PAGE_MASK, 1,
+   direction != DMA_TO_DEVICE, page) != 1)
+   return -EFAULT;
+
+   *hpa = __pa((unsigned long) page_address(page));
+
+   return 0;
+}
+
 static long tce_iommu_build(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long tce, unsigned long pages)
 {
long i, ret = 0;
-   struct page *page = NULL;
-   unsigned long hva;
+   struct page *page;
+   unsigned long hpa;
enum dma_data_direction direction = iommu_tce_direction(tce);
 
for (i = 0; i  pages; ++i) {
unsigned long offset = tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK;
 
-   ret = get_user_pages_fast(tce  PAGE_MASK, 1,
-   direction != DMA_TO_DEVICE, page);
-   if (unlikely(ret != 1)) {
-   ret = -EFAULT;
+   ret = tce_iommu_use_page(tce, hpa);
+   if (ret)
break;
-   }
 
+   page = pfn_to_page(hpa  PAGE_SHIFT);
if (!tce_page_is_contained(page, tbl-it_page_shift)) {
ret = -EPERM;
break;
}
 
-   hva = (unsigned long) page_address(page) + offset;
-
-   ret = iommu_tce_build(tbl, entry + i, hva, direction);
+   hpa |= offset;
+   ret = iommu_tce_build(tbl, entry + i, (unsigned long) 

[PATCH 00/13] arm64: KVM: GICv3 ITS emulation

2015-05-29 Thread Andre Przywara
The GICv3 ITS (Interrupt Translation Service) is a part of the
ARM GICv3 interrupt controller used for implementing MSIs.
It specifies a new kind of interrupts (LPIs), which are mapped to
establish a connection between a device, its MSI payload value and
the target processor the IRQ is eventually delivered to.
In order to allow using MSIs in an ARM64 KVM guest, we emulate this
ITS widget in the kernel.
The ITS works by reading commands written by software (from the guest
in our case) into a (guest allocated) memory region and establishing
the mapping between a device, the MSI payload and the target CPU.
We parse these commands and update our internal data structures to
reflect those changes. On an MSI injection we iterate those
structures to learn the LPI number we have to inject.
For the time being we use simple lists to hold the data, this is
good enough for the small number of entries each of the components
currently have. Should this become a performance bottleneck in the
future, those can be extended to arrays or trees if needed.

Most of the code lives in a separate source file (its-emul.c), though
there are some changes necessary both in vgic.c and gic-v3-emul.c.
Patch 01/13 gets rid of the internal tracking of the used LR for
an injected IRQ.
Patch 02/13 extends the KVM MSI ioctl to hold a device ID.
Patch 03/13 introduces an emulation model specific destroy function
to let the ITS be teared down correctly later.
The rest of the patches implement the ITS functionality step by step.
For more details see the respective commit messages.

For the time being this series gives us the ability to use emulated
PCI devices that can use MSIs in the guest. Those have to be
triggered by letting the userland device emulation simulate the MSI
write with the KVM_SIGNAL_MSI ioctl. This will be translated into
the proper LPI by the ITS emulation and injected into the guest in
the usual way (just with a higher IRQ number).

This series is based on 4.1-rc5 and can be found at the its-emul/v1
branch of this repository [1].
For this to be used you need a GICv3 host machine, though it does not
rely on any host ITS bits (neither hardware or software).

To test this you can use the kvmtool patches available in the its
branch here [2].
Start a guest with: $ lkvm run --irqchip=gicv3-its --force-pci
and see the ITS being used for instance by the virtio devices.

[1]: git://linux-arm.org/linux-ap.git
 http://www.linux-arm.org/git?p=linux-ap.git;a=log;h=refs/heads/its-emul/v1
[2]: git://linux-arm.org/kvmtool.git
 http://www.linux-arm.org/git?p=kvmtool.git;a=log;h=refs/heads/its

Andre Przywara (13):
  KVM: arm/arm64: VGIC: don't track used LRs in the distributor
  KVM: extend struct kvm_msi to hold a 32-bit device ID
  KVM: arm/arm64: add emulation model specific destroy function
  KVM: arm64: Introduce new MMIO region for the ITS base address
  KVM: arm64: handle ITS related GICv3 redistributor registers
  KVM: arm64: introduce ITS emulation file with stub functions
  KVM: arm64: implement basic ITS register handlers
  KVM: arm64: add data structures to model ITS interrupt translation
  KVM: arm64: handle pending bit for LPIs in ITS emulation
  KVM: arm64: sync LPI properties and status between guest and KVM
  KVM: arm64: implement ITS command queue command handlers
  KVM: arm64: implement MSI injection in ITS emulation
  KVM: arm64: enable ITS emulation as a virtual MSI controller

 Documentation/virtual/kvm/api.txt  |   10 +-
 Documentation/virtual/kvm/devices/arm-vgic.txt |7 +
 arch/arm64/include/uapi/asm/kvm.h  |3 +
 arch/arm64/kvm/Kconfig |1 +
 arch/arm64/kvm/Makefile|1 +
 include/kvm/arm_vgic.h |   39 +-
 include/linux/irqchip/arm-gic-v3.h |   10 +
 include/uapi/linux/kvm.h   |4 +-
 virt/kvm/arm/its-emul.c| 1026 
 virt/kvm/arm/its-emul.h|   52 ++
 virt/kvm/arm/vgic-v2.c |1 +
 virt/kvm/arm/vgic-v3-emul.c|   98 ++-
 virt/kvm/arm/vgic-v3.c |1 +
 virt/kvm/arm/vgic.c|  280 ---
 virt/kvm/arm/vgic.h|5 +
 15 files changed, 1426 insertions(+), 112 deletions(-)
 create mode 100644 virt/kvm/arm/its-emul.c
 create mode 100644 virt/kvm/arm/its-emul.h

-- 
2.3.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/13] KVM: arm/arm64: VGIC: don't track used LRs in the distributor

2015-05-29 Thread Andre Przywara
Currently we track which IRQ has been mapped to which VGIC list
register and also have to synchronize both. We used to do this
to hold some extra state (for instance the active bit).
It turns out that this extra state in the LRs is no longer needed and
this extra tracking causes some pain later.
Remove the tracking feature (lr_map and lr_used) and get rid of
quite some code on the way.
On a guest exit we pick up all still pending IRQs from the LRs and put
them back in the distributor. We don't care about active-only IRQs,
so we keep them in the LRs. They will be retired either by our
vgic_process_maintenance() routine or by the GIC hardware in case of
edge triggered interrupts.
In places where we scan LRs we now use our shadow copy of the ELRSR
register directly.
This code change means we lose the piggy-back optimization, which
would re-use an active-only LR to inject the pending state on top of
it. Tracing with various workloads shows that this actually occurred
very rarely, the ballpark figure is about once every 10,000 exits
in a disk I/O heavy workload. Also the list registers don't seem to
as scarce as assumed, with all 4 LRs on the popular implementations
used less than once every 100,000 exits.

This has been briefly tested on Midway, Juno and the model (the latter
both with GICv2 and GICv3 guests).

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 include/kvm/arm_vgic.h |   6 ---
 virt/kvm/arm/vgic-v2.c |   1 +
 virt/kvm/arm/vgic-v3.c |   1 +
 virt/kvm/arm/vgic.c| 143 ++---
 4 files changed, 66 insertions(+), 85 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 133ea00..2ccfa9a 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -279,9 +279,6 @@ struct vgic_v3_cpu_if {
 };
 
 struct vgic_cpu {
-   /* per IRQ to LR mapping */
-   u8  *vgic_irq_lr_map;
-
/* Pending/active/both interrupts on this VCPU */
DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS);
DECLARE_BITMAP( active_percpu, VGIC_NR_PRIVATE_IRQS);
@@ -292,9 +289,6 @@ struct vgic_cpu {
unsigned long   *active_shared;
unsigned long   *pend_act_shared;
 
-   /* Bitmap of used/free list registers */
-   DECLARE_BITMAP( lr_used, VGIC_V2_MAX_LRS);
-
/* Number of list registers on this CPU */
int nr_lr;
 
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
index f9b9c7c..f723710 100644
--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@@ -144,6 +144,7 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
 * anyway.
 */
vcpu-arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
+   vcpu-arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
 
/* Get the show on the road... */
vcpu-arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index dff0602..21e5d28 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -178,6 +178,7 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
 * anyway.
 */
vgic_v3-vgic_vmcr = 0;
+   vgic_v3-vgic_elrsr = ~0;
 
/*
 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 78fb820..037b723 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -81,7 +81,6 @@
 #include vgic.h
 
 static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
-static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
 static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
 static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
 
@@ -649,6 +648,17 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio 
*mmio,
return false;
 }
 
+static void vgic_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
+  struct vgic_lr vlr)
+{
+   vgic_ops-sync_lr_elrsr(vcpu, lr, vlr);
+}
+
+static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
+{
+   return vgic_ops-get_elrsr(vcpu);
+}
+
 /**
  * vgic_unqueue_irqs - move pending/active IRQs from LRs to the distributor
  * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
@@ -660,9 +670,11 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio 
*mmio,
 void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 {
struct vgic_cpu *vgic_cpu = vcpu-arch.vgic_cpu;
+   u64 elrsr = vgic_get_elrsr(vcpu);
+   unsigned long *elrsr_ptr = u64_to_bitmask(elrsr);
int i;
 
-   for_each_set_bit(i, vgic_cpu-lr_used, vgic_cpu-nr_lr) {
+   for_each_clear_bit(i, elrsr_ptr, vgic_cpu-nr_lr) {
struct vgic_lr lr = vgic_get_lr(vcpu, i);
 
/*
@@ -705,7 +717,7 @@ void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 * Mark the LR as free for other use.
 */
BUG_ON(lr.state  LR_STATE_MASK);
-   vgic_retire_lr(i, 

[PATCH v5 05/12] KVM: arm: introduce kvm_arm_init/setup/clear_debug

2015-05-29 Thread Alex Bennée
This is a precursor for later patches which will need to do more to
setup debug state before entering the hyp.S switch code. The existing
functionality for setting mdcr_el2 has been moved out of hyp.S and now
uses the value kept in vcpu-arch.mdcr_el2.

As the assembler used to previously mask and preserve MDCR_EL2.HPMN I've
had to add a mechanism to save the value of mdcr_el2 as a per-cpu
variable during the initialisation code. The kernel never sets this
number so we are assuming the bootcode has set up the correct value
here.

This also moves the conditional setting of the TDA bit from the hyp code
into the C code which is currently used for the lazy debug register
context switch code.

Signed-off-by: Alex Bennée alex.ben...@linaro.org

---
v3
  - rename fns from arch-arm
  - preserve MDCR_EL2.HPMN setting
  - re-word some of the comments
  - fix some minor grammar nits
  - merge setting of mdcr_el2
  - introduce trap_debug flag
  - move setup/clear within the irq lock section
v4
  - fix TDOSA desc
  - rm un-needed else leg
  - s/arch/arm/
---
 arch/arm/include/asm/kvm_host.h   |  4 ++
 arch/arm/kvm/arm.c|  9 -
 arch/arm64/include/asm/kvm_asm.h  |  2 +
 arch/arm64/include/asm/kvm_host.h |  5 +++
 arch/arm64/kernel/asm-offsets.c   |  1 +
 arch/arm64/kvm/Makefile   |  2 +-
 arch/arm64/kvm/debug.c| 81 +++
 arch/arm64/kvm/hyp.S  | 19 -
 8 files changed, 110 insertions(+), 13 deletions(-)
 create mode 100644 arch/arm64/kvm/debug.c

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index d71607c..746c0c69 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -236,4 +236,8 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
+static inline void kvm_arm_init_debug(void) {}
+static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 52a1d4d38..4a274e1 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -570,6 +570,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
continue;
}
 
+   kvm_arm_setup_debug(vcpu);
+
/**
 * Enter the guest
 */
@@ -582,7 +584,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
vcpu-mode = OUTSIDE_GUEST_MODE;
kvm_guest_exit();
trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
-   /*
+
+   kvm_arm_clear_debug(vcpu);
+
+/*
 * We may have taken a host interrupt in HYP mode (ie
 * while executing the guest). This interrupt is still
 * pending, as we haven't serviced it yet!
@@ -930,6 +935,8 @@ static void cpu_init_hyp_mode(void *dummy)
vector_ptr = (unsigned long)__kvm_hyp_vector;
 
__cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+
+   kvm_arm_init_debug();
 }
 
 static int hyp_init_cpu_notify(struct notifier_block *self,
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 4f7310f..d6b507e 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -137,6 +137,8 @@ extern char __restore_vgic_v2_state[];
 extern char __save_vgic_v3_state[];
 extern char __restore_vgic_v3_state[];
 
+extern u32 __kvm_get_mdcr_el2(void);
+
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index f0f58c9..7cb99b5 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -103,6 +103,7 @@ struct kvm_vcpu_arch {
 
/* HYP configuration */
u64 hcr_el2;
+   u32 mdcr_el2;
 
/* Exception Information */
struct kvm_vcpu_fault_info fault;
@@ -250,4 +251,8 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
+void kvm_arm_init_debug(void);
+void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
+void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index da675cc..dfb25a2 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -117,6 +117,7 @@ int main(void)
   DEFINE(VCPU_HPFAR_EL2,   offsetof(struct kvm_vcpu, 
arch.fault.hpfar_el2));
   DEFINE(VCPU_DEBUG_FLAGS,

[PATCH v5 09/12] KVM: arm64: introduce vcpu-arch.debug_ptr

2015-05-29 Thread Alex Bennée
This introduces a level of indirection for the debug registers. Instead
of using the sys_regs[] directly we store registers in a structure in
the vcpu. As we are no longer tied to the layout of the sys_regs[] we
can make the copies size appropriate for control and value registers.

This also entails updating the sys_regs code to access this new
structure. Instead of passing a register index we now pass an offset
into the kvm_guest_debug_arch structure.

We also need to ensure the GET/SET_ONE_REG ioctl operations store the
registers in their correct location.

Signed-off-by: Alex Bennée alex.ben...@linaro.org
---
 arch/arm/kvm/arm.c|   3 +
 arch/arm64/include/asm/kvm_asm.h  |  24 +++-
 arch/arm64/include/asm/kvm_host.h |  12 +++-
 arch/arm64/kernel/asm-offsets.c   |   6 ++
 arch/arm64/kvm/hyp.S  | 107 +---
 arch/arm64/kvm/sys_regs.c | 126 +++---
 6 files changed, 188 insertions(+), 90 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9b3ed6d..0d17c7b 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -279,6 +279,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
/* Set up the timer */
kvm_timer_vcpu_init(vcpu);
 
+   /* Set the debug registers to be the guests */
+   vcpu-arch.debug_ptr = vcpu-arch.vcpu_debug_state;
+
return 0;
 }
 
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index d6b507e..e997404 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -46,24 +46,16 @@
 #defineCNTKCTL_EL1 20  /* Timer Control Register (EL1) */
 #definePAR_EL1 21  /* Physical Address Register */
 #define MDSCR_EL1  22  /* Monitor Debug System Control Register */
-#define DBGBCR0_EL123  /* Debug Breakpoint Control Registers (0-15) */
-#define DBGBCR15_EL1   38
-#define DBGBVR0_EL139  /* Debug Breakpoint Value Registers (0-15) */
-#define DBGBVR15_EL1   54
-#define DBGWCR0_EL155  /* Debug Watchpoint Control Registers (0-15) */
-#define DBGWCR15_EL1   70
-#define DBGWVR0_EL171  /* Debug Watchpoint Value Registers (0-15) */
-#define DBGWVR15_EL1   86
-#define MDCCINT_EL187  /* Monitor Debug Comms Channel Interrupt Enable 
Reg */
+#define MDCCINT_EL123  /* Monitor Debug Comms Channel Interrupt Enable 
Reg */
 
 /* 32bit specific registers. Keep them at the end of the range */
-#defineDACR32_EL2  88  /* Domain Access Control Register */
-#defineIFSR32_EL2  89  /* Instruction Fault Status Register */
-#defineFPEXC32_EL2 90  /* Floating-Point Exception Control 
Register */
-#defineDBGVCR32_EL291  /* Debug Vector Catch Register */
-#defineTEECR32_EL1 92  /* ThumbEE Configuration Register */
-#defineTEEHBR32_EL193  /* ThumbEE Handler Base Register */
-#defineNR_SYS_REGS 94
+#defineDACR32_EL2  24  /* Domain Access Control Register */
+#defineIFSR32_EL2  25  /* Instruction Fault Status Register */
+#defineFPEXC32_EL2 26  /* Floating-Point Exception Control 
Register */
+#defineDBGVCR32_EL227  /* Debug Vector Catch Register */
+#defineTEECR32_EL1 28  /* ThumbEE Configuration Register */
+#defineTEEHBR32_EL129  /* ThumbEE Handler Base Register */
+#defineNR_SYS_REGS 30
 
 /* 32bit mapping */
 #define c0_MPIDR   (MPIDR_EL1 * 2) /* MultiProcessor ID Register */
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index e2db6a6..e5040b6 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -108,11 +108,21 @@ struct kvm_vcpu_arch {
/* Exception Information */
struct kvm_vcpu_fault_info fault;
 
-   /* Debug state */
+   /* Guest debug state */
u64 debug_flags;
 
+   /*
+* For debugging the guest we need to keep a set of debug
+* registers which can override the guests own debug state
+* while being used. These are set via the KVM_SET_GUEST_DEBUG
+* ioctl.
+*/
+   struct kvm_guest_debug_arch *debug_ptr;
+   struct kvm_guest_debug_arch vcpu_debug_state;
+
/* Pointer to host CPU context */
kvm_cpu_context_t *host_cpu_context;
+   struct kvm_guest_debug_arch host_debug_state;
 
/* VGIC state */
struct vgic_cpu vgic_cpu;
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index dfb25a2..1a8e97c 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -116,10 +116,16 @@ int main(void)
   DEFINE(VCPU_FAR_EL2, offsetof(struct kvm_vcpu, arch.fault.far_el2));
   DEFINE(VCPU_HPFAR_EL2,   offsetof(struct kvm_vcpu, 
arch.fault.hpfar_el2));
   DEFINE(VCPU_DEBUG_FLAGS,   

[PATCH v5 01/12] KVM: add comments for kvm_debug_exit_arch struct

2015-05-29 Thread Alex Bennée
Bring into line with the comments for the other structures and their
KVM_EXIT_* cases. Also update api.txt to reflect use in kvm_run
documentation.

Signed-off-by: Alex Bennée alex.ben...@linaro.org
Reviewed-by: David Hildenbrand d...@linux.vnet.ibm.com
Reviewed-by: Andrew Jones drjo...@redhat.com
Acked-by: Christoffer Dall christoffer.d...@linaro.org

---

v2
  - add comments for other exit types
v3
  - s/commentary/comments/
  - add rb tags
  - update api.txt kvm_run to include KVM_EXIT_DEBUG desc
v4
  - sp fixes
  - add a-b
---
 Documentation/virtual/kvm/api.txt | 4 +++-
 include/uapi/linux/kvm.h  | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 9fa2bf8..c34c32d 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3070,11 +3070,13 @@ data_offset describes where the data is located 
(KVM_EXIT_IO_OUT) or
 where kvm expects application code to place the data for the next
 KVM_RUN invocation (KVM_EXIT_IO_IN).  Data format is a packed array.
 
+   /* KVM_EXIT_DEBUG */
struct {
struct kvm_debug_exit_arch arch;
} debug;
 
-Unused.
+If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event
+for which architecture specific information is returned.
 
/* KVM_EXIT_MMIO */
struct {
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4b60056..70ac641 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -237,6 +237,7 @@ struct kvm_run {
__u32 count;
__u64 data_offset; /* relative to kvm_run start */
} io;
+   /* KVM_EXIT_DEBUG */
struct {
struct kvm_debug_exit_arch arch;
} debug;
@@ -285,6 +286,7 @@ struct kvm_run {
__u32 data;
__u8  is_write;
} dcr;
+   /* KVM_EXIT_INTERNAL_ERROR */
struct {
__u32 suberror;
/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
@@ -295,6 +297,7 @@ struct kvm_run {
struct {
__u64 gprs[32];
} osi;
+   /* KVM_EXIT_PAPR_HCALL */
struct {
__u64 nr;
__u64 ret;
-- 
2.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 00/12] KVM Guest Debug support for arm64

2015-05-29 Thread Alex Bennée
Here is V5 of the KVM Guest Debug support for arm64.

The changes are fairly minimal from the last round:

  - dropped KVM_GUESTDBG_USE_SW/HW_BP unifying patch (ABI break)
  - new comment patch to fix comments in hyp.S (also sent separately)
  - simplified singlestep code (no longer needs to preserve pstate)
  - don't set MDSCR_EL1.KDE (not needed)

For full details see the changelog on each of the patches.

GIT Repos:

The patches for this series are based off v4.1-rc5 and can be found
at:

https://git.linaro.org/people/alex.bennee/linux.git
branch: guest-debug/4.1-rc5-v5

While adding debug exception injection support into QEMU I ran into
problem with GDB in the guest which relies on working single step
support. So while guest SW BKPTs get delivered (and HW BKPTs if the
host is not using them) GDB tends to get confused as it tries to
single step. If the host isn't doing any debugging of the guest then
everything works as normal.

The actual solution would be to fully emulate single step in QEMU by
creating a new debug event when the guest sets MDSCR_EL1.SS. QEMU
would then need to ensure the correct position is reached while
honouring the guests setting of MDSCR_EL1.KDE. However this would be a
bunch of potentially hairy new code so I've left this as an exercise
for a future patch series.

https://github.com/stsquad/qemu
branch: kvm/guest-debug-v5

Alex Bennée (12):
  KVM: add comments for kvm_debug_exit_arch struct
  KVM: arm64: fix misleading comments in save/restore
  KVM: arm64: guest debug, define API headers
  KVM: arm: guest debug, add stub KVM_SET_GUEST_DEBUG ioctl
  KVM: arm: introduce kvm_arm_init/setup/clear_debug
  KVM: arm64: guest debug, add SW break point support
  KVM: arm64: guest debug, add support for single-step
  KVM: arm64: re-factor hyp.S debug register code
  KVM: arm64: introduce vcpu-arch.debug_ptr
  KVM: arm64: guest debug, HW assisted debug support
  KVM: arm64: enable KVM_CAP_SET_GUEST_DEBUG
  KVM: arm64: add trace points for guest_debug debug

 Documentation/virtual/kvm/api.txt  |  15 +-
 arch/arm/include/asm/kvm_host.h|   4 +
 arch/arm/kvm/arm.c |  46 ++-
 arch/arm/kvm/trace.h   |  17 +
 arch/arm64/include/asm/hw_breakpoint.h |  12 +
 arch/arm64/include/asm/kvm_asm.h   |  26 +-
 arch/arm64/include/asm/kvm_host.h  |  29 +-
 arch/arm64/include/uapi/asm/kvm.h  |  20 ++
 arch/arm64/kernel/asm-offsets.c|   7 +
 arch/arm64/kernel/hw_breakpoint.c  |  12 -
 arch/arm64/kvm/Makefile|   2 +-
 arch/arm64/kvm/debug.c | 194 
 arch/arm64/kvm/handle_exit.c   |  44 +++
 arch/arm64/kvm/hyp.S   | 551 ++---
 arch/arm64/kvm/reset.c |  15 +
 arch/arm64/kvm/sys_regs.c  | 136 ++--
 arch/arm64/kvm/trace.h | 105 +++
 include/uapi/linux/kvm.h   |   5 +
 18 files changed, 788 insertions(+), 452 deletions(-)
 create mode 100644 arch/arm64/kvm/debug.c

-- 
2.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 10/12] KVM: arm64: guest debug, HW assisted debug support

2015-05-29 Thread Alex Bennée
This adds support for userspace to control the HW debug registers for
guest debug. In the debug ioctl we copy the IMPDEF defined number of
registers into a new register set called host_debug_state. There is now
a new vcpu parameter called debug_ptr which selects which register set
is to copied into the real registers when world switch occurs.

I've moved some helper functions into the hw_breakpoint.h header for
re-use.

As with single step we need to tweak the guest registers to enable the
exceptions so we need to save and restore those bits.

Two new capabilities have been added to the KVM_EXTENSION ioctl to allow
userspace to query the number of hardware break and watch points
available on the host hardware.

Signed-off-by: Alex Bennée alex.ben...@linaro.org

---
v2
   - switched to C setup
   - replace host debug registers directly into context
   - minor tweak to api docs
   - setup right register for debug
   - add FAR_EL2 to debug exit structure
   - add support for trapping debug register access
v3
   - remove stray trace statement
   - fix spacing around operators (various)
   - clean-up usage of trap_debug
   - introduce debug_ptr, replace excessive memcpy stuff
   - don't use memcpy in ioctl, just assign
   - update cap ioctl documentation
   - reword a number comments
   - rename host_debug_state-external_debug_state
v4
   - use the new u32/u64 split debug_ptr approach
   - fix some wording/comments
v5
   - don't set MDSCR_EL1.KDE (not needed)
---
 Documentation/virtual/kvm/api.txt  |  7 ++-
 arch/arm/kvm/arm.c |  7 +++
 arch/arm64/include/asm/hw_breakpoint.h | 12 +++
 arch/arm64/include/asm/kvm_host.h  |  3 ++-
 arch/arm64/include/uapi/asm/kvm.h  |  2 +-
 arch/arm64/kernel/hw_breakpoint.c  | 12 ---
 arch/arm64/kvm/debug.c | 37 +-
 arch/arm64/kvm/handle_exit.c   |  6 ++
 arch/arm64/kvm/reset.c | 12 +++
 include/uapi/linux/kvm.h   |  2 ++
 10 files changed, 80 insertions(+), 20 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 33c8143..ada57df 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2668,7 +2668,7 @@ The top 16 bits of the control field are architecture 
specific control
 flags which can include the following:
 
   - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64]
-  - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390]
+  - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390, arm64]
   - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86]
   - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86]
   - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
@@ -2683,6 +2683,11 @@ updated to the correct (supplied) values.
 The second part of the structure is architecture specific and
 typically contains a set of debug registers.
 
+For arm64 the number of debug registers is implementation defined and
+can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and
+KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number
+indicating the number of supported registers.
+
 When debug events exit the main run loop with the reason
 KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
 structure containing architecture specific debug information.
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 0d17c7b..6df47c1 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -307,6 +307,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 
 #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE |\
KVM_GUESTDBG_USE_SW_BP | \
+   KVM_GUESTDBG_USE_HW_BP | \
KVM_GUESTDBG_SINGLESTEP)
 
 /**
@@ -327,6 +328,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu 
*vcpu,
 
if (dbg-control  KVM_GUESTDBG_ENABLE) {
vcpu-guest_debug = dbg-control;
+
+   /* Hardware assisted Break and Watch points */
+   if (vcpu-guest_debug  KVM_GUESTDBG_USE_HW_BP) {
+   vcpu-arch.external_debug_state = dbg-arch;
+   }
+
} else {
/* If not enabled clear all flags */
vcpu-guest_debug = 0;
diff --git a/arch/arm64/include/asm/hw_breakpoint.h 
b/arch/arm64/include/asm/hw_breakpoint.h
index 52b484b..c450552 100644
--- a/arch/arm64/include/asm/hw_breakpoint.h
+++ b/arch/arm64/include/asm/hw_breakpoint.h
@@ -130,6 +130,18 @@ static inline void ptrace_hw_copy_thread(struct 
task_struct *task)
 }
 #endif
 
+/* Determine number of BRP registers available. */
+static inline int get_num_brps(void)
+{
+   return ((read_cpuid(ID_AA64DFR0_EL1)  12)  0xf) + 1;
+}
+
+/* Determine number of WRP registers available. */
+static inline int get_num_wrps(void)
+{
+   return 

[PATCH v5 12/12] KVM: arm64: add trace points for guest_debug debug

2015-05-29 Thread Alex Bennée
This includes trace points for:
  kvm_arch_setup_guest_debug
  kvm_arch_clear_guest_debug

I've also added some generic register setting trace events and also a
trace point to dump the array of hardware registers.

Signed-off-by: Alex Bennée alex.ben...@linaro.org

---
v3
  - add trace event for debug access.
  - remove short trace #define, rename trace events
  - use __print_array with fixed array instead of own func
  - rationalise trace points (only one per register changed)
  - add vcpu ptr to the debug_setup trace
  - remove :: in prints
v4
  - u32/u64 split on debug registers
  - fix for renames
  - add tracing of traps/set_guest_debug
  - remove handle_guest_debug trace
v5
  - minor print fmt fix
  - rm pstate traces
---
 arch/arm/kvm/arm.c|   2 +
 arch/arm/kvm/trace.h  |  17 
 arch/arm64/kvm/debug.c|  35 +++-
 arch/arm64/kvm/sys_regs.c |  10 +
 arch/arm64/kvm/trace.h| 105 ++
 5 files changed, 168 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 6df47c1..a939a4e 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -323,6 +323,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
 {
+   trace_kvm_set_guest_debug(vcpu, dbg-control);
+
if (dbg-control  ~KVM_GUESTDBG_VALID_MASK)
return -EINVAL;
 
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index 0ec3539..3e346a6 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -317,6 +317,23 @@ TRACE_EVENT(kvm_toggle_cache,
  __entry-now ? on : off)
 );
 
+TRACE_EVENT(kvm_set_guest_debug,
+   TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+   TP_ARGS(vcpu, guest_debug),
+
+   TP_STRUCT__entry(
+   __field(struct kvm_vcpu *, vcpu)
+   __field(__u32, guest_debug)
+   ),
+
+   TP_fast_assign(
+   __entry-vcpu = vcpu;
+   __entry-guest_debug = guest_debug;
+   ),
+
+   TP_printk(vcpu: %p, flags: 0x%08x, __entry-vcpu, 
__entry-guest_debug)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 3c0daae..97204b5 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -24,6 +24,8 @@
 #include asm/kvm_arm.h
 #include asm/kvm_emulate.h
 
+#include trace.h
+
 /* These are the bits of MDSCR_EL1 we may manipulate */
 #define MDSCR_EL1_DEBUG_MASK   (DBG_MDSCR_SS | \
DBG_MDSCR_KDE | \
@@ -46,11 +48,17 @@ static DEFINE_PER_CPU(u32, mdcr_el2);
 static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
 {
vcpu-arch.guest_debug_preserved.mdscr_el1 = vcpu_sys_reg(vcpu, 
MDSCR_EL1);
+
+   trace_kvm_arm_set_dreg32(Saved MDSCR_EL1,
+   vcpu-arch.guest_debug_preserved.mdscr_el1);
 }
 
 static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
 {
vcpu_sys_reg(vcpu, MDSCR_EL1) = 
vcpu-arch.guest_debug_preserved.mdscr_el1;
+
+   trace_kvm_arm_set_dreg32(Restored MDSCR_EL1,
+   vcpu_sys_reg(vcpu, MDSCR_EL1));
 }
 
 /**
@@ -92,6 +100,8 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
 {
bool trap_debug = !(vcpu-arch.debug_flags  KVM_ARM64_DEBUG_DIRTY);
 
+   trace_kvm_arm_setup_debug(vcpu, vcpu-guest_debug);
+
vcpu-arch.mdcr_el2 = __this_cpu_read(mdcr_el2)  MDCR_EL2_HPMN_MASK;
vcpu-arch.mdcr_el2 |= (MDCR_EL2_TPM |
MDCR_EL2_TPMCR |
@@ -121,6 +131,8 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
vcpu_sys_reg(vcpu, MDSCR_EL1) = ~DBG_MDSCR_SS;
}
 
+   trace_kvm_arm_set_dreg32(SPSR_EL2, *vcpu_cpsr(vcpu));
+
/*
 * HW Break/Watch points
 *
@@ -137,16 +149,29 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
vcpu-arch.debug_ptr = vcpu-arch.external_debug_state;
vcpu-arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
trap_debug = true;
+
+   trace_kvm_arm_set_regset(BKPTS, get_num_brps(),
+   
vcpu-arch.debug_ptr-dbg_bcr[0],
+   
vcpu-arch.debug_ptr-dbg_bvr[0]);
+
+   trace_kvm_arm_set_regset(WAPTS, get_num_wrps(),
+   
vcpu-arch.debug_ptr-dbg_wcr[0],
+   
vcpu-arch.debug_ptr-dbg_wvr[0]);
}
}
 
/* Trap debug register access */
if (trap_debug)
vcpu-arch.mdcr_el2 |= MDCR_EL2_TDA;
+
+   trace_kvm_arm_set_dreg32(MDCR_EL2, vcpu-arch.mdcr_el2);
+   trace_kvm_arm_set_dreg32(MDSCR_EL1, vcpu_sys_reg(vcpu, 

[PATCH 13/13] KVM: arm64: enable ITS emulation as a virtual MSI controller

2015-05-29 Thread Andre Przywara
If userspace has provided a base address for the ITS register frame,
we enable the bits that advertise LPIs in the GICv3.
When the guest has enabled LPIs and the ITS, we enable the emulation
part by initializing the ITS data structures and trapping on ITS
register frame accesses by the guest.
Also we enable the KVM_SIGNAL_MSI feature to allow userland to inject
MSIs into the guest. Not having enabled the ITS emulation will lead
to a -ENODEV when trying to inject a MSI.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 Documentation/virtual/kvm/api.txt |  2 +-
 arch/arm64/kvm/Kconfig|  1 +
 include/kvm/arm_vgic.h| 10 ++
 virt/kvm/arm/its-emul.c   |  9 -
 virt/kvm/arm/vgic-v3-emul.c   | 20 +++-
 virt/kvm/arm/vgic.c   | 10 ++
 6 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 891d64a..d20fd94 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2108,7 +2108,7 @@ after pausing the vcpu, but before it is resumed.
 4.71 KVM_SIGNAL_MSI
 
 Capability: KVM_CAP_SIGNAL_MSI
-Architectures: x86
+Architectures: x86 arm64
 Type: vm ioctl
 Parameters: struct kvm_msi (in)
 Returns: 0 on delivery, 0 if guest blocked the MSI, and -1 on error
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 5105e29..6c432c0 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -30,6 +30,7 @@ config KVM
select SRCU
select HAVE_KVM_EVENTFD
select HAVE_KVM_IRQFD
+   select HAVE_KVM_MSI
---help---
  Support hosting virtualized guest machines.
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 6bb138d..8f1be6a 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -162,6 +162,7 @@ struct vgic_io_device {
 
 struct vgic_its {
boolenabled;
+   struct vgic_io_device   iodev;
spinlock_t  lock;
u64 cbaser;
int creadr;
@@ -365,4 +366,13 @@ static inline int vgic_v3_probe(struct device_node 
*vgic_node,
 }
 #endif
 
+#ifdef CONFIG_HAVE_KVM_MSI
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
+#else
+static inline int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+   return -ENODEV;
+}
+#endif
+
 #endif
diff --git a/virt/kvm/arm/its-emul.c b/virt/kvm/arm/its-emul.c
index 35e886c..864de19 100644
--- a/virt/kvm/arm/its-emul.c
+++ b/virt/kvm/arm/its-emul.c
@@ -964,6 +964,7 @@ int vits_init(struct kvm *kvm)
 {
struct vgic_dist *dist = kvm-arch.vgic;
struct vgic_its *its = dist-its;
+   int ret;
 
if (IS_VGIC_ADDR_UNDEF(dist-vgic_its_base))
return -ENXIO;
@@ -977,9 +978,15 @@ int vits_init(struct kvm *kvm)
INIT_LIST_HEAD(its-device_list);
INIT_LIST_HEAD(its-collection_list);
 
+   ret = vgic_register_kvm_io_dev(kvm, dist-vgic_its_base,
+  KVM_VGIC_V3_ITS_SIZE, vgicv3_its_ranges,
+  -1, its-iodev);
+   if (ret)
+   return ret;
+
its-enabled = false;
 
-   return -ENXIO;
+   return 0;
 }
 
 void vits_destroy(struct kvm *kvm)
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
index 4513551..71d0bcf 100644
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -89,10 +89,11 @@ static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
 /*
  * As this implementation does not provide compatibility
  * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
- * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
- * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
+ * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs)
+ * and provide 16 bits worth of LPI number space (to give 8192 LPIs).
  */
-#define INTERRUPT_ID_BITS 10
+#define INTERRUPT_ID_BITS_SPIS 10
+#define INTERRUPT_ID_BITS_ITS 16
 static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
  struct kvm_exit_mmio *mmio, phys_addr_t offset)
 {
@@ -100,7 +101,12 @@ static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
 
reg = (min(vcpu-kvm-arch.vgic.nr_irqs, 1024)  5) - 1;
 
-   reg |= (INTERRUPT_ID_BITS - 1)  19;
+   if (vgic_has_its(vcpu-kvm)) {
+   reg |= GICD_TYPER_LPIS;
+   reg |= (INTERRUPT_ID_BITS_ITS - 1)  19;
+   } else {
+   reg |= (INTERRUPT_ID_BITS_SPIS - 1)  19;
+   }
 
vgic_reg_access(mmio, reg, offset,
ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
@@ -519,7 +525,9 @@ static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
vgic_reg_access(mmio, reg, offset,
ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
if (!dist-lpis_enabled  

[PATCH v5 11/12] KVM: arm64: enable KVM_CAP_SET_GUEST_DEBUG

2015-05-29 Thread Alex Bennée
Finally advertise the KVM capability for SET_GUEST_DEBUG. Once arm
support is added this check can be moved to the common
kvm_vm_ioctl_check_extension() code.

Signed-off-by: Alex Bennée alex.ben...@linaro.org
Acked-by: Christoffer Dall christoffer.d...@linaro.org

---

v3:
 - separated capability check from previous patches
 - moved into arm64 specific ioctl handler.
v4:
 - add a-b-tag
---
 arch/arm64/kvm/reset.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 21d5a62..88e5331 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -76,6 +76,9 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
case KVM_CAP_GUEST_DEBUG_HW_WPS:
r  = get_num_wrps();
break;
+   case KVM_CAP_SET_GUEST_DEBUG:
+   r = 1;
+   break;
default:
r = 0;
}
-- 
2.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/13] KVM: arm64: sync LPI properties and status between guest and KVM

2015-05-29 Thread Andre Przywara
The properties and status of the GICv3 LPIs are hold in tables in
(guest) memory. To achieve reasonable performance, we cache this
data in our own data structures, so we need to sync those two views
from time to time. This behaviour is well described in the GICv3 spec
and is also exercised by hardware, so the sync points are well known.

Provide functions that read the guest memory and store the
information from the property and status table in the kernel.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 virt/kvm/arm/its-emul.c | 140 
 1 file changed, 140 insertions(+)

diff --git a/virt/kvm/arm/its-emul.c b/virt/kvm/arm/its-emul.c
index f75fb9e..afd440e 100644
--- a/virt/kvm/arm/its-emul.c
+++ b/virt/kvm/arm/its-emul.c
@@ -50,6 +50,7 @@ struct its_itte {
struct its_collection *collection;
u32 lpi;
u32 event_id;
+   u8 priority;
bool enabled;
unsigned long *pending;
 };
@@ -70,7 +71,140 @@ static struct its_itte *find_itte_by_lpi(struct kvm *kvm, 
int lpi)
return NULL;
 }
 
+#define LPI_PROP_ENABLE_BIT(p) ((p)  LPI_PROP_ENABLED)
+#define LPI_PROP_PRIORITY(p)   ((p)  0xfc)
+
+/* stores the priority and enable bit for a given LPI */
+static void update_lpi_property(struct kvm *kvm, struct its_itte *itte, u8 
prop)
+{
+   itte-priority = LPI_PROP_PRIORITY(prop);
+   itte-enabled  = LPI_PROP_ENABLE_BIT(prop);
+}
+
+#define GIC_LPI_OFFSET 8192
+
+/* We scan the table in chunks the size of the smallest page size */
+#define CHUNK_SIZE 4096U
+
 #define BASER_BASE_ADDRESS(x) ((x)  0xf000ULL)
+#define PROPBASE_TSIZE(x) (1U  (x  0x1f))
+
+/*
+ * Scan the whole LPI property table and put the LPI configuration
+ * data in our own data structures. This relies on the LPI being
+ * mapped before.
+ * We scan from two sides:
+ * 1) for each byte in the table we care for the ones being enabled
+ * 2) for each mapped LPI we look into the table to spot LPIs being disabled
+ * Must be called with the ITS lock held.
+ */
+static bool its_update_lpi_properties(struct kvm *kvm)
+{
+   struct vgic_dist *dist = kvm-arch.vgic;
+   u8 *prop;
+   u32 tsize;
+   gpa_t propbase;
+   int lpi = GIC_LPI_OFFSET;
+   struct its_itte *itte;
+   struct its_device *device;
+   int ret;
+
+   propbase = BASER_BASE_ADDRESS(dist-propbaser);
+   tsize = PROPBASE_TSIZE(dist-propbaser);
+
+   prop = kmalloc(CHUNK_SIZE, GFP_KERNEL);
+   if (!prop)
+   return false;
+
+   while (tsize  0) {
+   int chunksize = min(tsize, CHUNK_SIZE);
+
+   ret = kvm_read_guest(kvm, propbase, prop, chunksize);
+   if (ret) {
+   kfree(prop);
+   break;
+   }
+
+   /*
+* Updating the status for all allocated LPIs. We catch
+* those LPIs that get disabled. We really don't care
+* about unmapped LPIs, as they need to be updated
+* later manually anyway once they get mapped.
+*/
+   for_each_lpi(device, itte, kvm) {
+   /*
+* Is the LPI covered by that part of the table we
+* are currently looking at?
+*/
+   if (itte-lpi  lpi)
+   continue;
+   if (itte-lpi = lpi + chunksize)
+   continue;
+
+   update_lpi_property(kvm, itte,
+   prop[itte-lpi - lpi]);
+   }
+   tsize -= chunksize;
+   lpi += chunksize;
+   propbase += chunksize;
+   }
+
+   kfree(prop);
+   return true;
+}
+
+/*
+ * Scan the whole LPI pending table and sync the pending bit in there
+ * with our own data structures. This relies on the LPI being
+ * mapped before.
+ * Must be called with the ITS lock held.
+ */
+static bool its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
+{
+   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
+   unsigned long *pendmask;
+   u32 nr_lpis;
+   gpa_t pendbase;
+   int lpi = GIC_LPI_OFFSET;
+   struct its_itte *itte;
+   struct its_device *device;
+   int ret;
+   int lpi_bit, nr_bits;
+
+   pendbase = BASER_BASE_ADDRESS(dist-pendbaser[vcpu-vcpu_id]);
+   nr_lpis = GIC_LPI_OFFSET;
+
+   pendmask = kmalloc(CHUNK_SIZE, GFP_KERNEL);
+   if (!pendmask)
+   return false;
+
+   while (nr_lpis  0) {
+   nr_bits = min(nr_lpis, CHUNK_SIZE * 8);
+
+   ret = kvm_read_guest(vcpu-kvm, pendbase, pendmask,
+nr_bits / 8);
+   if (ret)
+   break;
+
+   for_each_lpi(device, itte, vcpu-kvm) {
+   lpi_bit = itte-lpi - lpi;
+ 

[PATCH 06/13] KVM: arm64: introduce ITS emulation file with stub functions

2015-05-29 Thread Andre Przywara
The ARM GICv3 ITS emulation code goes into a separate file, but
needs to be connected to the GICv3 emulation, of which it is an
option.
Introduce the skeletton with function stubs to be filled later.
Introduce the basic ITS data structure and initialize it, but don't
return any success yet, as we are not yet ready for the show.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 arch/arm64/kvm/Makefile|   1 +
 include/kvm/arm_vgic.h |   6 ++
 include/linux/irqchip/arm-gic-v3.h |   1 +
 virt/kvm/arm/its-emul.c| 127 +
 virt/kvm/arm/its-emul.h|  35 ++
 virt/kvm/arm/vgic-v3-emul.c|  22 ++-
 6 files changed, 189 insertions(+), 3 deletions(-)
 create mode 100644 virt/kvm/arm/its-emul.c
 create mode 100644 virt/kvm/arm/its-emul.h

diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index d5904f8..0d09189 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -25,5 +25,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
 kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v2-switch.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/its-emul.o
 kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v3-switch.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 9ea0b3b..d76c2d9 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -156,6 +156,11 @@ struct vgic_io_device {
struct kvm_io_device dev;
 };
 
+struct vgic_its {
+   boolenabled;
+   spinlock_t  lock;
+};
+
 struct vgic_dist {
spinlock_t  lock;
boolin_kernel;
@@ -260,6 +265,7 @@ struct vgic_dist {
u64 propbaser;
u64 *pendbaser;
boollpis_enabled;
+   struct vgic_its its;
 };
 
 struct vgic_v2_cpu_if {
diff --git a/include/linux/irqchip/arm-gic-v3.h 
b/include/linux/irqchip/arm-gic-v3.h
index ffbc034..df4e527 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -177,6 +177,7 @@
 #define GITS_CWRITER   0x0088
 #define GITS_CREADR0x0090
 #define GITS_BASER 0x0100
+#define GITS_IDREGS_BASE   0xffd0
 #define GITS_PIDR2 GICR_PIDR2
 
 #define GITS_TRANSLATER0x10040
diff --git a/virt/kvm/arm/its-emul.c b/virt/kvm/arm/its-emul.c
new file mode 100644
index 000..7b283ce
--- /dev/null
+++ b/virt/kvm/arm/its-emul.c
@@ -0,0 +1,127 @@
+/*
+ * GICv3 ITS emulation
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ * Author: Andre Przywara andre.przyw...@arm.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/.
+ */
+
+#include linux/cpu.h
+#include linux/kvm.h
+#include linux/kvm_host.h
+#include linux/interrupt.h
+
+#include linux/irqchip/arm-gic-v3.h
+#include kvm/arm_vgic.h
+
+#include asm/kvm_emulate.h
+#include asm/kvm_arm.h
+#include asm/kvm_mmu.h
+
+#include vgic.h
+#include its-emul.h
+
+static bool handle_mmio_misc_gits(struct kvm_vcpu *vcpu,
+ struct kvm_exit_mmio *mmio,
+ phys_addr_t offset)
+{
+   return false;
+}
+
+static bool handle_mmio_gits_idregs(struct kvm_vcpu *vcpu,
+   struct kvm_exit_mmio *mmio,
+   phys_addr_t offset)
+{
+   return false;
+}
+
+static bool handle_mmio_gits_cbaser(struct kvm_vcpu *vcpu,
+   struct kvm_exit_mmio *mmio,
+   phys_addr_t offset)
+{
+   return false;
+}
+
+static bool handle_mmio_gits_cwriter(struct kvm_vcpu *vcpu,
+struct kvm_exit_mmio *mmio,
+phys_addr_t offset)
+{
+   return false;
+}
+
+static bool handle_mmio_gits_creadr(struct kvm_vcpu *vcpu,
+   struct kvm_exit_mmio *mmio,
+   phys_addr_t offset)
+{
+   return false;
+}
+
+static const struct vgic_io_range vgicv3_its_ranges[] = {
+   {
+   .base   = GITS_CTLR,
+   .len= 0x10,
+   .bits_per_irq   = 0,
+  

[PATCH 09/13] KVM: arm64: handle pending bit for LPIs in ITS emulation

2015-05-29 Thread Andre Przywara
As the actual LPI number in a guest can be quite high, but is mostly
assigned using a very sparse allocation scheme, bitmaps and arrays
for storing the virtual interrupt status are a waste of memory.
We use our equivalent of the Interrupt Translation Table Entry
(ITTE) to hold this extra status information for a virtual LPI.
As the normal VGIC code cannot use it's fancy bitmaps to manage
pending interrupts, we provide a hook in the VGIC code to let the
ITS emulation handle the list register queueing itself.
LPIs are located in a separate number range (=8192), so
distinguishing them is easy. With LPIs being only edge-triggered, we
get away with a less complex IRQ handling.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 include/kvm/arm_vgic.h  |  2 ++
 virt/kvm/arm/its-emul.c | 66 +++
 virt/kvm/arm/its-emul.h |  3 ++
 virt/kvm/arm/vgic-v3-emul.c |  2 ++
 virt/kvm/arm/vgic.c | 68 +
 5 files changed, 124 insertions(+), 17 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index fa17df6..de19c34 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -147,6 +147,8 @@ struct vgic_vm_ops {
int (*init_model)(struct kvm *);
void(*destroy_model)(struct kvm *);
int (*map_resources)(struct kvm *, const struct vgic_params *);
+   bool(*queue_lpis)(struct kvm_vcpu *);
+   void(*unqueue_lpi)(struct kvm_vcpu *, int irq);
 };
 
 struct vgic_io_device {
diff --git a/virt/kvm/arm/its-emul.c b/virt/kvm/arm/its-emul.c
index f0f4a9c..f75fb9e 100644
--- a/virt/kvm/arm/its-emul.c
+++ b/virt/kvm/arm/its-emul.c
@@ -50,8 +50,26 @@ struct its_itte {
struct its_collection *collection;
u32 lpi;
u32 event_id;
+   bool enabled;
+   unsigned long *pending;
 };
 
+#define for_each_lpi(dev, itte, kvm) \
+   list_for_each_entry(dev, (kvm)-arch.vgic.its.device_list, dev_list) \
+   list_for_each_entry(itte, (dev)-itt, itte_list)
+
+static struct its_itte *find_itte_by_lpi(struct kvm *kvm, int lpi)
+{
+   struct its_device *device;
+   struct its_itte *itte;
+
+   for_each_lpi(device, itte, kvm) {
+   if (itte-lpi == lpi)
+   return itte;
+   }
+   return NULL;
+}
+
 #define BASER_BASE_ADDRESS(x) ((x)  0xf000ULL)
 
 /* distributor lock is hold by the VGIC MMIO handler */
@@ -145,6 +163,54 @@ static bool handle_mmio_gits_idregs(struct kvm_vcpu *vcpu,
return false;
 }
 
+/*
+ * Find all enabled and pending LPIs and queue them into the list
+ * registers.
+ * The dist lock is held by the caller.
+ */
+bool vits_queue_lpis(struct kvm_vcpu *vcpu)
+{
+   struct vgic_its *its = vcpu-kvm-arch.vgic.its;
+   struct its_device *device;
+   struct its_itte *itte;
+   bool ret = true;
+
+   spin_lock(its-lock);
+   for_each_lpi(device, itte, vcpu-kvm) {
+   if (!itte-enabled || !test_bit(vcpu-vcpu_id, itte-pending))
+   continue;
+
+   if (!itte-collection)
+   continue;
+
+   if (itte-collection-target_addr != vcpu-vcpu_id)
+   continue;
+
+   clear_bit(vcpu-vcpu_id, itte-pending);
+
+   ret = vgic_queue_irq(vcpu, 0, itte-lpi);
+   }
+
+   spin_unlock(its-lock);
+   return ret;
+}
+
+/* is called with the distributor lock held by the caller */
+void vits_unqueue_lpi(struct kvm_vcpu *vcpu, int lpi)
+{
+   struct vgic_its *its = vcpu-kvm-arch.vgic.its;
+   struct its_itte *itte;
+
+   spin_lock(its-lock);
+
+   /* Find the right ITTE and put the pending state back in there */
+   itte = find_itte_by_lpi(vcpu-kvm, lpi);
+   if (itte)
+   set_bit(vcpu-vcpu_id, itte-pending);
+
+   spin_unlock(its-lock);
+}
+
 static int vits_handle_command(struct kvm_vcpu *vcpu, u64 *its_cmd)
 {
return -ENODEV;
diff --git a/virt/kvm/arm/its-emul.h b/virt/kvm/arm/its-emul.h
index 472a6d0..cc5d5ff 100644
--- a/virt/kvm/arm/its-emul.h
+++ b/virt/kvm/arm/its-emul.h
@@ -33,4 +33,7 @@ void vgic_enable_lpis(struct kvm_vcpu *vcpu);
 int vits_init(struct kvm *kvm);
 void vits_destroy(struct kvm *kvm);
 
+bool vits_queue_lpis(struct kvm_vcpu *vcpu);
+void vits_unqueue_lpi(struct kvm_vcpu *vcpu, int irq);
+
 #endif
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
index fa81c4b..66640c2fa 100644
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -901,6 +901,8 @@ void vgic_v3_init_emulation(struct kvm *kvm)
dist-vm_ops.init_model = vgic_v3_init_model;
dist-vm_ops.destroy_model = vgic_v3_destroy_model;
dist-vm_ops.map_resources = vgic_v3_map_resources;
+   dist-vm_ops.queue_lpis = vits_queue_lpis;
+   dist-vm_ops.unqueue_lpi = vits_unqueue_lpi;
 
kvm-arch.max_vcpus = KVM_MAX_VCPUS;
 }
diff 

[PATCH 05/13] KVM: arm64: handle ITS related GICv3 redistributor registers

2015-05-29 Thread Andre Przywara
In the GICv3 redistributor there are the PENDBASER and PROPBASER
registers which we did not emulate so far, as they only make sense
when having an ITS. In preparation for that emulate those MMIO
accesses by storing the 64-bit data written into it into a variable
which we later read in the ITS emulation.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 include/kvm/arm_vgic.h  |  4 
 virt/kvm/arm/vgic-v3-emul.c | 43 +++
 virt/kvm/arm/vgic.c | 35 +++
 virt/kvm/arm/vgic.h |  4 
 4 files changed, 86 insertions(+)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 37725bb..9ea0b3b 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -256,6 +256,10 @@ struct vgic_dist {
struct vgic_vm_ops  vm_ops;
struct vgic_io_device   dist_iodev;
struct vgic_io_device   *redist_iodevs;
+
+   u64 propbaser;
+   u64 *pendbaser;
+   boollpis_enabled;
 };
 
 struct vgic_v2_cpu_if {
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
index 16c6d8a..04f3aed 100644
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -607,6 +607,37 @@ static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu 
*vcpu,
return vgic_handle_cfg_reg(reg, mmio, offset);
 }
 
+/* We don't trigger any actions here, just store the register value */
+static bool handle_mmio_propbaser_redist(struct kvm_vcpu *vcpu,
+struct kvm_exit_mmio *mmio,
+phys_addr_t offset)
+{
+   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
+   int mode = ACCESS_READ_VALUE;
+
+   mode |= dist-lpis_enabled ? ACCESS_WRITE_IGNORED : ACCESS_WRITE_VALUE;
+   vgic_handle_base_register(vcpu, mmio, offset, dist-propbaser, mode);
+
+   return false;
+}
+
+/* We don't trigger any actions here, just store the register value */
+static bool handle_mmio_pendbaser_redist(struct kvm_vcpu *vcpu,
+struct kvm_exit_mmio *mmio,
+phys_addr_t offset)
+{
+   struct kvm_vcpu *rdvcpu = mmio-private;
+   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
+   int mode = ACCESS_READ_VALUE;
+
+   /* Storing a value with LPIs already enabled is undefined */
+   mode |= dist-lpis_enabled ? ACCESS_WRITE_IGNORED : ACCESS_WRITE_VALUE;
+   vgic_handle_base_register(vcpu, mmio, offset,
+ dist-pendbaser[rdvcpu-vcpu_id], mode);
+
+   return false;
+}
+
 #define SGI_base(x) ((x) + SZ_64K)
 
 static const struct vgic_io_range vgic_redist_ranges[] = {
@@ -635,6 +666,18 @@ static const struct vgic_io_range vgic_redist_ranges[] = {
.handle_mmio= handle_mmio_raz_wi,
},
{
+   .base   = GICR_PENDBASER,
+   .len= 0x08,
+   .bits_per_irq   = 0,
+   .handle_mmio= handle_mmio_pendbaser_redist,
+   },
+   {
+   .base   = GICR_PROPBASER,
+   .len= 0x08,
+   .bits_per_irq   = 0,
+   .handle_mmio= handle_mmio_propbaser_redist,
+   },
+   {
.base   = GICR_IDREGS,
.len= 0x30,
.bits_per_irq   = 0,
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 2e9723aa..0a9236d 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -448,6 +448,41 @@ void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
}
 }
 
+/* handle a 64-bit register access */
+void vgic_handle_base_register(struct kvm_vcpu *vcpu,
+  struct kvm_exit_mmio *mmio,
+  phys_addr_t offset, u64 *basereg,
+  int mode)
+{
+   u32 reg;
+   u64 breg;
+
+   switch (offset  ~3) {
+   case 0x00:
+   breg = *basereg;
+   reg = lower_32_bits(breg);
+   vgic_reg_access(mmio, reg, offset  3, mode);
+   if (mmio-is_write  (mode  ACCESS_WRITE_VALUE)) {
+   breg = GENMASK_ULL(63, 32);
+   breg |= reg;
+   *basereg = breg;
+   }
+   break;
+   case 0x04:
+   breg = *basereg;
+   reg = upper_32_bits(breg);
+   vgic_reg_access(mmio, reg, offset  3, mode);
+   if (mmio-is_write  (mode  ACCESS_WRITE_VALUE)) {
+   breg  = lower_32_bits(breg);
+   breg |= (u64)reg  32;
+   *basereg = breg;
+   }
+   break;
+   }
+}
+
+
+
 bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
phys_addr_t offset)
 {
diff --git 

[PATCH 08/13] KVM: arm64: add data structures to model ITS interrupt translation

2015-05-29 Thread Andre Przywara
The GICv3 Interrupt Translation Service (ITS) uses tables in memory
to allow a sophisticated interrupt routing. It features device tables,
an interrupt table per device and a table connecting collections to
actual CPUs (aka. redistributors in the GICv3 lingo).
Since the interrupt numbers for the LPIs are allocated quite sparsely
and the range can be quite huge (8192 LPIs being the minimum), using
bitmaps or arrays for storing information is a waste of memory.
We use linked lists instead, which we iterate linearily. This works
very well with the actual number of LPIs/MSIs in the guest being
quite low. Should the number of LPIs exceed the number where iterating
the lists becomes painful, we can later revisit this and use more
efficient data structures.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 include/kvm/arm_vgic.h  |  3 +++
 virt/kvm/arm/its-emul.c | 45 +
 2 files changed, 48 insertions(+)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 3b8e3a1..fa17df6 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -25,6 +25,7 @@
 #include linux/spinlock.h
 #include linux/types.h
 #include kvm/iodev.h
+#include linux/list.h
 
 #define VGIC_NR_IRQS_LEGACY256
 #define VGIC_NR_SGIS   16
@@ -162,6 +163,8 @@ struct vgic_its {
u64 cbaser;
int creadr;
int cwriter;
+   struct list_headdevice_list;
+   struct list_headcollection_list;
 };
 
 struct vgic_dist {
diff --git a/virt/kvm/arm/its-emul.c b/virt/kvm/arm/its-emul.c
index 82bc34a..f0f4a9c 100644
--- a/virt/kvm/arm/its-emul.c
+++ b/virt/kvm/arm/its-emul.c
@@ -21,6 +21,7 @@
 #include linux/kvm.h
 #include linux/kvm_host.h
 #include linux/interrupt.h
+#include linux/list.h
 
 #include linux/irqchip/arm-gic-v3.h
 #include kvm/arm_vgic.h
@@ -32,6 +33,25 @@
 #include vgic.h
 #include its-emul.h
 
+struct its_device {
+   struct list_head dev_list;
+   struct list_head itt;
+   u32 device_id;
+};
+
+struct its_collection {
+   struct list_head coll_list;
+   u32 collection_id;
+   u32 target_addr;
+};
+
+struct its_itte {
+   struct list_head itte_list;
+   struct its_collection *collection;
+   u32 lpi;
+   u32 event_id;
+};
+
 #define BASER_BASE_ADDRESS(x) ((x)  0xf000ULL)
 
 /* distributor lock is hold by the VGIC MMIO handler */
@@ -280,6 +300,9 @@ int vits_init(struct kvm *kvm)
 
spin_lock_init(its-lock);
 
+   INIT_LIST_HEAD(its-device_list);
+   INIT_LIST_HEAD(its-collection_list);
+
its-enabled = false;
 
return -ENXIO;
@@ -289,11 +312,33 @@ void vits_destroy(struct kvm *kvm)
 {
struct vgic_dist *dist = kvm-arch.vgic;
struct vgic_its *its = dist-its;
+   struct its_device *dev;
+   struct its_itte *itte;
+   struct list_head *dev_cur, *dev_temp;
+   struct list_head *cur, *temp;
 
if (!vgic_has_its(kvm))
return;
 
+   spin_lock(its-lock);
+   list_for_each_safe(dev_cur, dev_temp, its-device_list) {
+   dev = container_of(dev_cur, struct its_device, dev_list);
+   list_for_each_safe(cur, temp, dev-itt) {
+   itte = (container_of(cur, struct its_itte, itte_list));
+   list_del(cur);
+   kfree(itte);
+   }
+   list_del(dev_cur);
+   kfree(dev);
+   }
+
+   list_for_each_safe(cur, temp, its-collection_list) {
+   list_del(cur);
+   kfree(container_of(cur, struct its_collection, coll_list));
+   }
+
kfree(dist-pendbaser);
 
its-enabled = false;
+   spin_unlock(its-lock);
 }
-- 
2.3.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/13] KVM: arm64: implement basic ITS register handlers

2015-05-29 Thread Andre Przywara
Add emulation for some basic MMIO registers used in the ITS emulation.
This includes:
- GITS_{CTLR,TYPER,IIDR}
- ID registers
- GITS_{CBASER,CREAD,CWRITER}
  those implement the ITS command buffer handling

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 include/kvm/arm_vgic.h |   3 +
 include/linux/irqchip/arm-gic-v3.h |   8 ++
 virt/kvm/arm/its-emul.c| 172 +
 virt/kvm/arm/its-emul.h|   1 +
 virt/kvm/arm/vgic-v3-emul.c|   2 +
 5 files changed, 186 insertions(+)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index d76c2d9..3b8e3a1 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -159,6 +159,9 @@ struct vgic_io_device {
 struct vgic_its {
boolenabled;
spinlock_t  lock;
+   u64 cbaser;
+   int creadr;
+   int cwriter;
 };
 
 struct vgic_dist {
diff --git a/include/linux/irqchip/arm-gic-v3.h 
b/include/linux/irqchip/arm-gic-v3.h
index df4e527..0b450c7 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -179,15 +179,23 @@
 #define GITS_BASER 0x0100
 #define GITS_IDREGS_BASE   0xffd0
 #define GITS_PIDR2 GICR_PIDR2
+#define GITS_PIDR4 0xffd0
+#define GITS_CIDR0 0xfff0
+#define GITS_CIDR1 0xfff4
+#define GITS_CIDR2 0xfff8
+#define GITS_CIDR3 0xfffc
 
 #define GITS_TRANSLATER0x10040
 
 #define GITS_CTLR_ENABLE   (1U  0)
 #define GITS_CTLR_QUIESCENT(1U  31)
 
+#define GITS_TYPER_PLPIS   (1UL  0)
+#define GITS_TYPER_IDBITS_SHIFT8
 #define GITS_TYPER_DEVBITS_SHIFT   13
 #define GITS_TYPER_DEVBITS(r)  r)  GITS_TYPER_DEVBITS_SHIFT)  
0x1f) + 1)
 #define GITS_TYPER_PTA (1UL  19)
+#define GITS_TYPER_HWCOLLCNT_SHIFT 24
 
 #define GITS_CBASER_VALID  (1UL  63)
 #define GITS_CBASER_nCnB   (0UL  59)
diff --git a/virt/kvm/arm/its-emul.c b/virt/kvm/arm/its-emul.c
index 7b283ce..82bc34a 100644
--- a/virt/kvm/arm/its-emul.c
+++ b/virt/kvm/arm/its-emul.c
@@ -32,10 +32,62 @@
 #include vgic.h
 #include its-emul.h
 
+#define BASER_BASE_ADDRESS(x) ((x)  0xf000ULL)
+
+/* distributor lock is hold by the VGIC MMIO handler */
 static bool handle_mmio_misc_gits(struct kvm_vcpu *vcpu,
  struct kvm_exit_mmio *mmio,
  phys_addr_t offset)
 {
+   struct vgic_its *its = vcpu-kvm-arch.vgic.its;
+   u32 reg;
+   bool was_enabled;
+
+   switch (offset  ~3) {
+   case 0x00:  /* GITS_CTLR */
+   /* We never defer any command execution. */
+   reg = GITS_CTLR_QUIESCENT;
+   if (its-enabled)
+   reg |= GITS_CTLR_ENABLE;
+   was_enabled = its-enabled;
+   vgic_reg_access(mmio, reg, offset  3,
+   ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+   its-enabled = !!(reg  GITS_CTLR_ENABLE);
+   return !was_enabled  its-enabled;
+   case 0x04:  /* GITS_IIDR */
+   reg = (PRODUCT_ID_KVM  24) | (IMPLEMENTER_ARM  0);
+   vgic_reg_access(mmio, reg, offset  3,
+   ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+   break;
+   case 0x08:  /* GITS_TYPER */
+   /*
+* We use linear CPU numbers for redistributor addressing,
+* so GITS_TYPER.PTA is 0.
+* To avoid memory waste on the guest side, we keep the
+* number of IDBits and DevBits low for the time being.
+* This could later be made configurable by userland.
+* Since we have all collections in linked list, we claim
+* that we can hold all of the collection tables in our
+* own memory and that the ITT entry size is 1 byte (the
+* smallest possible one).
+*/
+   reg = GITS_TYPER_PLPIS;
+   reg |= 0xff  GITS_TYPER_HWCOLLCNT_SHIFT;
+   reg |= 0x0f  GITS_TYPER_DEVBITS_SHIFT;
+   reg |= 0x0f  GITS_TYPER_IDBITS_SHIFT;
+   vgic_reg_access(mmio, reg, offset  3,
+   ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+   break;
+   case 0x0c:
+   /* The upper 32bits of TYPER are all 0 for the time being.
+* Should we need more than 256 collections, we can enable
+* some bits in here.
+*/
+   vgic_reg_access(mmio, NULL, offset  3,
+   ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+

[PATCH 12/13] KVM: arm64: implement MSI injection in ITS emulation

2015-05-29 Thread Andre Przywara
When userland wants to inject a MSI into the guest, we have to use
our data structures to find the LPI number and the VCPU to receivce
the interrupt.
Use the wrapper functions to iterate the linked lists and find the
proper Interrupt Translation Table Entry. Then set the pending bit
in this ITTE to be later picked up by the LR handling code. Kick
the VCPU which is meant to handle this interrupt.
We provide a VGIC emulation model specific routine for the actual
MSI injection. The wrapper functions return an error for models not
(yet) implementing MSIs (like the GICv2 emulation).

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 include/kvm/arm_vgic.h  |  1 +
 virt/kvm/arm/its-emul.c | 49 +
 virt/kvm/arm/its-emul.h |  2 ++
 virt/kvm/arm/vgic-v3-emul.c |  1 +
 4 files changed, 53 insertions(+)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index de19c34..6bb138d 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -149,6 +149,7 @@ struct vgic_vm_ops {
int (*map_resources)(struct kvm *, const struct vgic_params *);
bool(*queue_lpis)(struct kvm_vcpu *);
void(*unqueue_lpi)(struct kvm_vcpu *, int irq);
+   int (*inject_msi)(struct kvm *, struct kvm_msi *);
 };
 
 struct vgic_io_device {
diff --git a/virt/kvm/arm/its-emul.c b/virt/kvm/arm/its-emul.c
index 574cf05..35e886c 100644
--- a/virt/kvm/arm/its-emul.c
+++ b/virt/kvm/arm/its-emul.c
@@ -340,6 +340,55 @@ static bool handle_mmio_gits_idregs(struct kvm_vcpu *vcpu,
 }
 
 /*
+ * Translates an incoming MSI request into the redistributor (=VCPU) and
+ * the associated LPI number. Sets the LPI pending bit and also marks the
+ * VCPU as having a pending interrupt.
+ */
+int vits_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+   struct vgic_dist *dist = kvm-arch.vgic;
+   struct vgic_its *its = dist-its;
+   struct its_itte *itte;
+   int cpuid;
+   bool inject = false;
+   int ret = 0;
+
+   if (!vgic_has_its(kvm))
+   return -ENODEV;
+
+   if (!(msi-flags  KVM_MSI_VALID_DEVID))
+   return -EINVAL;
+
+   spin_lock(its-lock);
+
+   if (!its-enabled || !dist-lpis_enabled) {
+   ret = -EAGAIN;
+   goto out_unlock;
+   }
+
+   itte = find_itte(kvm, msi-devid, msi-data);
+   /* Triggering an unmapped IRQ gets silently dropped. */
+   if (!itte || !itte-collection)
+   goto out_unlock;
+
+   cpuid = itte-collection-target_addr;
+   set_bit(cpuid, itte-pending);
+   inject = itte-enabled;
+
+out_unlock:
+   spin_unlock(its-lock);
+
+   if (inject) {
+   spin_lock(dist-lock);
+   set_bit(cpuid, dist-irq_pending_on_cpu);
+   spin_unlock(dist-lock);
+   kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid));
+   }
+
+   return ret;
+}
+
+/*
  * Find all enabled and pending LPIs and queue them into the list
  * registers.
  * The dist lock is held by the caller.
diff --git a/virt/kvm/arm/its-emul.h b/virt/kvm/arm/its-emul.h
index 6152d04..cac1406 100644
--- a/virt/kvm/arm/its-emul.h
+++ b/virt/kvm/arm/its-emul.h
@@ -33,6 +33,8 @@ void vgic_enable_lpis(struct kvm_vcpu *vcpu);
 int vits_init(struct kvm *kvm);
 void vits_destroy(struct kvm *kvm);
 
+int vits_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
+
 bool vits_queue_lpis(struct kvm_vcpu *vcpu);
 void vits_unqueue_lpi(struct kvm_vcpu *vcpu, int irq);
 
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
index 66640c2fa..4513551 100644
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -901,6 +901,7 @@ void vgic_v3_init_emulation(struct kvm *kvm)
dist-vm_ops.init_model = vgic_v3_init_model;
dist-vm_ops.destroy_model = vgic_v3_destroy_model;
dist-vm_ops.map_resources = vgic_v3_map_resources;
+   dist-vm_ops.inject_msi = vits_inject_msi;
dist-vm_ops.queue_lpis = vits_queue_lpis;
dist-vm_ops.unqueue_lpi = vits_unqueue_lpi;
 
-- 
2.3.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/13] KVM: arm64: implement ITS command queue command handlers

2015-05-29 Thread Andre Przywara
The connection between a device, an event ID, the LPI number and the
allocated CPU is stored in in-memory tables in a GICv3, but their
format is not specified by the spec. Instead software uses a command
queue to let the ITS implementation use their own format.
Implement handlers for the various ITS commands and let them store
the requested relation into our own data structures.
Error handling is very basic at this point, as we don't have a good
way of communicating errors to the guest (usually a SError).

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 include/linux/irqchip/arm-gic-v3.h |   1 +
 virt/kvm/arm/its-emul.c| 422 -
 virt/kvm/arm/its-emul.h|  11 +
 3 files changed, 433 insertions(+), 1 deletion(-)

diff --git a/include/linux/irqchip/arm-gic-v3.h 
b/include/linux/irqchip/arm-gic-v3.h
index 0b450c7..651aacc 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -254,6 +254,7 @@
 #define GITS_CMD_MAPD  0x08
 #define GITS_CMD_MAPC  0x09
 #define GITS_CMD_MAPVI 0x0a
+#define GITS_CMD_MAPI  0x0b
 #define GITS_CMD_MOVI  0x01
 #define GITS_CMD_DISCARD   0x0f
 #define GITS_CMD_INV   0x0c
diff --git a/virt/kvm/arm/its-emul.c b/virt/kvm/arm/its-emul.c
index afd440e..574cf05 100644
--- a/virt/kvm/arm/its-emul.c
+++ b/virt/kvm/arm/its-emul.c
@@ -22,6 +22,7 @@
 #include linux/kvm_host.h
 #include linux/interrupt.h
 #include linux/list.h
+#include linux/slab.h
 
 #include linux/irqchip/arm-gic-v3.h
 #include kvm/arm_vgic.h
@@ -55,6 +56,34 @@ struct its_itte {
unsigned long *pending;
 };
 
+static struct its_device *find_its_device(struct kvm *kvm, u32 device_id)
+{
+   struct vgic_its *its = kvm-arch.vgic.its;
+   struct its_device *device;
+
+   list_for_each_entry(device, its-device_list, dev_list)
+   if (device_id == device-device_id)
+   return device;
+
+   return NULL;
+}
+
+static struct its_itte *find_itte(struct kvm *kvm, u32 device_id, u32 event_id)
+{
+   struct its_device *device;
+   struct its_itte *itte;
+
+   device = find_its_device(kvm, device_id);
+   if (device == NULL)
+   return NULL;
+
+   list_for_each_entry(itte, device-itt, itte_list)
+   if (itte-event_id == event_id)
+   return itte;
+
+   return NULL;
+}
+
 #define for_each_lpi(dev, itte, kvm) \
list_for_each_entry(dev, (kvm)-arch.vgic.its.device_list, dev_list) \
list_for_each_entry(itte, (dev)-itt, itte_list)
@@ -71,6 +100,19 @@ static struct its_itte *find_itte_by_lpi(struct kvm *kvm, 
int lpi)
return NULL;
 }
 
+static struct its_collection *find_collection(struct kvm *kvm, int coll_id)
+{
+   struct its_collection *collection;
+
+   list_for_each_entry(collection, kvm-arch.vgic.its.collection_list,
+   coll_list) {
+   if (coll_id == collection-collection_id)
+   return collection;
+   }
+
+   return NULL;
+}
+
 #define LPI_PROP_ENABLE_BIT(p) ((p)  LPI_PROP_ENABLED)
 #define LPI_PROP_PRIORITY(p)   ((p)  0xfc)
 
@@ -345,9 +387,386 @@ void vits_unqueue_lpi(struct kvm_vcpu *vcpu, int lpi)
spin_unlock(its-lock);
 }
 
+static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
+{
+   return (le64_to_cpu(its_cmd[word])  shift)  (BIT_ULL(size) - 1);
+}
+
+#define its_cmd_get_command(cmd)   its_cmd_mask_field(cmd, 0,  0,  8)
+#define its_cmd_get_deviceid(cmd)  its_cmd_mask_field(cmd, 0, 32, 32)
+#define its_cmd_get_id(cmd)its_cmd_mask_field(cmd, 1,  0, 32)
+#define its_cmd_get_physical_id(cmd)   its_cmd_mask_field(cmd, 1, 32, 32)
+#define its_cmd_get_collection(cmd)its_cmd_mask_field(cmd, 2,  0, 16)
+#define its_cmd_get_target_addr(cmd)   its_cmd_mask_field(cmd, 2, 16, 32)
+#define its_cmd_get_validbit(cmd)  its_cmd_mask_field(cmd, 2, 63,  1)
+
+/*
+ * Handles the DISCARD command, which frees an ITTE.
+ * Must be called with the ITS lock held.
+ */
+static int vits_cmd_handle_discard(struct kvm *kvm, u64 *its_cmd)
+{
+   u32 device_id;
+   u32 event_id;
+   struct its_itte *itte;
+
+   device_id = its_cmd_get_deviceid(its_cmd);
+   event_id = its_cmd_get_id(its_cmd);
+
+   itte = find_itte(kvm, device_id, event_id);
+   if (!itte || !itte-collection)
+   return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
+
+   clear_bit(itte-collection-target_addr, itte-pending);
+
+   list_del(itte-itte_list);
+   kfree(itte);
+   return 0;
+}
+
+/*
+ * Handles the MOVI command, which moves an ITTE to a different collection.
+ * Must be called with the ITS lock held.
+ */
+static int vits_cmd_handle_movi(struct kvm *kvm, u64 *its_cmd)
+{
+   u32 device_id = its_cmd_get_deviceid(its_cmd);
+   u32 event_id = 

[PATCH 02/13] KVM: extend struct kvm_msi to hold a 32-bit device ID

2015-05-29 Thread Andre Przywara
The ARM GICv3 ITS MSI controller requires a device ID to be able to
assign the proper interrupt vector. On real hardware, this ID is
sampled from the bus. To be able to emulate an ITS controller, extend
the KVM MSI interface to let userspace provide such a device ID. For
PCI devices, the device ID is simply the 16-bit bus-device-function
triplet, which should be easily available to the userland tool.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 Documentation/virtual/kvm/api.txt | 8 ++--
 include/uapi/linux/kvm.h  | 4 +++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 9fa2bf8..891d64a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2121,10 +2121,14 @@ struct kvm_msi {
__u32 address_hi;
__u32 data;
__u32 flags;
-   __u8  pad[16];
+   __u32 devid;
+   __u8  pad[12];
 };
 
-No flags are defined so far. The corresponding field must be 0.
+flags: KVM_MSI_VALID_DEVID: devid is valid, otherwise ignored.
+devid: If KVM_MSI_VALID_DEVID is set, contains a value to identify the device
+   that wrote the MSI message. For PCI, this is usually a BFD
+   identifier in the lower 16 bits.
 
 
 4.71 KVM_CREATE_PIT2
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4b60056..2a23705 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -965,12 +965,14 @@ struct kvm_one_reg {
__u64 addr;
 };
 
+#define KVM_MSI_VALID_DEVID(1U  0)
 struct kvm_msi {
__u32 address_lo;
__u32 address_hi;
__u32 data;
__u32 flags;
-   __u8  pad[16];
+   __u32 devid;
+   __u8  pad[12];
 };
 
 struct kvm_arm_device_addr {
-- 
2.3.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/13] KVM: arm/arm64: add emulation model specific destroy function

2015-05-29 Thread Andre Przywara
Currently we destroy the VGIC emulation in one function that cares for
all emulated models. The ITS emulation will require some
differentiation, so introduce a per-emulation-model destroy method.
Use it for a tiny GICv3 specific code already.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 include/kvm/arm_vgic.h  |  1 +
 virt/kvm/arm/vgic-v3-emul.c |  9 +
 virt/kvm/arm/vgic.c | 11 ++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 2ccfa9a..b18e2c5 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -144,6 +144,7 @@ struct vgic_vm_ops {
bool(*queue_sgi)(struct kvm_vcpu *, int irq);
void(*add_sgi_source)(struct kvm_vcpu *, int irq, int source);
int (*init_model)(struct kvm *);
+   void(*destroy_model)(struct kvm *);
int (*map_resources)(struct kvm *, const struct vgic_params *);
 };
 
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
index e9c3a7a..fbfdd6f 100644
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -818,6 +818,14 @@ static int vgic_v3_init_model(struct kvm *kvm)
return 0;
 }
 
+static void vgic_v3_destroy_model(struct kvm *kvm)
+{
+   struct vgic_dist *dist = kvm-arch.vgic;
+
+   kfree(dist-irq_spi_mpidr);
+   dist-irq_spi_mpidr = NULL;
+}
+
 /* GICv3 does not keep track of SGI sources anymore. */
 static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
 {
@@ -830,6 +838,7 @@ void vgic_v3_init_emulation(struct kvm *kvm)
dist-vm_ops.queue_sgi = vgic_v3_queue_sgi;
dist-vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
dist-vm_ops.init_model = vgic_v3_init_model;
+   dist-vm_ops.destroy_model = vgic_v3_destroy_model;
dist-vm_ops.map_resources = vgic_v3_map_resources;
 
kvm-arch.max_vcpus = KVM_MAX_VCPUS;
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 037b723..6ea30e0 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -102,6 +102,14 @@ int kvm_vgic_map_resources(struct kvm *kvm)
return kvm-arch.vgic.vm_ops.map_resources(kvm, vgic);
 }
 
+static void vgic_destroy_model(struct kvm *kvm)
+{
+   struct vgic_vm_ops *vm_ops = kvm-arch.vgic.vm_ops;
+
+   if (vm_ops-destroy_model)
+   vm_ops-destroy_model(kvm);
+}
+
 /*
  * struct vgic_bitmap contains a bitmap made of unsigned longs, but
  * extracts u32s out of them.
@@ -1631,6 +1639,8 @@ void kvm_vgic_destroy(struct kvm *kvm)
struct kvm_vcpu *vcpu;
int i;
 
+   vgic_destroy_model(kvm);
+
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vgic_vcpu_destroy(vcpu);
 
@@ -1647,7 +1657,6 @@ void kvm_vgic_destroy(struct kvm *kvm)
}
kfree(dist-irq_sgi_sources);
kfree(dist-irq_spi_cpu);
-   kfree(dist-irq_spi_mpidr);
kfree(dist-irq_spi_target);
kfree(dist-irq_pending_on_cpu);
kfree(dist-irq_active_on_cpu);
-- 
2.3.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/13] KVM: arm64: Introduce new MMIO region for the ITS base address

2015-05-29 Thread Andre Przywara
The ARM GICv3 ITS controller requires a separate register frame to
cover ITS specific registers. Add a new VGIC address type and store
the address in a field in the vgic_dist structure.
Provide a function to check whether userland has provided the address,
so ITS functionality can be guarded by that check.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 Documentation/virtual/kvm/devices/arm-vgic.txt |  7 +++
 arch/arm64/include/uapi/asm/kvm.h  |  3 +++
 include/kvm/arm_vgic.h |  3 +++
 virt/kvm/arm/vgic-v3-emul.c|  1 +
 virt/kvm/arm/vgic.c| 17 +
 virt/kvm/arm/vgic.h|  1 +
 6 files changed, 32 insertions(+)

diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt 
b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 3fb9054..1f89001 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -39,6 +39,13 @@ Groups:
   Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
   This address needs to be 64K aligned.
 
+KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit)
+  Base address in the guest physical address space of the GICv3 ITS
+  register frame. The ITS allows MSI(-X) interrupts to be injected
+  into guests. This extension is optional, if the kernel does not
+  support the ITS, the call returns -ENODEV.
+  Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
+  This address needs to be 64K aligned and the region covers 64 KByte.
 
   KVM_DEV_ARM_VGIC_GRP_DIST_REGS
   Attributes:
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index d268320..e42435c 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -82,8 +82,11 @@ struct kvm_regs {
 #define KVM_VGIC_V3_ADDR_TYPE_DIST 2
 #define KVM_VGIC_V3_ADDR_TYPE_REDIST   3
 
+#define KVM_VGIC_V3_ADDR_TYPE_ITS  4
+
 #define KVM_VGIC_V3_DIST_SIZE  SZ_64K
 #define KVM_VGIC_V3_REDIST_SIZE(2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE   SZ_64K
 
 #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index b18e2c5..37725bb 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -178,6 +178,9 @@ struct vgic_dist {
phys_addr_t vgic_redist_base;
};
 
+   /* The base address for the MSI control block (V2M/ITS) */
+   phys_addr_t vgic_its_base;
+
/* Distributor enabled */
u32 enabled;
 
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
index fbfdd6f..16c6d8a 100644
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -1012,6 +1012,7 @@ static int vgic_v3_has_attr(struct kvm_device *dev,
return -ENXIO;
case KVM_VGIC_V3_ADDR_TYPE_DIST:
case KVM_VGIC_V3_ADDR_TYPE_REDIST:
+   case KVM_VGIC_V3_ADDR_TYPE_ITS:
return 0;
}
break;
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 6ea30e0..2e9723aa 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -932,6 +932,16 @@ int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, 
int len,
return ret;
 }
 
+bool vgic_has_its(struct kvm *kvm)
+{
+   struct vgic_dist *dist = kvm-arch.vgic;
+
+   if (dist-vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+   return false;
+
+   return !IS_VGIC_ADDR_UNDEF(dist-vgic_its_base);
+}
+
 static int vgic_nr_shared_irqs(struct vgic_dist *dist)
 {
return dist-nr_irqs - VGIC_NR_PRIVATE_IRQS;
@@ -1835,6 +1845,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
kvm-arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
kvm-arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
kvm-arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
+   kvm-arch.vgic.vgic_its_base = VGIC_ADDR_UNDEF;
 
 out_unlock:
for (; vcpu_lock_idx = 0; vcpu_lock_idx--) {
@@ -1932,6 +1943,12 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, 
u64 *addr, bool write)
block_size = KVM_VGIC_V3_REDIST_SIZE;
alignment = SZ_64K;
break;
+   case KVM_VGIC_V3_ADDR_TYPE_ITS:
+   type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
+   addr_ptr = vgic-vgic_its_base;
+   block_size = KVM_VGIC_V3_ITS_SIZE;
+   alignment = SZ_64K;
+   break;
 #endif
default:
r = -ENODEV;
diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
index 0df74cb..a093f5c 100644
--- a/virt/kvm/arm/vgic.h
+++ b/virt/kvm/arm/vgic.h
@@ -136,5 +136,6 @@ int vgic_get_common_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr);
 int vgic_init(struct kvm *kvm);
 void 

[PATCH v5 03/12] KVM: arm64: guest debug, define API headers

2015-05-29 Thread Alex Bennée
This commit defines the API headers for guest debugging. There are two
architecture specific debug structures:

  - kvm_guest_debug_arch, allows us to pass in HW debug registers
  - kvm_debug_exit_arch, signals exception and possible faulting address

The type of debugging being used is controlled by the architecture
specific control bits of the kvm_guest_debug-control flags in the ioctl
structure.

Signed-off-by: Alex Bennée alex.ben...@linaro.org
Reviewed-by: David Hildenbrand d...@linux.vnet.ibm.com
Reviewed-by: Andrew Jones drjo...@redhat.com
Acked-by: Christoffer Dall christoffer.d...@linaro.org

---
v2
   - expose hsr and pc directly to user-space
v3
   - s/control/controlled/ in commit message
   - add v8 to ARM ARM comment (ARM Architecture Reference Manual)
   - add rb tag
   - rm pc, add far
   - re-word comments on alignment
   - rename KVM_ARM_NDBG_REGS - KVM_ARM_MAX_DBG_REGS
v4
   - now uses common HW/SW BP define
   - add a-b-tag
   - use u32 for control regs
v5
   - revert to have arch specific KVM_GUESTDBG_USE_SW/HW_BP
   - rm stale comments dbgctrl was stored as u64
---
 arch/arm64/include/uapi/asm/kvm.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index d268320..43758e7 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -100,12 +100,32 @@ struct kvm_sregs {
 struct kvm_fpu {
 };
 
+/*
+ * See v8 ARM ARM D7.3: Debug Registers
+ *
+ * The architectural limit is 16 debug registers of each type although
+ * in practice there are usually less (see ID_AA64DFR0_EL1).
+ */
+#define KVM_ARM_MAX_DBG_REGS 16
 struct kvm_guest_debug_arch {
+   __u32 dbg_bcr[KVM_ARM_MAX_DBG_REGS];
+   __u64 dbg_bvr[KVM_ARM_MAX_DBG_REGS];
+   __u32 dbg_wcr[KVM_ARM_MAX_DBG_REGS];
+   __u64 dbg_wvr[KVM_ARM_MAX_DBG_REGS];
 };
 
 struct kvm_debug_exit_arch {
+   __u32 hsr;
+   __u64 far;
 };
 
+/*
+ * Architecture specific defines for kvm_guest_debug-control
+ */
+
+#define KVM_GUESTDBG_USE_SW_BP (1  16)
+#define KVM_GUESTDBG_USE_HW_BP (1  17)
+
 struct kvm_sync_regs {
 };
 
-- 
2.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 06/12] KVM: arm64: guest debug, add SW break point support

2015-05-29 Thread Alex Bennée
This adds support for SW breakpoints inserted by userspace.

We do this by trapping all guest software debug exceptions to the
hypervisor (MDCR_EL2.TDE). The exit handler sets an exit reason of
KVM_EXIT_DEBUG with the kvm_debug_exit_arch structure holding the
exception syndrome information.

It will be up to userspace to extract the PC (via GET_ONE_REG) and
determine if the debug event was for a breakpoint it inserted. If not
userspace will need to re-inject the correct exception restart the
hypervisor to deliver the debug exception to the guest.

Any other guest software debug exception (e.g. single step or HW
assisted breakpoints) will cause an error and the VM to be killed. This
is addressed by later patches which add support for the other debug
types.

Signed-off-by: Alex Bennée alex.ben...@linaro.org
Reviewed-by: Christoffer Dall christoffer.d...@linaro.org

---
v2
  - update to use new exit struct
  - tweak for C setup
  - do our setup in debug_setup/clear code
  - fixed up comments
v3:
  - fix spacing in KVM_GUESTDBG_VALID_MASK
  - fix and clarify wording on kvm_handle_guest_debug
  - handle error case in kvm_handle_guest_debug
  - re-word the commit message
v4
  - rm else leg
  - add r-b-tag
---
 Documentation/virtual/kvm/api.txt |  2 +-
 arch/arm/kvm/arm.c|  2 +-
 arch/arm64/kvm/debug.c|  3 +++
 arch/arm64/kvm/handle_exit.c  | 36 
 4 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index ba635c7..33c8143 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2667,7 +2667,7 @@ when running. Common control bits are:
 The top 16 bits of the control field are architecture specific control
 flags which can include the following:
 
-  - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86]
+  - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64]
   - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390]
   - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86]
   - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86]
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 4a274e1..064c105 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -302,7 +302,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
kvm_arm_set_running_vcpu(NULL);
 }
 
-#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE)
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)
 
 /**
  * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index faf0e1f..8d1bfa4 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -73,6 +73,9 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
if (trap_debug)
vcpu-arch.mdcr_el2 |= MDCR_EL2_TDA;
 
+   /* Trap breakpoints? */
+   if (vcpu-guest_debug  KVM_GUESTDBG_USE_SW_BP)
+   vcpu-arch.mdcr_el2 |= MDCR_EL2_TDE;
 }
 
 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 524fa25..27f38a9 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -82,6 +82,40 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
return 1;
 }
 
+/**
+ * kvm_handle_guest_debug - handle a debug exception instruction
+ *
+ * @vcpu:  the vcpu pointer
+ * @run:   access to the kvm_run structure for results
+ *
+ * We route all debug exceptions through the same handler. If both the
+ * guest and host are using the same debug facilities it will be up to
+ * userspace to re-inject the correct exception for guest delivery.
+ *
+ * @return: 0 (while setting run-exit_reason), -1 for error
+ */
+static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+   u32 hsr = kvm_vcpu_get_hsr(vcpu);
+   int ret = 0;
+
+   run-exit_reason = KVM_EXIT_DEBUG;
+   run-debug.arch.hsr = hsr;
+
+   switch (hsr  ESR_ELx_EC_SHIFT) {
+   case ESR_ELx_EC_BKPT32:
+   case ESR_ELx_EC_BRK64:
+   break;
+   default:
+   kvm_err(%s: un-handled case hsr: %#08x\n,
+   __func__, (unsigned int) hsr);
+   ret = -1;
+   break;
+   }
+
+   return ret;
+}
+
 static exit_handle_fn arm_exit_handlers[] = {
[ESR_ELx_EC_WFx]= kvm_handle_wfx,
[ESR_ELx_EC_CP15_32]= kvm_handle_cp15_32,
@@ -96,6 +130,8 @@ static exit_handle_fn arm_exit_handlers[] = {
[ESR_ELx_EC_SYS64]  = kvm_handle_sys_reg,
[ESR_ELx_EC_IABT_LOW]   = kvm_handle_guest_abort,
[ESR_ELx_EC_DABT_LOW]   = kvm_handle_guest_abort,
+   [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug,
+   [ESR_ELx_EC_BRK64]  = kvm_handle_guest_debug,
 };
 
 static exit_handle_fn kvm_get_exit_handler(struct 

[PATCH v5 02/12] KVM: arm64: fix misleading comments in save/restore

2015-05-29 Thread Alex Bennée
The elr_el2 and spsr_el2 registers in fact contain the processor state
before entry into the hypervisor code. In the case of guest state it
could be in either el0 or el1.

Signed-off-by: Alex Bennée alex.ben...@linaro.org
---
 arch/arm64/kvm/hyp.S | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 5befd01..cb9bdd8 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -50,8 +50,8 @@
stp x29, lr, [x3, #80]
 
mrs x19, sp_el0
-   mrs x20, elr_el2// EL1 PC
-   mrs x21, spsr_el2   // EL1 pstate
+   mrs x20, elr_el2// PC before hyp entry
+   mrs x21, spsr_el2   // pstate before hyp entry
 
stp x19, x20, [x3, #96]
str x21, [x3, #112]
@@ -82,8 +82,8 @@
ldr x21, [x3, #16]
 
msr sp_el0, x19
-   msr elr_el2, x20// EL1 PC
-   msr spsr_el2, x21   // EL1 pstate
+   msr elr_el2, x20// PC to restore
+   msr spsr_el2, x21   // pstate to restore
 
add x3, x2, #CPU_XREG_OFFSET(19)
ldp x19, x20, [x3]
-- 
2.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 08/12] KVM: arm64: re-factor hyp.S debug register code

2015-05-29 Thread Alex Bennée
This is a pre-cursor to sharing the code with the guest debug support.
This replaces the big macro that fishes data out of a fixed location
with a more general helper macro to restore a set of debug registers. It
uses macro substitution so it can be re-used for debug control and value
registers. It does however rely on the debug registers being 64 bit
aligned (as they happen to be in the hyp ABI).

Signed-off-by: Alex Bennée alex.ben...@linaro.org

---
v3:
  - return to the patch series
  - add save and restore targets
  - change register use and document
v4:
  - keep original setup/restore names
  - don't use split u32/u64 structure yet
---
 arch/arm64/kvm/hyp.S | 519 ++-
 1 file changed, 140 insertions(+), 379 deletions(-)

diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 74e63d8..9c4897d 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -228,199 +228,52 @@
stp x24, x25, [x3, #160]
 .endm
 
-.macro save_debug
-   // x2: base address for cpu context
-   // x3: tmp register
-
-   mrs x26, id_aa64dfr0_el1
-   ubfxx24, x26, #12, #4   // Extract BRPs
-   ubfxx25, x26, #20, #4   // Extract WRPs
-   mov w26, #15
-   sub w24, w26, w24   // How many BPs to skip
-   sub w25, w26, w25   // How many WPs to skip
-
-   add x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1)
-
-   adr x26, 1f
-   add x26, x26, x24, lsl #2
-   br  x26
-1:
-   mrs x20, dbgbcr15_el1
-   mrs x19, dbgbcr14_el1
-   mrs x18, dbgbcr13_el1
-   mrs x17, dbgbcr12_el1
-   mrs x16, dbgbcr11_el1
-   mrs x15, dbgbcr10_el1
-   mrs x14, dbgbcr9_el1
-   mrs x13, dbgbcr8_el1
-   mrs x12, dbgbcr7_el1
-   mrs x11, dbgbcr6_el1
-   mrs x10, dbgbcr5_el1
-   mrs x9, dbgbcr4_el1
-   mrs x8, dbgbcr3_el1
-   mrs x7, dbgbcr2_el1
-   mrs x6, dbgbcr1_el1
-   mrs x5, dbgbcr0_el1
-
-   adr x26, 1f
-   add x26, x26, x24, lsl #2
-   br  x26
-
-1:
-   str x20, [x3, #(15 * 8)]
-   str x19, [x3, #(14 * 8)]
-   str x18, [x3, #(13 * 8)]
-   str x17, [x3, #(12 * 8)]
-   str x16, [x3, #(11 * 8)]
-   str x15, [x3, #(10 * 8)]
-   str x14, [x3, #(9 * 8)]
-   str x13, [x3, #(8 * 8)]
-   str x12, [x3, #(7 * 8)]
-   str x11, [x3, #(6 * 8)]
-   str x10, [x3, #(5 * 8)]
-   str x9, [x3, #(4 * 8)]
-   str x8, [x3, #(3 * 8)]
-   str x7, [x3, #(2 * 8)]
-   str x6, [x3, #(1 * 8)]
-   str x5, [x3, #(0 * 8)]
-
-   add x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1)
-
-   adr x26, 1f
-   add x26, x26, x24, lsl #2
-   br  x26
-1:
-   mrs x20, dbgbvr15_el1
-   mrs x19, dbgbvr14_el1
-   mrs x18, dbgbvr13_el1
-   mrs x17, dbgbvr12_el1
-   mrs x16, dbgbvr11_el1
-   mrs x15, dbgbvr10_el1
-   mrs x14, dbgbvr9_el1
-   mrs x13, dbgbvr8_el1
-   mrs x12, dbgbvr7_el1
-   mrs x11, dbgbvr6_el1
-   mrs x10, dbgbvr5_el1
-   mrs x9, dbgbvr4_el1
-   mrs x8, dbgbvr3_el1
-   mrs x7, dbgbvr2_el1
-   mrs x6, dbgbvr1_el1
-   mrs x5, dbgbvr0_el1
-
-   adr x26, 1f
-   add x26, x26, x24, lsl #2
-   br  x26
-
-1:
-   str x20, [x3, #(15 * 8)]
-   str x19, [x3, #(14 * 8)]
-   str x18, [x3, #(13 * 8)]
-   str x17, [x3, #(12 * 8)]
-   str x16, [x3, #(11 * 8)]
-   str x15, [x3, #(10 * 8)]
-   str x14, [x3, #(9 * 8)]
-   str x13, [x3, #(8 * 8)]
-   str x12, [x3, #(7 * 8)]
-   str x11, [x3, #(6 * 8)]
-   str x10, [x3, #(5 * 8)]
-   str x9, [x3, #(4 * 8)]
-   str x8, [x3, #(3 * 8)]
-   str x7, [x3, #(2 * 8)]
-   str x6, [x3, #(1 * 8)]
-   str x5, [x3, #(0 * 8)]
-
-   add x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1)
-
-   adr x26, 1f
-   add x26, x26, x25, lsl #2
-   br  x26
+.macro save_debug type
+   // x4: pointer to register set
+   // x5: number of registers to skip
+   // x6..x22 trashed
+
+   adr x22, 1f
+   add x22, x22, x5, lsl #2
+   br  x22
 1:
-   mrs x20, dbgwcr15_el1
-   mrs x19, dbgwcr14_el1
-   mrs x18, dbgwcr13_el1
-   mrs x17, dbgwcr12_el1
-   mrs x16, dbgwcr11_el1
-   mrs x15, dbgwcr10_el1
-   mrs x14, dbgwcr9_el1
-   mrs x13, dbgwcr8_el1
-   mrs x12, dbgwcr7_el1
-   mrs x11, dbgwcr6_el1
-   mrs x10, dbgwcr5_el1
-   mrs x9, dbgwcr4_el1
-   mrs x8, dbgwcr3_el1
-   mrs x7, dbgwcr2_el1
-   mrs x6, dbgwcr1_el1
-   mrs x5, dbgwcr0_el1
-
-   adr x26, 1f
-   

[PATCH v5 04/12] KVM: arm: guest debug, add stub KVM_SET_GUEST_DEBUG ioctl

2015-05-29 Thread Alex Bennée
This commit adds a stub function to support the KVM_SET_GUEST_DEBUG
ioctl. Any unsupported flag will return -EINVAL. For now, only
KVM_GUESTDBG_ENABLE is supported, although it won't have any effects.

Signed-off-by: Alex Bennée alex.ben...@linaro.org.
Reviewed-by: Christoffer Dall christoffer.d...@linaro.org

---
v2
  - simplified form of the ioctl (stuff will go into setup_debug)
v3
 - KVM_GUESTDBG_VALID-KVM_GUESTDBG_VALID_MASK
 - move mask check to the top of function
 - add ioctl doc header
 - split capability into separate patch
 - tweaked commit wording w.r.t return of -EINVAL
v4
 - add r-b-tag
---
 Documentation/virtual/kvm/api.txt |  2 +-
 arch/arm/kvm/arm.c| 23 ++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index c34c32d..ba635c7 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2645,7 +2645,7 @@ handled.
 4.87 KVM_SET_GUEST_DEBUG
 
 Capability: KVM_CAP_SET_GUEST_DEBUG
-Architectures: x86, s390, ppc
+Architectures: x86, s390, ppc, arm64
 Type: vcpu ioctl
 Parameters: struct kvm_guest_debug (in)
 Returns: 0 on success; -1 on error
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index d9631ec..52a1d4d38 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -302,10 +302,31 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
kvm_arm_set_running_vcpu(NULL);
 }
 
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE)
+
+/**
+ * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
+ * @kvm:   pointer to the KVM struct
+ * @kvm_guest_debug: the ioctl data buffer
+ *
+ * This sets up and enables the VM for guest debugging. Userspace
+ * passes in a control flag to enable different debug types and
+ * potentially other architecture specific information in the rest of
+ * the structure.
+ */
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
 {
-   return -EINVAL;
+   if (dbg-control  ~KVM_GUESTDBG_VALID_MASK)
+   return -EINVAL;
+
+   if (dbg-control  KVM_GUESTDBG_ENABLE) {
+   vcpu-guest_debug = dbg-control;
+   } else {
+   /* If not enabled clear all flags */
+   vcpu-guest_debug = 0;
+   }
+   return 0;
 }
 
 
-- 
2.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 07/12] KVM: arm64: guest debug, add support for single-step

2015-05-29 Thread Alex Bennée
This adds support for single-stepping the guest. To do this we need to
manipulate the guests PSTATE.SS and MDSCR_EL1.SS bits which we do in the
kvm_arm_setup/clear_debug() so we don't affect the apparent state of the
guest. Additionally while the host is debugging the guest we suppress
the ability of the guest to single-step itself.

Signed-off-by: Alex Bennée alex.ben...@linaro.org

---
v2
  - Move pstate/mdscr manipulation into C
  - don't export guest_debug to assembly
  - add accessor for saved_debug regs
  - tweak save/restore of mdscr_el1
v3
  - don't save PC in debug information struct
  - rename debug_saved_regs-guest_debug_state
  - save whole value, only use bits in restore
  - add save/restore_guest-debug_regs helper functions
  - simplify commit message for clarity
  - rm vcpu_debug_saved_reg access fn
v4
  - added more comments based on suggestions
  - guest_debug_state-guest_debug_preserved
  - no point masking restore, we will trap out
v5
  - more comments
  - don't bother preserving pstate.ss
---
 arch/arm/kvm/arm.c|  4 ++-
 arch/arm64/include/asm/kvm_host.h | 11 
 arch/arm64/kvm/debug.c| 58 ---
 arch/arm64/kvm/handle_exit.c  |  2 ++
 4 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 064c105..9b3ed6d 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -302,7 +302,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
kvm_arm_set_running_vcpu(NULL);
 }
 
-#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE |\
+   KVM_GUESTDBG_USE_SW_BP | \
+   KVM_GUESTDBG_SINGLESTEP)
 
 /**
  * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 7cb99b5..e2db6a6 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -123,6 +123,17 @@ struct kvm_vcpu_arch {
 * here.
 */
 
+   /*
+* Guest registers we preserve during guest debugging.
+*
+* These shadow registers are updated by the kvm_handle_sys_reg
+* trap handler if the guest accesses or updates them while we
+* are using guest debug.
+*/
+   struct {
+   u32 mdscr_el1;
+   } guest_debug_preserved;
+
/* Don't run the guest */
bool pause;
 
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 8d1bfa4..10a6baa 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -19,11 +19,41 @@
 
 #include linux/kvm_host.h
 
+#include asm/debug-monitors.h
+#include asm/kvm_asm.h
 #include asm/kvm_arm.h
+#include asm/kvm_emulate.h
+
+/* These are the bits of MDSCR_EL1 we may manipulate */
+#define MDSCR_EL1_DEBUG_MASK   (DBG_MDSCR_SS | \
+   DBG_MDSCR_KDE | \
+   DBG_MDSCR_MDE)
 
 static DEFINE_PER_CPU(u32, mdcr_el2);
 
 /**
+ * save/restore_guest_debug_regs
+ *
+ * For some debug operations we need to tweak some guest registers. As
+ * a result we need to save the state of those registers before we
+ * make those modifications. This does get confused if the guest
+ * attempts to control single step while being debugged. It will start
+ * working again once it is no longer being debugged by the host.
+ *
+ * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled
+ * after we have restored the preserved value to the main context.
+ */
+static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
+{
+   vcpu-arch.guest_debug_preserved.mdscr_el1 = vcpu_sys_reg(vcpu, 
MDSCR_EL1);
+}
+
+static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
+{
+   vcpu_sys_reg(vcpu, MDSCR_EL1) = 
vcpu-arch.guest_debug_preserved.mdscr_el1;
+}
+
+/**
  * kvm_arm_init_debug - grab what we need for debug
  *
  * Currently the sole task of this function is to retrieve the initial
@@ -38,7 +68,6 @@ void kvm_arm_init_debug(void)
__this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2));
 }
 
-
 /**
  * kvm_arm_setup_debug - set up debug related stuff
  *
@@ -73,12 +102,33 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
if (trap_debug)
vcpu-arch.mdcr_el2 |= MDCR_EL2_TDA;
 
-   /* Trap breakpoints? */
-   if (vcpu-guest_debug  KVM_GUESTDBG_USE_SW_BP)
+   /* Is Guest debugging in effect? */
+   if (vcpu-guest_debug) {
vcpu-arch.mdcr_el2 |= MDCR_EL2_TDE;
+
+   /* Save guest debug state */
+   save_guest_debug_regs(vcpu);
+
+   /*
+* Single Step (ARM ARM D2.12.3 The software step state
+* machine)
+*
+* If we are doing Single Step we need to manipulate
+* MDSCR_EL1.SS and PSTATE.SS. If not we need to
+

[PATCH] virtio: fix fsync() on a directory

2015-05-29 Thread Russell King
dpkg in the guest fails when it tries to use fsync() on a directory:

openat(AT_FDCWD, /var/lib/dpkg, 
O_RDONLY|O_NONBLOCK|O_LARGEFILE|O_DIRECTORY|O_CLOEXEC) = 4
fsync(4)= -1 EINVAL (Invalid argument)

stracing lkvm shows that this is converted to:

openat(AT_FDCWD, /root/rootfs-32//var/lib/dpkg, 
O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC) = 368
fsync(0)= -1 EINVAL (Invalid argument)

In other words, we sync against the wrong file descriptor.  This case
is not handled in the kvmtool code, let's add support for it.

Signed-off-by: Russell King r...@arm.linux.org.uk
---
 virtio/9p.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/virtio/9p.c b/virtio/9p.c
index 2c120fa..5f93e41 100644
--- a/virtio/9p.c
+++ b/virtio/9p.c
@@ -886,17 +886,22 @@ err_out:
 static void virtio_p9_fsync(struct p9_dev *p9dev,
struct p9_pdu *pdu, u32 *outlen)
 {
-   int ret;
+   int ret, fd;
struct p9_fid *fid;
u32 fid_val, datasync;
 
virtio_p9_pdu_readf(pdu, dd, fid_val, datasync);
fid = get_fid(p9dev, fid_val);
 
+   if (fid-dir)
+   fd = dirfd(fid-dir);
+   else
+   fd = fid-fd;
+
if (datasync)
-   ret = fdatasync(fid-fd);
+   ret = fdatasync(fd);
else
-   ret = fsync(fid-fd);
+   ret = fsync(fd);
if (ret  0)
goto err_out;
*outlen = pdu-write_offset;
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V1 4/5] kvm: arm64: Implement ACPI probing code for GICv2

2015-05-29 Thread Andrew Jones
On Thu, May 28, 2015 at 01:34:33AM -0400, Wei Huang wrote:
 This patches enables ACPI support for KVM virtual GICv2. KVM parses
 ACPI table for virt GIC related information and initializes resources.
 
 Signed-off-by: Alexander Spyridaki a.spyrida...@virtualopensystems.com
 Signed-off-by: Wei Huang w...@redhat.com
 ---
  virt/kvm/arm/vgic-v2.c | 49 -
  1 file changed, 48 insertions(+), 1 deletion(-)
 
 diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
 index 711de82..01ce8a3 100644
 --- a/virt/kvm/arm/vgic-v2.c
 +++ b/virt/kvm/arm/vgic-v2.c
 @@ -264,6 +264,53 @@ int vgic_v2_acpi_probe(struct 
 acpi_madt_generic_interrupt *vgic_acpi,
  const struct vgic_ops **ops,
  const struct vgic_params **params)
  {
 - return -EINVAL;
 + struct vgic_params *vgic = vgic_v2_params;
 + int irq_mode, ret;
 +
 + /* IRQ trigger mode */
 + irq_mode = (vgic_acpi-flags  ACPI_MADT_VGIC_IRQ_MODE) ?
 + ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
 + vgic-maint_irq = acpi_register_gsi(NULL, vgic_acpi-vgic_interrupt,
 + irq_mode, ACPI_ACTIVE_HIGH);
 + if (!vgic-maint_irq) {
 + kvm_err(Cannot register VGIC ACPI maintenance irq\n);
 + ret = -ENXIO;
 + goto out;
 + }
 +
 + /* GICH resource */
 + vgic-vctrl_base = ioremap(vgic_acpi-gich_base_address, SZ_8K);
 + if (!vgic-vctrl_base) {
 + kvm_err(cannot ioremap GICH memory\n);
 + ret = -ENOMEM;
 + goto out;
 + }
 +
 + vgic-nr_lr = readl_relaxed(vgic-vctrl_base + GICH_VTR);
 + vgic-nr_lr = (vgic-nr_lr  0x3f) + 1;
 +
 + ret = create_hyp_io_mappings(vgic-vctrl_base,
 +  vgic-vctrl_base + SZ_8K,
 +  vgic_acpi-gich_base_address);
 + if (ret) {
 + kvm_err(Cannot map GICH into hyp\n);
 + goto out;
 + }
 +
 + vgic-vcpu_base = vgic_acpi-gicv_base_address;
 + vgic-can_emulate_gicv2 = true;
 + kvm_register_device_ops(kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
 +
 + kvm_info(GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n,
 +  (unsigned long long)vgic_acpi-gich_base_address,
 +  (unsigned long long)vgic_acpi-gicv_base_address,
 +  vgic-maint_irq);
 +
 + vgic-type = VGIC_V2;

we're missing max_gic_vcpus here

vgic-max_gic_vcpus = VGIC_V2_MAX_CPUS;

 + *ops = vgic_v2_ops;
 + *params = vgic;
 +
 +out:
 + return ret;
  }
  #endif /* CONFIG_ACPI */
 -- 
 1.8.3.1
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: arm: vgic: Drop useless Group0 warning

2015-05-29 Thread Marc Zyngier
If a GICv3-enabled guest tries to configure Group0, we print a
warning on the console (because we don't support Group0 interrupts).

This is fairly pointless, and would allow a guest to spam the
console. Let's just drop the warning.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 virt/kvm/arm/vgic-v3-emul.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
index e9c3a7a..53e6a6d 100644
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -76,8 +76,6 @@ static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
vgic_reg_access(mmio, reg, offset,
ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
if (mmio-is_write) {
-   if (reg  GICD_CTLR_ENABLE_SS_G0)
-   kvm_info(guest tried to enable unsupported Group0 
interrupts\n);
vcpu-kvm-arch.vgic.enabled = !!(reg  GICD_CTLR_ENABLE_SS_G1);
vgic_update_state(vcpu-kvm);
return true;
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 2/6] target-arm: kvm64: introduce kvm_arm_init_debug()

2015-05-29 Thread Alex Bennée
As we haven't always had guest debug support we need to probe for it.
Additionally we don't do this in the start-up capability code so we
don't fall over on old kernels.

Signed-off-by: Alex Bennée alex.ben...@linaro.org
---
 target-arm/kvm64.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/target-arm/kvm64.c b/target-arm/kvm64.c
index 93c1ca8..61592d2 100644
--- a/target-arm/kvm64.c
+++ b/target-arm/kvm64.c
@@ -25,6 +25,22 @@
 #include internals.h
 #include hw/arm/arm.h
 
+static bool have_guest_debug;
+
+/**
+ * kvm_arm_init_debug()
+ * @cs: CPUState
+ *
+ * Check for guest debug capabilities.
+ *
+ */
+static void kvm_arm_init_debug(CPUState *cs)
+{
+have_guest_debug = kvm_check_extension(cs-kvm_state,
+   KVM_CAP_SET_GUEST_DEBUG);
+return;
+}
+
 static inline void set_feature(uint64_t *features, int feature)
 {
 *features |= 1ULL  feature;
@@ -107,6 +123,8 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return ret;
 }
 
+kvm_arm_init_debug(cs);
+
 return kvm_arm_init_cpreg_list(cpu);
 }
 
-- 
2.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 4/6] target-arm: kvm - support for single step

2015-05-29 Thread Alex Bennée
This adds support for single-step. There isn't much to do on the QEMU
side as after we set-up the request for single step via the debug ioctl
it is all handled within the kernel.

Signed-off-by: Alex Bennée alex.ben...@linaro.org

---
v2
  - convert to using HSR_EC
v3
  - use internals.h definitions
---
 target-arm/kvm.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index c3bad6f..de2865a 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -528,6 +528,13 @@ static int kvm_handle_debug(CPUState *cs, struct kvm_run 
*run)
 kvm_cpu_synchronize_state(cs);
 
 switch (hsr_ec) {
+case EC_SOFTWARESTEP:
+if (cs-singlestep_enabled) {
+return true;
+} else {
+error_report(Came out of SINGLE STEP when not enabled);
+}
+break;
 case EC_AA64_BKPT:
 if (kvm_find_sw_breakpoint(cs, env-pc)) {
 return true;
@@ -588,6 +595,9 @@ int kvm_arch_on_sigbus(int code, void *addr)
 
 void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
 {
+if (cs-singlestep_enabled) {
+dbg-control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
+}
 if (kvm_sw_breakpoints_active(cs)) {
 dbg-control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
 }
-- 
2.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 0/6] QEMU support for KVM Guest Debug on arm64

2015-05-29 Thread Alex Bennée
Hi,

You may be wondering what happened to v3 and v4. They do exist but
they didn't change much from the the original patches as I've been
mostly looking the kernel side of the equation. So in summary the
changes are:

  - updates to the kernel ABI
  - don't fall over on kernels without debug support
  - better logging, syncing and use of internals.h
  - debug exception re-injection for guest events*

More detailed changelogs are attached to each patch.

* see
  https://lists.cs.columbia.edu/pipermail/kvmarm/2015-May/014807.html

GIT Repos:

The patch series is based off a recent master and can be found at:

https://github.com/stsquad/qemu
branch: kvm/guest-debug-v5

The kernel patches for this series are based off a v4.1-rc5-v5 and can be
found at:

https://git.linaro.org/people/alex.bennee/linux.git
branch: guest-debug/4.1-rc5-v5

Alex Bennée (6):
  linux-headers: sync from my kernel tree (DEV)
  target-arm: kvm64: introduce kvm_arm_init_debug()
  target-arm: kvm - implement software breakpoints
  target-arm: kvm - support for single step
  target-arm: kvm - add support for HW assisted debug
  target-arm: kvm - re-inject guest debug exceptions

 include/standard-headers/linux/virtio_balloon.h |  28 ++-
 include/standard-headers/linux/virtio_blk.h |   8 +-
 include/standard-headers/linux/virtio_ids.h |   1 +
 include/standard-headers/linux/virtio_input.h   |  76 ++
 include/standard-headers/linux/virtio_ring.h|   2 +-
 linux-headers/asm-arm/kvm.h |   9 +-
 linux-headers/asm-arm64/kvm.h   |  29 ++-
 linux-headers/asm-mips/kvm.h| 164 +++-
 linux-headers/asm-s390/kvm.h|   4 +
 linux-headers/asm-x86/hyperv.h  |   2 +
 linux-headers/linux/kvm.h   |  71 +-
 linux-headers/linux/vfio.h  |   2 +
 target-arm/cpu.h|   1 +
 target-arm/helper-a64.c |  17 +-
 target-arm/internals.h  |   1 +
 target-arm/kvm.c| 137 --
 target-arm/kvm64.c  | 318 
 target-arm/kvm_arm.h|  21 ++
 18 files changed, 790 insertions(+), 101 deletions(-)
 create mode 100644 include/standard-headers/linux/virtio_input.h

-- 
2.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 1/6] linux-headers: sync from my kernel tree (DEV)

2015-05-29 Thread Alex Bennée
I assume I'll properly merge the KVM Headers direct from Linux when
the kernel side is upstream. These headers came from:

https://git.linaro.org/people/alex.bennee/linux.git/shortlog/refs/heads/guest-debug/4.1-rc5-v5

Signed-off-by: Alex Bennée alex.ben...@linaro.org

---
v2
  - update ABI to include -far
v3
  - update with latest 4.1-rc1 headers
v4
  - update to v4 headers
v5
  - update to v5 headers
---
 include/standard-headers/linux/virtio_balloon.h |  28 +++-
 include/standard-headers/linux/virtio_blk.h |   8 +-
 include/standard-headers/linux/virtio_ids.h |   1 +
 include/standard-headers/linux/virtio_input.h   |  76 +++
 include/standard-headers/linux/virtio_ring.h|   2 +-
 linux-headers/asm-arm/kvm.h |   9 +-
 linux-headers/asm-arm64/kvm.h   |  29 -
 linux-headers/asm-mips/kvm.h| 164 +++-
 linux-headers/asm-s390/kvm.h|   4 +
 linux-headers/asm-x86/hyperv.h  |   2 +
 linux-headers/linux/kvm.h   |  71 +-
 linux-headers/linux/vfio.h  |   2 +
 12 files changed, 325 insertions(+), 71 deletions(-)
 create mode 100644 include/standard-headers/linux/virtio_input.h

diff --git a/include/standard-headers/linux/virtio_balloon.h 
b/include/standard-headers/linux/virtio_balloon.h
index 799376d..88ada1d 100644
--- a/include/standard-headers/linux/virtio_balloon.h
+++ b/include/standard-headers/linux/virtio_balloon.h
@@ -25,6 +25,7 @@
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE. */
+#include standard-headers/linux/types.h
 #include standard-headers/linux/virtio_ids.h
 #include standard-headers/linux/virtio_config.h
 
@@ -51,9 +52,32 @@ struct virtio_balloon_config {
 #define VIRTIO_BALLOON_S_MEMTOT   5   /* Total amount of memory */
 #define VIRTIO_BALLOON_S_NR   6
 
+/*
+ * Memory statistics structure.
+ * Driver fills an array of these structures and passes to device.
+ *
+ * NOTE: fields are laid out in a way that would make compiler add padding
+ * between and after fields, so we have to use compiler-specific attributes to
+ * pack it, to disable this padding. This also often causes compiler to
+ * generate suboptimal code.
+ *
+ * We maintain this statistics structure format for backwards compatibility,
+ * but don't follow this example.
+ *
+ * If implementing a similar structure, do something like the below instead:
+ * struct virtio_balloon_stat {
+ * __virtio16 tag;
+ * uint8_t reserved[6];
+ * __virtio64 val;
+ * };
+ *
+ * In other words, add explicit reserved fields to align field and
+ * structure boundaries at field size, avoiding compiler padding
+ * without the packed attribute.
+ */
 struct virtio_balloon_stat {
-   uint16_t tag;
-   uint64_t val;
+   __virtio16 tag;
+   __virtio64 val;
 } QEMU_PACKED;
 
 #endif /* _LINUX_VIRTIO_BALLOON_H */
diff --git a/include/standard-headers/linux/virtio_blk.h 
b/include/standard-headers/linux/virtio_blk.h
index 12016b4..cd601f4 100644
--- a/include/standard-headers/linux/virtio_blk.h
+++ b/include/standard-headers/linux/virtio_blk.h
@@ -58,7 +58,7 @@ struct virtio_blk_config {
uint32_t size_max;
/* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */
uint32_t seg_max;
-   /* geometry the device (if VIRTIO_BLK_F_GEOMETRY) */
+   /* geometry of the device (if VIRTIO_BLK_F_GEOMETRY) */
struct virtio_blk_geometry {
uint16_t cylinders;
uint8_t heads;
@@ -117,7 +117,11 @@ struct virtio_blk_config {
 #define VIRTIO_BLK_T_BARRIER   0x8000
 #endif /* !VIRTIO_BLK_NO_LEGACY */
 
-/* This is the first element of the read scatter-gather list. */
+/*
+ * This comes first in the read scatter-gather list.
+ * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated,
+ * this is the first element of the read scatter-gather list.
+ */
 struct virtio_blk_outhdr {
/* VIRTIO_BLK_T* */
__virtio32 type;
diff --git a/include/standard-headers/linux/virtio_ids.h 
b/include/standard-headers/linux/virtio_ids.h
index 284fc3a..5f60aa4 100644
--- a/include/standard-headers/linux/virtio_ids.h
+++ b/include/standard-headers/linux/virtio_ids.h
@@ -39,5 +39,6 @@
 #define VIRTIO_ID_9P   9 /* 9p virtio console */
 #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
 #define VIRTIO_ID_CAIF12 /* Virtio caif */
+#define VIRTIO_ID_INPUT18 /* virtio input */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/standard-headers/linux/virtio_input.h 
b/include/standard-headers/linux/virtio_input.h
new file mode 100644
index 000..a98a797
--- /dev/null
+++ b/include/standard-headers/linux/virtio_input.h
@@ -0,0 +1,76 @@
+#ifndef _LINUX_VIRTIO_INPUT_H
+#define _LINUX_VIRTIO_INPUT_H
+/* This header 

[PATCH v5 6/6] target-arm: kvm - re-inject guest debug exceptions

2015-05-29 Thread Alex Bennée
From: Alex Bennée a...@bennee.com

If we can't find details for the debug exception in our debug state
then we can assume the exception is due to debugging inside the guest.
To inject the exception into the guest state we re-use the TCG exception
code (do_interupt).

However while guest debugging is in effect we currently can't handle the
guest using single step which is heavily used by GDB.

Signed-off-by: Alex Bennée alex.ben...@linaro.org

---
v5:
  - new for v5
---
 target-arm/cpu.h|  1 +
 target-arm/helper-a64.c | 17 ++---
 target-arm/internals.h  |  1 +
 target-arm/kvm.c| 30 ++
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 083211c..95ae3a8 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -56,6 +56,7 @@
 #define EXCP_SMC13   /* Secure Monitor Call */
 #define EXCP_VIRQ   14
 #define EXCP_VFIQ   15
+#define EXCP_WAPT   16
 
 #define ARMV7M_EXCP_RESET   1
 #define ARMV7M_EXCP_NMI 2
diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 861f6fa..32bd27d 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -25,6 +25,7 @@
 #include qemu/bitops.h
 #include internals.h
 #include qemu/crc32c.h
+#include sysemu/kvm.h
 #include zlib.h /* For crc32 */
 
 /* C2.4.7 Multiply and divide */
@@ -478,10 +479,13 @@ void aarch64_cpu_do_interrupt(CPUState *cs)
 }
 
 arm_log_exception(cs-exception_index);
-qemu_log_mask(CPU_LOG_INT, ...from EL%d\n, arm_current_el(env));
+qemu_log_mask(CPU_LOG_INT, ...from EL%d PC 0x% PRIx64 \n,
+  arm_current_el(env), env-pc);
+
 if (qemu_loglevel_mask(CPU_LOG_INT)
  !excp_is_internal(cs-exception_index)) {
-qemu_log_mask(CPU_LOG_INT, ...with ESR 0x% PRIx32 \n,
+qemu_log_mask(CPU_LOG_INT, ...with ESR %x/0x% PRIx32 \n,
+  env-exception.syndrome  ARM_EL_EC_SHIFT,
   env-exception.syndrome);
 }
 
@@ -494,6 +498,7 @@ void aarch64_cpu_do_interrupt(CPUState *cs)
 switch (cs-exception_index) {
 case EXCP_PREFETCH_ABORT:
 case EXCP_DATA_ABORT:
+case EXCP_WAPT:
 env-cp15.far_el[new_el] = env-exception.vaddress;
 qemu_log_mask(CPU_LOG_INT, ...with FAR 0x% PRIx64 \n,
   env-cp15.far_el[new_el]);
@@ -539,6 +544,12 @@ void aarch64_cpu_do_interrupt(CPUState *cs)
 aarch64_restore_sp(env, new_el);
 
 env-pc = addr;
-cs-interrupt_request |= CPU_INTERRUPT_EXITTB;
+
+qemu_log_mask(CPU_LOG_INT, ...to EL%d PC 0x% PRIx64  PSTATE 0x%x\n,
+  new_el, env-pc, pstate_read(env));
+
+if (!kvm_enabled()) {
+cs-interrupt_request |= CPU_INTERRUPT_EXITTB;
+}
 }
 #endif
diff --git a/target-arm/internals.h b/target-arm/internals.h
index 2cc3017..10e8999 100644
--- a/target-arm/internals.h
+++ b/target-arm/internals.h
@@ -58,6 +58,7 @@ static const char * const excnames[] = {
 [EXCP_SMC] = Secure Monitor Call,
 [EXCP_VIRQ] = Virtual IRQ,
 [EXCP_VFIQ] = Virtual FIQ,
+[EXCP_WAPT] = Watchpoint,
 };
 
 static inline void arm_log_exception(int idx)
diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index e1fccdd..6f608d8 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -523,9 +523,11 @@ static int kvm_handle_debug(CPUState *cs, struct kvm_run 
*run)
 struct kvm_debug_exit_arch *arch_info = run-debug.arch;
 int hsr_ec = arch_info-hsr  ARM_EL_EC_SHIFT;
 ARMCPU *cpu = ARM_CPU(cs);
+CPUClass *cc = CPU_GET_CLASS(cs);
 CPUARMState *env = cpu-env;
+int forward_excp = EXCP_BKPT;
 
-/* Ensure PC is synchronised */
+/* Ensure all state is synchronised */
 kvm_cpu_synchronize_state(cs);
 
 switch (hsr_ec) {
@@ -533,7 +535,14 @@ static int kvm_handle_debug(CPUState *cs, struct kvm_run 
*run)
 if (cs-singlestep_enabled) {
 return true;
 } else {
-error_report(Came out of SINGLE STEP when not enabled);
+/*
+ * The kernel should have supressed the guests ability to
+ * single step at this point so something has gone wrong.
+ */
+error_report(%s: guest single-step while debugging unsupported
+  (%PRIx64, %PRIx32)\n,
+ __func__, env-pc, arch_info-hsr);
+return false;
 }
 break;
 case EC_AA64_BKPT:
@@ -549,19 +558,24 @@ static int kvm_handle_debug(CPUState *cs, struct kvm_run 
*run)
 case EC_WATCHPOINT:
 if (kvm_arm_find_hw_watchpoint(cs, arch_info-far)) {
 return true;
+} else {
+forward_excp = EXCP_WAPT;
 }
 break;
 default:
 error_report(%s: unhandled debug exit (%PRIx32, %PRIx64)\n,
  __func__, arch_info-hsr, env-pc);
+return false;
 }
 
-/* If we don't handle this it could be it really is for the
-   

[PATCH v5 5/6] target-arm: kvm - add support for HW assisted debug

2015-05-29 Thread Alex Bennée
This adds basic support for HW assisted debug. The ioctl interface to
KVM allows us to pass an implementation defined number of break and
watch point registers. When KVM_GUESTDBG_USE_HW_BP is specified these
debug registers will be installed in place on the world switch into the
guest.

The hardware is actually capable of more advanced matching but it is
unclear if this expressiveness is available via the gdbstub protocol.

Signed-off-by: Alex Bennée alex.ben...@linaro.org

---
v2
  - correct setting of PMC/BAS/MASK
  - improved commentary
  - added helper function to check watchpoint in range
  - fix find/deletion of watchpoints
v3
  - use internals.h definitions
---
 target-arm/kvm.c |  35 +++---
 target-arm/kvm64.c   | 304 ++-
 target-arm/kvm_arm.h |  21 
 3 files changed, 338 insertions(+), 22 deletions(-)

diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index de2865a..e1fccdd 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -17,6 +17,7 @@
 
 #include qemu-common.h
 #include qemu/timer.h
+#include qemu/error-report.h
 #include sysemu/sysemu.h
 #include sysemu/kvm.h
 #include kvm_arm.h
@@ -540,6 +541,16 @@ static int kvm_handle_debug(CPUState *cs, struct kvm_run 
*run)
 return true;
 }
 break;
+case EC_BREAKPOINT:
+if (kvm_arm_find_hw_breakpoint(cs, env-pc)) {
+return true;
+}
+break;
+case EC_WATCHPOINT:
+if (kvm_arm_find_hw_watchpoint(cs, arch_info-far)) {
+return true;
+}
+break;
 default:
 error_report(%s: unhandled debug exit (%PRIx32, %PRIx64)\n,
  __func__, arch_info-hsr, env-pc);
@@ -601,6 +612,10 @@ void kvm_arch_update_guest_debug(CPUState *cs, struct 
kvm_guest_debug *dbg)
 if (kvm_sw_breakpoints_active(cs)) {
 dbg-control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
 }
+if (kvm_hw_breakpoints_active(cs)) {
+dbg-control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+kvm_copy_hw_breakpoint_data(dbg-arch);
+}
 }
 
 /* C6.6.29 BRK instruction */
@@ -627,26 +642,6 @@ int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct 
kvm_sw_breakpoint *bp)
 return 0;
 }
 
-int kvm_arch_insert_hw_breakpoint(target_ulong addr,
-  target_ulong len, int type)
-{
-qemu_log_mask(LOG_UNIMP, %s: not implemented\n, __func__);
-return -EINVAL;
-}
-
-int kvm_arch_remove_hw_breakpoint(target_ulong addr,
-  target_ulong len, int type)
-{
-qemu_log_mask(LOG_UNIMP, %s: not implemented\n, __func__);
-return -EINVAL;
-}
-
-
-void kvm_arch_remove_all_hw_breakpoints(void)
-{
-qemu_log_mask(LOG_UNIMP, %s: not implemented\n, __func__);
-}
-
 void kvm_arch_init_irq_routing(KVMState *s)
 {
 }
diff --git a/target-arm/kvm64.c b/target-arm/kvm64.c
index 61592d2..06d4e1e 100644
--- a/target-arm/kvm64.c
+++ b/target-arm/kvm64.c
@@ -2,6 +2,7 @@
  * ARM implementation of KVM hooks, 64 bit specific code
  *
  * Copyright Mian-M. Hamayun 2013, Virtual Open Systems
+ * Copyright Alex Bennée 2014, Linaro
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
@@ -12,12 +13,18 @@
 #include sys/types.h
 #include sys/ioctl.h
 #include sys/mman.h
+#include sys/ptrace.h
+#include asm/ptrace.h
 
+#include linux/elf.h
 #include linux/kvm.h
 
 #include config-host.h
 #include qemu-common.h
 #include qemu/timer.h
+#include qemu/host-utils.h
+#include qemu/error-report.h
+#include exec/gdbstub.h
 #include sysemu/sysemu.h
 #include sysemu/kvm.h
 #include kvm_arm.h
@@ -26,21 +33,314 @@
 #include hw/arm/arm.h
 
 static bool have_guest_debug;
+/* Max and current break/watch point counts */
+int max_hw_bp, max_hw_wp;
+int cur_hw_bp, cur_hw_wp;
+struct kvm_guest_debug_arch guest_debug_registers;
 
 /**
- * kvm_arm_init_debug()
+ * kvm_arm_init_debug() - check for guest debug capabilities
  * @cs: CPUState
  *
- * Check for guest debug capabilities.
+ * kvm_check_extension returns 0 if we have no debug registers or the
+ * number we have.
  *
  */
 static void kvm_arm_init_debug(CPUState *cs)
 {
 have_guest_debug = kvm_check_extension(cs-kvm_state,
KVM_CAP_SET_GUEST_DEBUG);
+max_hw_wp = kvm_check_extension(cs-kvm_state, KVM_CAP_GUEST_DEBUG_HW_WPS);
+max_hw_bp = kvm_check_extension(cs-kvm_state, KVM_CAP_GUEST_DEBUG_HW_BPS);
 return;
 }
 
+/**
+ * insert_hw_breakpoint()
+ * @addr: address of breakpoint
+ *
+ * See ARM ARM D2.9.1 for details but here we are only going to create
+ * simple un-linked breakpoints (i.e. we don't chain breakpoints
+ * together to match address and context or vmid). The hardware is
+ * capable of fancier matching but that will require exposing that
+ * fanciness to GDB's interface
+ *
+ * D7.3.2 DBGBCRn_EL1, Debug Breakpoint Control Registers
+ *
+ *  31  24 23  

[PATCH v5 3/6] target-arm: kvm - implement software breakpoints

2015-05-29 Thread Alex Bennée
These don't involve messing around with debug registers, just setting
the breakpoint instruction in memory. GDB will not use this mechanism if
it can't access the memory to write the breakpoint.

All the kernel has to do is ensure the hypervisor traps the breakpoint
exceptions and returns to userspace.

Signed-off-by: Alex Bennée alex.ben...@linaro.org

--
v2
  - handle debug exit with new hsr exception info
  - add verbosity to UNIMP message
v3
  - sync with kvm_cpu_synchronize_state() before checking PC.
  - use internals.h defines
  - use env-pc
  - use proper format types
---
 target-arm/kvm.c | 88 
 1 file changed, 76 insertions(+), 12 deletions(-)

diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index fdd9ba3..c3bad6f 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -510,9 +510,60 @@ void kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
 {
 }
 
+/* See v8 ARM ARM D7.2.27 ESR_ELx, Exception Syndrome Register
+ *
+ * To minimise translating between kernel and user-space the kernel
+ * ABI just provides user-space with the full exception syndrome
+ * register value to be decoded in QEMU.
+ */
+
+static int kvm_handle_debug(CPUState *cs, struct kvm_run *run)
+{
+struct kvm_debug_exit_arch *arch_info = run-debug.arch;
+int hsr_ec = arch_info-hsr  ARM_EL_EC_SHIFT;
+ARMCPU *cpu = ARM_CPU(cs);
+CPUARMState *env = cpu-env;
+
+/* Ensure PC is synchronised */
+kvm_cpu_synchronize_state(cs);
+
+switch (hsr_ec) {
+case EC_AA64_BKPT:
+if (kvm_find_sw_breakpoint(cs, env-pc)) {
+return true;
+}
+break;
+default:
+error_report(%s: unhandled debug exit (%PRIx32, %PRIx64)\n,
+ __func__, arch_info-hsr, env-pc);
+}
+
+/* If we don't handle this it could be it really is for the
+   guest to handle */
+qemu_log_mask(LOG_UNIMP,
+  %s: re-injecting exception not yet implemented
+   (0x%PRIx32, %PRIx64)\n,
+  __func__, hsr_ec, env-pc);
+
+return false;
+}
+
 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
 {
-return 0;
+int ret = 0;
+
+switch (run-exit_reason) {
+case KVM_EXIT_DEBUG:
+if (kvm_handle_debug(cs, run)) {
+ret = EXCP_DEBUG;
+} /* otherwise return to guest */
+break;
+default:
+qemu_log_mask(LOG_UNIMP, %s: un-handled exit reason %d\n,
+  __func__, run-exit_reason);
+break;
+}
+return ret;
 }
 
 bool kvm_arch_stop_on_emulation_error(CPUState *cs)
@@ -537,14 +588,33 @@ int kvm_arch_on_sigbus(int code, void *addr)
 
 void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
 {
-qemu_log_mask(LOG_UNIMP, %s: not implemented\n, __func__);
+if (kvm_sw_breakpoints_active(cs)) {
+dbg-control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
+}
 }
 
-int kvm_arch_insert_sw_breakpoint(CPUState *cs,
-  struct kvm_sw_breakpoint *bp)
+/* C6.6.29 BRK instruction */
+int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
 {
-qemu_log_mask(LOG_UNIMP, %s: not implemented\n, __func__);
-return -EINVAL;
+static const uint32_t brk = 0xd420;
+
+if (cpu_memory_rw_debug(cs, bp-pc, (uint8_t *)bp-saved_insn, 4, 0) ||
+cpu_memory_rw_debug(cs, bp-pc, (uint8_t *)brk, 4, 1)) {
+return -EINVAL;
+}
+return 0;
+}
+
+int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
+{
+static uint32_t brk;
+
+if (cpu_memory_rw_debug(cs, bp-pc, (uint8_t *)brk, 4, 0) ||
+brk != 0xd420 ||
+cpu_memory_rw_debug(cs, bp-pc, (uint8_t *)bp-saved_insn, 4, 1)) {
+return -EINVAL;
+}
+return 0;
 }
 
 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
@@ -561,12 +631,6 @@ int kvm_arch_remove_hw_breakpoint(target_ulong addr,
 return -EINVAL;
 }
 
-int kvm_arch_remove_sw_breakpoint(CPUState *cs,
-  struct kvm_sw_breakpoint *bp)
-{
-qemu_log_mask(LOG_UNIMP, %s: not implemented\n, __func__);
-return -EINVAL;
-}
 
 void kvm_arch_remove_all_hw_breakpoints(void)
 {
-- 
2.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 04/13] KVM: x86: API changes for SMM support

2015-05-29 Thread Radim Krčmář
2015-05-27 19:05+0200, Paolo Bonzini:
 This patch includes changes to the external API for SMM support.
 All the changes are predicated by the availability of a new
 capability, KVM_CAP_X86_SMM, which is added at the end of the
 patch series.
 
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 @@ -309,7 +312,13 @@ struct kvm_vcpu_events {
 - __u32 reserved[10];
 + struct {
 + __u8 smm;
 + __u8 pending;
 + __u8 smm_inside_nmi;

(The 'u8 flags' discussion made me see that we just lost 21 bits here,
 but it seems worth the sacrifice.)

 + __u8 pad;
 + } smi;
 + __u32 reserved[9];
 @@ -3237,6 +3250,18 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct 
 kvm_vcpu *vcpu,
 + if (events-flags  KVM_VCPUEVENT_VALID_SMM) {
 + if (events-smi.smm)
 + vcpu-arch.hflags |= HF_SMM_MASK;
 + else
 + vcpu-arch.hflags = ~HF_SMM_MASK;
 + vcpu-arch.smi_pending = events-smi.pending;
 + if (events-smi.smm_inside_nmi)
 + vcpu-arch.hflags |= ~HF_SMM_INSIDE_NMI_MASK;

~~~ ~~ ~ HF_SMM_INSIDE_NMI_MASK.

 + else
 + vcpu-arch.hflags = ~HF_SMM_INSIDE_NMI_MASK;

(Looks like a use case for a new helper,
   __assign_bit(HF_SMM_INSIDE_NMI_MASK,
vcpu-arch.hflags,
events-smi.smm_inside_nmi);)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] arm/arm64: KVM: Properly account for guest CPU time

2015-05-29 Thread Mario Smarduch
On 05/28/2015 11:49 AM, Christoffer Dall wrote:
 Until now we have been calling kvm_guest_exit after re-enabling
 interrupts when we come back from the guest, but this has the
 unfortunate effect that CPU time accounting done in the context of timer
 interrupts occurring while the guest is running doesn't properly notice
 that the time since the last tick was spent in the guest.
 
 Inspired by the comment in the x86 code, move the kvm_guest_exit() call
 below the local_irq_enable() call and change __kvm_guest_exit() to
 kvm_guest_exit(), because we are now calling this function with
 interrupts enabled.  We have to now explicitly disable preemption and
 not enable preemption before we've called kvm_guest_exit(), since
 otherwise we could be preempted and everything happening before we
 eventually get scheduled again would be accounted for as guest time.
 
 At the same time, move the trace_kvm_exit() call outside of the atomic
 section, since there is no reason for us to do that with interrupts
 disabled.
 
 Signed-off-by: Christoffer Dall christoffer.d...@linaro.org
 ---
 This patch is based on kvm/queue, because it has the kvm_guest_enter/exit
 rework recently posted by Christian Borntraeger.  I hope I got the logic
 of this right, there were 2 slightly worrying facts about this:
 
 First, we now enable and disable and enable interrupts on each exit
 path, but I couldn't see any performance overhead on hackbench - yes the
 only benchmark we care about.
 
 Second, looking at the ppc and mips code, they seem to also call
 kvm_guest_exit() before enabling interrupts, so I don't understand how
 guest CPU time accounting works on those architectures.
 
 Changes since v1:
  - Tweak comment and commit text based on Marc's feedback.
  - Explicitly disable preemption and enable it only after kvm_guest_exit().
 
  arch/arm/kvm/arm.c | 21 +
  1 file changed, 17 insertions(+), 4 deletions(-)
 
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index e41cb11..fe8028d 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -532,6 +532,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
 kvm_run *run)
   kvm_vgic_flush_hwstate(vcpu);
   kvm_timer_flush_hwstate(vcpu);
  
 + preempt_disable();
   local_irq_disable();
  
   /*
 @@ -544,6 +545,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
 kvm_run *run)
  
   if (ret = 0 || need_new_vmid_gen(vcpu-kvm)) {
   local_irq_enable();
 + preempt_enable();
   kvm_timer_sync_hwstate(vcpu);
   kvm_vgic_sync_hwstate(vcpu);
   continue;
 @@ -559,8 +561,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, 
 struct kvm_run *run)
   ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
  
   vcpu-mode = OUTSIDE_GUEST_MODE;
 - __kvm_guest_exit();
 - trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 + /*
 +  * Back from guest
 +  */
 +
   /*
* We may have taken a host interrupt in HYP mode (ie
* while executing the guest). This interrupt is still
 @@ -574,8 +578,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, 
 struct kvm_run *run)
   local_irq_enable();
  
   /*
 -  * Back from guest
 -  */
 +  * We do local_irq_enable() before calling kvm_guest_exit() so
 +  * that if a timer interrupt hits while running the guest we
 +  * account that tick as being spent in the guest.  We enable
 +  * preemption after calling kvm_guest_exit() so that if we get
 +  * preempted we make sure ticks after that is not counted as
 +  * guest time.
 +  */
 + kvm_guest_exit();
 + trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 + preempt_enable();
 +
  
   kvm_timer_sync_hwstate(vcpu);
   kvm_vgic_sync_hwstate(vcpu);
 

Hi Christoffer,
 so currently we take a snap shot when we enter the guest
(tsk-vtime_snap) and upon exit add the time we spent in
the guest and update accrued time, which appears correct.

With this patch it appears that interrupts running
in host mode are accrued to Guest time, and additional preemption
latency is added.

Thanks,
- Mario

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/13] SMM implementation for KVM

2015-05-29 Thread Paolo Bonzini


On 29/05/2015 21:03, Radim Krčmář wrote:
 I found a corner case that doesn't fit any specific patch:
 
 We allow INIT while in SMM.  This brings some security complications as
 we also don't reset hflags (another long standing bug?), but we don't
 really need to because INIT in SMM is against the spec anyway;
 APM May 2013 2:10.3.3 Exceptions and Interrupts,
   • INIT—The processor does not recognize INIT while in SMM.
 
 SDM April 2015: 34.5.1  Initial SMM Execution Environment,
   Maskable hardware interrupts, exceptions, NMI interrupts, SMI
   interrupts, A20M interrupts, single-step traps, breakpoint traps, and
   INIT operations are inhibited when the processor enters SMM.
 
 And there is no mention of an exception for INIT.
 (Some extra old experiments say that INIT could be enabled, but that is
  not applicable now,  http://www.rcollins.org/ddj/Mar97/Mar97.html)
 
 I think that INIT received before RSM should be delivered after.

Yes.  I'll look at it next week and post a 14th patch.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 07/13] KVM: add vcpu-specific functions to read/write/translate GFNs

2015-05-29 Thread Radim Krčmář
2015-05-27 19:05+0200, Paolo Bonzini:
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 @@ -1616,6 +1727,27 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const 
 void *data,
|  int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
|   unsigned long len)
|  {
|   gfn_t gfn = gpa  PAGE_SHIFT;
|   int seg;
|   int offset = offset_in_page(gpa);
|   int ret;
|  
|   while ((seg = next_segment(len, offset)) != 0) {
|   ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
|   if (ret  0)
|   return ret;
|   offset = 0;
|   len -= seg;
|   data += seg;
|   ++gfn;
|   }
|   return 0;
|  }
  
 +int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 +  unsigned long len)
 +{
 + gfn_t gfn = gpa  PAGE_SHIFT;
 + int seg;
 + int offset = offset_in_page(gpa);
 + int ret;
 +
 + while ((seg = next_segment(len, offset)) != 0) {
 + ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
 + if (ret  0)
 + return ret;
 + offset = 0;
 + len -= seg;
 + data += seg;
 + ++gfn;
 + }
 + return 0;
 +}

(There is no need to pass vcpu, and kvm, in this API.
 Extracting memslots early will help to keep more code common.

 I have patches that did a superset of this for the old API, so posting
 them after this series is finalized will be simple.)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/13] SMM implementation for KVM

2015-05-29 Thread Radim Krčmář
I found a corner case that doesn't fit any specific patch:

We allow INIT while in SMM.  This brings some security complications as
we also don't reset hflags (another long standing bug?), but we don't
really need to because INIT in SMM is against the spec anyway;
APM May 2013 2:10.3.3 Exceptions and Interrupts,
  • INIT—The processor does not recognize INIT while in SMM.

SDM April 2015: 34.5.1  Initial SMM Execution Environment,
  Maskable hardware interrupts, exceptions, NMI interrupts, SMI
  interrupts, A20M interrupts, single-step traps, breakpoint traps, and
  INIT operations are inhibited when the processor enters SMM.

And there is no mention of an exception for INIT.
(Some extra old experiments say that INIT could be enabled, but that is
 not applicable now,  http://www.rcollins.org/ddj/Mar97/Mar97.html)

I think that INIT received before RSM should be delivered after.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Remove visible dependency files

2015-05-29 Thread Russell King
After building, there is a lot of clutter from the dependency system.
Let's clean this up by using dir/.file.d style dependencies, similar
to those used in the Linux kernel.

In order to support this, rearrange the dependency generation to
create the dependency files as we build rather than as a separate
step, and have make clean remove them.

Signed-off-by: Russell King r...@arm.linux.org.uk
---
 Makefile | 32 +---
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index 4bd86ee..c07fd8e 100644
--- a/Makefile
+++ b/Makefile
@@ -288,7 +288,13 @@ LIBS   += -lpthread
 LIBS   += -lutil
 
 
-DEPS   := $(patsubst %.o,%.d,$(OBJS))
+comma = ,
+
+# The dependency file for the current target
+depfile = $(subst $(comma),_,$(dir $@).$(notdir $@).d)
+
+DEPS   := $(foreach obj,$(OBJS),\
+   $(subst $(comma),_,$(dir $(obj)).$(notdir $(obj)).d))
 
 DEFINES+= -D_FILE_OFFSET_BITS=64
 DEFINES+= -D_GNU_SOURCE
@@ -327,6 +333,10 @@ all: arch_support_check $(PROGRAM) $(PROGRAM_ALIAS) 
$(GUEST_INIT)
 arch_support_check:
$(UNSUPP_ERR)
 
+# CFLAGS used when building objects
+# This is intentionally not assigned using :=
+c_flags= -Wp,-MD,$(depfile) $(CFLAGS)
+
 # When building -static all objects are built with appropriate flags, which
 # may differ between static  dynamic .o.  The objects are separated into
 # .o and .static.o.  See the %.o: %.c rules below.
@@ -336,11 +346,11 @@ arch_support_check:
 STATIC_OBJS = $(patsubst %.o,%.static.o,$(OBJS) $(OBJS_STATOPT))
 GUEST_OBJS = guest/guest_init.o
 
-$(PROGRAM)-static:  $(DEPS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_INIT)
+$(PROGRAM)-static:  $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_INIT)
$(E)   LINK $@
$(Q) $(CC) -static $(CFLAGS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_OBJS) 
$(LIBS) $(LIBS_STATOPT) -o $@
 
-$(PROGRAM): $(DEPS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_INIT)
+$(PROGRAM): $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_INIT)
$(E)   LINK $@
$(Q) $(CC) $(CFLAGS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_OBJS) 
$(LIBS) $(LIBS_DYNOPT) -o $@
 
@@ -353,19 +363,11 @@ $(GUEST_INIT): guest/init.c
$(Q) $(CC) -static guest/init.c -o $@
$(Q) $(LD) $(LDFLAGS) -r -b binary -o guest/guest_init.o $(GUEST_INIT)
 
-$(DEPS):
-
-util/rbtree.d: util/rbtree.c
-   $(Q) $(CC) -M -MT util/rbtree.o $(CFLAGS) $ -o $@
-
-%.d: %.c
-   $(Q) $(CC) -M -MT $(patsubst %.d,%.o,$@) $(CFLAGS) $ -o $@
-
 %.s: %.c
$(Q) $(CC) -o $@ -S $(CFLAGS) -fverbose-asm $
 
 # The header file common-cmds.h is needed for compilation of builtin-help.c.
-builtin-help.d: $(KVM_INCLUDE)/common-cmds.h
+builtin-help.o: $(KVM_INCLUDE)/common-cmds.h
 
 $(OBJS):
 
@@ -375,7 +377,7 @@ ifeq ($(C),1)
$(Q) $(CHECK) -c $(CFLAGS) $ -o $@
 endif
$(E)   CC   $@
-   $(Q) $(CC) -c $(CFLAGS) $ -o $@
+   $(Q) $(CC) -c $(c_flags) $ -o $@
 
 %.static.o: %.c
 ifeq ($(C),1)
@@ -383,7 +385,7 @@ ifeq ($(C),1)
$(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_STATOPT) $ -o $@
 endif
$(E)   CC   $@
-   $(Q) $(CC) -c $(CFLAGS) $(CFLAGS_STATOPT)  $ -o $@
+   $(Q) $(CC) -c $(c_flags) $(CFLAGS_STATOPT)  $ -o $@
 
 %.o: %.c
 ifeq ($(C),1)
@@ -391,7 +393,7 @@ ifeq ($(C),1)
$(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_DYNOPT) $ -o $@
 endif
$(E)   CC   $@
-   $(Q) $(CC) -c $(CFLAGS) $(CFLAGS_DYNOPT) $ -o $@
+   $(Q) $(CC) -c $(c_flags) $(CFLAGS_DYNOPT) $ -o $@
 
 
 $(KVM_INCLUDE)/common-cmds.h: util/generate-cmdlist.sh command-list.txt
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V1 4/5] kvm: arm64: Implement ACPI probing code for GICv2

2015-05-29 Thread Wei Huang


On 05/29/2015 09:06 AM, Andrew Jones wrote:
 On Thu, May 28, 2015 at 01:34:33AM -0400, Wei Huang wrote:
 This patches enables ACPI support for KVM virtual GICv2. KVM parses
 ACPI table for virt GIC related information and initializes resources.

 Signed-off-by: Alexander Spyridaki a.spyrida...@virtualopensystems.com
 Signed-off-by: Wei Huang w...@redhat.com
 ---
  virt/kvm/arm/vgic-v2.c | 49 
 -
  1 file changed, 48 insertions(+), 1 deletion(-)

 diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
 index 711de82..01ce8a3 100644
 --- a/virt/kvm/arm/vgic-v2.c
 +++ b/virt/kvm/arm/vgic-v2.c
 @@ -264,6 +264,53 @@ int vgic_v2_acpi_probe(struct 
 acpi_madt_generic_interrupt *vgic_acpi,
 const struct vgic_ops **ops,
 const struct vgic_params **params)
  {
 -return -EINVAL;
 +struct vgic_params *vgic = vgic_v2_params;
 +int irq_mode, ret;
 +
 +/* IRQ trigger mode */
 +irq_mode = (vgic_acpi-flags  ACPI_MADT_VGIC_IRQ_MODE) ?
 +ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
 +vgic-maint_irq = acpi_register_gsi(NULL, vgic_acpi-vgic_interrupt,
 +irq_mode, ACPI_ACTIVE_HIGH);
 +if (!vgic-maint_irq) {
 +kvm_err(Cannot register VGIC ACPI maintenance irq\n);
 +ret = -ENXIO;
 +goto out;
 +}
 +
 +/* GICH resource */
 +vgic-vctrl_base = ioremap(vgic_acpi-gich_base_address, SZ_8K);
 +if (!vgic-vctrl_base) {
 +kvm_err(cannot ioremap GICH memory\n);
 +ret = -ENOMEM;
 +goto out;
 +}
 +
 +vgic-nr_lr = readl_relaxed(vgic-vctrl_base + GICH_VTR);
 +vgic-nr_lr = (vgic-nr_lr  0x3f) + 1;
 +
 +ret = create_hyp_io_mappings(vgic-vctrl_base,
 + vgic-vctrl_base + SZ_8K,
 + vgic_acpi-gich_base_address);
 +if (ret) {
 +kvm_err(Cannot map GICH into hyp\n);
 +goto out;
 +}
 +
 +vgic-vcpu_base = vgic_acpi-gicv_base_address;
 +vgic-can_emulate_gicv2 = true;
 +kvm_register_device_ops(kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
 +
 +kvm_info(GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n,
 + (unsigned long long)vgic_acpi-gich_base_address,
 + (unsigned long long)vgic_acpi-gicv_base_address,
 + vgic-maint_irq);
 +
 +vgic-type = VGIC_V2;
 
 we're missing max_gic_vcpus here
 
   vgic-max_gic_vcpus = VGIC_V2_MAX_CPUS;

Yes. Will fix in the next spin.

-Wei

 
 +*ops = vgic_v2_ops;
 +*params = vgic;
 +
 +out:
 +return ret;
  }
  #endif /* CONFIG_ACPI */
 -- 
 1.8.3.1

 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html