Re: [kernel, v12, 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-06-10 Thread Michael Ellerman
On Fri, 2015-05-06 at 06:35:09 UTC, Alexey Kardashevskiy wrote:
 So far one TCE table could only be used by one IOMMU group. However
 IODA2 hardware allows programming the same TCE table address to
 multiple PE allowing sharing tables.

...

 + pnv_pci_link_table_and_group(phb-hose-node, 0, tbl, pe-table_group);
 + pnv_pci_link_table_and_group(phb-hose-node, 0, tbl, pe-table_group);
 + pnv_pci_link_table_and_group(phb-hose-node, 0,
 + tbl, phb-p5ioc2.table_group);

 +long pnv_pci_link_table_and_group(int node, int num,
 + struct iommu_table *tbl,
 + struct iommu_table_group *table_group)
 +{
 + struct iommu_table_group_link *tgl = NULL;
 +
 + BUG_ON(!tbl);
 + BUG_ON(!table_group);
 + BUG_ON(!table_group-group);
 +
 + tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
 + node);
 + if (!tgl)
 + return -ENOMEM;
 +
 + tgl-table_group = table_group;
 + list_add_rcu(tgl-next, tbl-it_group_list);
 +
 + table_group-tables[num] = tbl;
 +
 + return 0;

I'm not a fan of the BUG_ONs here.

This routine is so important that you have to BUG_ON three times at the start,
yet you never check the return code if it fails? That doesn't make sense to me.

If anything this should be sufficient:

if (WARN_ON(!tbl || !table_group))
return -EINVAL;

cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v12 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-06-09 Thread Michael Ellerman
On Fri, 2015-06-05 at 16:35 +1000, Alexey Kardashevskiy wrote:
 So far one TCE table could only be used by one IOMMU group. However
 IODA2 hardware allows programming the same TCE table address to
 multiple PE allowing sharing tables.

...

 diff --git a/arch/powerpc/platforms/powernv/pci.c 
 b/arch/powerpc/platforms/powernv/pci.c
 index 84b4ea4..4b4c583 100644
 --- a/arch/powerpc/platforms/powernv/pci.c
 +++ b/arch/powerpc/platforms/powernv/pci.c
 @@ -606,6 +606,82 @@ unsigned long pnv_tce_get(struct iommu_table *tbl, long 
 index)
   return ((u64 *)tbl-it_base)[index - tbl-it_offset];
  }
  
 +struct iommu_table *pnv_pci_table_alloc(int nid)
 +{
 + struct iommu_table *tbl;
 +
 + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
 + INIT_LIST_HEAD_RCU(tbl-it_group_list);
 +
 + return tbl;
 +}
 +
 +long pnv_pci_link_table_and_group(int node, int num,
 + struct iommu_table *tbl,
 + struct iommu_table_group *table_group)
 +{
 + struct iommu_table_group_link *tgl = NULL;
 +
 + BUG_ON(!tbl);
 + BUG_ON(!table_group);
 + BUG_ON(!table_group-group);


On p84 (Tuleta), my next + this series, with pseries_le_defconfig:

pci 0001:08 : [PE# 002] Assign DMA32 space
pci 0001:08 : [PE# 002] Setting up 32-bit TCE table at 0..8000
IOMMU table initialized, virtual merging enabled
pci 0001:08 : [PE# 002] Setting up window#0 0..7fff pg=1000
[ cut here ]
kernel BUG at arch/powerpc/platforms/powernv/pci.c:666!
Oops: Exception in kernel mode, sig: 5 [#1]
SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in:
CPU: 3 PID: 1 Comm: swapper/0 Not tainted 4.1.0-rc3-13721-g4c61caf #83
task: c01ff430 ti: c02ff6084000 task.ti: c02ff6084000
NIP: c0067a04 LR: c006b49c CTR: 3003e060
REGS: c02ff6087690 TRAP: 0700   Not tainted  (4.1.0-rc3-13721-g4c61caf)
MSR: 900100029033 SF,HV,EE,ME,IR,DR,RI,LE  CR: 2822  XER: 2000
CFAR: c006b498 SOFTE: 1 
GPR00: c006b49c c02ff6087910 c0d7cea0  
GPR04:  c00fef7a c03fffb2c6d8  
GPR08:  0001  90011003 
GPR12: c005d428 c1dc0d80 c0ca40f8 c03fffb48580 
GPR16: c0adb4c0 c0adb308 c038ca80 c03fffb2c6a0 
GPR20: 0007 c0ae31b8 c09136f8 0008 
GPR24: 0001 c03fffb48850  c00fef7a 
GPR28: c03fffb38580 c00fef7a c03fffb2c6d8  
NIP [c0067a04] pnv_pci_link_table_and_group+0x54/0xe0
LR [c006b49c] pnv_pci_ioda_fixup+0x6bc/0xe30
Call Trace:
[c02ff6087910] [c02ff6087988] 0xc02ff6087988 (unreliable)
[c02ff6087950] [c006b49c] pnv_pci_ioda_fixup+0x6bc/0xe30
[c02ff6087ae0] [c0bef224] pcibios_resource_survey+0x2b4/0x300
[c02ff6087bb0] [c0beeb6c] pcibios_init+0xa8/0xdc
[c02ff6087c30] [c000b3b0] do_one_initcall+0xd0/0x250
[c02ff6087d00] [c0be422c] kernel_init_freeable+0x25c/0x33c
[c02ff6087dc0] [c000bcf4] kernel_init+0x24/0x130
[c02ff6087e30] [c000956c] ret_from_kernel_thread+0x5c/0x70
Instruction dump:
7c9f2378 7cde3378 7cbd2b78 f8010010 f821ffc1 0b09 7cc90074 7929d182 
0b09 e9260018 7d290074 7929d182 0b09 6000 3880 e92294d0 
---[ end trace bfd126f01f6f6bfe ]---



Full log below:

opal: OPAL V3 detected !
Crash kernel location must be 0x200
Reserving 1024MB of memory at 32MB for crashkernel (System RAM: 262144MB)
Allocated 2359296 bytes for 2048 pacas at c1dc
Using PowerNV machine description
Page sizes from device-tree:
base_shift=12: shift=12, sllp=0x, avpnm=0x, tlbiel=1, penc=0
base_shift=12: shift=16, sllp=0x, avpnm=0x, tlbiel=1, penc=7
base_shift=12: shift=24, sllp=0x, avpnm=0x, tlbiel=1, penc=56
base_shift=16: shift=16, sllp=0x0110, avpnm=0x, tlbiel=1, penc=1
base_shift=16: shift=24, sllp=0x0110, avpnm=0x, tlbiel=1, penc=8
base_shift=24: shift=24, sllp=0x0100, avpnm=0x0001, tlbiel=0, penc=0
base_shift=34: shift=34, sllp=0x0120, avpnm=0x07ff, tlbiel=0, penc=3
Page orders: linear mapping = 24, virtual = 16, io = 16, vmemmap = 24
Using 1TB segments
cma: Reserved 13120 MiB at 0x003cac00
bootconsole [udbg0] enabled
CPU maps initialized for 8 threads per core
 (thread shift is 3)
Freed 2162688 bytes for unused pacas
 - smp_release_cpus()
spinning_secondaries = 127
 - smp_release_cpus()
Starting Linux ppc64le #83 SMP Tue Jun 9 15:52:08 AEST 2015
-
ppc64_pft_size= 0x0
phys_mem_size = 0x40
cpu_features  = 0x17fc7aed18500249
  possible= 0x1fef18500649
  always  = 0x18100040
cpu_user_features = 0xdc0065c7 0xee00
mmu_features  = 0x7c03

Re: [PATCH kernel v12 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-06-08 Thread David Gibson
On Fri, Jun 05, 2015 at 04:35:09PM +1000, Alexey Kardashevskiy wrote:
 So far one TCE table could only be used by one IOMMU group. However
 IODA2 hardware allows programming the same TCE table address to
 multiple PE allowing sharing tables.
 
 This replaces a single pointer to a group in a iommu_table struct
 with a linked list of groups which provides the way of invalidating
 TCE cache for every PE when an actual TCE table is updated. This adds 
 pnv_pci_link_table_and_group() and pnv_pci_unlink_table_and_group() helpers 
 to manage the list. However without VFIO, it is still going
 to be a single IOMMU group per iommu_table.
 
 This changes iommu_add_device() to add a device to a first group
 from the group list of a table as it is only called from the platform
 init code or PCI bus notifier and at these moments there is only
 one group per table.
 
 This does not change TCE invalidation code to loop through all
 attached groups in order to simplify this patch and because
 it is not really needed in most cases. IODA2 is fixed in a later
 patch.
 
 This should cause no behavioural change.
 
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 [aw: for the vfio related changes]
 Acked-by: Alex Williamson alex.william...@redhat.com
 Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com

Reviewed-by: David Gibson da...@gibson.dropbear.id.au

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgpjvfoOCwTjT.pgp
Description: PGP signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH kernel v12 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-06-05 Thread Alexey Kardashevskiy
So far one TCE table could only be used by one IOMMU group. However
IODA2 hardware allows programming the same TCE table address to
multiple PE allowing sharing tables.

This replaces a single pointer to a group in a iommu_table struct
with a linked list of groups which provides the way of invalidating
TCE cache for every PE when an actual TCE table is updated. This adds 
pnv_pci_link_table_and_group() and pnv_pci_unlink_table_and_group() helpers to 
manage the list. However without VFIO, it is still going
to be a single IOMMU group per iommu_table.

This changes iommu_add_device() to add a device to a first group
from the group list of a table as it is only called from the platform
init code or PCI bus notifier and at these moments there is only
one group per table.

This does not change TCE invalidation code to loop through all
attached groups in order to simplify this patch and because
it is not really needed in most cases. IODA2 is fixed in a later
patch.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v12:
* fixed iommu_add_device() to check what list_first_entry_or_null()
returned
* changed commit log
* removed loops from iommu_pseries_free_group as it does not support
tables sharing anyway

v10:
* iommu_table is not embedded into iommu_table_group but allocated
dynamically
* iommu_table allocation is moved to a single place for IODA2's
pnv_pci_ioda_setup_dma_pe where it belongs to
* added list of groups into iommu_table; most of the code just looks at
the first item to keep the patch simpler

v9:
* s/it_group/it_table_group/
* added and used iommu_table_group_free(), from now iommu_free_table()
is only used for VIO
* added iommu_pseries_group_alloc()
* squashed powerpc/iommu: Introduce iommu_table_alloc() helper into this
---
 arch/powerpc/include/asm/iommu.h|   8 +-
 arch/powerpc/kernel/iommu.c |  14 +++-
 arch/powerpc/platforms/powernv/pci-ioda.c   |  45 ++
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |   3 +
 arch/powerpc/platforms/powernv/pci.c|  76 +
 arch/powerpc/platforms/powernv/pci.h|   7 ++
 arch/powerpc/platforms/pseries/iommu.c  |  25 +-
 drivers/vfio/vfio_iommu_spapr_tce.c | 122 
 8 files changed, 240 insertions(+), 60 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 5a7267f..44a20cc 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -91,7 +91,7 @@ struct iommu_table {
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now */
unsigned long  it_page_shift;/* table iommu page size */
-   struct iommu_table_group *it_table_group;
+   struct list_head it_group_list;/* List of iommu_table_group_link */
struct iommu_table_ops *it_ops;
void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
@@ -126,6 +126,12 @@ extern struct iommu_table *iommu_init_table(struct 
iommu_table * tbl,
int nid);
 #define IOMMU_TABLE_GROUP_MAX_TABLES   1
 
+struct iommu_table_group_link {
+   struct list_head next;
+   struct rcu_head rcu;
+   struct iommu_table_group *table_group;
+};
+
 struct iommu_table_group {
struct iommu_group *group;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 719f048..be258b2 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1078,6 +1078,7 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership);
 int iommu_add_device(struct device *dev)
 {
struct iommu_table *tbl;
+   struct iommu_table_group_link *tgl;
 
/*
 * The sysfs entries should be populated before
@@ -1095,15 +1096,22 @@ int iommu_add_device(struct device *dev)
}
 
tbl = get_iommu_table_base(dev);
-   if (!tbl || !tbl-it_table_group || !tbl-it_table_group-group) {
+   if (!tbl) {
pr_debug(%s: Skipping device %s with no tbl\n,
 __func__, dev_name(dev));
return 0;
}
 
+   tgl = list_first_entry_or_null(tbl-it_group_list,
+   struct iommu_table_group_link, next);
+   if (!tgl) {
+   pr_debug(%s: Skipping device %s with no group\n,
+__func__, dev_name(dev));
+   return 0;
+   }
pr_debug(%s: Adding %s to iommu group %d\n,
 __func__, dev_name(dev),
-iommu_group_id(tbl-it_table_group-group));
+iommu_group_id(tgl-table_group-group));
 
if (PAGE_SIZE  IOMMU_PAGE_SIZE(tbl)) {
pr_err(%s: Invalid