Re: [kernel, v12, 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group
On Fri, 2015-05-06 at 06:35:09 UTC, Alexey Kardashevskiy wrote: So far one TCE table could only be used by one IOMMU group. However IODA2 hardware allows programming the same TCE table address to multiple PE allowing sharing tables. ... + pnv_pci_link_table_and_group(phb-hose-node, 0, tbl, pe-table_group); + pnv_pci_link_table_and_group(phb-hose-node, 0, tbl, pe-table_group); + pnv_pci_link_table_and_group(phb-hose-node, 0, + tbl, phb-p5ioc2.table_group); +long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group) +{ + struct iommu_table_group_link *tgl = NULL; + + BUG_ON(!tbl); + BUG_ON(!table_group); + BUG_ON(!table_group-group); + + tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, + node); + if (!tgl) + return -ENOMEM; + + tgl-table_group = table_group; + list_add_rcu(tgl-next, tbl-it_group_list); + + table_group-tables[num] = tbl; + + return 0; I'm not a fan of the BUG_ONs here. This routine is so important that you have to BUG_ON three times at the start, yet you never check the return code if it fails? That doesn't make sense to me. If anything this should be sufficient: if (WARN_ON(!tbl || !table_group)) return -EINVAL; cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH kernel v12 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group
On Fri, 2015-06-05 at 16:35 +1000, Alexey Kardashevskiy wrote: So far one TCE table could only be used by one IOMMU group. However IODA2 hardware allows programming the same TCE table address to multiple PE allowing sharing tables. ... diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 84b4ea4..4b4c583 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -606,6 +606,82 @@ unsigned long pnv_tce_get(struct iommu_table *tbl, long index) return ((u64 *)tbl-it_base)[index - tbl-it_offset]; } +struct iommu_table *pnv_pci_table_alloc(int nid) +{ + struct iommu_table *tbl; + + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid); + INIT_LIST_HEAD_RCU(tbl-it_group_list); + + return tbl; +} + +long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group) +{ + struct iommu_table_group_link *tgl = NULL; + + BUG_ON(!tbl); + BUG_ON(!table_group); + BUG_ON(!table_group-group); On p84 (Tuleta), my next + this series, with pseries_le_defconfig: pci 0001:08 : [PE# 002] Assign DMA32 space pci 0001:08 : [PE# 002] Setting up 32-bit TCE table at 0..8000 IOMMU table initialized, virtual merging enabled pci 0001:08 : [PE# 002] Setting up window#0 0..7fff pg=1000 [ cut here ] kernel BUG at arch/powerpc/platforms/powernv/pci.c:666! Oops: Exception in kernel mode, sig: 5 [#1] SMP NR_CPUS=2048 NUMA PowerNV Modules linked in: CPU: 3 PID: 1 Comm: swapper/0 Not tainted 4.1.0-rc3-13721-g4c61caf #83 task: c01ff430 ti: c02ff6084000 task.ti: c02ff6084000 NIP: c0067a04 LR: c006b49c CTR: 3003e060 REGS: c02ff6087690 TRAP: 0700 Not tainted (4.1.0-rc3-13721-g4c61caf) MSR: 900100029033 SF,HV,EE,ME,IR,DR,RI,LE CR: 2822 XER: 2000 CFAR: c006b498 SOFTE: 1 GPR00: c006b49c c02ff6087910 c0d7cea0 GPR04: c00fef7a c03fffb2c6d8 GPR08: 0001 90011003 GPR12: c005d428 c1dc0d80 c0ca40f8 c03fffb48580 GPR16: c0adb4c0 c0adb308 c038ca80 c03fffb2c6a0 GPR20: 0007 c0ae31b8 c09136f8 0008 GPR24: 0001 c03fffb48850 c00fef7a GPR28: c03fffb38580 c00fef7a c03fffb2c6d8 NIP [c0067a04] pnv_pci_link_table_and_group+0x54/0xe0 LR [c006b49c] pnv_pci_ioda_fixup+0x6bc/0xe30 Call Trace: [c02ff6087910] [c02ff6087988] 0xc02ff6087988 (unreliable) [c02ff6087950] [c006b49c] pnv_pci_ioda_fixup+0x6bc/0xe30 [c02ff6087ae0] [c0bef224] pcibios_resource_survey+0x2b4/0x300 [c02ff6087bb0] [c0beeb6c] pcibios_init+0xa8/0xdc [c02ff6087c30] [c000b3b0] do_one_initcall+0xd0/0x250 [c02ff6087d00] [c0be422c] kernel_init_freeable+0x25c/0x33c [c02ff6087dc0] [c000bcf4] kernel_init+0x24/0x130 [c02ff6087e30] [c000956c] ret_from_kernel_thread+0x5c/0x70 Instruction dump: 7c9f2378 7cde3378 7cbd2b78 f8010010 f821ffc1 0b09 7cc90074 7929d182 0b09 e9260018 7d290074 7929d182 0b09 6000 3880 e92294d0 ---[ end trace bfd126f01f6f6bfe ]--- Full log below: opal: OPAL V3 detected ! Crash kernel location must be 0x200 Reserving 1024MB of memory at 32MB for crashkernel (System RAM: 262144MB) Allocated 2359296 bytes for 2048 pacas at c1dc Using PowerNV machine description Page sizes from device-tree: base_shift=12: shift=12, sllp=0x, avpnm=0x, tlbiel=1, penc=0 base_shift=12: shift=16, sllp=0x, avpnm=0x, tlbiel=1, penc=7 base_shift=12: shift=24, sllp=0x, avpnm=0x, tlbiel=1, penc=56 base_shift=16: shift=16, sllp=0x0110, avpnm=0x, tlbiel=1, penc=1 base_shift=16: shift=24, sllp=0x0110, avpnm=0x, tlbiel=1, penc=8 base_shift=24: shift=24, sllp=0x0100, avpnm=0x0001, tlbiel=0, penc=0 base_shift=34: shift=34, sllp=0x0120, avpnm=0x07ff, tlbiel=0, penc=3 Page orders: linear mapping = 24, virtual = 16, io = 16, vmemmap = 24 Using 1TB segments cma: Reserved 13120 MiB at 0x003cac00 bootconsole [udbg0] enabled CPU maps initialized for 8 threads per core (thread shift is 3) Freed 2162688 bytes for unused pacas - smp_release_cpus() spinning_secondaries = 127 - smp_release_cpus() Starting Linux ppc64le #83 SMP Tue Jun 9 15:52:08 AEST 2015 - ppc64_pft_size= 0x0 phys_mem_size = 0x40 cpu_features = 0x17fc7aed18500249 possible= 0x1fef18500649 always = 0x18100040 cpu_user_features = 0xdc0065c7 0xee00 mmu_features = 0x7c03
Re: [PATCH kernel v12 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group
On Fri, Jun 05, 2015 at 04:35:09PM +1000, Alexey Kardashevskiy wrote: So far one TCE table could only be used by one IOMMU group. However IODA2 hardware allows programming the same TCE table address to multiple PE allowing sharing tables. This replaces a single pointer to a group in a iommu_table struct with a linked list of groups which provides the way of invalidating TCE cache for every PE when an actual TCE table is updated. This adds pnv_pci_link_table_and_group() and pnv_pci_unlink_table_and_group() helpers to manage the list. However without VFIO, it is still going to be a single IOMMU group per iommu_table. This changes iommu_add_device() to add a device to a first group from the group list of a table as it is only called from the platform init code or PCI bus notifier and at these moments there is only one group per table. This does not change TCE invalidation code to loop through all attached groups in order to simplify this patch and because it is not really needed in most cases. IODA2 is fixed in a later patch. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru [aw: for the vfio related changes] Acked-by: Alex Williamson alex.william...@redhat.com Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com Reviewed-by: David Gibson da...@gibson.dropbear.id.au -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson pgpjvfoOCwTjT.pgp Description: PGP signature ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH kernel v12 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group
So far one TCE table could only be used by one IOMMU group. However IODA2 hardware allows programming the same TCE table address to multiple PE allowing sharing tables. This replaces a single pointer to a group in a iommu_table struct with a linked list of groups which provides the way of invalidating TCE cache for every PE when an actual TCE table is updated. This adds pnv_pci_link_table_and_group() and pnv_pci_unlink_table_and_group() helpers to manage the list. However without VFIO, it is still going to be a single IOMMU group per iommu_table. This changes iommu_add_device() to add a device to a first group from the group list of a table as it is only called from the platform init code or PCI bus notifier and at these moments there is only one group per table. This does not change TCE invalidation code to loop through all attached groups in order to simplify this patch and because it is not really needed in most cases. IODA2 is fixed in a later patch. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru [aw: for the vfio related changes] Acked-by: Alex Williamson alex.william...@redhat.com Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com --- Changes: v12: * fixed iommu_add_device() to check what list_first_entry_or_null() returned * changed commit log * removed loops from iommu_pseries_free_group as it does not support tables sharing anyway v10: * iommu_table is not embedded into iommu_table_group but allocated dynamically * iommu_table allocation is moved to a single place for IODA2's pnv_pci_ioda_setup_dma_pe where it belongs to * added list of groups into iommu_table; most of the code just looks at the first item to keep the patch simpler v9: * s/it_group/it_table_group/ * added and used iommu_table_group_free(), from now iommu_free_table() is only used for VIO * added iommu_pseries_group_alloc() * squashed powerpc/iommu: Introduce iommu_table_alloc() helper into this --- arch/powerpc/include/asm/iommu.h| 8 +- arch/powerpc/kernel/iommu.c | 14 +++- arch/powerpc/platforms/powernv/pci-ioda.c | 45 ++ arch/powerpc/platforms/powernv/pci-p5ioc2.c | 3 + arch/powerpc/platforms/powernv/pci.c| 76 + arch/powerpc/platforms/powernv/pci.h| 7 ++ arch/powerpc/platforms/pseries/iommu.c | 25 +- drivers/vfio/vfio_iommu_spapr_tce.c | 122 8 files changed, 240 insertions(+), 60 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 5a7267f..44a20cc 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -91,7 +91,7 @@ struct iommu_table { struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ unsigned long it_page_shift;/* table iommu page size */ - struct iommu_table_group *it_table_group; + struct list_head it_group_list;/* List of iommu_table_group_link */ struct iommu_table_ops *it_ops; void (*set_bypass)(struct iommu_table *tbl, bool enable); }; @@ -126,6 +126,12 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); #define IOMMU_TABLE_GROUP_MAX_TABLES 1 +struct iommu_table_group_link { + struct list_head next; + struct rcu_head rcu; + struct iommu_table_group *table_group; +}; + struct iommu_table_group { struct iommu_group *group; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 719f048..be258b2 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1078,6 +1078,7 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership); int iommu_add_device(struct device *dev) { struct iommu_table *tbl; + struct iommu_table_group_link *tgl; /* * The sysfs entries should be populated before @@ -1095,15 +1096,22 @@ int iommu_add_device(struct device *dev) } tbl = get_iommu_table_base(dev); - if (!tbl || !tbl-it_table_group || !tbl-it_table_group-group) { + if (!tbl) { pr_debug(%s: Skipping device %s with no tbl\n, __func__, dev_name(dev)); return 0; } + tgl = list_first_entry_or_null(tbl-it_group_list, + struct iommu_table_group_link, next); + if (!tgl) { + pr_debug(%s: Skipping device %s with no group\n, +__func__, dev_name(dev)); + return 0; + } pr_debug(%s: Adding %s to iommu group %d\n, __func__, dev_name(dev), -iommu_group_id(tbl-it_table_group-group)); +iommu_group_id(tgl-table_group-group)); if (PAGE_SIZE IOMMU_PAGE_SIZE(tbl)) { pr_err(%s: Invalid