Re: [RFC PATCH 7/7 v2] ppc: add dynamic dma window support
On 09.12.2010 [15:17:06 +1100], Benjamin Herrenschmidt wrote: On Tue, 2010-10-26 at 20:35 -0700, Nishanth Aravamudan wrote: No much comments... I'm amazed how complex he firmware folks managed to make this ... static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) { int err = NOTIFY_OK; struct device_node *np = node; struct pci_dn *pci = PCI_DN(np); + struct direct_window *window; switch (action) { case PSERIES_RECONFIG_REMOVE: if (pci pci-iommu_table) iommu_free_table(pci-iommu_table, np-full_name); + + spin_lock(direct_window_list_lock); + list_for_each_entry(window, direct_window_list, list) { + if (window-device == np) { + list_del(window-list); + break; + } + } + spin_unlock(direct_window_list_lock); Should you also kfree the window ? Yeah, looks like I should. I have a few other questions due to testing, but I'll reply to my original e-mail with those. Thanks for the review! Nish -- Nishanth Aravamudan n...@us.ibm.com IBM Linux Technology Center ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC PATCH 7/7 v2] ppc: add dynamic dma window support
On 26.10.2010 [20:35:17 -0700], Nishanth Aravamudan wrote: If firmware allows us to map all of a partition's memory for DMA on a particular bridge, create a 1:1 mapping of that memory. Add hooks for dealing with hotplug events. Dyanmic DMA windows can use larger than the default page size, and we use the largest one possible. Not-yet-signed-off-by: Nishanth Aravamudan n...@us.ibm.com --- I've tested this briefly on a machine with suitable firmware/hardware. Things seem to work well, but I want to do more exhaustive I/O testing before asking for upstream merging. I would really appreciate any feedback on the updated approach. Specific questions: Ben, did I hook into the dma_set_mask() platform callback as you expected? Anything I can do better or which perhaps might lead to gotchas later? I've added a disable_ddw option, but perhaps it would be better to just disable the feature if iommu=force? So for the final version, I probably should document this option in kernel-parameters.txt w/ the patch, right? snip +static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + int rc; + u64 tce_size, num_tce, dma_offset, next; + u32 tce_shift; + long limit; + + tce_shift = be32_to_cpu(maprange-tce_shift); + tce_size = 1ULL tce_shift; + next = start_pfn PAGE_SHIFT; + num_tce = num_pfn PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next (tce_size - 1); + next = ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce = tce_shift; + + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 512); + dma_offset = next + be64_to_cpu(maprange-dma_base); + + rc = plpar_tce_stuff(be64_to_cpu(maprange-liobn), + (u64)dma_offset, + 0, limit); + num_tce -= limit; + } while (num_tce 0 !rc); + + return rc; +} There is a bit of a typo here, the liobn is a 32-bit value. I've fixed this is up locally and will update it when I send out the final version of this patch. I'm finding that on dlpar remove of adapters running in slots supporting 64-bit DMA, that the plpar_tce_stuff is failing. Can you think of a reason why? It looks basically the same as the put_indirect below... +static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn; + u32 tce_shift; + u64 rc = 0; + long l, limit; + + local_irq_disable();/* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); + + if (!tcep) { + tcep = (u64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { + local_irq_enable(); + return -ENOMEM; + } + __get_cpu_var(tce_page) = tcep; + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; + + liobn = (u64)be32_to_cpu(maprange-liobn); + tce_shift = be32_to_cpu(maprange-tce_shift); + tce_size = 1ULL tce_shift; + next = start_pfn PAGE_SHIFT; + num_tce = num_pfn PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next (tce_size - 1); + next = ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce = tce_shift; + + /* We can map max one pageful of TCEs at a time */ + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE); + dma_offset = next + be64_to_cpu(maprange-dma_base); + + for (l = 0; l limit; l++) { + tcep[l] = proto_tce | next; + next += tce_size; + } + + rc = plpar_tce_put_indirect(liobn, + (u64)dma_offset, + (u64)virt_to_abs(tcep), + limit); + + num_tce -= limit; + } while (num_tce 0 !rc); +printk(plpar_tce_put_indirect for offset 0x%llx and tcep[0] 0x%llx returned %llu\n, +(u64)dma_offset, tcep[0], rc); + I'll cleanup the debugging on the final version too. snip +static void remove_ddw(struct
Re: [RFC PATCH 7/7 v2] ppc: add dynamic dma window support
On Tue, 2010-10-26 at 20:35 -0700, Nishanth Aravamudan wrote: No much comments... I'm amazed how complex he firmware folks managed to make this ... static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) { int err = NOTIFY_OK; struct device_node *np = node; struct pci_dn *pci = PCI_DN(np); + struct direct_window *window; switch (action) { case PSERIES_RECONFIG_REMOVE: if (pci pci-iommu_table) iommu_free_table(pci-iommu_table, np-full_name); + + spin_lock(direct_window_list_lock); + list_for_each_entry(window, direct_window_list, list) { + if (window-device == np) { + list_del(window-list); + break; + } + } + spin_unlock(direct_window_list_lock); Should you also kfree the window ? Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 7/7 v2] ppc: add dynamic dma window support
If firmware allows us to map all of a partition's memory for DMA on a particular bridge, create a 1:1 mapping of that memory. Add hooks for dealing with hotplug events. Dyanmic DMA windows can use larger than the default page size, and we use the largest one possible. Not-yet-signed-off-by: Nishanth Aravamudan n...@us.ibm.com --- I've tested this briefly on a machine with suitable firmware/hardware. Things seem to work well, but I want to do more exhaustive I/O testing before asking for upstream merging. I would really appreciate any feedback on the updated approach. Specific questions: Ben, did I hook into the dma_set_mask() platform callback as you expected? Anything I can do better or which perhaps might lead to gotchas later? I've added a disable_ddw option, but perhaps it would be better to just disable the feature if iommu=force? --- arch/powerpc/platforms/pseries/iommu.c | 577 +++- 1 files changed, 575 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 45c6865..8090b6b 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -33,6 +33,7 @@ #include linux/pci.h #include linux/dma-mapping.h #include linux/crash_dump.h +#include linux/memory.h #include asm/io.h #include asm/prom.h #include asm/rtas.h @@ -45,6 +46,7 @@ #include asm/tce.h #include asm/ppc-pci.h #include asm/udbg.h +#include asm/mmzone.h #include plpar_wrappers.h @@ -270,6 +272,139 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) return tce_ret; } +/* this is compatable with cells for the device tree property */ +struct dynamic_dma_window_prop { + __be32 liobn; /* tce table number */ + __be64 dma_base; /* address hi,lo */ + __be32 tce_shift; /* ilog2(tce_page_size) */ + __be32 window_shift; /* ilog2(tce_window_size) */ +}; + +struct direct_window { + struct device_node *device; + const struct dynamic_dma_window_prop *prop; + struct list_head list; +}; +static LIST_HEAD(direct_window_list); +/* prevents races between memory on/offline and window creation */ +static DEFINE_SPINLOCK(direct_window_list_lock); +/* protects initializing window twice for same device */ +static DEFINE_MUTEX(direct_window_init_mutex); +#define DIRECT64_PROPNAME linux,direct64-ddr-window-info + +static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + int rc; + u64 tce_size, num_tce, dma_offset, next; + u32 tce_shift; + long limit; + + tce_shift = be32_to_cpu(maprange-tce_shift); + tce_size = 1ULL tce_shift; + next = start_pfn PAGE_SHIFT; + num_tce = num_pfn PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next (tce_size - 1); + next = ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce = tce_shift; + + do { + /* +* Set up the page with TCE data, looping through and setting +* the values. +*/ + limit = min_t(long, num_tce, 512); + dma_offset = next + be64_to_cpu(maprange-dma_base); + + rc = plpar_tce_stuff(be64_to_cpu(maprange-liobn), + (u64)dma_offset, +0, limit); + num_tce -= limit; + } while (num_tce 0 !rc); + + return rc; +} + +static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn; + u32 tce_shift; + u64 rc = 0; + long l, limit; + + local_irq_disable();/* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); + + if (!tcep) { + tcep = (u64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { + local_irq_enable(); + return -ENOMEM; + } + __get_cpu_var(tce_page) = tcep; + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; + + liobn = (u64)be32_to_cpu(maprange-liobn); + tce_shift = be32_to_cpu(maprange-tce_shift); + tce_size = 1ULL tce_shift; + next = start_pfn PAGE_SHIFT; + num_tce = num_pfn PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next (tce_size - 1); + next = ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce =