Re: [PATCH v4 2/3] mm: Add support for kmem caches in DMA32 zone
On Wed, Dec 5, 2018 at 3:25 PM Wei Yang wrote: > > On Wed, Dec 05, 2018 at 01:48:27PM +0800, Nicolas Boichat wrote: > >In some cases (e.g. IOMMU ARMv7s page allocator), we need to allocate > >data structures smaller than a page with GFP_DMA32 flag. > > > >This change makes it possible to create a custom cache in DMA32 zone > >using kmem_cache_create, then allocate memory using kmem_cache_alloc. > > > >We do not create a DMA32 kmalloc cache array, as there are currently > >no users of kmalloc(..., GFP_DMA32). The new test in check_slab_flags > >ensures that such calls still fail (as they do before this change). > > > >Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32") > >Signed-off-by: Nicolas Boichat > >--- > > > >Changes since v2: > > - Clarified commit message > > - Add entry in sysfs-kernel-slab to document the new sysfs file > > > >(v3 used the page_frag approach) > > > >Documentation/ABI/testing/sysfs-kernel-slab | 9 + > > include/linux/slab.h| 2 ++ > > mm/internal.h | 8 ++-- > > mm/slab.c | 4 +++- > > mm/slab.h | 3 ++- > > mm/slab_common.c| 2 +- > > mm/slub.c | 18 +- > > 7 files changed, 40 insertions(+), 6 deletions(-) > > > >diff --git a/Documentation/ABI/testing/sysfs-kernel-slab > >b/Documentation/ABI/testing/sysfs-kernel-slab > >index 29601d93a1c2ea..d742c6cfdffbe9 100644 > >--- a/Documentation/ABI/testing/sysfs-kernel-slab > >+++ b/Documentation/ABI/testing/sysfs-kernel-slab > >@@ -106,6 +106,15 @@ Description: > > are from ZONE_DMA. > > Available when CONFIG_ZONE_DMA is enabled. > > > >+What: /sys/kernel/slab/cache/cache_dma32 > >+Date: December 2018 > >+KernelVersion:4.21 > >+Contact: Nicolas Boichat > >+Description: > >+ The cache_dma32 file is read-only and specifies whether > >objects > >+ are from ZONE_DMA32. > >+ Available when CONFIG_ZONE_DMA32 is enabled. > >+ > > What: /sys/kernel/slab/cache/cpu_slabs > > Date: May 2007 > > KernelVersion:2.6.22 > >diff --git a/include/linux/slab.h b/include/linux/slab.h > >index 11b45f7ae4057c..9449b19c5f107a 100644 > >--- a/include/linux/slab.h > >+++ b/include/linux/slab.h > >@@ -32,6 +32,8 @@ > > #define SLAB_HWCACHE_ALIGN((slab_flags_t __force)0x2000U) > > /* Use GFP_DMA memory */ > > #define SLAB_CACHE_DMA((slab_flags_t __force)0x4000U) > >+/* Use GFP_DMA32 memory */ > >+#define SLAB_CACHE_DMA32 ((slab_flags_t __force)0x8000U) > > /* DEBUG: Store the last owner for bug hunting */ > > #define SLAB_STORE_USER ((slab_flags_t __force)0x0001U) > > /* Panic if kmem_cache_create() fails */ > >diff --git a/mm/internal.h b/mm/internal.h > >index a2ee82a0cd44ae..fd244ad716eaf8 100644 > >--- a/mm/internal.h > >+++ b/mm/internal.h > >@@ -14,6 +14,7 @@ > > #include > > #include > > #include > >+#include > > #include > > > > /* > >@@ -34,9 +35,12 @@ > > #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) > > > > /* Check for flags that must not be used with a slab allocator */ > >-static inline gfp_t check_slab_flags(gfp_t flags) > >+static inline gfp_t check_slab_flags(gfp_t flags, slab_flags_t slab_flags) > > { > >- gfp_t bug_mask = __GFP_DMA32 | __GFP_HIGHMEM | ~__GFP_BITS_MASK; > >+ gfp_t bug_mask = __GFP_HIGHMEM | ~__GFP_BITS_MASK; > >+ > >+ if (!IS_ENABLED(CONFIG_ZONE_DMA32) || !(slab_flags & > >SLAB_CACHE_DMA32)) > >+ bug_mask |= __GFP_DMA32; > > The original version doesn't check CONFIG_ZONE_DMA32. > > Do we need to add this condition here? > Could we just decide the bug_mask based on slab_flags? We can. The reason I did it this way is that when we don't have CONFIG_ZONE_DMA32, the compiler should be able to simplify to: bug_mask = __GFP_HIGHMEM | ~__GFP_BITS_MASK; if (true || ..) => if (true) bug_mask |= __GFP_DMA32; Then just bug_mask = __GFP_HIGHMEM | ~__GFP_BITS_MASK | __GFP_DMA32; And since the function is inline, slab_flags would not even need to be accessed at all. > > > > if (unlikely(flags & bug_mask)) { > > gfp_t invalid_mask = flags & bug_mask; > >diff --git a/mm/slab.c b/mm/slab.c > >index 65a774f05e7836..2fd3b9a996cbe6 100644 > >--- a/mm/slab.c > >+++ b/mm/slab.c > >@@ -2109,6 +2109,8 @@ int __kmem_cache_create(struct kmem_cache *cachep, > >slab_flags_t flags) > > cachep->allocflags = __GFP_COMP; > > if (flags & SLAB_CACHE_DMA) > > cachep->allocflags |= GFP_DMA; > >+ if (flags & SLAB_CACHE_DMA32) > >+ cachep->allocflags |= GFP_DMA32; > > if (flags & SLAB_RECLAIM_ACCOUNT) > > cachep->allocflags |= __GFP_RECLAIMABLE; > > cachep->size = size; > >@@ -2643,7 +2645,7 @@ static struct page *cache_grow_beg
Re: [PATCH v4 2/3] mm: Add support for kmem caches in DMA32 zone
On Wed, Dec 05, 2018 at 01:48:27PM +0800, Nicolas Boichat wrote: >In some cases (e.g. IOMMU ARMv7s page allocator), we need to allocate >data structures smaller than a page with GFP_DMA32 flag. > >This change makes it possible to create a custom cache in DMA32 zone >using kmem_cache_create, then allocate memory using kmem_cache_alloc. > >We do not create a DMA32 kmalloc cache array, as there are currently >no users of kmalloc(..., GFP_DMA32). The new test in check_slab_flags >ensures that such calls still fail (as they do before this change). > >Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32") >Signed-off-by: Nicolas Boichat >--- > >Changes since v2: > - Clarified commit message > - Add entry in sysfs-kernel-slab to document the new sysfs file > >(v3 used the page_frag approach) > >Documentation/ABI/testing/sysfs-kernel-slab | 9 + > include/linux/slab.h| 2 ++ > mm/internal.h | 8 ++-- > mm/slab.c | 4 +++- > mm/slab.h | 3 ++- > mm/slab_common.c| 2 +- > mm/slub.c | 18 +- > 7 files changed, 40 insertions(+), 6 deletions(-) > >diff --git a/Documentation/ABI/testing/sysfs-kernel-slab >b/Documentation/ABI/testing/sysfs-kernel-slab >index 29601d93a1c2ea..d742c6cfdffbe9 100644 >--- a/Documentation/ABI/testing/sysfs-kernel-slab >+++ b/Documentation/ABI/testing/sysfs-kernel-slab >@@ -106,6 +106,15 @@ Description: > are from ZONE_DMA. > Available when CONFIG_ZONE_DMA is enabled. > >+What: /sys/kernel/slab/cache/cache_dma32 >+Date: December 2018 >+KernelVersion:4.21 >+Contact: Nicolas Boichat >+Description: >+ The cache_dma32 file is read-only and specifies whether objects >+ are from ZONE_DMA32. >+ Available when CONFIG_ZONE_DMA32 is enabled. >+ > What: /sys/kernel/slab/cache/cpu_slabs > Date: May 2007 > KernelVersion:2.6.22 >diff --git a/include/linux/slab.h b/include/linux/slab.h >index 11b45f7ae4057c..9449b19c5f107a 100644 >--- a/include/linux/slab.h >+++ b/include/linux/slab.h >@@ -32,6 +32,8 @@ > #define SLAB_HWCACHE_ALIGN((slab_flags_t __force)0x2000U) > /* Use GFP_DMA memory */ > #define SLAB_CACHE_DMA((slab_flags_t __force)0x4000U) >+/* Use GFP_DMA32 memory */ >+#define SLAB_CACHE_DMA32 ((slab_flags_t __force)0x8000U) > /* DEBUG: Store the last owner for bug hunting */ > #define SLAB_STORE_USER ((slab_flags_t __force)0x0001U) > /* Panic if kmem_cache_create() fails */ >diff --git a/mm/internal.h b/mm/internal.h >index a2ee82a0cd44ae..fd244ad716eaf8 100644 >--- a/mm/internal.h >+++ b/mm/internal.h >@@ -14,6 +14,7 @@ > #include > #include > #include >+#include > #include > > /* >@@ -34,9 +35,12 @@ > #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) > > /* Check for flags that must not be used with a slab allocator */ >-static inline gfp_t check_slab_flags(gfp_t flags) >+static inline gfp_t check_slab_flags(gfp_t flags, slab_flags_t slab_flags) > { >- gfp_t bug_mask = __GFP_DMA32 | __GFP_HIGHMEM | ~__GFP_BITS_MASK; >+ gfp_t bug_mask = __GFP_HIGHMEM | ~__GFP_BITS_MASK; >+ >+ if (!IS_ENABLED(CONFIG_ZONE_DMA32) || !(slab_flags & SLAB_CACHE_DMA32)) >+ bug_mask |= __GFP_DMA32; The original version doesn't check CONFIG_ZONE_DMA32. Do we need to add this condition here? Could we just decide the bug_mask based on slab_flags? > > if (unlikely(flags & bug_mask)) { > gfp_t invalid_mask = flags & bug_mask; >diff --git a/mm/slab.c b/mm/slab.c >index 65a774f05e7836..2fd3b9a996cbe6 100644 >--- a/mm/slab.c >+++ b/mm/slab.c >@@ -2109,6 +2109,8 @@ int __kmem_cache_create(struct kmem_cache *cachep, >slab_flags_t flags) > cachep->allocflags = __GFP_COMP; > if (flags & SLAB_CACHE_DMA) > cachep->allocflags |= GFP_DMA; >+ if (flags & SLAB_CACHE_DMA32) >+ cachep->allocflags |= GFP_DMA32; > if (flags & SLAB_RECLAIM_ACCOUNT) > cachep->allocflags |= __GFP_RECLAIMABLE; > cachep->size = size; >@@ -2643,7 +2645,7 @@ static struct page *cache_grow_begin(struct kmem_cache >*cachep, >* Be lazy and only check for valid flags here, keeping it out of the >* critical path in kmem_cache_alloc(). >*/ >- flags = check_slab_flags(flags); >+ flags = check_slab_flags(flags, cachep->flags); > WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); > local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); > >diff --git a/mm/slab.h b/mm/slab.h >index 4190c24ef0e9df..fcf717e12f0a86 100644 >--- a/mm/slab.h >+++ b/mm/slab.h >@@ -127,7 +127,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int >object_size, > > > /* Legal flag mask for kmem_cache_create(), for
Re: [PATCH v2 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables
On Wed, Dec 5, 2018 at 10:04 AM Nicolas Boichat wrote: > > On Tue, Dec 4, 2018 at 10:35 PM Vlastimil Babka wrote: > > > > On 12/4/18 10:37 AM, Nicolas Boichat wrote: > > > On Sun, Nov 11, 2018 at 5:04 PM Nicolas Boichat > > > wrote: > > >> > > >> This is a follow-up to the discussion in [1], to make sure that the page > > >> tables allocated by iommu/io-pgtable-arm-v7s are contained within 32-bit > > >> physical address space. > > >> > > >> [1] > > >> https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html > > > > > > Hi everyone, > > > > > > Let's try to summarize here. > > > > > > First, we confirmed that this is a regression, and IOMMU errors happen > > > on 4.19 and linux-next/master on MT8173 (elm, Acer Chromebook R13). > > > The issue most likely starts from ad67f5a6545f ("arm64: replace > > > ZONE_DMA with ZONE_DMA32"), i.e. 4.15, and presumably breaks a number > > > of Mediatek platforms (and maybe others?). > > > > > > We have a few options here: > > > 1. This series [2], that adds support for GFP_DMA32 slab caches, > > > _without_ adding kmalloc caches (since there are no users of > > > kmalloc(..., GFP_DMA32)). I think I've addressed all the comments on > > > the 3 patches, and AFAICT this solution works fine. > > > 2. genalloc. That works, but unless we preallocate 4MB for L2 tables > > > (which is wasteful as we usually only need a handful of L2 tables), > > > we'll need changes in the core (use GFP_ATOMIC) to allow allocating on > > > demand, and as it stands we'd have no way to shrink the allocation. > > > 3. page_frag [3]. That works fine, and the code is quite simple. One > > > drawback is that fragments in partially freed pages cannot be reused > > > (from limited experiments, I see that IOMMU L2 tables are rarely > > > freed, so it's unlikely a whole page would get freed). But given the > > > low number of L2 tables, maybe we can live with that. > > > > > > I think 2 is out. Any preference between 1 and 3? I think 1 makes > > > better use of the memory, so that'd be my preference. But I'm probably > > > missing something. > > > > I would prefer 1 as well. IIRC you already confirmed that alignment > > requirements are not broken for custom kmem caches even in presence of > > SLUB debug options (and I would say it's a bug to be fixed if they > > weren't). > > > I just asked (and didn't get a reply I think) about your > > ability to handle the GFP_ATOMIC allocation failures. They should be > > rare when only single page allocations are needed for the kmem cache. > > But in case they are not an option, then preallocating would be needed, > > thus probably option 2. > > Oh, sorry, I missed your question. > > I don't have a full answer, but: > - The allocations themselves are rare (I count a few 10s of L2 tables > at most on my system, I assume we rarely have >100), and yes, we only > need a single page, so the failures should be exceptional. > - My change is probably not making anything worse: I assume that even > with the current approach using GFP_DMA slab caches on older kernels, > failures could potentially happen. I don't think we've seen those. If > we are really concerned about this, maybe we'd need to modify > mtk_iommu_map to not hold a spinlock (if that's possible), so we don't > need to use GFP_ATOMIC. I suggest we just keep an eye on such issues, > and address them if they show up (we can even revisit genalloc at that > stage). > > Anyway, I'll clean up patches for 1 (mostly commit message changes > based on the comments in the threads) and resend. Done here: https://patchwork.kernel.org/cover/10713019/ . > Thanks, > > > > [2] https://patchwork.kernel.org/cover/10677529/, 3 patches > > > [3] https://patchwork.codeaurora.org/patch/671639/ > > > > > > Thanks, > > > > > > Nicolas > > > > > ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 3/3] iommu/io-pgtable-arm-v7s: Request DMA32 memory, and improve debugging
IOMMUs using ARMv7 short-descriptor format require page tables (level 1 and 2) to be allocated within the first 4GB of RAM, even on 64-bit systems. For level 1/2 pages, ensure GFP_DMA32 is used if CONFIG_ZONE_DMA32 is defined (e.g. on arm64 platforms). For level 2 pages, allocate a slab cache in SLAB_CACHE_DMA32. Also, print an error when the physical address does not fit in 32-bit, to make debugging easier in the future. Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32") Signed-off-by: Nicolas Boichat --- Changes since v2: - Commit message (v3 used the page_frag approach) drivers/iommu/io-pgtable-arm-v7s.c | 20 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 445c3bde04800c..996f7b6d00b44a 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -161,6 +161,14 @@ #define ARM_V7S_TCR_PD1BIT(5) +#ifdef CONFIG_ZONE_DMA32 +#define ARM_V7S_TABLE_GFP_DMA GFP_DMA32 +#define ARM_V7S_TABLE_SLAB_CACHE SLAB_CACHE_DMA32 +#else +#define ARM_V7S_TABLE_GFP_DMA GFP_DMA +#define ARM_V7S_TABLE_SLAB_CACHE SLAB_CACHE_DMA +#endif + typedef u32 arm_v7s_iopte; static bool selftest_running; @@ -198,13 +206,17 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp, void *table = NULL; if (lvl == 1) - table = (void *)__get_dma_pages(__GFP_ZERO, get_order(size)); + table = (void *)__get_free_pages( + __GFP_ZERO | ARM_V7S_TABLE_GFP_DMA, get_order(size)); else if (lvl == 2) - table = kmem_cache_zalloc(data->l2_tables, gfp | GFP_DMA); + table = kmem_cache_zalloc(data->l2_tables, + gfp | ARM_V7S_TABLE_GFP_DMA); phys = virt_to_phys(table); - if (phys != (arm_v7s_iopte)phys) + if (phys != (arm_v7s_iopte)phys) { /* Doesn't fit in PTE */ + dev_err(dev, "Page table does not fit in PTE: %pa", &phys); goto out_free; + } if (table && !(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) { dma = dma_map_single(dev, table, size, DMA_TO_DEVICE); if (dma_mapping_error(dev, dma)) @@ -737,7 +749,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, data->l2_tables = kmem_cache_create("io-pgtable_armv7s_l2", ARM_V7S_TABLE_SIZE(2), ARM_V7S_TABLE_SIZE(2), - SLAB_CACHE_DMA, NULL); + ARM_V7S_TABLE_SLAB_CACHE, NULL); if (!data->l2_tables) goto out_free_data; -- 2.20.0.rc1.387.gf8505762e3-goog ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 1/3] mm: slab/slub: Add check_slab_flags function to check for valid flags
Remove duplicated code between slab and slub, and will make it easier to make the test more complicated in the next commits. Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32") Signed-off-by: Nicolas Boichat --- mm/internal.h | 18 -- mm/slab.c | 8 +--- mm/slub.c | 8 +--- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index f0c9ccde3bdb9e..a2ee82a0cd44ae 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -33,8 +33,22 @@ /* Control allocation cpuset and node placement constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) -/* Do not use these with a slab allocator */ -#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) +/* Check for flags that must not be used with a slab allocator */ +static inline gfp_t check_slab_flags(gfp_t flags) +{ + gfp_t bug_mask = __GFP_DMA32 | __GFP_HIGHMEM | ~__GFP_BITS_MASK; + + if (unlikely(flags & bug_mask)) { + gfp_t invalid_mask = flags & bug_mask; + + flags &= ~bug_mask; + pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", + invalid_mask, &invalid_mask, flags, &flags); + dump_stack(); + } + + return flags; +} void page_writeback_init(void); diff --git a/mm/slab.c b/mm/slab.c index 73fe23e649c91a..65a774f05e7836 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2643,13 +2643,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; - flags &= ~GFP_SLAB_BUG_MASK; - pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", - invalid_mask, &invalid_mask, flags, &flags); - dump_stack(); - } + flags = check_slab_flags(flags); WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); diff --git a/mm/slub.c b/mm/slub.c index c229a9b7dd5448..21a3f6866da472 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1685,13 +1685,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { - if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; - flags &= ~GFP_SLAB_BUG_MASK; - pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", - invalid_mask, &invalid_mask, flags, &flags); - dump_stack(); - } + flags = check_slab_flags(flags); return allocate_slab(s, flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); -- 2.20.0.rc1.387.gf8505762e3-goog ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 2/3] mm: Add support for kmem caches in DMA32 zone
In some cases (e.g. IOMMU ARMv7s page allocator), we need to allocate data structures smaller than a page with GFP_DMA32 flag. This change makes it possible to create a custom cache in DMA32 zone using kmem_cache_create, then allocate memory using kmem_cache_alloc. We do not create a DMA32 kmalloc cache array, as there are currently no users of kmalloc(..., GFP_DMA32). The new test in check_slab_flags ensures that such calls still fail (as they do before this change). Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32") Signed-off-by: Nicolas Boichat --- Changes since v2: - Clarified commit message - Add entry in sysfs-kernel-slab to document the new sysfs file (v3 used the page_frag approach) Documentation/ABI/testing/sysfs-kernel-slab | 9 + include/linux/slab.h| 2 ++ mm/internal.h | 8 ++-- mm/slab.c | 4 +++- mm/slab.h | 3 ++- mm/slab_common.c| 2 +- mm/slub.c | 18 +- 7 files changed, 40 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 29601d93a1c2ea..d742c6cfdffbe9 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -106,6 +106,15 @@ Description: are from ZONE_DMA. Available when CONFIG_ZONE_DMA is enabled. +What: /sys/kernel/slab/cache/cache_dma32 +Date: December 2018 +KernelVersion: 4.21 +Contact: Nicolas Boichat +Description: + The cache_dma32 file is read-only and specifies whether objects + are from ZONE_DMA32. + Available when CONFIG_ZONE_DMA32 is enabled. + What: /sys/kernel/slab/cache/cpu_slabs Date: May 2007 KernelVersion: 2.6.22 diff --git a/include/linux/slab.h b/include/linux/slab.h index 11b45f7ae4057c..9449b19c5f107a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -32,6 +32,8 @@ #define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x2000U) /* Use GFP_DMA memory */ #define SLAB_CACHE_DMA ((slab_flags_t __force)0x4000U) +/* Use GFP_DMA32 memory */ +#define SLAB_CACHE_DMA32 ((slab_flags_t __force)0x8000U) /* DEBUG: Store the last owner for bug hunting */ #define SLAB_STORE_USER((slab_flags_t __force)0x0001U) /* Panic if kmem_cache_create() fails */ diff --git a/mm/internal.h b/mm/internal.h index a2ee82a0cd44ae..fd244ad716eaf8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -14,6 +14,7 @@ #include #include #include +#include #include /* @@ -34,9 +35,12 @@ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) /* Check for flags that must not be used with a slab allocator */ -static inline gfp_t check_slab_flags(gfp_t flags) +static inline gfp_t check_slab_flags(gfp_t flags, slab_flags_t slab_flags) { - gfp_t bug_mask = __GFP_DMA32 | __GFP_HIGHMEM | ~__GFP_BITS_MASK; + gfp_t bug_mask = __GFP_HIGHMEM | ~__GFP_BITS_MASK; + + if (!IS_ENABLED(CONFIG_ZONE_DMA32) || !(slab_flags & SLAB_CACHE_DMA32)) + bug_mask |= __GFP_DMA32; if (unlikely(flags & bug_mask)) { gfp_t invalid_mask = flags & bug_mask; diff --git a/mm/slab.c b/mm/slab.c index 65a774f05e7836..2fd3b9a996cbe6 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2109,6 +2109,8 @@ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) cachep->allocflags = __GFP_COMP; if (flags & SLAB_CACHE_DMA) cachep->allocflags |= GFP_DMA; + if (flags & SLAB_CACHE_DMA32) + cachep->allocflags |= GFP_DMA32; if (flags & SLAB_RECLAIM_ACCOUNT) cachep->allocflags |= __GFP_RECLAIMABLE; cachep->size = size; @@ -2643,7 +2645,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - flags = check_slab_flags(flags); + flags = check_slab_flags(flags, cachep->flags); WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); diff --git a/mm/slab.h b/mm/slab.h index 4190c24ef0e9df..fcf717e12f0a86 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -127,7 +127,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, /* Legal flag mask for kmem_cache_create(), for various configurations */ -#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ +SLAB_CACHE_DMA32 | SLAB_PANIC | \ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) #if defined(CONFIG_DEBU
[PATCH v4 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables
This is a follow-up to the discussion in [1], [2]. IOMMUs using ARMv7 short-descriptor format require page tables (level 1 and 2) to be allocated within the first 4GB of RAM, even on 64-bit systems. For L1 tables that are bigger than a page, we can just use __get_free_pages with GFP_DMA32 (on arm64 systems only, arm would still use GFP_DMA). For L2 tables that only take 1KB, it would be a waste to allocate a full page, so we considered 3 approaches: 1. This series, adding support for GFP_DMA32 slab caches. 2. genalloc, which requires pre-allocating the maximum number of L2 page tables (4096, so 4MB of memory). 3. page_frag, which is not very memory-efficient as it is unable to reuse freed fragments until the whole page is freed. [3] This series is the most memory-efficient approach. [1] https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html [2] https://lists.linuxfoundation.org/pipermail/iommu/2018-December/031696.html [3] https://patchwork.codeaurora.org/patch/671639/ Changes since v1: - Add support for SLAB_CACHE_DMA32 in slab and slub (patches 1/2) - iommu/io-pgtable-arm-v7s (patch 3): - Changed approach to use SLAB_CACHE_DMA32 added by the previous commit. - Use DMA or DMA32 depending on the architecture (DMA for arm, DMA32 for arm64). Changes since v2: - Reworded and expanded commit messages - Added cache_dma32 documentation in PATCH 2/3. v3 used the page_frag approach, see [3]. Nicolas Boichat (3): mm: slab/slub: Add check_slab_flags function to check for valid flags mm: Add support for kmem caches in DMA32 zone iommu/io-pgtable-arm-v7s: Request DMA32 memory, and improve debugging Documentation/ABI/testing/sysfs-kernel-slab | 9 drivers/iommu/io-pgtable-arm-v7s.c | 20 + include/linux/slab.h| 2 ++ mm/internal.h | 22 +-- mm/slab.c | 10 +++-- mm/slab.h | 3 ++- mm/slab_common.c| 2 +- mm/slub.c | 24 +++-- 8 files changed, 70 insertions(+), 22 deletions(-) -- 2.20.0.rc1.387.gf8505762e3-goog ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/4] dma-debug: Use pr_fmt()
On Mon, 2018-12-03 at 17:28 +, Robin Murphy wrote: > Use pr_fmt() to generate the "DMA-API: " prefix consistently. This > results in it being added to a couple of pr_*() messages which were > missing it before, and for the err_printk() calls moves it to the actual > start of the message instead of somewhere in the middle. > > Signed-off-by: Robin Murphy > --- > > I chose not to refactor the existing split strings for minimal churn here. > > kernel/dma/debug.c | 74 -- > 1 file changed, 38 insertions(+), 36 deletions(-) > > diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c > index 231ca4628062..91b84140e4a5 100644 > --- a/kernel/dma/debug.c > +++ b/kernel/dma/debug.c > @@ -17,6 +17,8 @@ > * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > */ > > +#define pr_fmt(fmt) "DMA-API: " fmt > + > #include > #include > #include > @@ -234,7 +236,7 @@ static bool driver_filter(struct device *dev) > error_count += 1; \ > if (driver_filter(dev) && \ > (show_all_errors || show_num_errors > 0)) { \ > - WARN(1, "%s %s: " format, \ > + WARN(1, pr_fmt("%s %s: ") format, \ >dev ? dev_driver_string(dev) : "NULL", \ >dev ? dev_name(dev) : "NULL", ## arg); \ > dump_entry_trace(entry);\ I think converting this WARN to dev_err(dev, format, ##__VA_ARGS__); dump_stack(); would look better and be more intelligible. Perhaps add a #define for dev_fmt if really necessary. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables
On Tue, Dec 4, 2018 at 10:35 PM Vlastimil Babka wrote: > > On 12/4/18 10:37 AM, Nicolas Boichat wrote: > > On Sun, Nov 11, 2018 at 5:04 PM Nicolas Boichat > > wrote: > >> > >> This is a follow-up to the discussion in [1], to make sure that the page > >> tables allocated by iommu/io-pgtable-arm-v7s are contained within 32-bit > >> physical address space. > >> > >> [1] > >> https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html > > > > Hi everyone, > > > > Let's try to summarize here. > > > > First, we confirmed that this is a regression, and IOMMU errors happen > > on 4.19 and linux-next/master on MT8173 (elm, Acer Chromebook R13). > > The issue most likely starts from ad67f5a6545f ("arm64: replace > > ZONE_DMA with ZONE_DMA32"), i.e. 4.15, and presumably breaks a number > > of Mediatek platforms (and maybe others?). > > > > We have a few options here: > > 1. This series [2], that adds support for GFP_DMA32 slab caches, > > _without_ adding kmalloc caches (since there are no users of > > kmalloc(..., GFP_DMA32)). I think I've addressed all the comments on > > the 3 patches, and AFAICT this solution works fine. > > 2. genalloc. That works, but unless we preallocate 4MB for L2 tables > > (which is wasteful as we usually only need a handful of L2 tables), > > we'll need changes in the core (use GFP_ATOMIC) to allow allocating on > > demand, and as it stands we'd have no way to shrink the allocation. > > 3. page_frag [3]. That works fine, and the code is quite simple. One > > drawback is that fragments in partially freed pages cannot be reused > > (from limited experiments, I see that IOMMU L2 tables are rarely > > freed, so it's unlikely a whole page would get freed). But given the > > low number of L2 tables, maybe we can live with that. > > > > I think 2 is out. Any preference between 1 and 3? I think 1 makes > > better use of the memory, so that'd be my preference. But I'm probably > > missing something. > > I would prefer 1 as well. IIRC you already confirmed that alignment > requirements are not broken for custom kmem caches even in presence of > SLUB debug options (and I would say it's a bug to be fixed if they > weren't). > I just asked (and didn't get a reply I think) about your > ability to handle the GFP_ATOMIC allocation failures. They should be > rare when only single page allocations are needed for the kmem cache. > But in case they are not an option, then preallocating would be needed, > thus probably option 2. Oh, sorry, I missed your question. I don't have a full answer, but: - The allocations themselves are rare (I count a few 10s of L2 tables at most on my system, I assume we rarely have >100), and yes, we only need a single page, so the failures should be exceptional. - My change is probably not making anything worse: I assume that even with the current approach using GFP_DMA slab caches on older kernels, failures could potentially happen. I don't think we've seen those. If we are really concerned about this, maybe we'd need to modify mtk_iommu_map to not hold a spinlock (if that's possible), so we don't need to use GFP_ATOMIC. I suggest we just keep an eye on such issues, and address them if they show up (we can even revisit genalloc at that stage). Anyway, I'll clean up patches for 1 (mostly commit message changes based on the comments in the threads) and resend. Thanks, > > [2] https://patchwork.kernel.org/cover/10677529/, 3 patches > > [3] https://patchwork.codeaurora.org/patch/671639/ > > > > Thanks, > > > > Nicolas > > > ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH] iommu: fix amd_iommu=force_isolation
The parameter is still there but it's ignored. We need to check its value before deciding to go into passthrough mode for AMD IOMMU. Fixes: aafd8ba0ca74 ("iommu/amd: Implement add_device and remove_device") Signed-off-by: Yu Zhao --- drivers/iommu/amd_iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 1167ff0416cf..3e4219e6cff0 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2195,7 +2195,8 @@ static int amd_iommu_add_device(struct device *dev) BUG_ON(!dev_data); - if (iommu_pass_through || dev_data->iommu_v2) + if (iommu_pass_through || + (!amd_iommu_force_isolation && dev_data->iommu_v2)) iommu_request_dm_for_dev(dev); /* Domains are initialized for this device - have a look what we ended up with */ -- 2.20.0.rc1.387.gf8505762e3-goog ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 01/23] dma-mapping: provide a generic DMA_MAPPING_ERROR
On Tue, Dec 04, 2018 at 04:41:34PM +, Robin Murphy wrote: > I'd have been inclined to put the default check here, i.e. > > - return 0 > + return dma_addr == DMA_MAPPING_ERROR > > such that the callback retains full precedence and we don't have to deal > with the non-trivial removals immediately if it comes to it. Not that it > makes a vast difference though, so either way, Ok, I've switched it around. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] of/device: add blacklist for iommu dma_ops
On Sat, Dec 1, 2018 at 10:54 AM Rob Clark wrote: > > This solves a problem we see with drm/msm, caused by getting > iommu_dma_ops while we attach our own domain and manage it directly at > the iommu API level: > > [0038] user address but active_mm is swapper > Internal error: Oops: 9605 [#1] PREEMPT SMP > Modules linked in: > CPU: 7 PID: 70 Comm: kworker/7:1 Tainted: GW 4.19.3 #90 > Hardware name: xxx (DT) > Workqueue: events deferred_probe_work_func > pstate: 80c9 (Nzcv daif +PAN +UAO) > pc : iommu_dma_map_sg+0x7c/0x2c8 > lr : iommu_dma_map_sg+0x40/0x2c8 > sp : ff80095eb4f0 > x29: ff80095eb4f0 x28: > x27: ffc0f9431578 x26: > x25: x24: 0003 > x23: 0001 x22: ffc0fa9ac010 > x21: x20: ffc0fab40980 > x19: ffc0fab40980 x18: 0003 > x17: 01c4 x16: 0007 > x15: 000e x14: > x13: x12: 0028 > x11: 0101010101010101 x10: 7f7f7f7f7f7f7f7f > x9 : x8 : ffc0fab409a0 > x7 : x6 : 0002 > x5 : 0001 x4 : > x3 : 0001 x2 : 0002 > x1 : ffc0f9431578 x0 : > Process kworker/7:1 (pid: 70, stack limit = 0x17d08ffb) > Call trace: >iommu_dma_map_sg+0x7c/0x2c8 >__iommu_map_sg_attrs+0x70/0x84 >get_pages+0x170/0x1e8 >msm_gem_get_iova+0x8c/0x128 >_msm_gem_kernel_new+0x6c/0xc8 >msm_gem_kernel_new+0x4c/0x58 >dsi_tx_buf_alloc_6g+0x4c/0x8c >msm_dsi_host_modeset_init+0xc8/0x108 >msm_dsi_modeset_init+0x54/0x18c >_dpu_kms_drm_obj_init+0x430/0x474 >dpu_kms_hw_init+0x5f8/0x6b4 >msm_drm_bind+0x360/0x6c8 >try_to_bring_up_master.part.7+0x28/0x70 >component_master_add_with_match+0xe8/0x124 >msm_pdev_probe+0x294/0x2b4 >platform_drv_probe+0x58/0xa4 >really_probe+0x150/0x294 >driver_probe_device+0xac/0xe8 >__device_attach_driver+0xa4/0xb4 >bus_for_each_drv+0x98/0xc8 >__device_attach+0xac/0x12c >device_initial_probe+0x24/0x30 >bus_probe_device+0x38/0x98 >deferred_probe_work_func+0x78/0xa4 >process_one_work+0x24c/0x3dc >worker_thread+0x280/0x360 >kthread+0x134/0x13c >ret_from_fork+0x10/0x18 > Code: d284 91000725 6b17039f 5400048a (f9401f40) > ---[ end trace f22dda57f3648e2c ]--- > Kernel panic - not syncing: Fatal exception > SMP: stopping secondary CPUs > Kernel Offset: disabled > CPU features: 0x0,22802a18 > Memory Limit: none > > The problem is that when drm/msm does it's own iommu_attach_device(), > now the domain returned by iommu_get_domain_for_dev() is drm/msm's > domain, and it doesn't have domain->iova_cookie. > > We kind of avoided this problem prior to sdm845/dpu because the iommu > was attached to the mdp node in dt, which is a child of the toplevel > mdss node (which corresponds to the dev passed in dma_map_sg()). But > with sdm845, now the iommu is attached at the mdss level so we hit the > iommu_dma_ops in dma_map_sg(). > > But auto allocating/attaching a domain before the driver is probed was > already a blocking problem for enabling per-context pagetables for the > GPU. This problem is also now solved with this patch. > > Fixes: 97890ba9289c dma-mapping: detect and configure IOMMU in > of_dma_configure > Tested-by: Douglas Anderson > Signed-off-by: Rob Clark > --- > This is an alternative/replacement for [1]. What it lacks in elegance > it makes up for in practicality ;-) > > [1] https://patchwork.freedesktop.org/patch/264930/ > > drivers/of/device.c | 22 ++ > 1 file changed, 22 insertions(+) > > diff --git a/drivers/of/device.c b/drivers/of/device.c > index 5957cd4fa262..15ffee00fb22 100644 > --- a/drivers/of/device.c > +++ b/drivers/of/device.c > @@ -72,6 +72,14 @@ int of_device_add(struct platform_device *ofdev) > return device_add(&ofdev->dev); > } > > +static const struct of_device_id iommu_blacklist[] = { > + { .compatible = "qcom,mdp4" }, > + { .compatible = "qcom,mdss" }, > + { .compatible = "qcom,sdm845-mdss" }, > + { .compatible = "qcom,adreno" }, > + {} > +}; Not completely clear to whether this is still needed or not, but this really won't scale. Why can't the driver for these devices override whatever has been setup by default? Rob ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption
On Tue, Dec 4, 2018 at 11:26 PM Tony Battersby wrote: > > On 12/4/18 3:30 PM, Andy Shevchenko wrote: > > On Tue, Dec 4, 2018 at 10:18 PM Matthew Wilcox wrote: > >> On Tue, Dec 04, 2018 at 12:14:43PM -0800, Andrew Morton wrote: > >>> Also, Andy had issues with the v2 series so it would be good to hear an > >>> update from him? > >> Certainly. > > Hmm... I certainly forgot what was long time ago. > > If I _was_ in Cc list and didn't comment, I'm fine with it. > > > v4 of the patchset is the same as v3 but with the last patch dropped. > Andy had only one minor comment on v3 about the use of division in patch > #8, to which I replied. That was back on August 8. Seems I'm fine with the last version then. -- With Best Regards, Andy Shevchenko ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH V2] mm: Replace all open encodings for NUMA_NO_NODE
On Mon, 2018-11-26 at 17:56 +0530, Anshuman Khandual wrote: > At present there are multiple places where invalid node number is encoded > as -1. Even though implicitly understood it is always better to have macros > in there. Replace these open encodings for an invalid node number with the > global macro NUMA_NO_NODE. This helps remove NUMA related assumptions like > 'invalid node' from various places redirecting them to a common definition. > > Signed-off-by: Anshuman Khandual > --- > Changes in V2: > > - Added inclusion of 'numa.h' header at various places per Andrew > - Updated 'dev_to_node' to use NUMA_NO_NODE instead per Vinod > > Changes in V1: (https://lkml.org/lkml/2018/11/23/485) > > - Dropped OCFS2 changes per Joseph > - Dropped media/video drivers changes per Hans > > RFC - https://patchwork.kernel.org/patch/10678035/ > > Build tested this with multiple cross compiler options like alpha, sparc, > arm64, x86, powerpc, powerpc64le etc with their default config which might > not have compiled tested all driver related changes. I will appreciate > folks giving this a test in their respective build environment. > > All these places for replacement were found by running the following grep > patterns on the entire kernel code. Please let me know if this might have > missed some instances. This might also have replaced some false positives. > I will appreciate suggestions, inputs and review. > > 1. git grep "nid == -1" > 2. git grep "node == -1" > 3. git grep "nid = -1" > 4. git grep "node = -1" > > arch/alpha/include/asm/topology.h | 3 ++- > arch/ia64/kernel/numa.c | 2 +- > arch/ia64/mm/discontig.c | 6 +++--- > arch/ia64/sn/kernel/io_common.c | 3 ++- > arch/powerpc/include/asm/pci-bridge.h | 3 ++- > arch/powerpc/kernel/paca.c| 3 ++- > arch/powerpc/kernel/pci-common.c | 3 ++- > arch/powerpc/mm/numa.c| 14 +++--- > arch/powerpc/platforms/powernv/memtrace.c | 5 +++-- > arch/sparc/kernel/auxio_32.c | 3 ++- > arch/sparc/kernel/pci_fire.c | 3 ++- > arch/sparc/kernel/pci_schizo.c| 3 ++- > arch/sparc/kernel/pcic.c | 7 --- > arch/sparc/kernel/psycho_common.c | 3 ++- > arch/sparc/kernel/sbus.c | 3 ++- > arch/sparc/mm/init_64.c | 6 +++--- > arch/sparc/prom/init_32.c | 3 ++- > arch/sparc/prom/init_64.c | 5 +++-- > arch/sparc/prom/tree_32.c | 13 +++-- > arch/sparc/prom/tree_64.c | 19 ++- > arch/x86/include/asm/pci.h| 3 ++- > arch/x86/kernel/apic/x2apic_uv_x.c| 7 --- > arch/x86/kernel/smpboot.c | 3 ++- > arch/x86/platform/olpc/olpc_dt.c | 17 + > drivers/block/mtip32xx/mtip32xx.c | 5 +++-- > drivers/dma/dmaengine.c | 4 +++- > drivers/infiniband/hw/hfi1/affinity.c | 3 ++- > drivers/infiniband/hw/hfi1/init.c | 3 ++- > drivers/iommu/dmar.c | 5 +++-- > drivers/iommu/intel-iommu.c | 3 ++- > drivers/misc/sgi-xp/xpc_uv.c | 3 ++- > drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 +++-- > include/linux/device.h| 2 +- > init/init_task.c | 3 ++- > kernel/kthread.c | 3 ++- > kernel/sched/fair.c | 15 --- > lib/cpumask.c | 3 ++- > mm/huge_memory.c | 13 +++-- > mm/hugetlb.c | 3 ++- > mm/ksm.c | 2 +- > mm/memory.c | 7 --- > mm/memory_hotplug.c | 12 ++-- > mm/mempolicy.c| 2 +- > mm/page_alloc.c | 4 ++-- > mm/page_ext.c | 2 +- > net/core/pktgen.c | 3 ++- > net/qrtr/qrtr.c | 3 ++- > tools/perf/bench/numa.c | 6 +++--- > 48 files changed, 146 insertions(+), 108 deletions(-) Thanks for the patch. It seems to me that you've got a fairly large amount of it wrong though -- perhaps relying just on "git grep" alone is not the best idea. The diffstat is not all that big, it is entirely plausible to just review each hunk manually: just do a "git show -U20" to get some context. You get a NAK from me for the OLPC DT part, but I think at least the sparc/prom part also deals with device tree nodes and not NUMA nodes. More inline. Lubo > > diff --git a/arch/alpha/include/asm/topolog
Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption
On 12/4/18 3:30 PM, Andy Shevchenko wrote: > On Tue, Dec 4, 2018 at 10:18 PM Matthew Wilcox wrote: >> On Tue, Dec 04, 2018 at 12:14:43PM -0800, Andrew Morton wrote: >>> Also, Andy had issues with the v2 series so it would be good to hear an >>> update from him? >> Certainly. > Hmm... I certainly forgot what was long time ago. > If I _was_ in Cc list and didn't comment, I'm fine with it. > v4 of the patchset is the same as v3 but with the last patch dropped. Andy had only one minor comment on v3 about the use of division in patch #8, to which I replied. That was back on August 8. Tony ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption
On Tue, Dec 04, 2018 at 12:28:54PM -0800, Andrew Morton wrote: > On Tue, 4 Dec 2018 12:18:01 -0800 Matthew Wilcox wrote: > > I only had a review comment on 8/9, which I then withdrew during my review > > of patch 9/9. Unless I missed something during my re-review of my > > responses? > > And in 0/9, that 1.3MB allocation. > > Maybe it's using kvmalloc, I didn't look. Oh! That's the mptsas driver doing something utterly awful. Not the fault of this patchset, in any way. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption
On Tue, Dec 4, 2018 at 10:18 PM Matthew Wilcox wrote: > On Tue, Dec 04, 2018 at 12:14:43PM -0800, Andrew Morton wrote: > > Also, Andy had issues with the v2 series so it would be good to hear an > > update from him? > > Certainly. Hmm... I certainly forgot what was long time ago. If I _was_ in Cc list and didn't comment, I'm fine with it. -- With Best Regards, Andy Shevchenko ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption
On Tue, 4 Dec 2018 12:18:01 -0800 Matthew Wilcox wrote: > On Tue, Dec 04, 2018 at 12:14:43PM -0800, Andrew Morton wrote: > > On Tue, 4 Dec 2018 11:22:34 -0500 Tony Battersby > > wrote: > > > > > On 11/13/18 1:36 AM, Matthew Wilcox wrote: > > > > On Mon, Nov 12, 2018 at 10:46:35AM -0500, Tony Battersby wrote: > > > >> Prevent a possible endless loop with DMAPOOL_DEBUG enabled if a buggy > > > >> driver corrupts DMA pool memory. > > > >> > > > >> Signed-off-by: Tony Battersby > > > > I like it! Also, here you're using blks_per_alloc in a way which isn't > > > > normally in the performance path, but might be with the right config > > > > options. With that, I withdraw my objection to the previous patch and > > > > > > > > Acked-by: Matthew Wilcox > > > > > > > > Andrew, can you funnel these in through your tree? If you'd rather not, > > > > I don't mind stuffing them into a git tree and asking Linus to pull > > > > for 4.21. > > > > > > > No reply for 3 weeks, so adding Andrew Morton to recipient list. > > > > > > Andrew, I have 9 dmapool patches ready for merging in 4.21. See Matthew > > > Wilcox's request above. > > > > > > > I'll take a look, but I see that this v4 series has several review > > comments from Matthew which remain unresponded to. Please attend to > > that. > > I only had a review comment on 8/9, which I then withdrew during my review > of patch 9/9. Unless I missed something during my re-review of my responses? And in 0/9, that 1.3MB allocation. Maybe it's using kvmalloc, I didn't look. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption
On Tue, Dec 04, 2018 at 12:14:43PM -0800, Andrew Morton wrote: > On Tue, 4 Dec 2018 11:22:34 -0500 Tony Battersby > wrote: > > > On 11/13/18 1:36 AM, Matthew Wilcox wrote: > > > On Mon, Nov 12, 2018 at 10:46:35AM -0500, Tony Battersby wrote: > > >> Prevent a possible endless loop with DMAPOOL_DEBUG enabled if a buggy > > >> driver corrupts DMA pool memory. > > >> > > >> Signed-off-by: Tony Battersby > > > I like it! Also, here you're using blks_per_alloc in a way which isn't > > > normally in the performance path, but might be with the right config > > > options. With that, I withdraw my objection to the previous patch and > > > > > > Acked-by: Matthew Wilcox > > > > > > Andrew, can you funnel these in through your tree? If you'd rather not, > > > I don't mind stuffing them into a git tree and asking Linus to pull > > > for 4.21. > > > > > No reply for 3 weeks, so adding Andrew Morton to recipient list. > > > > Andrew, I have 9 dmapool patches ready for merging in 4.21. See Matthew > > Wilcox's request above. > > > > I'll take a look, but I see that this v4 series has several review > comments from Matthew which remain unresponded to. Please attend to > that. I only had a review comment on 8/9, which I then withdrew during my review of patch 9/9. Unless I missed something during my re-review of my responses? > Also, Andy had issues with the v2 series so it would be good to hear an > update from him? Certainly. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption
On Tue, 4 Dec 2018 11:22:34 -0500 Tony Battersby wrote: > On 11/13/18 1:36 AM, Matthew Wilcox wrote: > > On Mon, Nov 12, 2018 at 10:46:35AM -0500, Tony Battersby wrote: > >> Prevent a possible endless loop with DMAPOOL_DEBUG enabled if a buggy > >> driver corrupts DMA pool memory. > >> > >> Signed-off-by: Tony Battersby > > I like it! Also, here you're using blks_per_alloc in a way which isn't > > normally in the performance path, but might be with the right config > > options. With that, I withdraw my objection to the previous patch and > > > > Acked-by: Matthew Wilcox > > > > Andrew, can you funnel these in through your tree? If you'd rather not, > > I don't mind stuffing them into a git tree and asking Linus to pull > > for 4.21. > > > No reply for 3 weeks, so adding Andrew Morton to recipient list. > > Andrew, I have 9 dmapool patches ready for merging in 4.21. See Matthew > Wilcox's request above. > I'll take a look, but I see that this v4 series has several review comments from Matthew which remain unresponded to. Please attend to that. Also, Andy had issues with the v2 series so it would be good to hear an update from him? ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH] Fix typo. Change tlb_range_add to iotlb_range_add and tlb_sync to iotlb_sync
From: tom Someone forgot to update this comment. Signed-off-by: Tom Murphy --- include/linux/iommu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a1d28f42cb77..11db18b9ffe8 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -168,8 +168,8 @@ struct iommu_resv_region { * @map: map a physically contiguous memory region to an iommu domain * @unmap: unmap a physically contiguous memory region from an iommu domain * @flush_tlb_all: Synchronously flush all hardware TLBs for this domain - * @tlb_range_add: Add a given iova range to the flush queue for this domain - * @tlb_sync: Flush all queued ranges from the hardware TLBs and empty flush + * @iotlb_range_add: Add a given iova range to the flush queue for this domain + * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush *queue * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping -- 2.11.0 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool
In fact, having got this far in, what I'd quite like to do is to get rid of dma_debug_resize_entries() such that we never need to free things at all, since then we could allocate whole pages as blocks of entries to save on masses of individual slab allocations. On a related topic, is it possible for the user to learn the total entries created at a given point in time? If not, could we add a file in the debugfs folder for this? Hi Robin, I did get as far as pondering that you effectively lose track of utilisation once the low-water-mark of min_free_entries hits 0 and stays I did try your patches and I noticed this, i.e I was hitting the point at which we start to alloc more entries. there - AFAICS it should be sufficient to just expose nr_total_entries as-is, since users can then calculate current and maximum occupancy based on *_free_entries. Does that sound reasonable to you? Sounds ok. I am just interested to know roughly how many DMA buffers we're using in our system. That also indirectly reminds me that this lot is documented in DMA_API.txt, so I should be good and update that too... Thanks, John Cheers, Robin. . ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 5/5] xhci: Use device_iommu_mapped()
From: Joerg Roedel Replace the dev->iommu_group check with a proper function call that better reprensents its purpose. Cc: Mathias Nyman Signed-off-by: Joerg Roedel --- drivers/usb/host/xhci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index c928dbbff881..5ab97e54d070 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -244,7 +244,7 @@ static void xhci_zero_64b_regs(struct xhci_hcd *xhci) * an iommu. Doing anything when there is no iommu is definitely * unsafe... */ - if (!(xhci->quirks & XHCI_ZERO_64B_REGS) || !dev->iommu_group) + if (!(xhci->quirks & XHCI_ZERO_64B_REGS) || !device_iommu_mapped(dev)) return; xhci_info(xhci, "Zeroing 64bit base registers, expecting fault\n"); -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 4/5] powerpc/iommu: Use device_iommu_mapped()
From: Joerg Roedel Use the new function to replace the open-coded iommu check. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Russell Currey Cc: Sam Bobroff Signed-off-by: Joerg Roedel --- arch/powerpc/kernel/eeh.c | 2 +- arch/powerpc/kernel/iommu.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 6cae6b56ffd6..23fe62f11486 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1472,7 +1472,7 @@ static int dev_has_iommu_table(struct device *dev, void *data) if (!dev) return 0; - if (dev->iommu_group) { + if (device_iommu_mapped(dev)) { *ppdev = pdev; return 1; } diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index f0dc680e659a..48d58d1dcac2 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1086,7 +1086,7 @@ int iommu_add_device(struct device *dev) if (!device_is_registered(dev)) return -ENOENT; - if (dev->iommu_group) { + if (device_iommu_mapped(dev)) { pr_debug("%s: Skipping device %s with iommu group %d\n", __func__, dev_name(dev), iommu_group_id(dev->iommu_group)); @@ -1129,7 +1129,7 @@ void iommu_del_device(struct device *dev) * and we needn't detach them from the associated * IOMMU groups */ - if (!dev->iommu_group) { + if (!device_iommu_mapped(dev)) { pr_debug("iommu_tce: skipping device %s with no tbl\n", dev_name(dev)); return; @@ -1148,7 +1148,7 @@ static int tce_iommu_bus_notifier(struct notifier_block *nb, case BUS_NOTIFY_ADD_DEVICE: return iommu_add_device(dev); case BUS_NOTIFY_DEL_DEVICE: -if (dev->iommu_group) +if (device_iommu_mapped(dev)) iommu_del_device(dev); return 0; default: -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 2/5] iommu/of: Use device_iommu_mapped()
From: Joerg Roedel Use Use device_iommu_mapped() to check if the device is already mapped by an IOMMU. Signed-off-by: Joerg Roedel --- drivers/iommu/of_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index c5dd63072529..bfcf139503f0 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -220,7 +220,7 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, * If we have reason to believe the IOMMU driver missed the initial * add_device callback for dev, replay it to get things in order. */ - if (ops && ops->add_device && dev->bus && !dev->iommu_group) + if (ops && ops->add_device && dev->bus && !device_iommu_mapped(dev)) err = ops->add_device(dev); /* Ignore all other errors apart from EPROBE_DEFER */ -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 0/5] Introduce device_iommu_maped() function
Hi, here is a patch-set to replace the dev->iommu_group checks in the source tree by a proper function call. The pointer checks mostly happen to check whether a device is managed my an IOMMU. For that purpose a pointer check is not very descriptable, so replace it by a function call that make its purpose readable. This also starts to remove direct access to the dev->iommu_group pointer outside of iommu-code. This is another move towards consolidating the various iommu-related pointers in 'struct device' into one pointer only. Please review. Thanks, Joerg Joerg Roedel (5): driver core: Introduce device_iommu_mapped() function iommu/of: Use device_iommu_mapped() ACPI/IORT: Use device_iommu_mapped() powerpc/iommu: Use device_iommu_mapped() xhci: Use device_iommu_mapped() arch/powerpc/kernel/eeh.c | 2 +- arch/powerpc/kernel/iommu.c | 6 +++--- drivers/acpi/arm64/iort.c | 2 +- drivers/iommu/of_iommu.c| 2 +- drivers/usb/host/xhci.c | 2 +- include/linux/device.h | 10 ++ 6 files changed, 17 insertions(+), 7 deletions(-) -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 3/5] ACPI/IORT: Use device_iommu_mapped()
From: Joerg Roedel Replace the iommu-check with a proper and readable function call. Cc: Lorenzo Pieralisi Signed-off-by: Joerg Roedel --- drivers/acpi/arm64/iort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 70f4e80b9246..0125c8eb9e81 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -805,7 +805,7 @@ static inline int iort_add_device_replay(const struct iommu_ops *ops, { int err = 0; - if (ops->add_device && dev->bus && !dev->iommu_group) + if (ops->add_device && dev->bus && !device_iommu_mapped(dev)) err = ops->add_device(dev); return err; -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 1/5] driver core: Introduce device_iommu_mapped() function
From: Joerg Roedel Some places in the kernel check the iommu_group pointer in 'struct device' in order to find ot whether a device is mapped by an IOMMU. This is not good way to make this check, as the pointer will be moved to 'struct dev_iommu_data'. This way to make the check is also not very readable. Introduce an explicit function to perform this check. Cc: Greg Kroah-Hartman Signed-off-by: Joerg Roedel --- include/linux/device.h | 10 ++ 1 file changed, 10 insertions(+) diff --git a/include/linux/device.h b/include/linux/device.h index 1b25c7a43f4c..6cb4640b6160 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1058,6 +1058,16 @@ static inline struct device *kobj_to_dev(struct kobject *kobj) return container_of(kobj, struct device, kobj); } +/** + * device_iommu_mapped - Returns true when the device DMA is translated + * by an IOMMU + * @dev: Device to perform the check on + */ +static inline bool device_iommu_mapped(struct device *dev) +{ + return (dev->iommu_group != NULL); +} + /* Get the wakeup routines, which depend on struct device */ #include -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool
On 04/12/2018 16:30, John Garry wrote: On 04/12/2018 13:11, Robin Murphy wrote: Hi John, On 03/12/2018 18:23, John Garry wrote: On 03/12/2018 17:28, Robin Murphy wrote: Certain drivers such as large multi-queue network adapters can use pools of mapped DMA buffers larger than the default dma_debug_entry pool of 65536 entries, with the result that merely probing such a device can cause DMA debug to disable itself during boot unless explicitly given an appropriate "dma_debug_entries=..." option. Developers trying to debug some other driver on such a system may not be immediately aware of this, and at worst it can hide bugs if they fail to realise that dma-debug has already disabled itself unexpectedly by the time the code of interest gets to run. Even once they do realise, it can be a bit of a pain to emprirically determine a suitable number of preallocated entries to configure without massively over-allocating. There's really no need for such a static limit, though, since we can quite easily expand the pool at runtime in those rare cases that the preallocated entries are insufficient, which is arguably the least surprising and most useful behaviour. Hi Robin, Do you have an idea on shrinking the pool again when the culprit driver is removed, i.e. we have so many unused debug entries now available? I honestly don't believe it's worth the complication. This is a development feature with significant overheads already, so there's not an awful lot to gain by trying to optimise memory usage. If a system can ever load a driver that makes hundreds of thousands of simultaneous mappings, it can almost certainly spare 20-odd megabytes of RAM for the corresponding debug entries in perpetuity. Sure, it does mean you'd need to reboot to recover memory from a major leak, but that's mostly true of the current behaviour too, and rebooting during driver development is hardly an unacceptable inconvenience. ok, I just thought that it would not be too difficult to implement this on the dma entry free path. True, in the current code it wouldn't be all that hard, but it feels more worthwhile to optimise for allocation rather than freeing, and as soon as we start allocating memory for multiple entries at once, trying to free anything becomes extremely challenging. In fact, having got this far in, what I'd quite like to do is to get rid of dma_debug_resize_entries() such that we never need to free things at all, since then we could allocate whole pages as blocks of entries to save on masses of individual slab allocations. On a related topic, is it possible for the user to learn the total entries created at a given point in time? If not, could we add a file in the debugfs folder for this? I did get as far as pondering that you effectively lose track of utilisation once the low-water-mark of min_free_entries hits 0 and stays there - AFAICS it should be sufficient to just expose nr_total_entries as-is, since users can then calculate current and maximum occupancy based on *_free_entries. Does that sound reasonable to you? That also indirectly reminds me that this lot is documented in DMA_API.txt, so I should be good and update that too... Cheers, Robin. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 01/23] dma-mapping: provide a generic DMA_MAPPING_ERROR
On 30/11/2018 13:22, Christoph Hellwig wrote: Error handling of the dma_map_single and dma_map_page APIs is a little problematic at the moment, in that we use different encodings in the returned dma_addr_t to indicate an error. That means we require an additional indirect call to figure out if a dma mapping call returned an error, and a lot of boilerplate code to implement these semantics. Instead return the maximum addressable value as the error. As long as we don't allow mapping single-byte ranges with single-byte alignment this value can never be a valid return. Additionaly if drivers do not check the return value from the dma_map* routines this values means they will generally not be pointed to actual memory. Once the default value is added here we can start removing the various mapping_error methods and just rely on this generic check. Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 6 ++ 1 file changed, 6 insertions(+) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 0f81c713f6e9..46bd612d929e 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -133,6 +133,8 @@ struct dma_map_ops { u64 (*get_required_mask)(struct device *dev); }; +#define DMA_MAPPING_ERROR (~(dma_addr_t)0) + extern const struct dma_map_ops dma_direct_ops; extern const struct dma_map_ops dma_virt_ops; @@ -576,6 +578,10 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) const struct dma_map_ops *ops = get_dma_ops(dev); debug_dma_mapping_error(dev, dma_addr); + + if (dma_addr == DMA_MAPPING_ERROR) + return 1; + if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); return 0; I'd have been inclined to put the default check here, i.e. - return 0 + return dma_addr == DMA_MAPPING_ERROR such that the callback retains full precedence and we don't have to deal with the non-trivial removals immediately if it comes to it. Not that it makes a vast difference though, so either way, Reviewed-by: Robin Murphy ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v3, RFC] iommu/io-pgtable-arm-v7s: Use page_frag to request DMA32 memory
On Tue, Dec 04, 2018 at 04:23:00PM +0800, Nicolas Boichat wrote: > IOMMUs using ARMv7 short-descriptor format require page tables > (level 1 and 2) to be allocated within the first 4GB of RAM, even > on 64-bit systems. > > For level 1/2 tables, ensure GFP_DMA32 is used if CONFIG_ZONE_DMA32 > is defined (e.g. on arm64 platforms). > > For level 2 tables (1 KB), we use page_frag to allocate these pages, > as we cannot directly use kmalloc (no slab cache for GFP_DMA32) or > kmem_cache (mm/ code treats GFP_DMA32 as an invalid flag). > > One downside is that we only free the allocated page if all the > 4 fragments (4 IOMMU L2 tables) are freed, but given that we > usually only allocate limited number of IOMMU L2 tables, this > should not have too much impact on memory usage: In the absolute > worst case (4096 L2 page tables, each on their own 4K page), > we would use 16 MB of memory for 4 MB of L2 tables. > > Also, print an error when the physical address does not fit in > 32-bit, to make debugging easier in the future. > > Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32") > Signed-off-by: Nicolas Boichat > --- > > As an alternative to the series [1], which adds support for GFP_DMA32 > to kmem_cache in mm/. IMHO the solution in [1] is cleaner and more > efficient, as it allows freed fragments (L2 tables) to be reused, but > this approach does not require any core change. > > [1] https://patchwork.kernel.org/cover/10677529/, 3 patches > > drivers/iommu/io-pgtable-arm-v7s.c | 32 -- > 1 file changed, 17 insertions(+), 15 deletions(-) > > diff --git a/drivers/iommu/io-pgtable-arm-v7s.c > b/drivers/iommu/io-pgtable-arm-v7s.c > index 445c3bde04800c..0de6a51eb6755f 100644 > --- a/drivers/iommu/io-pgtable-arm-v7s.c > +++ b/drivers/iommu/io-pgtable-arm-v7s.c > @@ -161,6 +161,12 @@ > > #define ARM_V7S_TCR_PD1 BIT(5) > > +#ifdef CONFIG_ZONE_DMA32 > +#define ARM_V7S_TABLE_GFP_DMA GFP_DMA32 > +#else > +#define ARM_V7S_TABLE_GFP_DMA GFP_DMA > +#endif We may as well include __GFP_ZERO in here too. Anyway, this looks alright to me: Acked-by: Will Deacon But it sounds like you're still on the fence about this patch, so I won't pick it up unless you ask explicitly. Will ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool
On 04/12/2018 14:29, Christoph Hellwig wrote: + for (retry_count = 0; ; retry_count++) { + spin_lock_irqsave(&free_entries_lock, flags); + + if (num_free_entries > 0) + break; spin_unlock_irqrestore(&free_entries_lock, flags); Taking a spinlock just to read a single integer value doesn't really help anything. If the freelist is non-empty we break out with the lock still held in order to actually allocate our entry - only if there are no free entries left do we drop the lock in order to handle the failure. This much is just the original logic shuffled around a bit (with the tweak that testing num_free_entries seemed justifiably simpler than the original list_empty() check). + + if (retry_count < DMA_DEBUG_DYNAMIC_RETRIES && + !prealloc_memory(DMA_DEBUG_DYNAMIC_ENTRIES)) Don't we need GFP_ATOMIC here? Also why do we need the retries? Ah, right, we may be outside our own spinlock, but of course the whole DMA API call which got us here might be under someone else's and/or in a non-sleeping context - I'll fix that. The number of retries is just to bound the loop due to its inherent raciness - since we drop the lock to create more entries, under pathological conditions by the time we get back in to grab one they could have all gone. 2 retries (well, strictly it's 1 try and 1 retry) was an entirely arbitrary choice just to accommodate that happening very occasionally by chance. However, if the dynamic allocations need GFP_ATOMIC for external reasons anyway, then I don't need the lock-juggling that invites that race in the first place, and the whole loop disappears again. Neat! Robin. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool
On 04/12/2018 13:11, Robin Murphy wrote: Hi John, On 03/12/2018 18:23, John Garry wrote: On 03/12/2018 17:28, Robin Murphy wrote: Certain drivers such as large multi-queue network adapters can use pools of mapped DMA buffers larger than the default dma_debug_entry pool of 65536 entries, with the result that merely probing such a device can cause DMA debug to disable itself during boot unless explicitly given an appropriate "dma_debug_entries=..." option. Developers trying to debug some other driver on such a system may not be immediately aware of this, and at worst it can hide bugs if they fail to realise that dma-debug has already disabled itself unexpectedly by the time the code of interest gets to run. Even once they do realise, it can be a bit of a pain to emprirically determine a suitable number of preallocated entries to configure without massively over-allocating. There's really no need for such a static limit, though, since we can quite easily expand the pool at runtime in those rare cases that the preallocated entries are insufficient, which is arguably the least surprising and most useful behaviour. Hi Robin, Do you have an idea on shrinking the pool again when the culprit driver is removed, i.e. we have so many unused debug entries now available? I honestly don't believe it's worth the complication. This is a development feature with significant overheads already, so there's not an awful lot to gain by trying to optimise memory usage. If a system can ever load a driver that makes hundreds of thousands of simultaneous mappings, it can almost certainly spare 20-odd megabytes of RAM for the corresponding debug entries in perpetuity. Sure, it does mean you'd need to reboot to recover memory from a major leak, but that's mostly true of the current behaviour too, and rebooting during driver development is hardly an unacceptable inconvenience. ok, I just thought that it would not be too difficult to implement this on the dma entry free path. In fact, having got this far in, what I'd quite like to do is to get rid of dma_debug_resize_entries() such that we never need to free things at all, since then we could allocate whole pages as blocks of entries to save on masses of individual slab allocations. On a related topic, is it possible for the user to learn the total entries created at a given point in time? If not, could we add a file in the debugfs folder for this? Thanks, John Robin. Thanks, John Signed-off-by: Robin Murphy --- kernel/dma/debug.c | 18 +++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index de5db800dbfc..46cc075aec99 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -47,6 +47,9 @@ #ifndef PREALLOC_DMA_DEBUG_ENTRIES #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) #endif +/* If the pool runs out, try this many times to allocate this many new entries */ +#define DMA_DEBUG_DYNAMIC_ENTRIES 256 +#define DMA_DEBUG_DYNAMIC_RETRIES 2 enum { dma_debug_single, @@ -702,12 +705,21 @@ static struct dma_debug_entry *dma_entry_alloc(void) { struct dma_debug_entry *entry; unsigned long flags; +int retry_count; -spin_lock_irqsave(&free_entries_lock, flags); +for (retry_count = 0; ; retry_count++) { +spin_lock_irqsave(&free_entries_lock, flags); + +if (num_free_entries > 0) +break; -if (list_empty(&free_entries)) { -global_disable = true; spin_unlock_irqrestore(&free_entries_lock, flags); + +if (retry_count < DMA_DEBUG_DYNAMIC_RETRIES && +!prealloc_memory(DMA_DEBUG_DYNAMIC_ENTRIES)) +continue; + +global_disable = true; pr_err("debugging out of memory - disabling\n"); return NULL; } . ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 9/9] iommu/tegra: Use helper functions to access dev->iommu_fwspec
From: Joerg Roedel Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct. Cc: Thierry Reding Signed-off-by: Joerg Roedel --- drivers/iommu/tegra-smmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 0d03341317c4..0d026cb2dfff 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -846,7 +846,7 @@ static struct iommu_group *tegra_smmu_group_get(struct tegra_smmu *smmu, static struct iommu_group *tegra_smmu_device_group(struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu *smmu = dev->archdata.iommu; struct iommu_group *group; -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 7/9] iommu/of: Use helper functions to access dev->iommu_fwspec
From: Joerg Roedel Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct. Signed-off-by: Joerg Roedel --- drivers/iommu/of_iommu.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index c5dd63072529..38232250b5f4 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -164,7 +164,7 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, struct device_node *master_np) { const struct iommu_ops *ops = NULL; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); int err = NO_IOMMU; if (!master_np) @@ -208,6 +208,9 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, } } + /* The fwspec pointer changed, read it again */ + fwspec = dev_iommu_fwspec_get(dev); + /* * Two success conditions can be represented by non-negative err here: * >0 : there is no IOMMU, or one was unavailable for non-fatal reasons @@ -215,7 +218,7 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, * <0 : any actual error */ if (!err) - ops = dev->iommu_fwspec->ops; + ops = fwspec->ops; /* * If we have reason to believe the IOMMU driver missed the initial * add_device callback for dev, replay it to get things in order. -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 2/9] ACPI/IORT: Use helper functions to access dev->iommu_fwspec
From: Joerg Roedel Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct. Cc: Lorenzo Pieralisi Signed-off-by: Joerg Roedel --- drivers/acpi/arm64/iort.c | 12 +++- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 70f4e80b9246..754a67ba49e5 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -779,7 +779,7 @@ static inline bool iort_iommu_driver_enabled(u8 type) static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev) { struct acpi_iort_node *iommu; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); iommu = iort_get_iort_node(fwspec->iommu_fwnode); @@ -824,6 +824,7 @@ static inline int iort_add_device_replay(const struct iommu_ops *ops, */ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct acpi_iort_its_group *its; struct acpi_iort_node *iommu_node, *its_node = NULL; int i, resv = 0; @@ -841,9 +842,9 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) * a given PCI or named component may map IDs to. */ - for (i = 0; i < dev->iommu_fwspec->num_ids; i++) { + for (i = 0; i < fwspec->num_ids; i++) { its_node = iort_node_map_id(iommu_node, - dev->iommu_fwspec->ids[i], + fwspec->ids[i], NULL, IORT_MSI_TYPE); if (its_node) break; @@ -1036,6 +1037,7 @@ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) */ const struct iommu_ops *iort_iommu_configure(struct device *dev) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct acpi_iort_node *node, *parent; const struct iommu_ops *ops; u32 streamid = 0; @@ -1045,7 +1047,7 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev) * If we already translated the fwspec there * is nothing left to do, return the iommu_ops. */ - ops = iort_fwspec_iommu_ops(dev->iommu_fwspec); + ops = iort_fwspec_iommu_ops(fwspec); if (ops) return ops; @@ -1084,7 +1086,7 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev) * add_device callback for dev, replay it to get things in order. */ if (!err) { - ops = iort_fwspec_iommu_ops(dev->iommu_fwspec); + ops = iort_fwspec_iommu_ops(fwspec); err = iort_add_device_replay(ops, dev); } -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 8/9] iommu/qcom: Use helper functions to access dev->iommu_fwspec
From: Joerg Roedel Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct. Cc: Rob Clark Signed-off-by: Joerg Roedel --- drivers/iommu/qcom_iommu.c | 18 ++ 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c index ee70e9921cf1..f437bf9c5ebd 100644 --- a/drivers/iommu/qcom_iommu.c +++ b/drivers/iommu/qcom_iommu.c @@ -354,7 +354,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain) static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) { - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec); + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec); struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); int ret; @@ -365,7 +366,7 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev /* Ensure that the domain is finalized */ pm_runtime_get_sync(qcom_iommu->dev); - ret = qcom_iommu_init_domain(domain, qcom_iommu, dev->iommu_fwspec); + ret = qcom_iommu_init_domain(domain, qcom_iommu, fwspec); pm_runtime_put_sync(qcom_iommu->dev); if (ret < 0) return ret; @@ -387,7 +388,7 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec); struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); unsigned i; @@ -500,7 +501,7 @@ static bool qcom_iommu_capable(enum iommu_cap cap) static int qcom_iommu_add_device(struct device *dev) { - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec); + struct qcom_iommu_dev *qcom_iommu = to_iommu(dev_iommu_fwspec_get(dev)); struct iommu_group *group; struct device_link *link; @@ -531,7 +532,7 @@ static int qcom_iommu_add_device(struct device *dev) static void qcom_iommu_remove_device(struct device *dev) { - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec); + struct qcom_iommu_dev *qcom_iommu = to_iommu(dev_iommu_fwspec_get(dev)); if (!qcom_iommu) return; @@ -543,6 +544,7 @@ static void qcom_iommu_remove_device(struct device *dev) static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct qcom_iommu_dev *qcom_iommu; struct platform_device *iommu_pdev; unsigned asid = args->args[0]; @@ -568,14 +570,14 @@ static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) WARN_ON(asid > qcom_iommu->num_ctxs)) return -EINVAL; - if (!dev->iommu_fwspec->iommu_priv) { - dev->iommu_fwspec->iommu_priv = qcom_iommu; + if (!fwspec->iommu_priv) { + fwspec->iommu_priv = qcom_iommu; } else { /* make sure devices iommus dt node isn't referring to * multiple different iommu devices. Multiple context * banks are ok, but multiple devices are not: */ - if (WARN_ON(qcom_iommu != dev->iommu_fwspec->iommu_priv)) + if (WARN_ON(qcom_iommu != fwspec->iommu_priv)) return -EINVAL; } -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 0/9] Access dev->iommu_fwspec through functions
Hi, here is a patch-set to wrap accesses to dev->iommu_fwspec into functions. This will make it easier to move the pointer into a separate struct and consolitdate the iommu-related pointers in 'struct device'. Regards, Joerg Joerg Roedel (9): iommu: Introduce wrappers around dev->iommu_fwspec ACPI/IORT: Use helper functions to access dev->iommu_fwspec iommu/arm-smmu: Use helper functions to access dev->iommu_fwspec iommu/dma: Use helper functions to access dev->iommu_fwspec iommu/ipmmu-vmsa: Use helper functions to access dev->iommu_fwspec iommu/mediatek: Use helper functions to access dev->iommu_fwspec iommu/of: Use helper functions to access dev->iommu_fwspec iommu/qcom: Use helper functions to access dev->iommu_fwspec iommu/tegra: Use helper functions to access dev->iommu_fwspec drivers/acpi/arm64/iort.c| 12 +++- drivers/iommu/arm-smmu-v3.c | 16 +--- drivers/iommu/arm-smmu.c | 12 ++-- drivers/iommu/dma-iommu.c| 2 +- drivers/iommu/iommu.c| 14 +++--- drivers/iommu/ipmmu-vmsa.c | 12 drivers/iommu/mtk_iommu.c| 21 - drivers/iommu/mtk_iommu_v1.c | 28 drivers/iommu/of_iommu.c | 7 +-- drivers/iommu/qcom_iommu.c | 18 ++ drivers/iommu/tegra-smmu.c | 2 +- include/linux/iommu.h| 11 +++ 12 files changed, 93 insertions(+), 62 deletions(-) -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 5/9] iommu/ipmmu-vmsa: Use helper functions to access dev->iommu_fwspec
From: Joerg Roedel Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct. Signed-off-by: Joerg Roedel --- drivers/iommu/ipmmu-vmsa.c | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index ddf3a492e1d5..679f18bf0634 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -81,7 +81,9 @@ static struct ipmmu_vmsa_domain *to_vmsa_domain(struct iommu_domain *dom) static struct ipmmu_vmsa_device *to_ipmmu(struct device *dev) { - return dev->iommu_fwspec ? dev->iommu_fwspec->iommu_priv : NULL; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + + return fwspec ? fwspec->iommu_priv : NULL; } #define TLB_LOOP_TIMEOUT 100 /* 100us */ @@ -643,7 +645,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain) static int ipmmu_attach_device(struct iommu_domain *io_domain, struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); unsigned int i; @@ -692,7 +694,7 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, static void ipmmu_detach_device(struct iommu_domain *io_domain, struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); unsigned int i; @@ -744,13 +746,15 @@ static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain, static int ipmmu_init_platform_device(struct device *dev, struct of_phandle_args *args) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct platform_device *ipmmu_pdev; ipmmu_pdev = of_find_device_by_node(args->np); if (!ipmmu_pdev) return -ENODEV; - dev->iommu_fwspec->iommu_priv = platform_get_drvdata(ipmmu_pdev); + fwspec->iommu_priv = platform_get_drvdata(ipmmu_pdev); + return 0; } -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 6/9] iommu/mediatek: Use helper functions to access dev->iommu_fwspec
From: Joerg Roedel Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct. Cc: Matthias Brugger Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu.c| 21 - drivers/iommu/mtk_iommu_v1.c | 28 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 44bd5b9166bb..0783fba05d19 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -244,7 +244,7 @@ static void mtk_iommu_config(struct mtk_iommu_data *data, { struct mtk_smi_larb_iommu*larb_mmu; unsigned int larbid, portid; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); int i; for (i = 0; i < fwspec->num_ids; ++i) { @@ -336,7 +336,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); - struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv; + struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv; if (!data) return -ENODEV; @@ -355,7 +355,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, static void mtk_iommu_detach_device(struct iommu_domain *domain, struct device *dev) { - struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv; + struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv; if (!data) return; @@ -417,13 +417,14 @@ static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain, static int mtk_iommu_add_device(struct device *dev) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_data *data; struct iommu_group *group; - if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != &mtk_iommu_ops) + if (!fwspec || fwspec->ops != &mtk_iommu_ops) return -ENODEV; /* Not a iommu client device */ - data = dev->iommu_fwspec->iommu_priv; + data = fwspec->iommu_priv; iommu_device_link(&data->iommu, dev); group = iommu_group_get_for_dev(dev); @@ -436,12 +437,13 @@ static int mtk_iommu_add_device(struct device *dev) static void mtk_iommu_remove_device(struct device *dev) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_data *data; - if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != &mtk_iommu_ops) + if (!fwspec || fwspec->ops != &mtk_iommu_ops) return; - data = dev->iommu_fwspec->iommu_priv; + data = fwspec->iommu_priv; iommu_device_unlink(&data->iommu, dev); iommu_group_remove_device(dev); @@ -468,6 +470,7 @@ static struct iommu_group *mtk_iommu_device_group(struct device *dev) static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) { + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct platform_device *m4updev; if (args->args_count != 1) { @@ -476,13 +479,13 @@ static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) return -EINVAL; } - if (!dev->iommu_fwspec->iommu_priv) { + if (!fwspec->iommu_priv) { /* Get the m4u device */ m4updev = of_find_device_by_node(args->np); if (WARN_ON(!m4updev)) return -EINVAL; - dev->iommu_fwspec->iommu_priv = platform_get_drvdata(m4updev); + fwspec->iommu_priv = platform_get_drvdata(m4updev); } return iommu_fwspec_add_ids(dev, args->args, 1); diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 0e780848f59b..d22a2a145bd2 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -206,7 +206,7 @@ static void mtk_iommu_config(struct mtk_iommu_data *data, { struct mtk_smi_larb_iommu*larb_mmu; unsigned int larbid, portid; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); int i; for (i = 0; i < fwspec->num_ids; ++i) { @@ -271,7 +271,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); - struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv; + struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv; int ret; if (!data) @@ -293,7 +293,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, static void mtk_iommu_detach_device(stru
[PATCH 3/9] iommu/arm-smmu: Use helper functions to access dev->iommu_fwspec
From: Joerg Roedel Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct. Cc: Will Deacon Cc: Robin Murphy Signed-off-by: Joerg Roedel --- drivers/iommu/arm-smmu-v3.c | 16 +--- drivers/iommu/arm-smmu.c| 12 ++-- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 6947ccf26512..8f2d3a30d090 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -1691,24 +1691,26 @@ static void arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec) static void arm_smmu_detach_dev(struct device *dev) { - struct arm_smmu_master_data *master = dev->iommu_fwspec->iommu_priv; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct arm_smmu_master_data *master = fwspec->iommu_priv; master->ste.assigned = false; - arm_smmu_install_ste_for_dev(dev->iommu_fwspec); + arm_smmu_install_ste_for_dev(fwspec); } static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) { int ret = 0; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_device *smmu; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_master_data *master; struct arm_smmu_strtab_ent *ste; - if (!dev->iommu_fwspec) + if (!fwspec) return -ENOENT; - master = dev->iommu_fwspec->iommu_priv; + master = fwspec->iommu_priv; smmu = master->smmu; ste = &master->ste; @@ -1748,7 +1750,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) ste->s2_cfg = &smmu_domain->s2_cfg; } - arm_smmu_install_ste_for_dev(dev->iommu_fwspec); + arm_smmu_install_ste_for_dev(fwspec); out_unlock: mutex_unlock(&smmu_domain->init_mutex); return ret; @@ -1839,7 +1841,7 @@ static int arm_smmu_add_device(struct device *dev) int i, ret; struct arm_smmu_device *smmu; struct arm_smmu_master_data *master; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct iommu_group *group; if (!fwspec || fwspec->ops != &arm_smmu_ops) @@ -1890,7 +1892,7 @@ static int arm_smmu_add_device(struct device *dev) static void arm_smmu_remove_device(struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_master_data *master; struct arm_smmu_device *smmu; diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 5a28ae892504..988d0362cd03 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -1103,7 +1103,7 @@ static bool arm_smmu_free_sme(struct arm_smmu_device *smmu, int idx) static int arm_smmu_master_alloc_smes(struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_master_cfg *cfg = fwspec->iommu_priv; struct arm_smmu_device *smmu = cfg->smmu; struct arm_smmu_smr *smrs = smmu->smrs; @@ -1206,7 +1206,7 @@ static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain, static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) { int ret; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_device *smmu; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); @@ -1380,7 +1380,7 @@ static int arm_smmu_add_device(struct device *dev) { struct arm_smmu_device *smmu; struct arm_smmu_master_cfg *cfg; - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); int i, ret; if (using_legacy_binding) { @@ -1391,7 +1391,7 @@ static int arm_smmu_add_device(struct device *dev) * will allocate/initialise a new one. Thus we need to update fwspec for * later use. */ - fwspec = dev->iommu_fwspec; + fwspec = dev_iommu_fwspec_get(dev); if (ret) goto out_free; } else if (fwspec && fwspec->ops == &arm_smmu_ops) { @@ -1445,7 +1445,7 @@ static int arm_smmu_add_device(struct device *dev) static void arm_smmu_remove_device(struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_master_cfg *cfg; struct arm_smmu_device *smmu; @@ -1465,7 +1465,7 @@ static void arm_smmu_remove_device(struct device *dev) static struc
[PATCH 4/9] iommu/dma: Use helper functions to access dev->iommu_fwspec
From: Joerg Roedel Use the new helpers dev_iommu_fwspec_get()/set() to access the dev->iommu_fwspec pointer. This makes it easier to move that pointer later into another struct. Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index d1b04753b204..2b1c6912fbcc 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -177,7 +177,7 @@ EXPORT_SYMBOL(iommu_put_dma_cookie); void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) { - if (!is_of_node(dev->iommu_fwspec->iommu_fwnode)) + if (!is_of_node(dev_iommu_fwspec_get(dev)->iommu_fwnode)) iort_iommu_msi_get_resv_regions(dev, list); } -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 1/9] iommu: Introduce wrappers around dev->iommu_fwspec
From: Joerg Roedel These wrappers will be used to easily change the location of the field later when all users are converted. Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 14 +++--- include/linux/iommu.h | 11 +++ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index edbdf5d6962c..4cfd407b972c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1976,7 +1976,7 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, const struct iommu_ops *ops) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); if (fwspec) return ops == fwspec->ops ? 0 : -EINVAL; @@ -1988,26 +1988,26 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, of_node_get(to_of_node(iommu_fwnode)); fwspec->iommu_fwnode = iommu_fwnode; fwspec->ops = ops; - dev->iommu_fwspec = fwspec; + dev_iommu_fwspec_set(dev, fwspec); return 0; } EXPORT_SYMBOL_GPL(iommu_fwspec_init); void iommu_fwspec_free(struct device *dev) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); if (fwspec) { fwnode_handle_put(fwspec->iommu_fwnode); kfree(fwspec); - dev->iommu_fwspec = NULL; + dev_iommu_fwspec_set(dev, NULL); } } EXPORT_SYMBOL_GPL(iommu_fwspec_free); int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids) { - struct iommu_fwspec *fwspec = dev->iommu_fwspec; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); size_t size; int i; @@ -2016,11 +2016,11 @@ int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids) size = offsetof(struct iommu_fwspec, ids[fwspec->num_ids + num_ids]); if (size > sizeof(*fwspec)) { - fwspec = krealloc(dev->iommu_fwspec, size, GFP_KERNEL); + fwspec = krealloc(fwspec, size, GFP_KERNEL); if (!fwspec) return -ENOMEM; - dev->iommu_fwspec = fwspec; + dev_iommu_fwspec_set(dev, fwspec); } for (i = 0; i < num_ids; i++) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a1d28f42cb77..f93fdcf7d130 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -398,6 +398,17 @@ void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids); const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode); +static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) +{ + return dev->iommu_fwspec; +} + +static inline void dev_iommu_fwspec_set(struct device *dev, + struct iommu_fwspec *fwspec) +{ + dev->iommu_fwspec = fwspec; +} + #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; -- 2.17.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables
On Tue, Dec 04, 2018 at 05:37:13PM +0800, Nicolas Boichat wrote: > On Sun, Nov 11, 2018 at 5:04 PM Nicolas Boichat wrote: > > > > This is a follow-up to the discussion in [1], to make sure that the page > > tables allocated by iommu/io-pgtable-arm-v7s are contained within 32-bit > > physical address space. > > > > [1] > > https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html > > Hi everyone, > > Let's try to summarize here. > > First, we confirmed that this is a regression, and IOMMU errors happen > on 4.19 and linux-next/master on MT8173 (elm, Acer Chromebook R13). > The issue most likely starts from ad67f5a6545f ("arm64: replace > ZONE_DMA with ZONE_DMA32"), i.e. 4.15, and presumably breaks a number > of Mediatek platforms (and maybe others?). > > We have a few options here: > 1. This series [2], that adds support for GFP_DMA32 slab caches, > _without_ adding kmalloc caches (since there are no users of > kmalloc(..., GFP_DMA32)). I think I've addressed all the comments on > the 3 patches, and AFAICT this solution works fine. > 2. genalloc. That works, but unless we preallocate 4MB for L2 tables > (which is wasteful as we usually only need a handful of L2 tables), > we'll need changes in the core (use GFP_ATOMIC) to allow allocating on > demand, and as it stands we'd have no way to shrink the allocation. > 3. page_frag [3]. That works fine, and the code is quite simple. One > drawback is that fragments in partially freed pages cannot be reused > (from limited experiments, I see that IOMMU L2 tables are rarely > freed, so it's unlikely a whole page would get freed). But given the > low number of L2 tables, maybe we can live with that. > > I think 2 is out. Any preference between 1 and 3? I think 1 makes > better use of the memory, so that'd be my preference. But I'm probably > missing something. FWIW, I'm open to any solution at this point, since I'd like to see this regression fixed. (1) does sound better longer-term, but (3) looks pretty much ready to do afaict. Will ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption
On 11/13/18 1:36 AM, Matthew Wilcox wrote: > On Mon, Nov 12, 2018 at 10:46:35AM -0500, Tony Battersby wrote: >> Prevent a possible endless loop with DMAPOOL_DEBUG enabled if a buggy >> driver corrupts DMA pool memory. >> >> Signed-off-by: Tony Battersby > I like it! Also, here you're using blks_per_alloc in a way which isn't > normally in the performance path, but might be with the right config > options. With that, I withdraw my objection to the previous patch and > > Acked-by: Matthew Wilcox > > Andrew, can you funnel these in through your tree? If you'd rather not, > I don't mind stuffing them into a git tree and asking Linus to pull > for 4.21. > No reply for 3 weeks, so adding Andrew Morton to recipient list. Andrew, I have 9 dmapool patches ready for merging in 4.21. See Matthew Wilcox's request above. Tony Battersby ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
merge swiotlb support into dma_direct_ops
Hi Konrad and others, can you review this series? It merges the swiotlb support into the DMA direct ops so that we don't have to duplicate the dma mapping logic in multiple places. Note that this is based on the dma_mapping_error series for which I'd still like to collect a few more reviews, so pulling the git tree might be easiest for testing. The git tree is available here: git://git.infradead.org/users/hch/misc.git swiotlb-merge Gitweb: http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/swiotlb-merge ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 1/5] swiotlb: remove SWIOTLB_MAP_ERROR
We can use DMA_MAPPING_ERROR instead, which already maps to the same value. Signed-off-by: Christoph Hellwig --- drivers/xen/swiotlb-xen.c | 4 ++-- include/linux/swiotlb.h | 3 --- kernel/dma/swiotlb.c | 4 ++-- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 6dc969d5ea2f..833e80b46eb2 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -403,7 +403,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) + if (map == DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; dev_addr = xen_phys_to_bus(map); @@ -572,7 +572,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, sg_phys(sg), sg->length, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { + if (map == DMA_MAPPING_ERROR) { dev_warn(hwdev, "swiotlb buffer is full\n"); /* Don't panic here, we expect map_sg users to do proper error handling. */ diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index a387b59640a4..14aec0b70dd9 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -46,9 +46,6 @@ enum dma_sync_target { SYNC_FOR_DEVICE = 1, }; -/* define the last possible byte of physical address space as a mapping error */ -#define SWIOTLB_MAP_ERROR (~(phys_addr_t)0x0) - extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, phys_addr_t phys, size_t size, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index ff1ce81bb623..19ba8e473d71 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -526,7 +526,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, spin_unlock_irqrestore(&io_tlb_lock, flags); if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); - return SWIOTLB_MAP_ERROR; + return DMA_MAPPING_ERROR; found: spin_unlock_irqrestore(&io_tlb_lock, flags); @@ -637,7 +637,7 @@ static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys, /* Oh well, have to allocate and map a bounce buffer. */ *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), *phys, size, dir, attrs); - if (*phys == SWIOTLB_MAP_ERROR) + if (*phys == DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ -- 2.19.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 5/5] dma-direct: merge swiotlb_dma_ops into the dma_direct code
While the dma-direct code is (relatively) clean and simple we actually have to use the swiotlb ops for the mapping on many architectures due to devices with addressing limits. Instead of keeping two implementations around this commit allows the dma-direct implementation to call the swiotlb bounce buffering functions and thus share the guts of the mapping implementation. This also simplified the dma-mapping setup on a few architectures where we don't have to differenciate which implementation to use. Signed-off-by: Christoph Hellwig --- arch/arm64/mm/dma-mapping.c | 2 +- arch/ia64/hp/common/hwsw_iommu.c | 2 +- arch/ia64/hp/common/sba_iommu.c | 6 +- arch/ia64/kernel/dma-mapping.c | 2 +- arch/mips/include/asm/dma-mapping.h | 2 - arch/powerpc/kernel/dma-swiotlb.c| 16 +- arch/riscv/include/asm/dma-mapping.h | 15 -- arch/x86/kernel/pci-swiotlb.c| 4 +- arch/x86/mm/mem_encrypt.c| 7 - arch/x86/pci/sta2x11-fixup.c | 1 - include/linux/dma-direct.h | 12 ++ include/linux/swiotlb.h | 68 +++- kernel/dma/direct.c | 113 + kernel/dma/swiotlb.c | 227 ++- 14 files changed, 144 insertions(+), 333 deletions(-) delete mode 100644 arch/riscv/include/asm/dma-mapping.h diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 4c0f498069e8..e4effbb243b1 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -549,7 +549,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) { if (!dev->dma_ops) - dev->dma_ops = &swiotlb_dma_ops; + dev->dma_ops = &dma_direct_ops; dev->dma_coherent = coherent; __iommu_setup_dma_ops(dev, dma_base, size, iommu); diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c index 58969039bed2..f40ca499b246 100644 --- a/arch/ia64/hp/common/hwsw_iommu.c +++ b/arch/ia64/hp/common/hwsw_iommu.c @@ -38,7 +38,7 @@ static inline int use_swiotlb(struct device *dev) const struct dma_map_ops *hwsw_dma_get_ops(struct device *dev) { if (use_swiotlb(dev)) - return &swiotlb_dma_ops; + return &dma_direct_ops; return &sba_dma_ops; } EXPORT_SYMBOL(hwsw_dma_get_ops); diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 0d21c0b5b23d..5ee74820a0f6 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -2065,8 +2065,6 @@ static int __init acpi_sba_ioc_init_acpi(void) /* This has to run before acpi_scan_init(). */ arch_initcall(acpi_sba_ioc_init_acpi); -extern const struct dma_map_ops swiotlb_dma_ops; - static int __init sba_init(void) { @@ -2080,7 +2078,7 @@ sba_init(void) * a successful kdump kernel boot is to use the swiotlb. */ if (is_kdump_kernel()) { - dma_ops = &swiotlb_dma_ops; + dma_ops = &dma_direct_ops; if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0) panic("Unable to initialize software I/O TLB:" " Try machvec=dig boot option"); @@ -2102,7 +2100,7 @@ sba_init(void) * If we didn't find something sba_iommu can claim, we * need to setup the swiotlb and switch to the dig machvec. */ - dma_ops = &swiotlb_dma_ops; + dma_ops = &dma_direct_ops; if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0) panic("Unable to find SBA IOMMU or initialize " "software I/O TLB: Try machvec=dig boot option"); diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c index 36dd6aa6d759..80cd3e1ea95a 100644 --- a/arch/ia64/kernel/dma-mapping.c +++ b/arch/ia64/kernel/dma-mapping.c @@ -36,7 +36,7 @@ long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, void __init swiotlb_dma_init(void) { - dma_ops = &swiotlb_dma_ops; + dma_ops = &dma_direct_ops; swiotlb_init(1); } #endif diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h index b4c477eb46ce..69f914667f3e 100644 --- a/arch/mips/include/asm/dma-mapping.h +++ b/arch/mips/include/asm/dma-mapping.h @@ -10,8 +10,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { #if defined(CONFIG_MACH_JAZZ) return &jazz_dma_ops; -#elif defined(CONFIG_SWIOTLB) - return &swiotlb_dma_ops; #else return &dma_direct_ops; #endif diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 3d8df2cf8be9..21a869fb9734 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -50,15 +50,15 @@ const struct dma_map_ops
[PATCH 4/5] dma-direct: use dma_direct_map_page to implement dma_direct_map_sg
No need to duplicate the mapping logic. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 14 +- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index edb24f94ea1e..d45306473c90 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -217,6 +217,7 @@ static void dma_direct_sync_single_for_device(struct device *dev, arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); } +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) static void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { @@ -229,6 +230,7 @@ static void dma_direct_sync_sg_for_device(struct device *dev, for_each_sg(sgl, sg, nents, i) arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); } +#endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) @@ -294,19 +296,13 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, struct scatterlist *sg; for_each_sg(sgl, sg, nents, i) { - BUG_ON(!sg_page(sg)); - - sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg)); - if (unlikely(dev && !dma_capable(dev, sg_dma_address(sg), - sg->length))) { - report_addr(dev, sg_dma_address(sg), sg->length); + sg->dma_address = dma_direct_map_page(dev, sg_page(sg), + sg->offset, sg->length, dir, attrs); + if (sg->dma_address == DMA_MAPPING_ERROR) return 0; - } sg_dma_len(sg) = sg->length; } - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_direct_sync_sg_for_device(dev, sgl, nents, dir); return nents; } -- 2.19.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 3/5] dma-direct: improve addressability error reporting
Only report report a DMA addressability report once to avoid spewing the kernel log with repeated message. Also provide a stack trace to make it easy to find the actual caller that caused the problem. Last but not least move the actual check into the fast path and only leave the error reporting in a helper. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 36 +++- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 308f88a750c8..edb24f94ea1e 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -30,27 +30,16 @@ static inline bool force_dma_unencrypted(void) return sev_active(); } -static bool -check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, - const char *caller) +static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) { - if (unlikely(dev && !dma_capable(dev, dma_addr, size))) { - if (!dev->dma_mask) { - dev_err(dev, - "%s: call on device without dma_mask\n", - caller); - return false; - } - - if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { - dev_err(dev, - "%s: overflow %pad+%zu of device mask %llx bus mask %llx\n", - caller, &dma_addr, size, - *dev->dma_mask, dev->bus_dma_mask); - } - return false; + if (!dev->dma_mask) { + dev_err_once(dev, "DMA map on device without dma_mask\n"); + } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { + dev_err_once(dev, + "overflow %pad+%zu of DMA mask %llx bus mask %llx\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_mask); } - return true; + WARN_ON_ONCE(1); } static inline dma_addr_t phys_to_dma_direct(struct device *dev, @@ -288,8 +277,10 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (!check_addr(dev, dma_addr, size, __func__)) + if (unlikely(dev && !dma_capable(dev, dma_addr, size))) { + report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; + } if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_device(dev, dma_addr, size, dir); @@ -306,8 +297,11 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, BUG_ON(!sg_page(sg)); sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg)); - if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__)) + if (unlikely(dev && !dma_capable(dev, sg_dma_address(sg), + sg->length))) { + report_addr(dev, sg_dma_address(sg), sg->length); return 0; + } sg_dma_len(sg) = sg->length; } -- 2.19.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 2/5] swiotlb: remove dma_mark_clean
Instead of providing a special dma_mark_clean hook just for ia64, switch ia64 to use the normal arch_sync_dma_for_cpu hooks instead. This means that we now also set the PG_arch_1 bit for pages in the swiotlb buffer, which isn't stricly needed as we will never execute code out of the swiotlb buffer, but otherwise harmless. Signed-off-by: Christoph Hellwig --- arch/ia64/Kconfig | 3 ++- arch/ia64/kernel/dma-mapping.c | 20 +++- arch/ia64/mm/init.c| 18 +++--- drivers/xen/swiotlb-xen.c | 20 +--- include/linux/dma-direct.h | 8 kernel/dma/swiotlb.c | 18 +- 6 files changed, 30 insertions(+), 57 deletions(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 36773def6920..39724df0de29 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -28,8 +28,9 @@ config IA64 select HAVE_ARCH_TRACEHOOK select HAVE_MEMBLOCK_NODE_MAP select HAVE_VIRT_CPU_ACCOUNTING - select ARCH_HAS_DMA_MARK_CLEAN + select ARCH_HAS_DMA_COHERENT_TO_PFN select ARCH_HAS_SG_CHAIN + select ARCH_HAS_SYNC_DMA_FOR_CPU select VIRT_TO_BUS select ARCH_DISCARD_MEMBLOCK select GENERIC_IRQ_PROBE diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c index 7a471d8d67d4..36dd6aa6d759 100644 --- a/arch/ia64/kernel/dma-mapping.c +++ b/arch/ia64/kernel/dma-mapping.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include @@ -16,6 +16,24 @@ const struct dma_map_ops *dma_get_ops(struct device *dev) EXPORT_SYMBOL(dma_get_ops); #ifdef CONFIG_SWIOTLB +void *arch_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) +{ + return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); +} + +void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_addr, unsigned long attrs) +{ + dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); +} + +long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, + dma_addr_t dma_addr) +{ + return page_to_pfn(virt_to_page(cpu_addr)); +} + void __init swiotlb_dma_init(void) { dma_ops = &swiotlb_dma_ops; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d5e12ff1d73c..2c51733f1dfd 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -71,18 +71,14 @@ __ia64_sync_icache_dcache (pte_t pte) * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to * flush them when they get mapped into an executable vm-area. */ -void -dma_mark_clean(void *addr, size_t size) +void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) { - unsigned long pg_addr, end; - - pg_addr = PAGE_ALIGN((unsigned long) addr); - end = (unsigned long) addr + size; - while (pg_addr + PAGE_SIZE <= end) { - struct page *page = virt_to_page(pg_addr); - set_bit(PG_arch_1, &page->flags); - pg_addr += PAGE_SIZE; - } + unsigned long pfn = __phys_to_pfn(paddr); + + do { + set_bit(PG_arch_1, &pfn_to_page(pfn)->flags); + } while (++pfn <= __phys_to_pfn(paddr + size - 1)); } inline void diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 833e80b46eb2..989cf872b98c 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -441,21 +441,8 @@ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, xen_dma_unmap_page(hwdev, dev_addr, size, dir, attrs); /* NOTE: We use dev_addr here, not paddr! */ - if (is_xen_swiotlb_buffer(dev_addr)) { + if (is_xen_swiotlb_buffer(dev_addr)) swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); - return; - } - - if (dir != DMA_FROM_DEVICE) - return; - - /* -* phys_to_virt doesn't work with hihgmem page but we could -* call dma_mark_clean() with hihgmem page here. However, we -* are fine since dma_mark_clean() is null on POWERPC. We can -* make dma_mark_clean() take a physical address if necessary. -*/ - dma_mark_clean(phys_to_virt(paddr), size); } static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, @@ -493,11 +480,6 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, if (target == SYNC_FOR_DEVICE) xen_dma_sync_single_for_device(hwdev, dev_addr, size, dir); - - if (dir != DMA_FROM_DEVICE) - return; - - dma_mark_clean(phys_to_virt(paddr), size); } void diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 6e5a47ae7d64..1aa73f4907ae 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h
Re: [PATCH 2/4] dma-debug: Refactor dma_debug_entry allocation
On 04/12/2018 14:27, Christoph Hellwig wrote: On Mon, Dec 03, 2018 at 05:28:07PM +, Robin Murphy wrote: Make prealloc_memory() a little more general and robust so that it serves for runtime reallocations too. The first thing we can do with that is clean up dma_debug_resize_entries() quite a bit. Maybe also renamed it to dma_debug_alloc_entries or something like that? Yes, that's definitely nicer. Otherwise this looks fine to me. Thanks (and for the review on #1) Robin. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool
On 04/12/2018 14:17, Christoph Hellwig wrote: On Tue, Dec 04, 2018 at 01:11:37PM +, Robin Murphy wrote: In fact, having got this far in, what I'd quite like to do is to get rid of dma_debug_resize_entries() such that we never need to free things at all, since then we could allocate whole pages as blocks of entries to save on masses of individual slab allocations. Yes, we should defintively kill dma_debug_resize_entries. Allocating page batches might sound nice, but is that going to introduce additional complexity? OK, looking at what the weird AMD GART code does I reckon it should be happy enough with on-demand expansion, and that no tears will be shed if it can no longer actually trim the pool to the size it thinks is necessary. I'll add a patch to clean that up. Page-based allocation, at least the way I'm thinking of it, shouldn't do much more than add an extra loop in one place, which should be more than made up for by removing all the freeing code :) Robin. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH] dma-mapping: remove a pointless memset in dma_atomic_pool_init
We already zero the memory after allocating it from the pool that this function fills, and having the memset here in this form means we can't support CMA highmem allocations. Signed-off-by: Christoph Hellwig Reported-by: Russell King - ARM Linux --- kernel/dma/remap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 68a64e3ff6a1..6e784824326f 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -121,7 +121,6 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) if (!page) goto out; - memset(page_address(page), 0, atomic_pool_size); arch_dma_prep_coherent(page, atomic_pool_size); atomic_pool = gen_pool_create(PAGE_SHIFT, -1); -- 2.19.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH] dma-mapping: simplify the dma_sync_single_range_for_{cpu, device} implementation
We can just call the regular calls after adding offset the the address instead of reimplementing them. Signed-off-by: Christoph Hellwig --- include/linux/dma-debug.h | 27 include/linux/dma-mapping.h | 34 +- kernel/dma/debug.c | 42 - 3 files changed, 10 insertions(+), 93 deletions(-) diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h index 30213adbb6b9..c85e097a984c 100644 --- a/include/linux/dma-debug.h +++ b/include/linux/dma-debug.h @@ -72,17 +72,6 @@ extern void debug_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, int direction); -extern void debug_dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, - int direction); - -extern void debug_dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, int direction); - extern void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, int direction); @@ -174,22 +163,6 @@ static inline void debug_dma_sync_single_for_device(struct device *dev, { } -static inline void debug_dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, - int direction) -{ -} - -static inline void debug_dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, - int direction) -{ -} - static inline void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, int direction) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 634c0ab39074..1176e6adb035 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -360,6 +360,13 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, debug_dma_sync_single_for_cpu(dev, addr, size, dir); } +static inline void dma_sync_single_range_for_cpu(struct device *dev, + dma_addr_t addr, unsigned long offset, size_t size, + enum dma_data_direction dir) +{ + return dma_sync_single_for_cpu(dev, addr + offset, size, dir); +} + static inline void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) @@ -372,32 +379,11 @@ static inline void dma_sync_single_for_device(struct device *dev, debug_dma_sync_single_for_device(dev, addr, size, dir); } -static inline void dma_sync_single_range_for_cpu(struct device *dev, -dma_addr_t addr, -unsigned long offset, -size_t size, -enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_cpu) - ops->sync_single_for_cpu(dev, addr + offset, size, dir); - debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); -} - static inline void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t addr, - unsigned long offset, - size_t size, - enum dma_data_direction dir) + dma_addr_t addr, unsigned long offset, size_t size, + enum dma_data_direction dir) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_device) - ops->sync_single_for_device(dev, addr
Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache
On 04/12/2018 11:01, Vivek Gautam wrote: Qualcomm SoCs have an additional level of cache called as System cache, aka. Last level cache (LLC). This cache sits right before the DDR, and is tightly coupled with the memory controller. The cache is available to all the clients present in the SoC system. The clients request their slices from this system cache, make it active, and can then start using it. For these clients with smmu, to start using the system cache for buffers and, related page tables [1], memory attributes need to be set accordingly. This change updates the MAIR and TCR configurations with correct attributes to use this system cache. To explain a little about memory attribute requirements here: Non-coherent I/O devices can't look-up into inner caches. However, coherent I/O devices can. But both can allocate in the system cache based on system policy and configured memory attributes in page tables. CPUs can access both inner and outer caches (including system cache, aka. Last level cache), and can allocate into system cache too based on memory attributes, and system policy. Further looking at memory types, we have following - a) Normal uncached :- MAIR 0x44, inner non-cacheable, outer non-cacheable; b) Normal cached :- MAIR 0xff, inner read write-back non-transient, outer read write-back non-transient; attribute setting for coherenet I/O devices. and, for non-coherent i/o devices that can allocate in system cache another type gets added - c) Normal sys-cached/non-inner-cached :- MAIR 0xf4, inner non-cacheable, outer read write-back non-transient So, CPU will automatically use the system cache for memory marked as normal cached. The normal sys-cached is downgraded to normal non-cached memory for CPUs. Coherent I/O devices can use system cache by marking the memory as normal cached. Non-coherent I/O devices, to use system cache, should mark the memory as normal sys-cached in page tables. This change is a realisation of following changes from downstream msm-4.9: iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT[2] iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT[3] [1] https://patchwork.kernel.org/patch/10302791/ [2] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=bf762276796e79ca90014992f4d9da5593fa7d51 [3] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=d4c72c413ea27c43f60825193d4de9cb8ffd9602 Signed-off-by: Vivek Gautam --- Changes since v1: - Addressed Tomasz's comments for basing the change on "NO_INNER_CACHE" concept for non-coherent I/O devices rather than capturing "SYS_CACHE". This is to indicate clearly the intent of non-coherent I/O devices that can't access inner caches. That seems backwards to me - there is already a fundamental assumption that non-coherent devices can't access caches. What we're adding here is a weird exception where they *can* use some level of cache despite still being non-coherent overall. In other words, it's not a case of downgrading coherent devices' accesses to bypass inner caches, it's upgrading non-coherent devices' accesses to hit the outer cache. That's certainly the understanding I got from talking with Pratik at Plumbers, and it does appear to fit with your explanation above despite the final conclusion you draw being different. I do see what Tomasz meant in terms of the TCR attributes, but what we currently do there is a little unintuitive and not at all representative of actual mapping attributes - I'll come back to that inline. drivers/iommu/arm-smmu.c | 15 +++ drivers/iommu/dma-iommu.c | 3 +++ drivers/iommu/io-pgtable-arm.c | 22 +- drivers/iommu/io-pgtable.h | 5 + include/linux/iommu.h | 3 +++ 5 files changed, 43 insertions(+), 5 deletions(-) As a minor nit, I'd prefer this as at least two patches to separate the io-pgtable changes and arm-smmu changes - basically I'd expect it to look much the same as the non-strict mode support did. diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index ba18d89d4732..047f7ff95b0d 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -255,6 +255,7 @@ struct arm_smmu_domain { struct mutexinit_mutex; /* Protects smmu pointer */ spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ struct iommu_domain domain; + boolno_inner_cache; Can we keep all the domain flags together please? In fact, I'd be inclined to implement an options bitmap as we do elsewhere rather than proliferate multiple bools. }; struct arm_smmu_option_prop { @@ -897,6 +898,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, if (smmu_domain
Re: [PATCH v2 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables
On 12/4/18 10:37 AM, Nicolas Boichat wrote: > On Sun, Nov 11, 2018 at 5:04 PM Nicolas Boichat wrote: >> >> This is a follow-up to the discussion in [1], to make sure that the page >> tables allocated by iommu/io-pgtable-arm-v7s are contained within 32-bit >> physical address space. >> >> [1] >> https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html > > Hi everyone, > > Let's try to summarize here. > > First, we confirmed that this is a regression, and IOMMU errors happen > on 4.19 and linux-next/master on MT8173 (elm, Acer Chromebook R13). > The issue most likely starts from ad67f5a6545f ("arm64: replace > ZONE_DMA with ZONE_DMA32"), i.e. 4.15, and presumably breaks a number > of Mediatek platforms (and maybe others?). > > We have a few options here: > 1. This series [2], that adds support for GFP_DMA32 slab caches, > _without_ adding kmalloc caches (since there are no users of > kmalloc(..., GFP_DMA32)). I think I've addressed all the comments on > the 3 patches, and AFAICT this solution works fine. > 2. genalloc. That works, but unless we preallocate 4MB for L2 tables > (which is wasteful as we usually only need a handful of L2 tables), > we'll need changes in the core (use GFP_ATOMIC) to allow allocating on > demand, and as it stands we'd have no way to shrink the allocation. > 3. page_frag [3]. That works fine, and the code is quite simple. One > drawback is that fragments in partially freed pages cannot be reused > (from limited experiments, I see that IOMMU L2 tables are rarely > freed, so it's unlikely a whole page would get freed). But given the > low number of L2 tables, maybe we can live with that. > > I think 2 is out. Any preference between 1 and 3? I think 1 makes > better use of the memory, so that'd be my preference. But I'm probably > missing something. I would prefer 1 as well. IIRC you already confirmed that alignment requirements are not broken for custom kmem caches even in presence of SLUB debug options (and I would say it's a bug to be fixed if they weren't). I just asked (and didn't get a reply I think) about your ability to handle the GFP_ATOMIC allocation failures. They should be rare when only single page allocations are needed for the kmem cache. But in case they are not an option, then preallocating would be needed, thus probably option 2. > [2] https://patchwork.kernel.org/cover/10677529/, 3 patches > [3] https://patchwork.codeaurora.org/patch/671639/ > > Thanks, > > Nicolas > ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [RFC 4/4] dma-debug: Make leak-like behaviour apparent
On Mon, Dec 03, 2018 at 05:28:09PM +, Robin Murphy wrote: > Now that we can dynamically allocate DMA debug entries to cope with > drivers maintaining excessively large numbers of live mappings, a driver > which *does* actually have a bug leaking mappings (and is not unloaded) > will no longer trigger the "DMA-API: debugging out of memory - disabling" > message until it gets to actual kernel OOM conditions, which means it > could go unnoticed for a while. To that end, let's inform the user each > time the pool has grown to a multiple of its initial size, which should > make it apparent that they either have a leak or might want to increase > the preallocation size. > > Signed-off-by: Robin Murphy > --- > > Tagging this one as RFC since people might think it's silly. I think finding out the numbers is useful, but I'm a little worried about claiming a possible leak. Maybe we just need to print a log message for each new power of 2 of entries reached? ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool
> + for (retry_count = 0; ; retry_count++) { > + spin_lock_irqsave(&free_entries_lock, flags); > + > + if (num_free_entries > 0) > + break; > > spin_unlock_irqrestore(&free_entries_lock, flags); Taking a spinlock just to read a single integer value doesn't really help anything. > + > + if (retry_count < DMA_DEBUG_DYNAMIC_RETRIES && > + !prealloc_memory(DMA_DEBUG_DYNAMIC_ENTRIES)) Don't we need GFP_ATOMIC here? Also why do we need the retries? ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 2/4] dma-debug: Refactor dma_debug_entry allocation
On Mon, Dec 03, 2018 at 05:28:07PM +, Robin Murphy wrote: > Make prealloc_memory() a little more general and robust so that it > serves for runtime reallocations too. The first thing we can do with > that is clean up dma_debug_resize_entries() quite a bit. Maybe also renamed it to dma_debug_alloc_entries or something like that? Otherwise this looks fine to me. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/4] dma-debug: Use pr_fmt()
Looks good, Reviewed-by: Christoph Hellwig ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v3, RFC] iommu/io-pgtable-arm-v7s: Use page_frag to request DMA32 memory
On Tue, Dec 04, 2018 at 04:23:00PM +0800, Nicolas Boichat wrote: > IOMMUs using ARMv7 short-descriptor format require page tables > (level 1 and 2) to be allocated within the first 4GB of RAM, even > on 64-bit systems. > > For level 1/2 tables, ensure GFP_DMA32 is used if CONFIG_ZONE_DMA32 > is defined (e.g. on arm64 platforms). > > For level 2 tables (1 KB), we use page_frag to allocate these pages, > as we cannot directly use kmalloc (no slab cache for GFP_DMA32) or > kmem_cache (mm/ code treats GFP_DMA32 as an invalid flag). > > One downside is that we only free the allocated page if all the > 4 fragments (4 IOMMU L2 tables) are freed, but given that we > usually only allocate limited number of IOMMU L2 tables, this > should not have too much impact on memory usage: In the absolute > worst case (4096 L2 page tables, each on their own 4K page), > we would use 16 MB of memory for 4 MB of L2 tables. I think this needs to be documemented in the code. That is move the explanation about into a comment in the code. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: use generic DMA mapping code in powerpc V4
On Tue, Dec 04, 2018 at 10:53:39AM +0100, Christian Zigotzky wrote: > I don't know why this kernel doesn't recognize the hard disks connected to > my physical P5020 board and why the onboard ethernet on my PASEMI board > doesn't work. (dma_direct_map_page: overflow) Do you know if this actually works for the baseline before my patches? E.g. with commit 721c01ba8b46ddb5355bd6e6b3bbfdabfdf01e97 ? ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 5/9] dma-mapping: support highmem in the generic remap allocator
On Tue, Dec 04, 2018 at 09:38:02AM +0100, Marek Szyprowski wrote: > Hi All, > > On 2018-11-30 20:05, Robin Murphy wrote: > > On 05/11/2018 12:19, Christoph Hellwig wrote: > >> By using __dma_direct_alloc_pages we can deal entirely with struct page > >> instead of having to derive a kernel virtual address. > > > > Simple enough :) > > > > Reviewed-by: Robin Murphy > > This patch has landed linux-next yesterday and I've noticed that it > breaks operation of many drivers. The change looked simple, but a stupid > bug managed to slip into the code. After a short investigation I've > noticed that __dma_direct_alloc_pages() doesn't set dma_handle and zero > allocated memory, while dma_direct_alloc_pages() did. The other > difference is the lack of set_memory_decrypted() handling. > > Following patch fixes the issue, but maybe it would be better to fix it > in kernel/dma/direct.c: Thanks for spotting this Marek. Can you send the patch below with a signoff and a changelog so that I can queue it up? ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 4/9] dma-mapping: move the arm64 ncoherent alloc/free support to common code
> > +int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) > > +{ > > + unsigned int pool_size_order = get_order(atomic_pool_size); > > + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; > > + struct page *page; > > + void *addr; > > + int ret; > > + > > + if (dev_get_cma_area(NULL)) > > + page = dma_alloc_from_contiguous(NULL, nr_pages, > > +pool_size_order, false); > > + else > > + page = alloc_pages(gfp, pool_size_order); > > + if (!page) > > + goto out; > > + > > + memset(page_address(page), 0, atomic_pool_size); > > Note that this won't work if 'page' is a highmem page - should there > be a check for that, or a check for the gfp flags? > > Also, is this memset() actually useful, or a waste of cycles - when we > allocate from this pool (see dma_alloc_from_pool()), we always memset() > the buffer. Currently there is no user that supports highmem, but yes, the memset should probably simply be removed. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool
On Tue, Dec 04, 2018 at 01:11:37PM +, Robin Murphy wrote: > In fact, having got this far in, what I'd quite like to do is to get rid of > dma_debug_resize_entries() such that we never need to free things at all, > since then we could allocate whole pages as blocks of entries to save on > masses of individual slab allocations. Yes, we should defintively kill dma_debug_resize_entries. Allocating page batches might sound nice, but is that going to introduce additional complexity? ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
RE: [PATCH v2] dma-debug: Kconfig for PREALLOC_DMA_DEBUG_ENTRIES
From: Qian Cai > Sent: 30 November 2018 21:48 > To: h...@lst.de; m.szyprow...@samsung.com; robin.mur...@arm.com > Cc: yisen.zhu...@huawei.com; salil.me...@huawei.com; john.ga...@huawei.com; > linux...@huawei.com; > iommu@lists.linux-foundation.org; net...@vger.kernel.org; > linux-ker...@vger.kernel.org; Qian Cai > Subject: [PATCH v2] dma-debug: Kconfig for PREALLOC_DMA_DEBUG_ENTRIES > > The amount of DMA mappings from Hisilicon HNS ethernet devices is huge, > so it could trigger "DMA-API: debugging out of memory - disabling". > > hnae_get_handle [1] > hnae_init_queue > hnae_init_ring > hnae_alloc_buffers [2] > debug_dma_map_page > dma_entry_alloc > > [1] for (i = 0; i < handle->q_num; i++) > [2] for (i = 0; i < ring->desc_num; i++) > > Also, "#define HNS_DSAF_MAX_DESC_CNT 1024" > > On this Huawei TaiShan 2280 aarch64 server, it has reached the limit > already, > > 4 (NICs) x 16 (queues) x 1024 (port descption numbers) = 65536 > > Added a Kconfig entry for PREALLOC_DMA_DEBUG_ENTRIES, so make it easier > for users to deal with special cases like this. Ugg. That is worse than a module parameter. The driver needs to automatically reduce the number of mapping requests. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales) ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool
Hi John, On 03/12/2018 18:23, John Garry wrote: On 03/12/2018 17:28, Robin Murphy wrote: Certain drivers such as large multi-queue network adapters can use pools of mapped DMA buffers larger than the default dma_debug_entry pool of 65536 entries, with the result that merely probing such a device can cause DMA debug to disable itself during boot unless explicitly given an appropriate "dma_debug_entries=..." option. Developers trying to debug some other driver on such a system may not be immediately aware of this, and at worst it can hide bugs if they fail to realise that dma-debug has already disabled itself unexpectedly by the time the code of interest gets to run. Even once they do realise, it can be a bit of a pain to emprirically determine a suitable number of preallocated entries to configure without massively over-allocating. There's really no need for such a static limit, though, since we can quite easily expand the pool at runtime in those rare cases that the preallocated entries are insufficient, which is arguably the least surprising and most useful behaviour. Hi Robin, Do you have an idea on shrinking the pool again when the culprit driver is removed, i.e. we have so many unused debug entries now available? I honestly don't believe it's worth the complication. This is a development feature with significant overheads already, so there's not an awful lot to gain by trying to optimise memory usage. If a system can ever load a driver that makes hundreds of thousands of simultaneous mappings, it can almost certainly spare 20-odd megabytes of RAM for the corresponding debug entries in perpetuity. Sure, it does mean you'd need to reboot to recover memory from a major leak, but that's mostly true of the current behaviour too, and rebooting during driver development is hardly an unacceptable inconvenience. In fact, having got this far in, what I'd quite like to do is to get rid of dma_debug_resize_entries() such that we never need to free things at all, since then we could allocate whole pages as blocks of entries to save on masses of individual slab allocations. Robin. Thanks, John Signed-off-by: Robin Murphy --- kernel/dma/debug.c | 18 +++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index de5db800dbfc..46cc075aec99 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -47,6 +47,9 @@ #ifndef PREALLOC_DMA_DEBUG_ENTRIES #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) #endif +/* If the pool runs out, try this many times to allocate this many new entries */ +#define DMA_DEBUG_DYNAMIC_ENTRIES 256 +#define DMA_DEBUG_DYNAMIC_RETRIES 2 enum { dma_debug_single, @@ -702,12 +705,21 @@ static struct dma_debug_entry *dma_entry_alloc(void) { struct dma_debug_entry *entry; unsigned long flags; + int retry_count; - spin_lock_irqsave(&free_entries_lock, flags); + for (retry_count = 0; ; retry_count++) { + spin_lock_irqsave(&free_entries_lock, flags); + + if (num_free_entries > 0) + break; - if (list_empty(&free_entries)) { - global_disable = true; spin_unlock_irqrestore(&free_entries_lock, flags); + + if (retry_count < DMA_DEBUG_DYNAMIC_RETRIES && + !prealloc_memory(DMA_DEBUG_DYNAMIC_ENTRIES)) + continue; + + global_disable = true; pr_err("debugging out of memory - disabling\n"); return NULL; } ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache
Qualcomm SoCs have an additional level of cache called as System cache, aka. Last level cache (LLC). This cache sits right before the DDR, and is tightly coupled with the memory controller. The cache is available to all the clients present in the SoC system. The clients request their slices from this system cache, make it active, and can then start using it. For these clients with smmu, to start using the system cache for buffers and, related page tables [1], memory attributes need to be set accordingly. This change updates the MAIR and TCR configurations with correct attributes to use this system cache. To explain a little about memory attribute requirements here: Non-coherent I/O devices can't look-up into inner caches. However, coherent I/O devices can. But both can allocate in the system cache based on system policy and configured memory attributes in page tables. CPUs can access both inner and outer caches (including system cache, aka. Last level cache), and can allocate into system cache too based on memory attributes, and system policy. Further looking at memory types, we have following - a) Normal uncached :- MAIR 0x44, inner non-cacheable, outer non-cacheable; b) Normal cached :- MAIR 0xff, inner read write-back non-transient, outer read write-back non-transient; attribute setting for coherenet I/O devices. and, for non-coherent i/o devices that can allocate in system cache another type gets added - c) Normal sys-cached/non-inner-cached :- MAIR 0xf4, inner non-cacheable, outer read write-back non-transient So, CPU will automatically use the system cache for memory marked as normal cached. The normal sys-cached is downgraded to normal non-cached memory for CPUs. Coherent I/O devices can use system cache by marking the memory as normal cached. Non-coherent I/O devices, to use system cache, should mark the memory as normal sys-cached in page tables. This change is a realisation of following changes from downstream msm-4.9: iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT[2] iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT[3] [1] https://patchwork.kernel.org/patch/10302791/ [2] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=bf762276796e79ca90014992f4d9da5593fa7d51 [3] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=d4c72c413ea27c43f60825193d4de9cb8ffd9602 Signed-off-by: Vivek Gautam --- Changes since v1: - Addressed Tomasz's comments for basing the change on "NO_INNER_CACHE" concept for non-coherent I/O devices rather than capturing "SYS_CACHE". This is to indicate clearly the intent of non-coherent I/O devices that can't access inner caches. drivers/iommu/arm-smmu.c | 15 +++ drivers/iommu/dma-iommu.c | 3 +++ drivers/iommu/io-pgtable-arm.c | 22 +- drivers/iommu/io-pgtable.h | 5 + include/linux/iommu.h | 3 +++ 5 files changed, 43 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index ba18d89d4732..047f7ff95b0d 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -255,6 +255,7 @@ struct arm_smmu_domain { struct mutexinit_mutex; /* Protects smmu pointer */ spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ struct iommu_domain domain; + boolno_inner_cache; }; struct arm_smmu_option_prop { @@ -897,6 +898,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, if (smmu_domain->non_strict) pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; + if (smmu_domain->no_inner_cache) + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NO_IC; + smmu_domain->smmu = smmu; pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); if (!pgtbl_ops) { @@ -1579,6 +1583,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, case DOMAIN_ATTR_NESTING: *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED); return 0; + case DOMAIN_ATTR_NO_IC: + *((int *)data) = smmu_domain->no_inner_cache; + return 0; default: return -ENODEV; } @@ -1619,6 +1626,14 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain, else smmu_domain->stage = ARM_SMMU_DOMAIN_S1; break; + case DOMAIN_ATTR_NO_IC: + if (smmu_domain->smmu) { + ret = -EPERM; + goto out_unlock; + } + if (*((int
Re: [PATCH 4/9] dma-mapping: move the arm64 ncoherent alloc/free support to common code
On Mon, Nov 05, 2018 at 01:19:26PM +0100, Christoph Hellwig wrote: > The arm64 codebase to implement coherent dma allocation for architectures > with non-coherent DMA is a good start for a generic implementation, given > that is uses the generic remap helpers, provides the atomic pool for > allocations that can't sleep and still is realtively simple and well > tested. Move it to kernel/dma and allow architectures to opt into it > using a config symbol. Architectures just need to provide a new > arch_dma_prep_coherent helper to writeback an invalidate the caches > for any memory that gets remapped for uncached access. > > Signed-off-by: Christoph Hellwig > --- > arch/arm64/Kconfig | 2 +- > arch/arm64/mm/dma-mapping.c | 184 ++-- > include/linux/dma-mapping.h | 5 + > include/linux/dma-noncoherent.h | 2 + > kernel/dma/Kconfig | 6 ++ > kernel/dma/remap.c | 158 ++- > 6 files changed, 181 insertions(+), 176 deletions(-) > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 5d065acb6d10..2e645ea693ea 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -82,7 +82,7 @@ config ARM64 > select CRC32 > select DCACHE_WORD_ACCESS > select DMA_DIRECT_OPS > - select DMA_REMAP > + select DMA_DIRECT_REMAP > select EDAC_SUPPORT > select FRAME_POINTER > select GENERIC_ALLOCATOR > diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c > index a3ac26284845..e2e7e5d0f94e 100644 > --- a/arch/arm64/mm/dma-mapping.c > +++ b/arch/arm64/mm/dma-mapping.c > @@ -33,113 +33,6 @@ > > #include > > -static struct gen_pool *atomic_pool __ro_after_init; > - > -#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K > -static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; > - > -static int __init early_coherent_pool(char *p) > -{ > - atomic_pool_size = memparse(p, &p); > - return 0; > -} > -early_param("coherent_pool", early_coherent_pool); > - > -static void *__alloc_from_pool(size_t size, struct page **ret_page, gfp_t > flags) > -{ > - unsigned long val; > - void *ptr = NULL; > - > - if (!atomic_pool) { > - WARN(1, "coherent pool not initialised!\n"); > - return NULL; > - } > - > - val = gen_pool_alloc(atomic_pool, size); > - if (val) { > - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); > - > - *ret_page = phys_to_page(phys); > - ptr = (void *)val; > - memset(ptr, 0, size); > - } > - > - return ptr; > -} > - > -static bool __in_atomic_pool(void *start, size_t size) > -{ > - return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); > -} > - > -static int __free_from_pool(void *start, size_t size) > -{ > - if (!__in_atomic_pool(start, size)) > - return 0; > - > - gen_pool_free(atomic_pool, (unsigned long)start, size); > - > - return 1; > -} > - > -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, > - gfp_t flags, unsigned long attrs) > -{ > - struct page *page; > - void *ptr, *coherent_ptr; > - pgprot_t prot = pgprot_writecombine(PAGE_KERNEL); > - > - size = PAGE_ALIGN(size); > - > - if (!gfpflags_allow_blocking(flags)) { > - struct page *page = NULL; > - void *addr = __alloc_from_pool(size, &page, flags); > - > - if (addr) > - *dma_handle = phys_to_dma(dev, page_to_phys(page)); > - > - return addr; > - } > - > - ptr = dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs); > - if (!ptr) > - goto no_mem; > - > - /* remove any dirty cache lines on the kernel alias */ > - __dma_flush_area(ptr, size); > - > - /* create a coherent mapping */ > - page = virt_to_page(ptr); > - coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP, > -prot, > __builtin_return_address(0)); > - if (!coherent_ptr) > - goto no_map; > - > - return coherent_ptr; > - > -no_map: > - dma_direct_free_pages(dev, size, ptr, *dma_handle, attrs); > -no_mem: > - return NULL; > -} > - > -void arch_dma_free(struct device *dev, size_t size, void *vaddr, > - dma_addr_t dma_handle, unsigned long attrs) > -{ > - if (!__free_from_pool(vaddr, PAGE_ALIGN(size))) { > - void *kaddr = phys_to_virt(dma_to_phys(dev, dma_handle)); > - > - vunmap(vaddr); > - dma_direct_free_pages(dev, size, kaddr, dma_handle, attrs); > - } > -} > - > -long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, > - dma_addr_t dma_addr) > -{ > - return __phys_to_pfn(dma_to_phys(dev, dma_addr)); > -} > - > pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, > unsigned long att
Re: use generic DMA mapping code in powerpc V4
On 04 December 2018 at 08:31AM, Christian Zigotzky wrote: Hi All, Could you please test Christoph's kernel on your PASEMI and NXP boards? Download: 'git clone git://git.infradead.org/users/hch/misc.git -b powerpc-dma.5 a' Thanks, Christian I successfully tested this kernel on a virtual e5500 QEMU machine today. Command: ./qemu-system-ppc64 -M ppce500 -cpu e5500 -m 2048 -kernel uImage-dma -drive format=raw,file=MATE_PowerPC_Remix_2017_0.9.img,index=0,if=virtio -nic user,model=e1000 -append "rw root=/dev/vda" -device virtio-vga -device virtio-mouse-pci -device virtio-keyboard-pci -usb -soundhw es1370 -smp 4 QEMU version 3.1.0. I don't know why this kernel doesn't recognize the hard disks connected to my physical P5020 board and why the onboard ethernet on my PASEMI board doesn't work. (dma_direct_map_page: overflow) -- Christian ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables
On Sun, Nov 11, 2018 at 5:04 PM Nicolas Boichat wrote: > > This is a follow-up to the discussion in [1], to make sure that the page > tables allocated by iommu/io-pgtable-arm-v7s are contained within 32-bit > physical address space. > > [1] > https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html Hi everyone, Let's try to summarize here. First, we confirmed that this is a regression, and IOMMU errors happen on 4.19 and linux-next/master on MT8173 (elm, Acer Chromebook R13). The issue most likely starts from ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32"), i.e. 4.15, and presumably breaks a number of Mediatek platforms (and maybe others?). We have a few options here: 1. This series [2], that adds support for GFP_DMA32 slab caches, _without_ adding kmalloc caches (since there are no users of kmalloc(..., GFP_DMA32)). I think I've addressed all the comments on the 3 patches, and AFAICT this solution works fine. 2. genalloc. That works, but unless we preallocate 4MB for L2 tables (which is wasteful as we usually only need a handful of L2 tables), we'll need changes in the core (use GFP_ATOMIC) to allow allocating on demand, and as it stands we'd have no way to shrink the allocation. 3. page_frag [3]. That works fine, and the code is quite simple. One drawback is that fragments in partially freed pages cannot be reused (from limited experiments, I see that IOMMU L2 tables are rarely freed, so it's unlikely a whole page would get freed). But given the low number of L2 tables, maybe we can live with that. I think 2 is out. Any preference between 1 and 3? I think 1 makes better use of the memory, so that'd be my preference. But I'm probably missing something. [2] https://patchwork.kernel.org/cover/10677529/, 3 patches [3] https://patchwork.codeaurora.org/patch/671639/ Thanks, Nicolas ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v3 1/4] PCI / ACPI: Identify untrusted PCI devices
On Mon, Dec 03, 2018 at 06:28:00PM -0600, Bjorn Helgaas wrote: > On Thu, Nov 29, 2018 at 06:51:50PM +0300, Mika Westerberg wrote: > > A malicious PCI device may use DMA to attack the system. An external > > Thunderbolt port is a convenient point to attach such a device. The OS > > may use IOMMU to defend against DMA attacks. > > > > Recent BIOSes with Thunderbolt ports mark these externally facing root > > ports with this ACPI _DSD [1]: > > I'm not 100% comfortable with the "Recent BIOSes" wording because that > suggests that we can rely on the fact that *all* BIOSes newer than > some date X mark these ports. > > Since this _DSD usage is Microsoft-specific and not required by either > PCIe or ACPI specs, we can't rely on it. A BIOS that doesn't > implement it may not be Windows-certified, but it's perfectly > spec-compliant otherwise and we have to keep in mind the possibility > that ports without this _DSD may still be externally visible and may > still be attack vectors. OK. I will change it to "Some BIOSes .." following what you suggested earlier. That should make it clear not all BIOSes are required to implement this. > > Name (_DSD, Package () { > > ToUUID ("efcc06cc-73ac-4bc3-bff0-76143807c389"), > > Package () { > > Package () {"ExternalFacingPort", 1}, > > Package () {"UID", 0 } > > } > > }) > > > > If we find such a root port, mark it and all its children as untrusted. > > The rest of the OS may use this information to enable DMA protection > > against malicious devices. For instance the device may be put behind an > > IOMMU to keep it from accessing memory outside of what the driver has > > allocated for it. > > > > While at it, add a comment on top of prp_guids array explaining the > > possible caveat resulting when these GUIDs are treated equivalent. > > > > [1] > > https://docs.microsoft.com/en-us/windows-hardware/drivers/pci/dsd-for-pcie-root-ports#identifying-externally-exposed-pcie-root-ports > > > > Signed-off-by: Mika Westerberg > > Acked-by: Bjorn Helgaas Thanks! ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 5/9] dma-mapping: support highmem in the generic remap allocator
Hi All, On 2018-11-30 20:05, Robin Murphy wrote: > On 05/11/2018 12:19, Christoph Hellwig wrote: >> By using __dma_direct_alloc_pages we can deal entirely with struct page >> instead of having to derive a kernel virtual address. > > Simple enough :) > > Reviewed-by: Robin Murphy This patch has landed linux-next yesterday and I've noticed that it breaks operation of many drivers. The change looked simple, but a stupid bug managed to slip into the code. After a short investigation I've noticed that __dma_direct_alloc_pages() doesn't set dma_handle and zero allocated memory, while dma_direct_alloc_pages() did. The other difference is the lack of set_memory_decrypted() handling. Following patch fixes the issue, but maybe it would be better to fix it in kernel/dma/direct.c: diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index dcc82dd668f8..7765ddc56e4e 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -219,8 +219,14 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, ret = dma_common_contiguous_remap(page, size, VM_USERMAP, arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); - if (!ret) + if (!ret) { __dma_direct_free_pages(dev, size, page); + return ret; + } + + *dma_handle = phys_to_dma(dev, page_to_phys(page)); + memset(ret, 0, size); + return ret; } > >> Signed-off-by: Christoph Hellwig >> --- >> kernel/dma/remap.c | 14 +++--- >> 1 file changed, 7 insertions(+), 7 deletions(-) >> >> diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c >> index bc42766f52df..8f1fca34b894 100644 >> --- a/kernel/dma/remap.c >> +++ b/kernel/dma/remap.c >> @@ -196,7 +196,7 @@ void *arch_dma_alloc(struct device *dev, size_t >> size, dma_addr_t *dma_handle, >> gfp_t flags, unsigned long attrs) >> { >> struct page *page = NULL; >> - void *ret, *kaddr; >> + void *ret; >> size = PAGE_ALIGN(size); >> @@ -208,10 +208,9 @@ void *arch_dma_alloc(struct device *dev, >> size_t size, dma_addr_t *dma_handle, >> return ret; >> } >> - kaddr = dma_direct_alloc_pages(dev, size, dma_handle, flags, >> attrs); >> - if (!kaddr) >> + page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, >> attrs); >> + if (!page) >> return NULL; >> - page = virt_to_page(kaddr); >> /* remove any dirty cache lines on the kernel alias */ >> arch_dma_prep_coherent(page, size); >> @@ -221,7 +220,7 @@ void *arch_dma_alloc(struct device *dev, size_t >> size, dma_addr_t *dma_handle, >> arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), >> __builtin_return_address(0)); >> if (!ret) >> - dma_direct_free_pages(dev, size, kaddr, *dma_handle, attrs); >> + __dma_direct_free_pages(dev, size, page); >> return ret; >> } >> @@ -229,10 +228,11 @@ void arch_dma_free(struct device *dev, size_t >> size, void *vaddr, >> dma_addr_t dma_handle, unsigned long attrs) >> { >> if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { >> - void *kaddr = phys_to_virt(dma_to_phys(dev, dma_handle)); >> + phys_addr_t phys = dma_to_phys(dev, dma_handle); >> + struct page *page = pfn_to_page(__phys_to_pfn(phys)); >> vunmap(vaddr); >> - dma_direct_free_pages(dev, size, kaddr, dma_handle, attrs); >> + __dma_direct_free_pages(dev, size, page); >> } >> } >> > Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v3, RFC] iommu/io-pgtable-arm-v7s: Use page_frag to request DMA32 memory
IOMMUs using ARMv7 short-descriptor format require page tables (level 1 and 2) to be allocated within the first 4GB of RAM, even on 64-bit systems. For level 1/2 tables, ensure GFP_DMA32 is used if CONFIG_ZONE_DMA32 is defined (e.g. on arm64 platforms). For level 2 tables (1 KB), we use page_frag to allocate these pages, as we cannot directly use kmalloc (no slab cache for GFP_DMA32) or kmem_cache (mm/ code treats GFP_DMA32 as an invalid flag). One downside is that we only free the allocated page if all the 4 fragments (4 IOMMU L2 tables) are freed, but given that we usually only allocate limited number of IOMMU L2 tables, this should not have too much impact on memory usage: In the absolute worst case (4096 L2 page tables, each on their own 4K page), we would use 16 MB of memory for 4 MB of L2 tables. Also, print an error when the physical address does not fit in 32-bit, to make debugging easier in the future. Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32") Signed-off-by: Nicolas Boichat --- As an alternative to the series [1], which adds support for GFP_DMA32 to kmem_cache in mm/. IMHO the solution in [1] is cleaner and more efficient, as it allows freed fragments (L2 tables) to be reused, but this approach does not require any core change. [1] https://patchwork.kernel.org/cover/10677529/, 3 patches drivers/iommu/io-pgtable-arm-v7s.c | 32 -- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 445c3bde04800c..0de6a51eb6755f 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -161,6 +161,12 @@ #define ARM_V7S_TCR_PD1BIT(5) +#ifdef CONFIG_ZONE_DMA32 +#define ARM_V7S_TABLE_GFP_DMA GFP_DMA32 +#else +#define ARM_V7S_TABLE_GFP_DMA GFP_DMA +#endif + typedef u32 arm_v7s_iopte; static bool selftest_running; @@ -169,7 +175,7 @@ struct arm_v7s_io_pgtable { struct io_pgtable iop; arm_v7s_iopte *pgd; - struct kmem_cache *l2_tables; + struct page_frag_cache l2_tables; spinlock_t split_lock; }; @@ -198,13 +204,17 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp, void *table = NULL; if (lvl == 1) - table = (void *)__get_dma_pages(__GFP_ZERO, get_order(size)); + table = (void *)__get_free_pages( + __GFP_ZERO | ARM_V7S_TABLE_GFP_DMA, get_order(size)); else if (lvl == 2) - table = kmem_cache_zalloc(data->l2_tables, gfp | GFP_DMA); + table = page_frag_alloc(&data->l2_tables, size, + gfp | __GFP_ZERO | ARM_V7S_TABLE_GFP_DMA); phys = virt_to_phys(table); - if (phys != (arm_v7s_iopte)phys) + if (phys != (arm_v7s_iopte)phys) { /* Doesn't fit in PTE */ + dev_err(dev, "Page table does not fit in PTE: %pa", &phys); goto out_free; + } if (table && !(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) { dma = dma_map_single(dev, table, size, DMA_TO_DEVICE); if (dma_mapping_error(dev, dma)) @@ -227,7 +237,7 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp, if (lvl == 1) free_pages((unsigned long)table, get_order(size)); else - kmem_cache_free(data->l2_tables, table); + page_frag_free(table); return NULL; } @@ -244,7 +254,7 @@ static void __arm_v7s_free_table(void *table, int lvl, if (lvl == 1) free_pages((unsigned long)table, get_order(size)); else - kmem_cache_free(data->l2_tables, table); + page_frag_free(table); } static void __arm_v7s_pte_sync(arm_v7s_iopte *ptep, int num_entries, @@ -515,7 +525,6 @@ static void arm_v7s_free_pgtable(struct io_pgtable *iop) __arm_v7s_free_table(iopte_deref(pte, 1), 2, data); } __arm_v7s_free_table(data->pgd, 1, data); - kmem_cache_destroy(data->l2_tables); kfree(data); } @@ -729,17 +738,11 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, !(cfg->quirks & IO_PGTABLE_QUIRK_NO_PERMS)) return NULL; - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return NULL; spin_lock_init(&data->split_lock); - data->l2_tables = kmem_cache_create("io-pgtable_armv7s_l2", - ARM_V7S_TABLE_SIZE(2), - ARM_V7S_TABLE_SIZE(2), - SLAB_CACHE_DMA, NULL); - if (!data->l2_tables) - goto out_free_data; data->iop.ops = (struct io_pgtable_ops) { .map= arm_v7s_m