Re: [PATCH v3 6/8] mm: parallelize deferred_init_memmap()

2020-05-27 Thread Alexander Duyck
fn_range_in_zone(&i, zone, &spfn, &epfn, 
> start_pfn);
> +
> +   /*
> +* Initialize and free pages in MAX_ORDER sized increments so that we
> +* can avoid introducing any issues with the buddy allocator.
> +*/
> +   while (spfn < end_pfn) {
> +   deferred_init_maxorder(&i, zone, &spfn, &epfn);
> +   cond_resched();
> +   }
> +}
> +
>  /* Initialise remaining memory on a node */
>  static int __init deferred_init_memmap(void *data)
>  {
> @@ -1823,7 +1844,7 @@ static int __init deferred_init_memmap(void *data)
> unsigned long first_init_pfn, flags;
> unsigned long start = jiffies;
> struct zone *zone;
> -   int zid;
> +   int zid, max_threads;
> u64 i;
>
> /* Bind memory initialisation thread to a local node if possible */
> @@ -1863,13 +1884,26 @@ static int __init deferred_init_memmap(void *data)
> goto zone_empty;
>
> /*
> -* Initialize and free pages in MAX_ORDER sized increments so
> -* that we can avoid introducing any issues with the buddy
> -* allocator.
> +* More CPUs always led to greater speedups on tested systems, up to
> +* all the nodes' CPUs.  Use all since the system is otherwise idle 
> now.
>  */
> +   max_threads = max(cpumask_weight(cpumask), 1u);
> +
> while (spfn < epfn) {
> -   deferred_init_maxorder(&i, zone, &spfn, &epfn);
> -   cond_resched();
> +   unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
> +   struct padata_mt_job job = {
> +   .thread_fn   = deferred_init_memmap_chunk,
> +   .fn_arg  = zone,
> +   .start   = spfn,
> +   .size= epfn_align - spfn,
> +   .align   = PAGES_PER_SECTION,
> +   .min_chunk   = PAGES_PER_SECTION,
> +   .max_threads = max_threads,
> +   };
> +
> +   padata_do_multithreaded(&job);
> +   deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
> +   epfn_align);
> }
>  zone_empty:
> /* Sanity check that the next zone really is unpopulated */

So I am not a huge fan of using deferred_init_mem_pfn_range_in zone
simply for the fact that we end up essentially discarding the i value
and will have to walk the list repeatedly. However I don't think the
overhead will be that great as I suspect there aren't going to be
systems with that many ranges. So this is probably fine.

Reviewed-by: Alexander Duyck 


Re: [PATCH v3 5/8] mm: don't track number of pages during deferred initialization

2020-05-27 Thread Alexander Duyck
On Wed, May 27, 2020 at 10:37 AM Daniel Jordan
 wrote:
>
> Deferred page init used to report the number of pages initialized:
>
>   node 0 initialised, 32439114 pages in 97ms
>
> Tracking this makes the code more complicated when using multiple
> threads.  Given that the statistic probably has limited value,
> especially since a zone grows on demand so that the page count can vary,
> just remove it.
>
> The boot message now looks like
>
>   node 0 deferred pages initialised in 97ms
>
> Signed-off-by: Daniel Jordan 
> Suggested-by: Alexander Duyck 

This looks good to me.

Reviewed-by: Alexander Duyck 

> ---
>  mm/page_alloc.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d0c0d9364aa6d..d64f3027fdfa6 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1819,7 +1819,7 @@ static int __init deferred_init_memmap(void *data)
>  {
> pg_data_t *pgdat = data;
> const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
> -   unsigned long spfn = 0, epfn = 0, nr_pages = 0;
> +   unsigned long spfn = 0, epfn = 0;
> unsigned long first_init_pfn, flags;
> unsigned long start = jiffies;
> struct zone *zone;
> @@ -1868,15 +1868,15 @@ static int __init deferred_init_memmap(void *data)
>  * allocator.
>  */
> while (spfn < epfn) {
> -   nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
> +   deferred_init_maxorder(&i, zone, &spfn, &epfn);
> cond_resched();
> }
>  zone_empty:
> /* Sanity check that the next zone really is unpopulated */
> WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
>
> -   pr_info("node %d initialised, %lu pages in %ums\n",
> -   pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
> +   pr_info("node %d deferred pages initialised in %ums\n",
> +   pgdat->node_id, jiffies_to_msecs(jiffies - start));
>
> pgdat_init_report_one_done();
> return 0;
> --
> 2.26.2
>
>


Re: [PATCH v2 5/7] mm: parallelize deferred_init_memmap()

2020-05-21 Thread Alexander Duyck
On Thu, May 21, 2020 at 8:37 AM Daniel Jordan
 wrote:
>
> On Wed, May 20, 2020 at 06:29:32PM -0700, Alexander Duyck wrote:
> > On Wed, May 20, 2020 at 11:27 AM Daniel Jordan
> > > @@ -1814,16 +1815,44 @@ deferred_init_maxorder(u64 *i, struct zone *zone, 
> > > unsigned long *start_pfn,
> > > return nr_pages;
> > >  }
> > >
> > > +struct definit_args {
> > > +   struct zone *zone;
> > > +   atomic_long_t nr_pages;
> > > +};
> > > +
> > > +static void __init
> > > +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long 
> > > end_pfn,
> > > +  void *arg)
> > > +{
> > > +   unsigned long spfn, epfn, nr_pages = 0;
> > > +   struct definit_args *args = arg;
> > > +   struct zone *zone = args->zone;
> > > +   u64 i;
> > > +
> > > +   deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 
> > > start_pfn);
> > > +
> > > +   /*
> > > +* Initialize and free pages in MAX_ORDER sized increments so 
> > > that we
> > > +* can avoid introducing any issues with the buddy allocator.
> > > +*/
> > > +   while (spfn < end_pfn) {
> > > +   nr_pages += deferred_init_maxorder(&i, zone, &spfn, 
> > > &epfn);
> > > +   cond_resched();
> > > +   }
> > > +
> > > +   atomic_long_add(nr_pages, &args->nr_pages);
> > > +}
> > > +
> >
> > Personally I would get rid of nr_pages entirely. It isn't worth the
> > cache thrash to have this atomic variable bouncing around.
>
> One of the things I tried to optimize was the managed_pages atomic adds in
> __free_pages_core, but performance stayed the same on the biggest machine I
> tested when it was done once at the end of page init instead of in every 
> thread
> for every pageblock.
>
> I'm not sure this atomic would matter either, given it's less frequent.

It is more about not bothering with the extra tracking. We don't
really need it and having it doesn't really add much in the way of
value.

> > You could
> > probably just have this function return void since all nr_pages is
> > used for is a pr_info  statement at the end of the initialization
> > which will be completely useless now anyway since we really have the
> > threads running in parallel anyway.
>
> The timestamp is still useful for observability, page init is a significant
> part of kernel boot on big machines, over 10% sometimes with these patches.

Agreed.

> It's mostly the time that matters though, I agree the number of pages is less
> important and is probably worth removing just to simplify the code.  I'll do 
> it
> if no one sees a reason to keep it.

Sounds good.

> > We only really need the nr_pages logic in deferred_grow_zone in order
> > to track if we have freed enough pages to allow us to go back to what
> > we were doing.
> >
> > > @@ -1863,11 +1892,32 @@ static int __init deferred_init_memmap(void *data)
> > > goto zone_empty;
> > >
> > > /*
> > > -* Initialize and free pages in MAX_ORDER sized increments so
> > > -* that we can avoid introducing any issues with the buddy
> > > -* allocator.
> > > +* More CPUs always led to greater speedups on tested systems, up 
> > > to
> > > +* all the nodes' CPUs.  Use all since the system is otherwise 
> > > idle now.
> > >  */
> > > +   max_threads = max(cpumask_weight(cpumask), 1u);
> > > +
> > > while (spfn < epfn) {
> > > +   epfn_align = ALIGN_DOWN(epfn, PAGES_PER_SECTION);
> > > +
> > > +   if (IS_ALIGNED(spfn, PAGES_PER_SECTION) &&
> > > +   epfn_align - spfn >= PAGES_PER_SECTION) {
> > > +   struct definit_args arg = { zone, 
> > > ATOMIC_LONG_INIT(0) };
> > > +   struct padata_mt_job job = {
> > > +   .thread_fn   = deferred_init_memmap_chunk,
> > > +   .fn_arg  = &arg,
> > > +   .start   = spfn,
> > > +   .size= epfn_align - spfn,
> > > +   .align   = PAGES_PER_SECTION,
> > > +   

Re: [PATCH v2 5/7] mm: parallelize deferred_init_memmap()

2020-05-21 Thread Alexander Duyck
On Wed, May 20, 2020 at 6:29 PM Alexander Duyck
 wrote:
>
> On Wed, May 20, 2020 at 11:27 AM Daniel Jordan
>  wrote:
> >
> > Deferred struct page init is a significant bottleneck in kernel boot.
> > Optimizing it maximizes availability for large-memory systems and allows
> > spinning up short-lived VMs as needed without having to leave them
> > running.  It also benefits bare metal machines hosting VMs that are
> > sensitive to downtime.  In projects such as VMM Fast Restart[1], where
> > guest state is preserved across kexec reboot, it helps prevent
> > application and network timeouts in the guests.
> >
> > Multithread to take full advantage of system memory bandwidth.
> >
> > The maximum number of threads is capped at the number of CPUs on the
> > node because speedups always improve with additional threads on every
> > system tested, and at this phase of boot, the system is otherwise idle
> > and waiting on page init to finish.
> >
> > Helper threads operate on section-aligned ranges to both avoid false
> > sharing when setting the pageblock's migrate type and to avoid accessing
> > uninitialized buddy pages, though max order alignment is enough for the
> > latter.
> >
> > The minimum chunk size is also a section.  There was benefit to using
> > multiple threads even on relatively small memory (1G) systems, and this
> > is the smallest size that the alignment allows.
> >
> > The time (milliseconds) is the slowest node to initialize since boot
> > blocks until all nodes finish.  intel_pstate is loaded in active mode
> > without hwp and with turbo enabled, and intel_idle is active as well.
> >
> > Intel(R) Xeon(R) Platinum 8167M CPU @ 2.00GHz (Skylake, bare metal)
> >   2 nodes * 26 cores * 2 threads = 104 CPUs
> >   384G/node = 768G memory
> >
> >kernel boot deferred init
> >
> > node% (thr)speedup  time_ms (stdev)speedup  time_ms (stdev)
> >   (  0) --   4078.0 (  9.0) --   1779.0 (  8.7)
> >2% (  1)   1.4%   4021.3 (  2.9)   3.4%   1717.7 (  7.8)
> >   12% (  6)  35.1%   2644.7 ( 35.3)  80.8%341.0 ( 35.5)
> >   25% ( 13)  38.7%   2498.0 ( 34.2)  89.1%193.3 ( 32.3)
> >   37% ( 19)  39.1%   2482.0 ( 25.2)  90.1%175.3 ( 31.7)
> >   50% ( 26)  38.8%   2495.0 (  8.7)  89.1%193.7 (  3.5)
> >   75% ( 39)  39.2%   2478.0 ( 21.0)  90.3%172.7 ( 26.7)
> >  100% ( 52)  40.0%   2448.0 (  2.0)  91.9%143.3 (  1.5)
> >
> > Intel(R) Xeon(R) CPU E5-2699C v4 @ 2.20GHz (Broadwell, bare metal)
> >   1 node * 16 cores * 2 threads = 32 CPUs
> >   192G/node = 192G memory
> >
> >kernel boot deferred init
> >
> > node% (thr)speedup  time_ms (stdev)speedup  time_ms (stdev)
> >   (  0) --   1996.0 ( 18.0) --   1104.3 (  6.7)
> >3% (  1)   1.4%   1968.0 (  3.0)   2.7%   1074.7 (  9.0)
> >   12% (  4)  40.1%   1196.0 ( 22.7)  72.4%305.3 ( 16.8)
> >   25% (  8)  47.4%   1049.3 ( 17.2)  84.2%174.0 ( 10.6)
> >   37% ( 12)  48.3%   1032.0 ( 14.9)  86.8%145.3 (  2.5)
> >   50% ( 16)  48.9%   1020.3 (  2.5)  88.0%133.0 (  1.7)
> >   75% ( 24)  49.1%   1016.3 (  8.1)  88.4%128.0 (  1.7)
> >  100% ( 32)  49.4%   1009.0 (  8.5)  88.6%126.3 (  0.6)
> >
> > Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, bare metal)
> >   2 nodes * 18 cores * 2 threads = 72 CPUs
> >   128G/node = 256G memory
> >
> >kernel boot deferred init
> >
> > node% (thr)speedup  time_ms (stdev)speedup  time_ms (stdev)
> >   (  0) --   1682.7 (  6.7) --630.0 (  4.6)
> >3% (  1)   0.4%   1676.0 (  2.0)   0.7%625.3 (  3.2)
> >   12% (  4)  25.8%   1249.0 (  1.0)  68.2%200.3 (  1.2)
> >   25% (  9)  30.0%   1178.0 (  5.2)  79.7%128.0 (  3.5)
> >   37% ( 13)  30.6%   1167.7 (  3.1)  81.3%117.7 (  1.2)
> >   50% ( 18)  30.6%   1167.3 (  2.3)  81.4%117.0 (  1.0)
> >   75% ( 27)  31.0%   1161.3 (  4.6)  82.5%110.0 (  6.9)
> > 

Re: [PATCH v2 5/7] mm: parallelize deferred_init_memmap()

2020-05-20 Thread Alexander Duyck
On Wed, May 20, 2020 at 11:27 AM Daniel Jordan
 wrote:
>
> Deferred struct page init is a significant bottleneck in kernel boot.
> Optimizing it maximizes availability for large-memory systems and allows
> spinning up short-lived VMs as needed without having to leave them
> running.  It also benefits bare metal machines hosting VMs that are
> sensitive to downtime.  In projects such as VMM Fast Restart[1], where
> guest state is preserved across kexec reboot, it helps prevent
> application and network timeouts in the guests.
>
> Multithread to take full advantage of system memory bandwidth.
>
> The maximum number of threads is capped at the number of CPUs on the
> node because speedups always improve with additional threads on every
> system tested, and at this phase of boot, the system is otherwise idle
> and waiting on page init to finish.
>
> Helper threads operate on section-aligned ranges to both avoid false
> sharing when setting the pageblock's migrate type and to avoid accessing
> uninitialized buddy pages, though max order alignment is enough for the
> latter.
>
> The minimum chunk size is also a section.  There was benefit to using
> multiple threads even on relatively small memory (1G) systems, and this
> is the smallest size that the alignment allows.
>
> The time (milliseconds) is the slowest node to initialize since boot
> blocks until all nodes finish.  intel_pstate is loaded in active mode
> without hwp and with turbo enabled, and intel_idle is active as well.
>
> Intel(R) Xeon(R) Platinum 8167M CPU @ 2.00GHz (Skylake, bare metal)
>   2 nodes * 26 cores * 2 threads = 104 CPUs
>   384G/node = 768G memory
>
>kernel boot deferred init
>
> node% (thr)speedup  time_ms (stdev)speedup  time_ms (stdev)
>   (  0) --   4078.0 (  9.0) --   1779.0 (  8.7)
>2% (  1)   1.4%   4021.3 (  2.9)   3.4%   1717.7 (  7.8)
>   12% (  6)  35.1%   2644.7 ( 35.3)  80.8%341.0 ( 35.5)
>   25% ( 13)  38.7%   2498.0 ( 34.2)  89.1%193.3 ( 32.3)
>   37% ( 19)  39.1%   2482.0 ( 25.2)  90.1%175.3 ( 31.7)
>   50% ( 26)  38.8%   2495.0 (  8.7)  89.1%193.7 (  3.5)
>   75% ( 39)  39.2%   2478.0 ( 21.0)  90.3%172.7 ( 26.7)
>  100% ( 52)  40.0%   2448.0 (  2.0)  91.9%143.3 (  1.5)
>
> Intel(R) Xeon(R) CPU E5-2699C v4 @ 2.20GHz (Broadwell, bare metal)
>   1 node * 16 cores * 2 threads = 32 CPUs
>   192G/node = 192G memory
>
>kernel boot deferred init
>
> node% (thr)speedup  time_ms (stdev)speedup  time_ms (stdev)
>   (  0) --   1996.0 ( 18.0) --   1104.3 (  6.7)
>3% (  1)   1.4%   1968.0 (  3.0)   2.7%   1074.7 (  9.0)
>   12% (  4)  40.1%   1196.0 ( 22.7)  72.4%305.3 ( 16.8)
>   25% (  8)  47.4%   1049.3 ( 17.2)  84.2%174.0 ( 10.6)
>   37% ( 12)  48.3%   1032.0 ( 14.9)  86.8%145.3 (  2.5)
>   50% ( 16)  48.9%   1020.3 (  2.5)  88.0%133.0 (  1.7)
>   75% ( 24)  49.1%   1016.3 (  8.1)  88.4%128.0 (  1.7)
>  100% ( 32)  49.4%   1009.0 (  8.5)  88.6%126.3 (  0.6)
>
> Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, bare metal)
>   2 nodes * 18 cores * 2 threads = 72 CPUs
>   128G/node = 256G memory
>
>kernel boot deferred init
>
> node% (thr)speedup  time_ms (stdev)speedup  time_ms (stdev)
>   (  0) --   1682.7 (  6.7) --630.0 (  4.6)
>3% (  1)   0.4%   1676.0 (  2.0)   0.7%625.3 (  3.2)
>   12% (  4)  25.8%   1249.0 (  1.0)  68.2%200.3 (  1.2)
>   25% (  9)  30.0%   1178.0 (  5.2)  79.7%128.0 (  3.5)
>   37% ( 13)  30.6%   1167.7 (  3.1)  81.3%117.7 (  1.2)
>   50% ( 18)  30.6%   1167.3 (  2.3)  81.4%117.0 (  1.0)
>   75% ( 27)  31.0%   1161.3 (  4.6)  82.5%110.0 (  6.9)
>  100% ( 36)  32.1%   1142.0 (  3.6)  85.7% 90.0 (  1.0)
>
> AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
>   1 node * 8 cores * 2 threads = 16 CPUs
>   64G/node = 64G memory
>
>kernel boot deferred init
>
> node% (thr)speedup  time_ms (stdev)speedup  time_ms (stdev)
>   (  0) --   1003.7 ( 16.6) --243.3 (  8.1)
>6% (  1)   1.4%990.0 (  4.6)   1.2%240.3 (  1.5)
>   12% (  2)  11.4%889.3 ( 16.7)  44.5%135.0 (  3.0)
>   25% (  4)  16.8%835.3 (  9.0)   

Re: linux-next: Tree for Oct 4

2018-10-05 Thread Alexander Duyck

On 10/4/2018 10:39 PM, Stephen Rothwell wrote:

Hi Guenter,

On Thu, 4 Oct 2018 18:33:02 -0700 Guenter Roeck  wrote:


Most of the boot failures are hopefully fixed with
https://lore.kernel.org/patchwork/patch/995254/


I have added that commit to linux-next today.



After getting over that I ended up running into an issues with a NULL 
pointer dereference updating static keys/jump labels that was fixed by:

https://lore.kernel.org/patchwork/patch/993864/

Thanks.

- Alex


Re: [PATCH] dma-direct: Fix return value of dma_direct_supported

2018-10-04 Thread Alexander Duyck
On Thu, Oct 4, 2018 at 4:25 AM Robin Murphy  wrote:
>
> On 04/10/18 00:48, Alexander Duyck wrote:
> > It appears that in commit 9d7a224b463e ("dma-direct: always allow dma mask
> > <= physiscal memory size") the logic of the test was changed from a "<" to
> > a ">=" however I don't see any reason for that change. I am assuming that
> > there was some additional change planned, specifically I suspect the logic
> > was intended to be reversed and possibly used for a return. Since that is
> > the case I have gone ahead and done that.
>
> Bah, seems I got hung up on the min_mask code above it and totally
> overlooked that the condition itself got flipped. It probably also can't
> help that it's an int return type, but treated as a bool by callers
> rather than "0 for success" as int tends to imply in isolation.
>
> Anyway, paying a bit more attention this time, I think this looks like
> the right fix - cheers Alex.
>
> Robin.

Thanks for the review.

- Alex

P.S. It looks like I forgot to add Christoph to the original mail
since I had just copied the To and Cc from the original submission, so
I added him to the Cc for this.

> > This addresses issues I had on my system that prevented me from booting
> > with the above mentioned commit applied on an x86_64 system w/ Intel IOMMU.
> >
> > Fixes: 9d7a224b463e ("dma-direct: always allow dma mask <= physiscal memory 
> > size")
> > Signed-off-by: Alexander Duyck 
> > ---
> >   kernel/dma/direct.c |4 +---
> >   1 file changed, 1 insertion(+), 3 deletions(-)
> >
> > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> > index 5a0806b5351b..65872f6c2e93 100644
> > --- a/kernel/dma/direct.c
> > +++ b/kernel/dma/direct.c
> > @@ -301,9 +301,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
> >
> >   min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
> >
> > - if (mask >= phys_to_dma(dev, min_mask))
> > - return 0;
> > - return 1;
> > + return mask >= phys_to_dma(dev, min_mask);
> >   }
> >
> >   int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
> >
> > ___
> > iommu mailing list
> > io...@lists.linux-foundation.org
> > https://lists.linuxfoundation.org/mailman/listinfo/iommu
> >
> ___
> iommu mailing list
> io...@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] dma-direct: Fix return value of dma_direct_supported

2018-10-03 Thread Alexander Duyck
It appears that in commit 9d7a224b463e ("dma-direct: always allow dma mask
<= physiscal memory size") the logic of the test was changed from a "<" to
a ">=" however I don't see any reason for that change. I am assuming that
there was some additional change planned, specifically I suspect the logic
was intended to be reversed and possibly used for a return. Since that is
the case I have gone ahead and done that.

This addresses issues I had on my system that prevented me from booting
with the above mentioned commit applied on an x86_64 system w/ Intel IOMMU.

Fixes: 9d7a224b463e ("dma-direct: always allow dma mask <= physiscal memory 
size")
Signed-off-by: Alexander Duyck 
---
 kernel/dma/direct.c |4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 5a0806b5351b..65872f6c2e93 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -301,9 +301,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
 
min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
 
-   if (mask >= phys_to_dma(dev, min_mask))
-   return 0;
-   return 1;
+   return mask >= phys_to_dma(dev, min_mask);
 }
 
 int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)



Re: [PATCH 5/5] dma-direct: always allow dma mask <= physiscal memory size

2018-10-03 Thread Alexander Duyck
On Thu, Sep 27, 2018 at 3:38 PM Christoph Hellwig  wrote:
>
> This way an architecture with less than 4G of RAM can support dma_mask
> smaller than 32-bit without a ZONE_DMA.  Apparently that is a common
> case on powerpc.
>
> Signed-off-by: Christoph Hellwig 
> Reviewed-by: Robin Murphy 
> ---
>  kernel/dma/direct.c | 28 
>  1 file changed, 16 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 60c433b880e0..170bd322a94a 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -284,21 +284,25 @@ int dma_direct_map_sg(struct device *dev, struct 
> scatterlist *sgl, int nents,
> return nents;
>  }
>
> +/*
> + * Because 32-bit DMA masks are so common we expect every architecture to be
> + * able to satisfy them - either by not supporting more physical memory, or 
> by
> + * providing a ZONE_DMA32.  If neither is the case, the architecture needs to
> + * use an IOMMU instead of the direct mapping.
> + */
>  int dma_direct_supported(struct device *dev, u64 mask)
>  {
> -#ifdef CONFIG_ZONE_DMA
> -   if (mask < phys_to_dma(dev, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)))
> -   return 0;
> -#else
> -   /*
> -* Because 32-bit DMA masks are so common we expect every architecture
> -* to be able to satisfy them - either by not supporting more physical
> -* memory, or by providing a ZONE_DMA32.  If neither is the case, the
> -* architecture needs to use an IOMMU instead of the direct mapping.
> -*/
> -   if (mask < phys_to_dma(dev, DMA_BIT_MASK(32)))
> +   u64 min_mask;
> +
> +   if (IS_ENABLED(CONFIG_ZONE_DMA))
> +   min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS);
> +   else
> +   min_mask = DMA_BIT_MASK(32);
> +
> +   min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
> +
> +   if (mask >= phys_to_dma(dev, min_mask))
> return 0;
> -#endif
> return 1;
>  }

So I believe I have run into the same issue that Guenter reported. On
an x86_64 system w/ Intel IOMMU. I wasn't able to complete boot and
all probe attempts for various devices were failing with -EIO errors.

I believe the last mask check should be "if (mask < phys_to_dma(dev,
min_mask))" not a ">=" check.

- Alex


Re: [PATCH 03/30] mm: remove CONFIG_HAVE_MEMBLOCK

2018-09-26 Thread Alexander Duyck
On Wed, Sep 26, 2018 at 11:32 AM Mike Rapoport  wrote:
>
> On Wed, Sep 26, 2018 at 09:58:41AM -0700, Alexander Duyck wrote:
> > On Fri, Sep 14, 2018 at 5:11 AM Mike Rapoport  
> > wrote:
> > >
> > > All architecures use memblock for early memory management. There is no 
> > > need
> > > for the CONFIG_HAVE_MEMBLOCK configuration option.
> > >
> > > Signed-off-by: Mike Rapoport 
> >
> > 
> >
> > > diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> > > index 5169205..4ae91fc 100644
> > > --- a/include/linux/memblock.h
> > > +++ b/include/linux/memblock.h
> > > @@ -2,7 +2,6 @@
> > >  #define _LINUX_MEMBLOCK_H
> > >  #ifdef __KERNEL__
> > >
> > > -#ifdef CONFIG_HAVE_MEMBLOCK
> > >  /*
> > >   * Logical memory blocks.
> > >   *
> > > @@ -460,7 +459,6 @@ static inline phys_addr_t memblock_alloc(phys_addr_t 
> > > size, phys_addr_t align)
> > >  {
> > > return 0;
> > >  }
> > > -#endif /* CONFIG_HAVE_MEMBLOCK */
> > >
> > >  #endif /* __KERNEL__ */
> >
> > There was an #else above this section and I believe it and the code
> > after it needs to be stripped as well.
>
> Right, I've already sent the fix [1] and it's in mmots.
>
> [1] https://lkml.org/lkml/2018/9/19/416
>

Are you sure? The patch you reference appears to be for
drivers/of/fdt.c, and the bit I pointed out here is in
include/linux/memblock.h.

- Alex


Re: [PATCH 03/30] mm: remove CONFIG_HAVE_MEMBLOCK

2018-09-26 Thread Alexander Duyck
On Fri, Sep 14, 2018 at 5:11 AM Mike Rapoport  wrote:
>
> All architecures use memblock for early memory management. There is no need
> for the CONFIG_HAVE_MEMBLOCK configuration option.
>
> Signed-off-by: Mike Rapoport 



> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 5169205..4ae91fc 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -2,7 +2,6 @@
>  #define _LINUX_MEMBLOCK_H
>  #ifdef __KERNEL__
>
> -#ifdef CONFIG_HAVE_MEMBLOCK
>  /*
>   * Logical memory blocks.
>   *
> @@ -460,7 +459,6 @@ static inline phys_addr_t memblock_alloc(phys_addr_t 
> size, phys_addr_t align)
>  {
> return 0;
>  }
> -#endif /* CONFIG_HAVE_MEMBLOCK */
>
>  #endif /* __KERNEL__ */

There was an #else above this section and I believe it and the code
after it needs to be stripped as well.


Re: RFC on writel and writel_relaxed

2018-03-27 Thread Alexander Duyck
On Tue, Mar 27, 2018 at 2:35 PM, Benjamin Herrenschmidt
 wrote:
> On Tue, 2018-03-27 at 10:46 -0400, Sinan Kaya wrote:
>>  combined buffers.
>>
>> Alex:
>> "Don't bother. I can tell you right now that for x86 you have to have a
>> wmb() before the writel().
>
> No, this isn't the semantics of writel. You shouldn't need it unless
> something changed and we need to revisit our complete understanding of
> *all* MMIO accessor semantics.

The issue seems to be that there have been two different ways of
dealing with this. There has historically been a number of different
drivers that have been carrying this wmb() workaround since something
like 2002. I get that the semantics for writel might have changed
since then, but those of us who already have the wmb() in our drivers
will be very wary of anyone wanting to go through and remove them
since writel is supposed to be "good enough". I would much rather err
on the side of caution here.

I view the wmb() + writel_relaxed() as more of a driver owning and
handling this itself. Besides in the Intel Ethernet driver case it is
better performance as our wmb() placement for us also provides a
secondary barrier so we don't need to add a separate smp_wmb() to deal
with a potential race we have with the Tx cleanup.

> At least for UC space, it has always been accepted (and enforced) that
> writel would not require any other barrier to order vs. previous stores
> to memory.

So the one thing I would question here is if this is UC vs UC or if
this extends to other types as well? So for x86 we could find
references to Write Combining being flushed by a write to UC memory,
however I have yet to find a clear explanation of what a write to UC
does to WB. My personal inclination would be to err on the side of
caution. I just don't want us going through and removing the wmb()
calls because it "should" work. I would want to know for certain it
will work.

- Alex


Re: RFC on writel and writel_relaxed

2018-03-27 Thread Alexander Duyck
On Tue, Mar 27, 2018 at 8:10 AM, Will Deacon  wrote:
> Hi Alex,
>
> On Tue, Mar 27, 2018 at 10:46:58AM -0400, Sinan Kaya wrote:
>> +netdev, +Alex
>>
>> On 3/26/2018 6:00 PM, Benjamin Herrenschmidt wrote:
>> > On Mon, 2018-03-26 at 23:30 +0200, Arnd Bergmann wrote:
>> >>  Most of the drivers have a unwound loop with writeq() or something to
>> >>> do it.
>> >>
>> >> But isn't the writeq() barrier much more expensive than anything you'd
>> >> do in function calls?
>> >
>> > It is for us, and will break any write combining.
>> >
>> >>>>> The same document says that _relaxed() does not give that guarentee.
>> >>>>>
>> >>>>> The lwn articule on this went into some depth on the interaction with
>> >>>>> spinlocks.
>> >>>>>
>> >>>>> As far as I can see, containment in a spinlock seems to be the only
>> >>>>> different between writel and writel_relaxed..
>> >>>>
>> >>>> I was always puzzled by this: The intention of _relaxed() on ARM
>> >>>> (where it originates) was to skip the barrier that serializes DMA
>> >>>> with MMIO, not to skip the serialization between MMIO and locks.
>> >>>
>> >>> But that was never a requirement of writel(),
>> >>> Documentation/memory-barriers.txt gives an explicit example demanding
>> >>> the wmb() before writel() for ordering system memory against writel.
>> >
>> > This is a bug in the documentation.
>> >
>> >> Indeed, but it's in an example for when to use dma_wmb(), not wmb().
>> >> Adding Alexander Duyck to Cc, he added that section as part of
>> >> 1077fa36f23e ("arch: Add lightweight memory barriers dma_rmb() and
>> >> dma_wmb()"). Also adding the other people that were involved with that.
>> >
>> > Linus himself made it very clear years ago. readl and writel have to
>> > order vs memory accesses.
>> >
>> >>> I actually have no idea why ARM had that barrier, I always assumed it
>> >>> was to give program ordering to the accesses and that _relaxed allowed
>> >>> re-ordering (the usual meaning of relaxed)..
>> >>>
>> >>> But the barrier document makes it pretty clear that the only
>> >>> difference between the two is spinlock containment, and WillD wrote
>> >>> this text, so I belive it is accurate for ARM.
>> >>>
>> >>> Very confusing.
>> >>
>> >> It does mention serialization with both DMA and locks in the
>> >> section about  readX_relaxed()/writeX_relaxed(). The part
>> >> about DMA is very clear here, and I must have just forgotten
>> >> the exact semantics with regards to spinlocks. I'm still not
>> >> sure what prevents a writel() from leaking out the end of a
>> >> spinlock section that doesn't happen with writel_relaxed(), since
>> >> the barrier in writel() comes before the access, and the
>> >> spin_unlock() shouldn't affect the external buses.
>> >
>> > So...
>> >
>> > Historically, what happened is that we (we means whoever participated
>> > in the discussion on the list with Linus calling the shots really)
>> > decided that there was no sane way for drivers to understand a world
>> > where readl/writel didn't fully order things vs. memory accesses (ie,
>> > DMA).
>> >
>> > So it should always be correct to do:
>> >
>> > - Write to some in-memory buffer
>> > - writel() to kick the DMA read of that buffer
>> >
>> > without any extra barrier.
>> >
>> > The spinlock situation however got murky. Mostly that came up because
>> > on architecture (I forgot who, might have been ia64) has a hard time
>> > providing that consistency without making writel insanely expensive.
>> >
>> > Thus they created mmiowb whose main purpose was precisely to order
>> > writel with a following spin_unlock.
>> >
>> > I decided not to go down that path on power because getting all drivers
>> > "fixed" to do the right thing was going to be a losing battle, and
>> > instead added per-cpu tracking of writel in order to "escalate" to a
>> > heavier barrier in spin_unlock itself when necessary.
>> >
>> > Now, all this happen

Re: [PATCH 03/44] dmaengine: ioat: don't use DMA_ERROR_CODE

2017-06-16 Thread Alexander Duyck
On Fri, Jun 16, 2017 at 11:10 AM, Christoph Hellwig  wrote:
> DMA_ERROR_CODE is not a public API and will go away.  Instead properly
> unwind based on the loop counter.
>
> Signed-off-by: Christoph Hellwig 
> Acked-by: Dave Jiang 
> Acked-By: Vinod Koul 
> ---
>  drivers/dma/ioat/init.c | 24 +++-
>  1 file changed, 7 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c
> index 6ad4384b3fa8..ed8ed1192775 100644
> --- a/drivers/dma/ioat/init.c
> +++ b/drivers/dma/ioat/init.c
> @@ -839,8 +839,6 @@ static int ioat_xor_val_self_test(struct ioatdma_device 
> *ioat_dma)
> goto free_resources;
> }
>
> -   for (i = 0; i < IOAT_NUM_SRC_TEST; i++)
> -   dma_srcs[i] = DMA_ERROR_CODE;
> for (i = 0; i < IOAT_NUM_SRC_TEST; i++) {
> dma_srcs[i] = dma_map_page(dev, xor_srcs[i], 0, PAGE_SIZE,
>DMA_TO_DEVICE);
> @@ -910,8 +908,6 @@ static int ioat_xor_val_self_test(struct ioatdma_device 
> *ioat_dma)
>
> xor_val_result = 1;
>
> -   for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++)
> -   dma_srcs[i] = DMA_ERROR_CODE;
> for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++) {
> dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE,
>DMA_TO_DEVICE);
> @@ -965,8 +961,6 @@ static int ioat_xor_val_self_test(struct ioatdma_device 
> *ioat_dma)
> op = IOAT_OP_XOR_VAL;
>
> xor_val_result = 0;
> -   for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++)
> -   dma_srcs[i] = DMA_ERROR_CODE;
> for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++) {
> dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE,
>DMA_TO_DEVICE);
> @@ -1017,18 +1011,14 @@ static int ioat_xor_val_self_test(struct 
> ioatdma_device *ioat_dma)
> goto free_resources;
>  dma_unmap:
> if (op == IOAT_OP_XOR) {
> -   if (dest_dma != DMA_ERROR_CODE)
> -   dma_unmap_page(dev, dest_dma, PAGE_SIZE,
> -  DMA_FROM_DEVICE);
> -   for (i = 0; i < IOAT_NUM_SRC_TEST; i++)
> -   if (dma_srcs[i] != DMA_ERROR_CODE)
> -   dma_unmap_page(dev, dma_srcs[i], PAGE_SIZE,
> -  DMA_TO_DEVICE);
> +   while (--i >= 0)
> +   dma_unmap_page(dev, dma_srcs[i], PAGE_SIZE,
> +  DMA_TO_DEVICE);
> +   dma_unmap_page(dev, dest_dma, PAGE_SIZE, DMA_FROM_DEVICE);
> } else if (op == IOAT_OP_XOR_VAL) {
> -   for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++)
> -   if (dma_srcs[i] != DMA_ERROR_CODE)
> -   dma_unmap_page(dev, dma_srcs[i], PAGE_SIZE,
> -  DMA_TO_DEVICE);
> +   while (--i >= 0)
> +   dma_unmap_page(dev, dma_srcs[i], PAGE_SIZE,
> +  DMA_TO_DEVICE);

Wouldn't it make more sense to pull out the while loop and just call
dma_unmap_page on dest_dma if "op == IOAT_OP_XOR"? Odds are it is what
the compiler is already generating and will save a few lines of code
so what you end up with is something like:
while (--i >= 0)
dma_unmap_page(dev, dma_srcs[i], PAGE_SIZE, DMA_TO_DEVICE);
if (op == IOAT_OP_XOR)
dma_unmap_page(dev, dest_dma, PAGE_SIZE, DMA_FROM_DEVICE);

> }
>  free_resources:
> dma->device_free_chan_resources(dma_chan);
> --
> 2.11.0
>


[mm PATCH v2 18/26] arch/powerpc: Add option to skip DMA sync as a part of mapping

2016-11-02 Thread Alexander Duyck
This change allows us to pass DMA_ATTR_SKIP_CPU_SYNC which allows us to
avoid invoking cache line invalidation if the driver will just handle it
via a sync_for_cpu or sync_for_device call.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Alexander Duyck 
---
 arch/powerpc/kernel/dma.c |9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index e64a601..6877e3f 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -203,6 +203,10 @@ static int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl,
for_each_sg(sgl, sg, nents, i) {
sg->dma_address = sg_phys(sg) + get_dma_offset(dev);
sg->dma_length = sg->length;
+
+   if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+   continue;
+
__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
}
 
@@ -235,7 +239,10 @@ static inline dma_addr_t dma_direct_map_page(struct device 
*dev,
 unsigned long attrs)
 {
BUG_ON(dir == DMA_NONE);
-   __dma_sync_page(page, offset, size, dir);
+
+   if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+   __dma_sync_page(page, offset, size, dir);
+
return page_to_phys(page) + offset + get_dma_offset(dev);
 }
 



[net-next PATCH 18/27] arch/powerpc: Add option to skip DMA sync as a part of mapping

2016-10-25 Thread Alexander Duyck
This change allows us to pass DMA_ATTR_SKIP_CPU_SYNC which allows us to
avoid invoking cache line invalidation if the driver will just handle it
via a sync_for_cpu or sync_for_device call.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Alexander Duyck 
---
 arch/powerpc/kernel/dma.c |9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index e64a601..6877e3f 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -203,6 +203,10 @@ static int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl,
for_each_sg(sgl, sg, nents, i) {
sg->dma_address = sg_phys(sg) + get_dma_offset(dev);
sg->dma_length = sg->length;
+
+   if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+   continue;
+
__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
}
 
@@ -235,7 +239,10 @@ static inline dma_addr_t dma_direct_map_page(struct device 
*dev,
 unsigned long attrs)
 {
BUG_ON(dir == DMA_NONE);
-   __dma_sync_page(page, offset, size, dir);
+
+   if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+   __dma_sync_page(page, offset, size, dir);
+
return page_to_phys(page) + offset + get_dma_offset(dev);
 }
 



[net-next PATCH RFC 17/26] arch/powerpc: Add option to skip DMA sync as a part of mapping

2016-10-24 Thread Alexander Duyck
This change allows us to pass DMA_ATTR_SKIP_CPU_SYNC which allows us to
avoid invoking cache line invalidation if the driver will just handle it
via a sync_for_cpu or sync_for_device call.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Alexander Duyck 
---
 arch/powerpc/kernel/dma.c |9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index e64a601..6877e3f 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -203,6 +203,10 @@ static int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl,
for_each_sg(sgl, sg, nents, i) {
sg->dma_address = sg_phys(sg) + get_dma_offset(dev);
sg->dma_length = sg->length;
+
+   if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+   continue;
+
__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
}
 
@@ -235,7 +239,10 @@ static inline dma_addr_t dma_direct_map_page(struct device 
*dev,
 unsigned long attrs)
 {
BUG_ON(dir == DMA_NONE);
-   __dma_sync_page(page, offset, size, dir);
+
+   if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+   __dma_sync_page(page, offset, size, dir);
+
return page_to_phys(page) + offset + get_dma_offset(dev);
 }