Re: [PATCH v2] lib: optimize cpumask_next_and()

2017-10-26 Thread kbuild test robot
Hi Clement,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.14-rc6 next-20171018]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Clement-Courbet/lib-optimize-cpumask_next_and/20171026-184850
config: m68k-allyesconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 4.9.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=m68k 

All errors (new ones prefixed by >>):

   lib/find_bit.c: In function 'find_next_and_bit':
>> lib/find_bit.c:92:2: error: implicit declaration of function 
>> '_find_next_bit' [-Werror=implicit-function-declaration]
 return _find_next_bit(addr1, addr2, size, offset, ~0UL);
 ^
   lib/find_bit.c:92:38: error: 'size' undeclared (first use in this function)
 return _find_next_bit(addr1, addr2, size, offset, ~0UL);
 ^
   lib/find_bit.c:92:38: note: each undeclared identifier is reported only once 
for each function it appears in
   lib/find_bit.c:92:44: error: 'offset' undeclared (first use in this function)
 return _find_next_bit(addr1, addr2, size, offset, ~0UL);
   ^
   lib/find_bit.c:93:1: warning: control reaches end of non-void function 
[-Wreturn-type]
}
^
   cc1: some warnings being treated as errors

vim +/_find_next_bit +92 lib/find_bit.c

86  
87  #if !defined(find_next_and_bit)
88  unsigned long find_next_and_bit(const unsigned long *addr1,
89  const unsigned long *addr2, unsigned long nbits,
90  unsigned long start)
91  {
  > 92  return _find_next_bit(addr1, addr2, size, offset, ~0UL);
93  }
94  EXPORT_SYMBOL(find_next_and_bit);
95  #endif
96  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH v2] lib: optimize cpumask_next_and()

2017-10-26 Thread kbuild test robot
Hi Clement,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.14-rc6 next-20171018]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Clement-Courbet/lib-optimize-cpumask_next_and/20171026-184850
config: m68k-allyesconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 4.9.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=m68k 

All errors (new ones prefixed by >>):

   lib/find_bit.c: In function 'find_next_and_bit':
>> lib/find_bit.c:92:2: error: implicit declaration of function 
>> '_find_next_bit' [-Werror=implicit-function-declaration]
 return _find_next_bit(addr1, addr2, size, offset, ~0UL);
 ^
   lib/find_bit.c:92:38: error: 'size' undeclared (first use in this function)
 return _find_next_bit(addr1, addr2, size, offset, ~0UL);
 ^
   lib/find_bit.c:92:38: note: each undeclared identifier is reported only once 
for each function it appears in
   lib/find_bit.c:92:44: error: 'offset' undeclared (first use in this function)
 return _find_next_bit(addr1, addr2, size, offset, ~0UL);
   ^
   lib/find_bit.c:93:1: warning: control reaches end of non-void function 
[-Wreturn-type]
}
^
   cc1: some warnings being treated as errors

vim +/_find_next_bit +92 lib/find_bit.c

86  
87  #if !defined(find_next_and_bit)
88  unsigned long find_next_and_bit(const unsigned long *addr1,
89  const unsigned long *addr2, unsigned long nbits,
90  unsigned long start)
91  {
  > 92  return _find_next_bit(addr1, addr2, size, offset, ~0UL);
93  }
94  EXPORT_SYMBOL(find_next_and_bit);
95  #endif
96  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH v2] lib: optimize cpumask_next_and()

2017-10-26 Thread kbuild test robot
Hi Clement,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.14-rc6 next-20171018]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Clement-Courbet/lib-optimize-cpumask_next_and/20171026-184850
config: i386-randconfig-x001-201743 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All error/warnings (new ones prefixed by >>):

   lib/find_bit.c: In function 'find_next_and_bit':
>> lib/find_bit.c:92:38: error: 'size' undeclared (first use in this function)
 return _find_next_bit(addr1, addr2, size, offset, ~0UL);
 ^~~~
   lib/find_bit.c:92:38: note: each undeclared identifier is reported only once 
for each function it appears in
>> lib/find_bit.c:92:44: error: 'offset' undeclared (first use in this function)
 return _find_next_bit(addr1, addr2, size, offset, ~0UL);
   ^~
>> lib/find_bit.c:93:1: warning: control reaches end of non-void function 
>> [-Wreturn-type]
}
^

vim +/size +92 lib/find_bit.c

86  
87  #if !defined(find_next_and_bit)
88  unsigned long find_next_and_bit(const unsigned long *addr1,
89  const unsigned long *addr2, unsigned long nbits,
90  unsigned long start)
91  {
  > 92  return _find_next_bit(addr1, addr2, size, offset, ~0UL);
  > 93  }
94  EXPORT_SYMBOL(find_next_and_bit);
95  #endif
96  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH v2] lib: optimize cpumask_next_and()

2017-10-26 Thread kbuild test robot
Hi Clement,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.14-rc6 next-20171018]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Clement-Courbet/lib-optimize-cpumask_next_and/20171026-184850
config: i386-randconfig-x001-201743 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All error/warnings (new ones prefixed by >>):

   lib/find_bit.c: In function 'find_next_and_bit':
>> lib/find_bit.c:92:38: error: 'size' undeclared (first use in this function)
 return _find_next_bit(addr1, addr2, size, offset, ~0UL);
 ^~~~
   lib/find_bit.c:92:38: note: each undeclared identifier is reported only once 
for each function it appears in
>> lib/find_bit.c:92:44: error: 'offset' undeclared (first use in this function)
 return _find_next_bit(addr1, addr2, size, offset, ~0UL);
   ^~
>> lib/find_bit.c:93:1: warning: control reaches end of non-void function 
>> [-Wreturn-type]
}
^

vim +/size +92 lib/find_bit.c

86  
87  #if !defined(find_next_and_bit)
88  unsigned long find_next_and_bit(const unsigned long *addr1,
89  const unsigned long *addr2, unsigned long nbits,
90  unsigned long start)
91  {
  > 92  return _find_next_bit(addr1, addr2, size, offset, ~0UL);
  > 93  }
94  EXPORT_SYMBOL(find_next_and_bit);
95  #endif
96  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: Re [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Yury Norov
On Wed, Oct 25, 2017 at 06:50:14PM +0300, Yury Norov wrote:
> On Wed, Oct 25, 2017 at 05:28:41PM +0200, Clement Courbet wrote:
> > Thanks for the comments Yury.
> > 
> > > But I'd like also to keep _find_next_bit() consistent with
> > > _find_next_bit_le()
> > 
> > Not sure I understand what you're suggesting here: Do you want a
> > find_next_and_bit_le() or do you want to make _find_next_bit_le() more
> > like _find_next_bit() ? In the latter case we might just want to merge
> > it with _find_next_bit() and end up with an extra is_le parameter :)
> 
> Both ways will work, but I think that extra is_le is too much.
> _find_next_bit_le() should be the copy of _find_next_bit() with the
> addition of swapping code.
> 
> If you don't need find_next_and_bit_le(), don't add it.
> find_{first,last}_bit() doesn't have LE version, for example.

Few comments more.

_find_next_bit is now referenced also by find_next_and_bit(), and
so it should be added to guard code:

#if !defined(find_next_bit) || !defined(find_next_zero_bit)
|| !defined(find_next_and_bit)
static unsigned long _find_next_bit( ... )
{
...
}
#endif

It may be essential at least for arm.

Don't forget to synchronize your changes with
tools/lib/find_bit.c

Thanks,
Yury


Re: Re [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Yury Norov
On Wed, Oct 25, 2017 at 06:50:14PM +0300, Yury Norov wrote:
> On Wed, Oct 25, 2017 at 05:28:41PM +0200, Clement Courbet wrote:
> > Thanks for the comments Yury.
> > 
> > > But I'd like also to keep _find_next_bit() consistent with
> > > _find_next_bit_le()
> > 
> > Not sure I understand what you're suggesting here: Do you want a
> > find_next_and_bit_le() or do you want to make _find_next_bit_le() more
> > like _find_next_bit() ? In the latter case we might just want to merge
> > it with _find_next_bit() and end up with an extra is_le parameter :)
> 
> Both ways will work, but I think that extra is_le is too much.
> _find_next_bit_le() should be the copy of _find_next_bit() with the
> addition of swapping code.
> 
> If you don't need find_next_and_bit_le(), don't add it.
> find_{first,last}_bit() doesn't have LE version, for example.

Few comments more.

_find_next_bit is now referenced also by find_next_and_bit(), and
so it should be added to guard code:

#if !defined(find_next_bit) || !defined(find_next_zero_bit)
|| !defined(find_next_and_bit)
static unsigned long _find_next_bit( ... )
{
...
}
#endif

It may be essential at least for arm.

Don't forget to synchronize your changes with
tools/lib/find_bit.c

Thanks,
Yury


Re: Re [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Yury Norov
On Wed, Oct 25, 2017 at 05:28:41PM +0200, Clement Courbet wrote:
> Thanks for the comments Yury.
> 
> > But I'd like also to keep _find_next_bit() consistent with
> > _find_next_bit_le()
> 
> Not sure I understand what you're suggesting here: Do you want a
> find_next_and_bit_le() or do you want to make _find_next_bit_le() more
> like _find_next_bit() ? In the latter case we might just want to merge
> it with _find_next_bit() and end up with an extra is_le parameter :)

Both ways will work, but I think that extra is_le is too much.
_find_next_bit_le() should be the copy of _find_next_bit() with the
addition of swapping code.

If you don't need find_next_and_bit_le(), don't add it.
find_{first,last}_bit() doesn't have LE version, for example.

Yury


Re: Re [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Yury Norov
On Wed, Oct 25, 2017 at 05:28:41PM +0200, Clement Courbet wrote:
> Thanks for the comments Yury.
> 
> > But I'd like also to keep _find_next_bit() consistent with
> > _find_next_bit_le()
> 
> Not sure I understand what you're suggesting here: Do you want a
> find_next_and_bit_le() or do you want to make _find_next_bit_le() more
> like _find_next_bit() ? In the latter case we might just want to merge
> it with _find_next_bit() and end up with an extra is_le parameter :)

Both ways will work, but I think that extra is_le is too much.
_find_next_bit_le() should be the copy of _find_next_bit() with the
addition of swapping code.

If you don't need find_next_and_bit_le(), don't add it.
find_{first,last}_bit() doesn't have LE version, for example.

Yury


Re [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Clement Courbet
Thanks for the comments Yury.

> But I'd like also to keep _find_next_bit() consistent with
> _find_next_bit_le()

Not sure I understand what you're suggesting here: Do you want a
find_next_and_bit_le() or do you want to make _find_next_bit_le() more
like _find_next_bit() ? In the latter case we might just want to merge
it with _find_next_bit() and end up with an extra is_le parameter :)



Re [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Clement Courbet
Thanks for the comments Yury.

> But I'd like also to keep _find_next_bit() consistent with
> _find_next_bit_le()

Not sure I understand what you're suggesting here: Do you want a
find_next_and_bit_le() or do you want to make _find_next_bit_le() more
like _find_next_bit() ? In the latter case we might just want to merge
it with _find_next_bit() and end up with an extra is_le parameter :)



Re: [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Yury Norov
On Tue, Oct 24, 2017 at 12:51:59PM +0200, Clement Courbet wrote:
> We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
> It's essentially a joined iteration in search for a non-zero bit, which
> is currently implemented as a lookup join (find a nonzero bit on the
> lhs, lookup the rhs to see if it's set there).
> 
> Implement a direct join (find a nonzero bit on the incrementally built
> join). Direct benchmarking shows that it's 1.17x to 14x faster with a
> geometric mean of 2.1 on 32 CPUs. No impact on memory usage.
> 
> Approximate benchmark code:
> 
> ```
>   unsigned long src1p[nr_cpumask_longs] = {pattern1};
>   unsigned long src2p[nr_cpumask_longs] = {pattern2};
>   for (/*a bunch of repetitions*/) {
> for (int n = -1; n <= nr_cpu_ids; ++n) {
>   asm volatile("" : "+rm"(src1p)); // prevent any optimization
>   asm volatile("" : "+rm"(src2p));
>   unsigned long result = cpumask_next_and(n, src1p, src2p);
>   asm volatile("" : "+rm"(result));
> }
>   }
> ```
> Signed-off-by: Clement Courbet 
> ---
> Changes in v2:
>  - Refactored _find_next_common_bit into _find_next_bit., as suggested
>by Yury Norov.

What I actually suggested is make _find_next_and_bit() similar to
_find_next_bit(), not to extend _find_next_bit(). But what you did
looks OK.

> This has no adverse effects on the performance side,
>as the compiler successfully inlines the code.

I think it's not about inlining, compiler just optimizes out branches known
as false at compile-time.

>  include/asm-generic/bitops/find.h   | 16 ++
>  include/linux/bitmap.h  |  2 ++
>  lib/cpumask.c   |  9 
>  lib/find_bit.c  | 37 
> +
>  tools/include/asm-generic/bitops/find.h | 16 ++
>  5 files changed, 67 insertions(+), 13 deletions(-)
> 
> diff --git a/include/asm-generic/bitops/find.h 
> b/include/asm-generic/bitops/find.h
> index 998d4d544f18..130962f3a264 100644
> --- a/include/asm-generic/bitops/find.h
> +++ b/include/asm-generic/bitops/find.h
> @@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long 
> *addr, unsigned long
>   size, unsigned long offset);
>  #endif
>  
> +#ifndef find_next_and_bit
> +/**
> + * find_next_and_bit - find the next set bit in both memory regions
> + * @addr1: The first address to base the search on
> + * @addr2: The second address to base the search on
> + * @offset: The bitnumber to start searching at
> + * @size: The bitmap size in bits
> + *
> + * Returns the bit number for the next set bit
> + * If no bits are set, returns @size.
> + */
> +extern unsigned long find_next_and_bit(const unsigned long *addr1,
> + const unsigned long *addr2, unsigned long size,
> + unsigned long offset);
> +#endif
> +
>  #ifndef find_next_zero_bit
>  /**
>   * find_next_zero_bit - find the next cleared bit in a memory region
> diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> index 700cf5f67118..b4606bfda52f 100644
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -77,6 +77,8 @@
>   * find_first_bit(addr, nbits)   Position first set bit in *addr
>   * find_next_zero_bit(addr, nbits, bit)  Position next zero bit in *addr 
> >= bit
>   * find_next_bit(addr, nbits, bit)   Position next set bit in *addr >= bit
> + * find_next_and_bit(addr1, addr2, nbits, bit)   Same as find_first_bit, 
> but in
> + *   (*addr1 & *addr2)
>   */
>  
>  /*
> diff --git a/lib/cpumask.c b/lib/cpumask.c
> index 8b1a1bd77539..5602223837fa 100644
> --- a/lib/cpumask.c
> +++ b/lib/cpumask.c
> @@ -32,10 +32,11 @@ EXPORT_SYMBOL(cpumask_next);
>  int cpumask_next_and(int n, const struct cpumask *src1p,
>const struct cpumask *src2p)
>  {
> - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
> - if (cpumask_test_cpu(n, src2p))
> - break;
> - return n;
> + /* -1 is a legal arg here. */
> + if (n != -1)
> + cpumask_check(n);
> + return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
> + nr_cpumask_bits, n + 1);
>  }
>  EXPORT_SYMBOL(cpumask_next_and);
>  
> diff --git a/lib/find_bit.c b/lib/find_bit.c
> index 6ed74f78380c..ebc08fd9fdf8 100644
> --- a/lib/find_bit.c
> +++ b/lib/find_bit.c
> @@ -24,19 +24,25 @@
>  #if !defined(find_next_bit) || !defined(find_next_zero_bit)
>  
>  /*
> - * This is a common helper function for find_next_bit and
> - * find_next_zero_bit.  The difference is the "invert" argument, which
> - * is XORed with each fetched word before searching it for one bits.
> + * This is a common helper function for find_next_bit, find_next_zero_bit, 
> and
> + * find_next_and_bit. The differences are:
> + *  - The "invert" argument, which is XORed with each fetched word before
> + *searching it for one bits.

Re: [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Yury Norov
On Tue, Oct 24, 2017 at 12:51:59PM +0200, Clement Courbet wrote:
> We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
> It's essentially a joined iteration in search for a non-zero bit, which
> is currently implemented as a lookup join (find a nonzero bit on the
> lhs, lookup the rhs to see if it's set there).
> 
> Implement a direct join (find a nonzero bit on the incrementally built
> join). Direct benchmarking shows that it's 1.17x to 14x faster with a
> geometric mean of 2.1 on 32 CPUs. No impact on memory usage.
> 
> Approximate benchmark code:
> 
> ```
>   unsigned long src1p[nr_cpumask_longs] = {pattern1};
>   unsigned long src2p[nr_cpumask_longs] = {pattern2};
>   for (/*a bunch of repetitions*/) {
> for (int n = -1; n <= nr_cpu_ids; ++n) {
>   asm volatile("" : "+rm"(src1p)); // prevent any optimization
>   asm volatile("" : "+rm"(src2p));
>   unsigned long result = cpumask_next_and(n, src1p, src2p);
>   asm volatile("" : "+rm"(result));
> }
>   }
> ```
> Signed-off-by: Clement Courbet 
> ---
> Changes in v2:
>  - Refactored _find_next_common_bit into _find_next_bit., as suggested
>by Yury Norov.

What I actually suggested is make _find_next_and_bit() similar to
_find_next_bit(), not to extend _find_next_bit(). But what you did
looks OK.

> This has no adverse effects on the performance side,
>as the compiler successfully inlines the code.

I think it's not about inlining, compiler just optimizes out branches known
as false at compile-time.

>  include/asm-generic/bitops/find.h   | 16 ++
>  include/linux/bitmap.h  |  2 ++
>  lib/cpumask.c   |  9 
>  lib/find_bit.c  | 37 
> +
>  tools/include/asm-generic/bitops/find.h | 16 ++
>  5 files changed, 67 insertions(+), 13 deletions(-)
> 
> diff --git a/include/asm-generic/bitops/find.h 
> b/include/asm-generic/bitops/find.h
> index 998d4d544f18..130962f3a264 100644
> --- a/include/asm-generic/bitops/find.h
> +++ b/include/asm-generic/bitops/find.h
> @@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long 
> *addr, unsigned long
>   size, unsigned long offset);
>  #endif
>  
> +#ifndef find_next_and_bit
> +/**
> + * find_next_and_bit - find the next set bit in both memory regions
> + * @addr1: The first address to base the search on
> + * @addr2: The second address to base the search on
> + * @offset: The bitnumber to start searching at
> + * @size: The bitmap size in bits
> + *
> + * Returns the bit number for the next set bit
> + * If no bits are set, returns @size.
> + */
> +extern unsigned long find_next_and_bit(const unsigned long *addr1,
> + const unsigned long *addr2, unsigned long size,
> + unsigned long offset);
> +#endif
> +
>  #ifndef find_next_zero_bit
>  /**
>   * find_next_zero_bit - find the next cleared bit in a memory region
> diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> index 700cf5f67118..b4606bfda52f 100644
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -77,6 +77,8 @@
>   * find_first_bit(addr, nbits)   Position first set bit in *addr
>   * find_next_zero_bit(addr, nbits, bit)  Position next zero bit in *addr 
> >= bit
>   * find_next_bit(addr, nbits, bit)   Position next set bit in *addr >= bit
> + * find_next_and_bit(addr1, addr2, nbits, bit)   Same as find_first_bit, 
> but in
> + *   (*addr1 & *addr2)
>   */
>  
>  /*
> diff --git a/lib/cpumask.c b/lib/cpumask.c
> index 8b1a1bd77539..5602223837fa 100644
> --- a/lib/cpumask.c
> +++ b/lib/cpumask.c
> @@ -32,10 +32,11 @@ EXPORT_SYMBOL(cpumask_next);
>  int cpumask_next_and(int n, const struct cpumask *src1p,
>const struct cpumask *src2p)
>  {
> - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
> - if (cpumask_test_cpu(n, src2p))
> - break;
> - return n;
> + /* -1 is a legal arg here. */
> + if (n != -1)
> + cpumask_check(n);
> + return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
> + nr_cpumask_bits, n + 1);
>  }
>  EXPORT_SYMBOL(cpumask_next_and);
>  
> diff --git a/lib/find_bit.c b/lib/find_bit.c
> index 6ed74f78380c..ebc08fd9fdf8 100644
> --- a/lib/find_bit.c
> +++ b/lib/find_bit.c
> @@ -24,19 +24,25 @@
>  #if !defined(find_next_bit) || !defined(find_next_zero_bit)
>  
>  /*
> - * This is a common helper function for find_next_bit and
> - * find_next_zero_bit.  The difference is the "invert" argument, which
> - * is XORed with each fetched word before searching it for one bits.
> + * This is a common helper function for find_next_bit, find_next_zero_bit, 
> and
> + * find_next_and_bit. The differences are:
> + *  - The "invert" argument, which is XORed with each fetched word before
> + *searching it for one bits.
> + *  - The 

RE: [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Matthew Wilcox
Reviewed-by: Matthew Wilcox 

> -Original Message-
> From: Clement Courbet [mailto:cour...@google.com]
> Sent: Tuesday, October 24, 2017 6:52 AM
> To: Arnd Bergmann ; Rasmus Villemoes
> ; Andrew Morton ;
> Matthew Wilcox ; Yury Norov
> 
> Cc: Clement Courbet ; Ingo Molnar
> ; linux-a...@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: [PATCH v2] lib: optimize cpumask_next_and()
> 
> We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
> It's essentially a joined iteration in search for a non-zero bit, which
> is currently implemented as a lookup join (find a nonzero bit on the
> lhs, lookup the rhs to see if it's set there).
> 
> Implement a direct join (find a nonzero bit on the incrementally built
> join). Direct benchmarking shows that it's 1.17x to 14x faster with a
> geometric mean of 2.1 on 32 CPUs. No impact on memory usage.
> 
> Approximate benchmark code:
> 
> ```
>   unsigned long src1p[nr_cpumask_longs] = {pattern1};
>   unsigned long src2p[nr_cpumask_longs] = {pattern2};
>   for (/*a bunch of repetitions*/) {
> for (int n = -1; n <= nr_cpu_ids; ++n) {
>   asm volatile("" : "+rm"(src1p)); // prevent any optimization
>   asm volatile("" : "+rm"(src2p));
>   unsigned long result = cpumask_next_and(n, src1p, src2p);
>   asm volatile("" : "+rm"(result));
> }
>   }
> ```
> 
> Results:
> pattern1pattern2 time_before/time_after
> 0x  0x   1.65
> 0x  0x   2.24
> 0x  0x   2.94
> 0x  0x   14.0
> 0x  0x   1.67
> 0x  0x   1.71
> 0x  0x   1.90
> 0x  0x   6.58
> 0x  0x   1.46
> 0x  0x   1.49
> 0x  0x   1.45
> 0x  0x   3.10
> 0x  0x   1.18
> 0x  0x   1.18
> 0x  0x   1.17
> 0x  0x   1.25
> -
>geo.mean  2.06
> 
> Signed-off-by: Clement Courbet 
> ---
> Changes in v2:
>  - Refactored _find_next_common_bit into _find_next_bit., as suggested
>by Yury Norov. This has no adverse effects on the performance side,
>as the compiler successfully inlines the code.
> 
>  include/asm-generic/bitops/find.h   | 16 ++
>  include/linux/bitmap.h  |  2 ++
>  lib/cpumask.c   |  9 
>  lib/find_bit.c  | 37 
> +
>  tools/include/asm-generic/bitops/find.h | 16 ++
>  5 files changed, 67 insertions(+), 13 deletions(-)
> 
> diff --git a/include/asm-generic/bitops/find.h b/include/asm-
> generic/bitops/find.h
> index 998d4d544f18..130962f3a264 100644
> --- a/include/asm-generic/bitops/find.h
> +++ b/include/asm-generic/bitops/find.h
> @@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long
> *addr, unsigned long
>   size, unsigned long offset);
>  #endif
> 
> +#ifndef find_next_and_bit
> +/**
> + * find_next_and_bit - find the next set bit in both memory regions
> + * @addr1: The first address to base the search on
> + * @addr2: The second address to base the search on
> + * @offset: The bitnumber to start searching at
> + * @size: The bitmap size in bits
> + *
> + * Returns the bit number for the next set bit
> + * If no bits are set, returns @size.
> + */
> +extern unsigned long find_next_and_bit(const unsigned long *addr1,
> + const unsigned long *addr2, unsigned long size,
> + unsigned long offset);
> +#endif
> +
>  #ifndef find_next_zero_bit
>  /**
>   * find_next_zero_bit - find the next cleared bit in a memory region
> diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> index 700cf5f67118..b4606bfda52f 100644
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -77,6 +77,8 @@
>   * find_first_bit(addr, nbits)   Position first set bit in *addr
>   * find_next_zero_bit(addr, nbits, bit)  Position next zero bit in *addr 
> >= bit
>   * find_next_bit(addr, nbits, bit)   Position next set bit in *addr >= bit
> + * find_next_and_bit(addr1, addr2, nbits, bit)   Same as find_first_bit, 
> but in
> + *   (*addr1 & *addr2)
>   */
> 
>  /*
> diff --git a/lib/cpumask.c b/lib/cpumask.c
> index 8b1a1bd77539..5602223837fa 100644
> --- a/lib/cpumask.c
> +++ b/lib/cpumask.c
> @@ -32,10 +32,11 @@ EXPORT_SYMBOL(cpumask_next);
>  int cpumask_next_and(int n, const struct cpumask *src1p,
>const struct cpumask *src2p)
>  {
> - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
> - if (cpumask_test_cpu(n, src2p))
> - break;
> - return n;
> + /* -1 is a legal 

RE: [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Matthew Wilcox
Reviewed-by: Matthew Wilcox 

> -Original Message-
> From: Clement Courbet [mailto:cour...@google.com]
> Sent: Tuesday, October 24, 2017 6:52 AM
> To: Arnd Bergmann ; Rasmus Villemoes
> ; Andrew Morton ;
> Matthew Wilcox ; Yury Norov
> 
> Cc: Clement Courbet ; Ingo Molnar
> ; linux-a...@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: [PATCH v2] lib: optimize cpumask_next_and()
> 
> We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
> It's essentially a joined iteration in search for a non-zero bit, which
> is currently implemented as a lookup join (find a nonzero bit on the
> lhs, lookup the rhs to see if it's set there).
> 
> Implement a direct join (find a nonzero bit on the incrementally built
> join). Direct benchmarking shows that it's 1.17x to 14x faster with a
> geometric mean of 2.1 on 32 CPUs. No impact on memory usage.
> 
> Approximate benchmark code:
> 
> ```
>   unsigned long src1p[nr_cpumask_longs] = {pattern1};
>   unsigned long src2p[nr_cpumask_longs] = {pattern2};
>   for (/*a bunch of repetitions*/) {
> for (int n = -1; n <= nr_cpu_ids; ++n) {
>   asm volatile("" : "+rm"(src1p)); // prevent any optimization
>   asm volatile("" : "+rm"(src2p));
>   unsigned long result = cpumask_next_and(n, src1p, src2p);
>   asm volatile("" : "+rm"(result));
> }
>   }
> ```
> 
> Results:
> pattern1pattern2 time_before/time_after
> 0x  0x   1.65
> 0x  0x   2.24
> 0x  0x   2.94
> 0x  0x   14.0
> 0x  0x   1.67
> 0x  0x   1.71
> 0x  0x   1.90
> 0x  0x   6.58
> 0x  0x   1.46
> 0x  0x   1.49
> 0x  0x   1.45
> 0x  0x   3.10
> 0x  0x   1.18
> 0x  0x   1.18
> 0x  0x   1.17
> 0x  0x   1.25
> -
>geo.mean  2.06
> 
> Signed-off-by: Clement Courbet 
> ---
> Changes in v2:
>  - Refactored _find_next_common_bit into _find_next_bit., as suggested
>by Yury Norov. This has no adverse effects on the performance side,
>as the compiler successfully inlines the code.
> 
>  include/asm-generic/bitops/find.h   | 16 ++
>  include/linux/bitmap.h  |  2 ++
>  lib/cpumask.c   |  9 
>  lib/find_bit.c  | 37 
> +
>  tools/include/asm-generic/bitops/find.h | 16 ++
>  5 files changed, 67 insertions(+), 13 deletions(-)
> 
> diff --git a/include/asm-generic/bitops/find.h b/include/asm-
> generic/bitops/find.h
> index 998d4d544f18..130962f3a264 100644
> --- a/include/asm-generic/bitops/find.h
> +++ b/include/asm-generic/bitops/find.h
> @@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long
> *addr, unsigned long
>   size, unsigned long offset);
>  #endif
> 
> +#ifndef find_next_and_bit
> +/**
> + * find_next_and_bit - find the next set bit in both memory regions
> + * @addr1: The first address to base the search on
> + * @addr2: The second address to base the search on
> + * @offset: The bitnumber to start searching at
> + * @size: The bitmap size in bits
> + *
> + * Returns the bit number for the next set bit
> + * If no bits are set, returns @size.
> + */
> +extern unsigned long find_next_and_bit(const unsigned long *addr1,
> + const unsigned long *addr2, unsigned long size,
> + unsigned long offset);
> +#endif
> +
>  #ifndef find_next_zero_bit
>  /**
>   * find_next_zero_bit - find the next cleared bit in a memory region
> diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> index 700cf5f67118..b4606bfda52f 100644
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -77,6 +77,8 @@
>   * find_first_bit(addr, nbits)   Position first set bit in *addr
>   * find_next_zero_bit(addr, nbits, bit)  Position next zero bit in *addr 
> >= bit
>   * find_next_bit(addr, nbits, bit)   Position next set bit in *addr >= bit
> + * find_next_and_bit(addr1, addr2, nbits, bit)   Same as find_first_bit, 
> but in
> + *   (*addr1 & *addr2)
>   */
> 
>  /*
> diff --git a/lib/cpumask.c b/lib/cpumask.c
> index 8b1a1bd77539..5602223837fa 100644
> --- a/lib/cpumask.c
> +++ b/lib/cpumask.c
> @@ -32,10 +32,11 @@ EXPORT_SYMBOL(cpumask_next);
>  int cpumask_next_and(int n, const struct cpumask *src1p,
>const struct cpumask *src2p)
>  {
> - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
> - if (cpumask_test_cpu(n, src2p))
> - break;
> - return n;
> + /* -1 is a legal arg here. */
> + if (n != -1)
> + cpumask_check(n);
> + return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
> + nr_cpumask_bits, n + 1);
>  }
>