Re: [PATCH v2] lib: optimize cpumask_next_and()
Hi Clement, [auto build test ERROR on linus/master] [also build test ERROR on v4.14-rc6 next-20171018] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Clement-Courbet/lib-optimize-cpumask_next_and/20171026-184850 config: m68k-allyesconfig (attached as .config) compiler: m68k-linux-gcc (GCC) 4.9.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree make.cross ARCH=m68k All errors (new ones prefixed by >>): lib/find_bit.c: In function 'find_next_and_bit': >> lib/find_bit.c:92:2: error: implicit declaration of function >> '_find_next_bit' [-Werror=implicit-function-declaration] return _find_next_bit(addr1, addr2, size, offset, ~0UL); ^ lib/find_bit.c:92:38: error: 'size' undeclared (first use in this function) return _find_next_bit(addr1, addr2, size, offset, ~0UL); ^ lib/find_bit.c:92:38: note: each undeclared identifier is reported only once for each function it appears in lib/find_bit.c:92:44: error: 'offset' undeclared (first use in this function) return _find_next_bit(addr1, addr2, size, offset, ~0UL); ^ lib/find_bit.c:93:1: warning: control reaches end of non-void function [-Wreturn-type] } ^ cc1: some warnings being treated as errors vim +/_find_next_bit +92 lib/find_bit.c 86 87 #if !defined(find_next_and_bit) 88 unsigned long find_next_and_bit(const unsigned long *addr1, 89 const unsigned long *addr2, unsigned long nbits, 90 unsigned long start) 91 { > 92 return _find_next_bit(addr1, addr2, size, offset, ~0UL); 93 } 94 EXPORT_SYMBOL(find_next_and_bit); 95 #endif 96 --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: [PATCH v2] lib: optimize cpumask_next_and()
Hi Clement, [auto build test ERROR on linus/master] [also build test ERROR on v4.14-rc6 next-20171018] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Clement-Courbet/lib-optimize-cpumask_next_and/20171026-184850 config: m68k-allyesconfig (attached as .config) compiler: m68k-linux-gcc (GCC) 4.9.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree make.cross ARCH=m68k All errors (new ones prefixed by >>): lib/find_bit.c: In function 'find_next_and_bit': >> lib/find_bit.c:92:2: error: implicit declaration of function >> '_find_next_bit' [-Werror=implicit-function-declaration] return _find_next_bit(addr1, addr2, size, offset, ~0UL); ^ lib/find_bit.c:92:38: error: 'size' undeclared (first use in this function) return _find_next_bit(addr1, addr2, size, offset, ~0UL); ^ lib/find_bit.c:92:38: note: each undeclared identifier is reported only once for each function it appears in lib/find_bit.c:92:44: error: 'offset' undeclared (first use in this function) return _find_next_bit(addr1, addr2, size, offset, ~0UL); ^ lib/find_bit.c:93:1: warning: control reaches end of non-void function [-Wreturn-type] } ^ cc1: some warnings being treated as errors vim +/_find_next_bit +92 lib/find_bit.c 86 87 #if !defined(find_next_and_bit) 88 unsigned long find_next_and_bit(const unsigned long *addr1, 89 const unsigned long *addr2, unsigned long nbits, 90 unsigned long start) 91 { > 92 return _find_next_bit(addr1, addr2, size, offset, ~0UL); 93 } 94 EXPORT_SYMBOL(find_next_and_bit); 95 #endif 96 --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: [PATCH v2] lib: optimize cpumask_next_and()
Hi Clement, [auto build test ERROR on linus/master] [also build test ERROR on v4.14-rc6 next-20171018] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Clement-Courbet/lib-optimize-cpumask_next_and/20171026-184850 config: i386-randconfig-x001-201743 (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: # save the attached .config to linux build tree make ARCH=i386 All error/warnings (new ones prefixed by >>): lib/find_bit.c: In function 'find_next_and_bit': >> lib/find_bit.c:92:38: error: 'size' undeclared (first use in this function) return _find_next_bit(addr1, addr2, size, offset, ~0UL); ^~~~ lib/find_bit.c:92:38: note: each undeclared identifier is reported only once for each function it appears in >> lib/find_bit.c:92:44: error: 'offset' undeclared (first use in this function) return _find_next_bit(addr1, addr2, size, offset, ~0UL); ^~ >> lib/find_bit.c:93:1: warning: control reaches end of non-void function >> [-Wreturn-type] } ^ vim +/size +92 lib/find_bit.c 86 87 #if !defined(find_next_and_bit) 88 unsigned long find_next_and_bit(const unsigned long *addr1, 89 const unsigned long *addr2, unsigned long nbits, 90 unsigned long start) 91 { > 92 return _find_next_bit(addr1, addr2, size, offset, ~0UL); > 93 } 94 EXPORT_SYMBOL(find_next_and_bit); 95 #endif 96 --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: [PATCH v2] lib: optimize cpumask_next_and()
Hi Clement, [auto build test ERROR on linus/master] [also build test ERROR on v4.14-rc6 next-20171018] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Clement-Courbet/lib-optimize-cpumask_next_and/20171026-184850 config: i386-randconfig-x001-201743 (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: # save the attached .config to linux build tree make ARCH=i386 All error/warnings (new ones prefixed by >>): lib/find_bit.c: In function 'find_next_and_bit': >> lib/find_bit.c:92:38: error: 'size' undeclared (first use in this function) return _find_next_bit(addr1, addr2, size, offset, ~0UL); ^~~~ lib/find_bit.c:92:38: note: each undeclared identifier is reported only once for each function it appears in >> lib/find_bit.c:92:44: error: 'offset' undeclared (first use in this function) return _find_next_bit(addr1, addr2, size, offset, ~0UL); ^~ >> lib/find_bit.c:93:1: warning: control reaches end of non-void function >> [-Wreturn-type] } ^ vim +/size +92 lib/find_bit.c 86 87 #if !defined(find_next_and_bit) 88 unsigned long find_next_and_bit(const unsigned long *addr1, 89 const unsigned long *addr2, unsigned long nbits, 90 unsigned long start) 91 { > 92 return _find_next_bit(addr1, addr2, size, offset, ~0UL); > 93 } 94 EXPORT_SYMBOL(find_next_and_bit); 95 #endif 96 --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: Re [PATCH v2] lib: optimize cpumask_next_and()
On Wed, Oct 25, 2017 at 06:50:14PM +0300, Yury Norov wrote: > On Wed, Oct 25, 2017 at 05:28:41PM +0200, Clement Courbet wrote: > > Thanks for the comments Yury. > > > > > But I'd like also to keep _find_next_bit() consistent with > > > _find_next_bit_le() > > > > Not sure I understand what you're suggesting here: Do you want a > > find_next_and_bit_le() or do you want to make _find_next_bit_le() more > > like _find_next_bit() ? In the latter case we might just want to merge > > it with _find_next_bit() and end up with an extra is_le parameter :) > > Both ways will work, but I think that extra is_le is too much. > _find_next_bit_le() should be the copy of _find_next_bit() with the > addition of swapping code. > > If you don't need find_next_and_bit_le(), don't add it. > find_{first,last}_bit() doesn't have LE version, for example. Few comments more. _find_next_bit is now referenced also by find_next_and_bit(), and so it should be added to guard code: #if !defined(find_next_bit) || !defined(find_next_zero_bit) || !defined(find_next_and_bit) static unsigned long _find_next_bit( ... ) { ... } #endif It may be essential at least for arm. Don't forget to synchronize your changes with tools/lib/find_bit.c Thanks, Yury
Re: Re [PATCH v2] lib: optimize cpumask_next_and()
On Wed, Oct 25, 2017 at 06:50:14PM +0300, Yury Norov wrote: > On Wed, Oct 25, 2017 at 05:28:41PM +0200, Clement Courbet wrote: > > Thanks for the comments Yury. > > > > > But I'd like also to keep _find_next_bit() consistent with > > > _find_next_bit_le() > > > > Not sure I understand what you're suggesting here: Do you want a > > find_next_and_bit_le() or do you want to make _find_next_bit_le() more > > like _find_next_bit() ? In the latter case we might just want to merge > > it with _find_next_bit() and end up with an extra is_le parameter :) > > Both ways will work, but I think that extra is_le is too much. > _find_next_bit_le() should be the copy of _find_next_bit() with the > addition of swapping code. > > If you don't need find_next_and_bit_le(), don't add it. > find_{first,last}_bit() doesn't have LE version, for example. Few comments more. _find_next_bit is now referenced also by find_next_and_bit(), and so it should be added to guard code: #if !defined(find_next_bit) || !defined(find_next_zero_bit) || !defined(find_next_and_bit) static unsigned long _find_next_bit( ... ) { ... } #endif It may be essential at least for arm. Don't forget to synchronize your changes with tools/lib/find_bit.c Thanks, Yury
Re: Re [PATCH v2] lib: optimize cpumask_next_and()
On Wed, Oct 25, 2017 at 05:28:41PM +0200, Clement Courbet wrote: > Thanks for the comments Yury. > > > But I'd like also to keep _find_next_bit() consistent with > > _find_next_bit_le() > > Not sure I understand what you're suggesting here: Do you want a > find_next_and_bit_le() or do you want to make _find_next_bit_le() more > like _find_next_bit() ? In the latter case we might just want to merge > it with _find_next_bit() and end up with an extra is_le parameter :) Both ways will work, but I think that extra is_le is too much. _find_next_bit_le() should be the copy of _find_next_bit() with the addition of swapping code. If you don't need find_next_and_bit_le(), don't add it. find_{first,last}_bit() doesn't have LE version, for example. Yury
Re: Re [PATCH v2] lib: optimize cpumask_next_and()
On Wed, Oct 25, 2017 at 05:28:41PM +0200, Clement Courbet wrote: > Thanks for the comments Yury. > > > But I'd like also to keep _find_next_bit() consistent with > > _find_next_bit_le() > > Not sure I understand what you're suggesting here: Do you want a > find_next_and_bit_le() or do you want to make _find_next_bit_le() more > like _find_next_bit() ? In the latter case we might just want to merge > it with _find_next_bit() and end up with an extra is_le parameter :) Both ways will work, but I think that extra is_le is too much. _find_next_bit_le() should be the copy of _find_next_bit() with the addition of swapping code. If you don't need find_next_and_bit_le(), don't add it. find_{first,last}_bit() doesn't have LE version, for example. Yury
Re [PATCH v2] lib: optimize cpumask_next_and()
Thanks for the comments Yury. > But I'd like also to keep _find_next_bit() consistent with > _find_next_bit_le() Not sure I understand what you're suggesting here: Do you want a find_next_and_bit_le() or do you want to make _find_next_bit_le() more like _find_next_bit() ? In the latter case we might just want to merge it with _find_next_bit() and end up with an extra is_le parameter :)
Re [PATCH v2] lib: optimize cpumask_next_and()
Thanks for the comments Yury. > But I'd like also to keep _find_next_bit() consistent with > _find_next_bit_le() Not sure I understand what you're suggesting here: Do you want a find_next_and_bit_le() or do you want to make _find_next_bit_le() more like _find_next_bit() ? In the latter case we might just want to merge it with _find_next_bit() and end up with an extra is_le parameter :)
Re: [PATCH v2] lib: optimize cpumask_next_and()
On Tue, Oct 24, 2017 at 12:51:59PM +0200, Clement Courbet wrote: > We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and(). > It's essentially a joined iteration in search for a non-zero bit, which > is currently implemented as a lookup join (find a nonzero bit on the > lhs, lookup the rhs to see if it's set there). > > Implement a direct join (find a nonzero bit on the incrementally built > join). Direct benchmarking shows that it's 1.17x to 14x faster with a > geometric mean of 2.1 on 32 CPUs. No impact on memory usage. > > Approximate benchmark code: > > ``` > unsigned long src1p[nr_cpumask_longs] = {pattern1}; > unsigned long src2p[nr_cpumask_longs] = {pattern2}; > for (/*a bunch of repetitions*/) { > for (int n = -1; n <= nr_cpu_ids; ++n) { > asm volatile("" : "+rm"(src1p)); // prevent any optimization > asm volatile("" : "+rm"(src2p)); > unsigned long result = cpumask_next_and(n, src1p, src2p); > asm volatile("" : "+rm"(result)); > } > } > ``` > Signed-off-by: Clement Courbet> --- > Changes in v2: > - Refactored _find_next_common_bit into _find_next_bit., as suggested >by Yury Norov. What I actually suggested is make _find_next_and_bit() similar to _find_next_bit(), not to extend _find_next_bit(). But what you did looks OK. > This has no adverse effects on the performance side, >as the compiler successfully inlines the code. I think it's not about inlining, compiler just optimizes out branches known as false at compile-time. > include/asm-generic/bitops/find.h | 16 ++ > include/linux/bitmap.h | 2 ++ > lib/cpumask.c | 9 > lib/find_bit.c | 37 > + > tools/include/asm-generic/bitops/find.h | 16 ++ > 5 files changed, 67 insertions(+), 13 deletions(-) > > diff --git a/include/asm-generic/bitops/find.h > b/include/asm-generic/bitops/find.h > index 998d4d544f18..130962f3a264 100644 > --- a/include/asm-generic/bitops/find.h > +++ b/include/asm-generic/bitops/find.h > @@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long > *addr, unsigned long > size, unsigned long offset); > #endif > > +#ifndef find_next_and_bit > +/** > + * find_next_and_bit - find the next set bit in both memory regions > + * @addr1: The first address to base the search on > + * @addr2: The second address to base the search on > + * @offset: The bitnumber to start searching at > + * @size: The bitmap size in bits > + * > + * Returns the bit number for the next set bit > + * If no bits are set, returns @size. > + */ > +extern unsigned long find_next_and_bit(const unsigned long *addr1, > + const unsigned long *addr2, unsigned long size, > + unsigned long offset); > +#endif > + > #ifndef find_next_zero_bit > /** > * find_next_zero_bit - find the next cleared bit in a memory region > diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h > index 700cf5f67118..b4606bfda52f 100644 > --- a/include/linux/bitmap.h > +++ b/include/linux/bitmap.h > @@ -77,6 +77,8 @@ > * find_first_bit(addr, nbits) Position first set bit in *addr > * find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr > >= bit > * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit > + * find_next_and_bit(addr1, addr2, nbits, bit) Same as find_first_bit, > but in > + * (*addr1 & *addr2) > */ > > /* > diff --git a/lib/cpumask.c b/lib/cpumask.c > index 8b1a1bd77539..5602223837fa 100644 > --- a/lib/cpumask.c > +++ b/lib/cpumask.c > @@ -32,10 +32,11 @@ EXPORT_SYMBOL(cpumask_next); > int cpumask_next_and(int n, const struct cpumask *src1p, >const struct cpumask *src2p) > { > - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids) > - if (cpumask_test_cpu(n, src2p)) > - break; > - return n; > + /* -1 is a legal arg here. */ > + if (n != -1) > + cpumask_check(n); > + return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), > + nr_cpumask_bits, n + 1); > } > EXPORT_SYMBOL(cpumask_next_and); > > diff --git a/lib/find_bit.c b/lib/find_bit.c > index 6ed74f78380c..ebc08fd9fdf8 100644 > --- a/lib/find_bit.c > +++ b/lib/find_bit.c > @@ -24,19 +24,25 @@ > #if !defined(find_next_bit) || !defined(find_next_zero_bit) > > /* > - * This is a common helper function for find_next_bit and > - * find_next_zero_bit. The difference is the "invert" argument, which > - * is XORed with each fetched word before searching it for one bits. > + * This is a common helper function for find_next_bit, find_next_zero_bit, > and > + * find_next_and_bit. The differences are: > + * - The "invert" argument, which is XORed with each fetched word before > + *searching it for one bits.
Re: [PATCH v2] lib: optimize cpumask_next_and()
On Tue, Oct 24, 2017 at 12:51:59PM +0200, Clement Courbet wrote: > We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and(). > It's essentially a joined iteration in search for a non-zero bit, which > is currently implemented as a lookup join (find a nonzero bit on the > lhs, lookup the rhs to see if it's set there). > > Implement a direct join (find a nonzero bit on the incrementally built > join). Direct benchmarking shows that it's 1.17x to 14x faster with a > geometric mean of 2.1 on 32 CPUs. No impact on memory usage. > > Approximate benchmark code: > > ``` > unsigned long src1p[nr_cpumask_longs] = {pattern1}; > unsigned long src2p[nr_cpumask_longs] = {pattern2}; > for (/*a bunch of repetitions*/) { > for (int n = -1; n <= nr_cpu_ids; ++n) { > asm volatile("" : "+rm"(src1p)); // prevent any optimization > asm volatile("" : "+rm"(src2p)); > unsigned long result = cpumask_next_and(n, src1p, src2p); > asm volatile("" : "+rm"(result)); > } > } > ``` > Signed-off-by: Clement Courbet > --- > Changes in v2: > - Refactored _find_next_common_bit into _find_next_bit., as suggested >by Yury Norov. What I actually suggested is make _find_next_and_bit() similar to _find_next_bit(), not to extend _find_next_bit(). But what you did looks OK. > This has no adverse effects on the performance side, >as the compiler successfully inlines the code. I think it's not about inlining, compiler just optimizes out branches known as false at compile-time. > include/asm-generic/bitops/find.h | 16 ++ > include/linux/bitmap.h | 2 ++ > lib/cpumask.c | 9 > lib/find_bit.c | 37 > + > tools/include/asm-generic/bitops/find.h | 16 ++ > 5 files changed, 67 insertions(+), 13 deletions(-) > > diff --git a/include/asm-generic/bitops/find.h > b/include/asm-generic/bitops/find.h > index 998d4d544f18..130962f3a264 100644 > --- a/include/asm-generic/bitops/find.h > +++ b/include/asm-generic/bitops/find.h > @@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long > *addr, unsigned long > size, unsigned long offset); > #endif > > +#ifndef find_next_and_bit > +/** > + * find_next_and_bit - find the next set bit in both memory regions > + * @addr1: The first address to base the search on > + * @addr2: The second address to base the search on > + * @offset: The bitnumber to start searching at > + * @size: The bitmap size in bits > + * > + * Returns the bit number for the next set bit > + * If no bits are set, returns @size. > + */ > +extern unsigned long find_next_and_bit(const unsigned long *addr1, > + const unsigned long *addr2, unsigned long size, > + unsigned long offset); > +#endif > + > #ifndef find_next_zero_bit > /** > * find_next_zero_bit - find the next cleared bit in a memory region > diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h > index 700cf5f67118..b4606bfda52f 100644 > --- a/include/linux/bitmap.h > +++ b/include/linux/bitmap.h > @@ -77,6 +77,8 @@ > * find_first_bit(addr, nbits) Position first set bit in *addr > * find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr > >= bit > * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit > + * find_next_and_bit(addr1, addr2, nbits, bit) Same as find_first_bit, > but in > + * (*addr1 & *addr2) > */ > > /* > diff --git a/lib/cpumask.c b/lib/cpumask.c > index 8b1a1bd77539..5602223837fa 100644 > --- a/lib/cpumask.c > +++ b/lib/cpumask.c > @@ -32,10 +32,11 @@ EXPORT_SYMBOL(cpumask_next); > int cpumask_next_and(int n, const struct cpumask *src1p, >const struct cpumask *src2p) > { > - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids) > - if (cpumask_test_cpu(n, src2p)) > - break; > - return n; > + /* -1 is a legal arg here. */ > + if (n != -1) > + cpumask_check(n); > + return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), > + nr_cpumask_bits, n + 1); > } > EXPORT_SYMBOL(cpumask_next_and); > > diff --git a/lib/find_bit.c b/lib/find_bit.c > index 6ed74f78380c..ebc08fd9fdf8 100644 > --- a/lib/find_bit.c > +++ b/lib/find_bit.c > @@ -24,19 +24,25 @@ > #if !defined(find_next_bit) || !defined(find_next_zero_bit) > > /* > - * This is a common helper function for find_next_bit and > - * find_next_zero_bit. The difference is the "invert" argument, which > - * is XORed with each fetched word before searching it for one bits. > + * This is a common helper function for find_next_bit, find_next_zero_bit, > and > + * find_next_and_bit. The differences are: > + * - The "invert" argument, which is XORed with each fetched word before > + *searching it for one bits. > + * - The
RE: [PATCH v2] lib: optimize cpumask_next_and()
Reviewed-by: Matthew Wilcox> -Original Message- > From: Clement Courbet [mailto:cour...@google.com] > Sent: Tuesday, October 24, 2017 6:52 AM > To: Arnd Bergmann ; Rasmus Villemoes > ; Andrew Morton ; > Matthew Wilcox ; Yury Norov > > Cc: Clement Courbet ; Ingo Molnar > ; linux-a...@vger.kernel.org; linux-kernel@vger.kernel.org > Subject: [PATCH v2] lib: optimize cpumask_next_and() > > We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and(). > It's essentially a joined iteration in search for a non-zero bit, which > is currently implemented as a lookup join (find a nonzero bit on the > lhs, lookup the rhs to see if it's set there). > > Implement a direct join (find a nonzero bit on the incrementally built > join). Direct benchmarking shows that it's 1.17x to 14x faster with a > geometric mean of 2.1 on 32 CPUs. No impact on memory usage. > > Approximate benchmark code: > > ``` > unsigned long src1p[nr_cpumask_longs] = {pattern1}; > unsigned long src2p[nr_cpumask_longs] = {pattern2}; > for (/*a bunch of repetitions*/) { > for (int n = -1; n <= nr_cpu_ids; ++n) { > asm volatile("" : "+rm"(src1p)); // prevent any optimization > asm volatile("" : "+rm"(src2p)); > unsigned long result = cpumask_next_and(n, src1p, src2p); > asm volatile("" : "+rm"(result)); > } > } > ``` > > Results: > pattern1pattern2 time_before/time_after > 0x 0x 1.65 > 0x 0x 2.24 > 0x 0x 2.94 > 0x 0x 14.0 > 0x 0x 1.67 > 0x 0x 1.71 > 0x 0x 1.90 > 0x 0x 6.58 > 0x 0x 1.46 > 0x 0x 1.49 > 0x 0x 1.45 > 0x 0x 3.10 > 0x 0x 1.18 > 0x 0x 1.18 > 0x 0x 1.17 > 0x 0x 1.25 > - >geo.mean 2.06 > > Signed-off-by: Clement Courbet > --- > Changes in v2: > - Refactored _find_next_common_bit into _find_next_bit., as suggested >by Yury Norov. This has no adverse effects on the performance side, >as the compiler successfully inlines the code. > > include/asm-generic/bitops/find.h | 16 ++ > include/linux/bitmap.h | 2 ++ > lib/cpumask.c | 9 > lib/find_bit.c | 37 > + > tools/include/asm-generic/bitops/find.h | 16 ++ > 5 files changed, 67 insertions(+), 13 deletions(-) > > diff --git a/include/asm-generic/bitops/find.h b/include/asm- > generic/bitops/find.h > index 998d4d544f18..130962f3a264 100644 > --- a/include/asm-generic/bitops/find.h > +++ b/include/asm-generic/bitops/find.h > @@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long > *addr, unsigned long > size, unsigned long offset); > #endif > > +#ifndef find_next_and_bit > +/** > + * find_next_and_bit - find the next set bit in both memory regions > + * @addr1: The first address to base the search on > + * @addr2: The second address to base the search on > + * @offset: The bitnumber to start searching at > + * @size: The bitmap size in bits > + * > + * Returns the bit number for the next set bit > + * If no bits are set, returns @size. > + */ > +extern unsigned long find_next_and_bit(const unsigned long *addr1, > + const unsigned long *addr2, unsigned long size, > + unsigned long offset); > +#endif > + > #ifndef find_next_zero_bit > /** > * find_next_zero_bit - find the next cleared bit in a memory region > diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h > index 700cf5f67118..b4606bfda52f 100644 > --- a/include/linux/bitmap.h > +++ b/include/linux/bitmap.h > @@ -77,6 +77,8 @@ > * find_first_bit(addr, nbits) Position first set bit in *addr > * find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr > >= bit > * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit > + * find_next_and_bit(addr1, addr2, nbits, bit) Same as find_first_bit, > but in > + * (*addr1 & *addr2) > */ > > /* > diff --git a/lib/cpumask.c b/lib/cpumask.c > index 8b1a1bd77539..5602223837fa 100644 > --- a/lib/cpumask.c > +++ b/lib/cpumask.c > @@ -32,10 +32,11 @@ EXPORT_SYMBOL(cpumask_next); > int cpumask_next_and(int n, const struct cpumask *src1p, >const struct cpumask *src2p) > { > - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids) > - if (cpumask_test_cpu(n, src2p)) > - break; > - return n; > + /* -1 is a legal
RE: [PATCH v2] lib: optimize cpumask_next_and()
Reviewed-by: Matthew Wilcox > -Original Message- > From: Clement Courbet [mailto:cour...@google.com] > Sent: Tuesday, October 24, 2017 6:52 AM > To: Arnd Bergmann ; Rasmus Villemoes > ; Andrew Morton ; > Matthew Wilcox ; Yury Norov > > Cc: Clement Courbet ; Ingo Molnar > ; linux-a...@vger.kernel.org; linux-kernel@vger.kernel.org > Subject: [PATCH v2] lib: optimize cpumask_next_and() > > We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and(). > It's essentially a joined iteration in search for a non-zero bit, which > is currently implemented as a lookup join (find a nonzero bit on the > lhs, lookup the rhs to see if it's set there). > > Implement a direct join (find a nonzero bit on the incrementally built > join). Direct benchmarking shows that it's 1.17x to 14x faster with a > geometric mean of 2.1 on 32 CPUs. No impact on memory usage. > > Approximate benchmark code: > > ``` > unsigned long src1p[nr_cpumask_longs] = {pattern1}; > unsigned long src2p[nr_cpumask_longs] = {pattern2}; > for (/*a bunch of repetitions*/) { > for (int n = -1; n <= nr_cpu_ids; ++n) { > asm volatile("" : "+rm"(src1p)); // prevent any optimization > asm volatile("" : "+rm"(src2p)); > unsigned long result = cpumask_next_and(n, src1p, src2p); > asm volatile("" : "+rm"(result)); > } > } > ``` > > Results: > pattern1pattern2 time_before/time_after > 0x 0x 1.65 > 0x 0x 2.24 > 0x 0x 2.94 > 0x 0x 14.0 > 0x 0x 1.67 > 0x 0x 1.71 > 0x 0x 1.90 > 0x 0x 6.58 > 0x 0x 1.46 > 0x 0x 1.49 > 0x 0x 1.45 > 0x 0x 3.10 > 0x 0x 1.18 > 0x 0x 1.18 > 0x 0x 1.17 > 0x 0x 1.25 > - >geo.mean 2.06 > > Signed-off-by: Clement Courbet > --- > Changes in v2: > - Refactored _find_next_common_bit into _find_next_bit., as suggested >by Yury Norov. This has no adverse effects on the performance side, >as the compiler successfully inlines the code. > > include/asm-generic/bitops/find.h | 16 ++ > include/linux/bitmap.h | 2 ++ > lib/cpumask.c | 9 > lib/find_bit.c | 37 > + > tools/include/asm-generic/bitops/find.h | 16 ++ > 5 files changed, 67 insertions(+), 13 deletions(-) > > diff --git a/include/asm-generic/bitops/find.h b/include/asm- > generic/bitops/find.h > index 998d4d544f18..130962f3a264 100644 > --- a/include/asm-generic/bitops/find.h > +++ b/include/asm-generic/bitops/find.h > @@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long > *addr, unsigned long > size, unsigned long offset); > #endif > > +#ifndef find_next_and_bit > +/** > + * find_next_and_bit - find the next set bit in both memory regions > + * @addr1: The first address to base the search on > + * @addr2: The second address to base the search on > + * @offset: The bitnumber to start searching at > + * @size: The bitmap size in bits > + * > + * Returns the bit number for the next set bit > + * If no bits are set, returns @size. > + */ > +extern unsigned long find_next_and_bit(const unsigned long *addr1, > + const unsigned long *addr2, unsigned long size, > + unsigned long offset); > +#endif > + > #ifndef find_next_zero_bit > /** > * find_next_zero_bit - find the next cleared bit in a memory region > diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h > index 700cf5f67118..b4606bfda52f 100644 > --- a/include/linux/bitmap.h > +++ b/include/linux/bitmap.h > @@ -77,6 +77,8 @@ > * find_first_bit(addr, nbits) Position first set bit in *addr > * find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr > >= bit > * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit > + * find_next_and_bit(addr1, addr2, nbits, bit) Same as find_first_bit, > but in > + * (*addr1 & *addr2) > */ > > /* > diff --git a/lib/cpumask.c b/lib/cpumask.c > index 8b1a1bd77539..5602223837fa 100644 > --- a/lib/cpumask.c > +++ b/lib/cpumask.c > @@ -32,10 +32,11 @@ EXPORT_SYMBOL(cpumask_next); > int cpumask_next_and(int n, const struct cpumask *src1p, >const struct cpumask *src2p) > { > - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids) > - if (cpumask_test_cpu(n, src2p)) > - break; > - return n; > + /* -1 is a legal arg here. */ > + if (n != -1) > + cpumask_check(n); > + return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), > + nr_cpumask_bits, n + 1); > } >