On Tue, Feb 17 2015, Yury Norov <[email protected]> wrote:

> The new implementation takes less space in the sources
> (see diffstat) and in the object. For me it's 710 vs 453
> bytes of text. It also shows a better performance.
>
> find_last_bit description fixed due to obvious typo.
>
> In this patch 2 macros were introduced: {LOW,HIGH}_BITS_MASK,
> that are doing almost the same as BITMAP_{FIRST,LAST}_WORD_MASK
> in include/linux/bitmap.h. But 'LAST' macro is potentially less
> effective, because it issues a conditional branch that can be
> omitted.  If it is replaced one day by a more effective
> implementation, {LOW,HIGH}_BITS_MASK can be removed.
>

I think it's better to use the existing macros and then improve them
instead of duplicating the functionality. I'll submit a patch for that
later tonight (that should then make it to 3.21 [or whatever 3.19+2 will
be called] together with this series). More on this issue below.

>
> Signed-off-by: Yury Norov <[email protected]>
> ---
>  include/linux/bitops.h |   4 +-
>  lib/find_last_bit.c    |  37 +++----
>  lib/find_next_bit.c    | 269 
> ++++++++++++++-----------------------------------
>  3 files changed, 94 insertions(+), 216 deletions(-)
>
> diff --git a/include/linux/bitops.h b/include/linux/bitops.h
> index 5d858e0..297f5bd 100644
> --- a/include/linux/bitops.h
> +++ b/include/linux/bitops.h
> @@ -218,9 +218,9 @@ static inline unsigned long __ffs64(u64 word)
>  /**
>   * find_last_bit - find the last set bit in a memory region
>   * @addr: The address to start the search at
> - * @size: The maximum size to search
> + * @size: The number of bits to search
>   *
> - * Returns the bit number of the first set bit, or size.
> + * Returns the bit number of the last set bit, or size.
>   */
>  extern unsigned long find_last_bit(const unsigned long *addr,
>                                  unsigned long size);
> diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
> index 91ca09f..edbb281 100644
> --- a/lib/find_last_bit.c
> +++ b/lib/find_last_bit.c
> @@ -4,6 +4,9 @@
>   * Written by Rusty Russell <[email protected]>
>   * (Inspired by David Howell's find_next_bit implementation)
>   *
> + * Rewritten by Yury Norov <[email protected]> to decrease
> + * size and improve performance, 2015.
> + *
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License
>   * as published by the Free Software Foundation; either version
> @@ -12,36 +15,26 @@
>  
>  #include <linux/bitops.h>
>  #include <linux/export.h>
> -#include <asm/types.h>
> -#include <asm/byteorder.h>
> +#include <linux/kernel.h>
> +
> +#define LOW_BITS_MASK(nr) (~0UL >> -(nr))

This is technically wrong, and may not even work on architectures that
are not as forgiving as x86: Whatever type and value nr has, -(nr) is
almost guaranteed not to be a number between 0 and BITS_PER_LONG-1. And
even on x86, gcc doesn't generate as good code as it could:

 163:   49 c7 c0 ff ff ff ff    mov    $0xffffffffffffffff,%r8
 16a:   83 e1 3f                and    $0x3f,%ecx
 16d:   f7 d9                   neg    %ecx
 16f:   48 c1 ea 06             shr    $0x6,%rdx
 173:   49 d3 e8                shr    %cl,%r8

It doesn't realize that pre-masking %ecx with 0x3f is redundant when we
then negate it and use it as a shift amount.

So a better definition of the macro is

#define BITMAP_LAST_WORD_MASK(nr) (~0UL >> (-(nr) & (BITS_PER_LONG-1)))

and then callers shouldn't do the modulo. On x86, gcc knows that the &
is redundant. I use & instead of % so that nr may also have signed type
(otherwise we're again in UB land, since -(nr) % BITS_PER_LONG is then,
by the broken C standard, a negative number).


>  #include <linux/bitops.h>
>  #include <linux/export.h>
> -#include <asm/types.h>
> -#include <asm/byteorder.h>
> +#include <linux/kernel.h>
>  
> -#define BITOP_WORD(nr)               ((nr) / BITS_PER_LONG)
> +#define HIGH_BITS_MASK(nr)           (~0UL << (nr))
> +
> +#if !defined(find_next_bit) || !defined(find_next_zero_bit)
>  
> -#ifndef find_next_bit
>  /*
> - * Find the next set bit in a memory region.
> + * This is a common helper function for find_next_bit and
> + * find_next_zero_bit.  The difference is the "invert" argument, which
> + * is XORed with each fetched word before searching it for one bits.
>   */
> -unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
> -                         unsigned long offset)
> +static unsigned long _find_next_bit(const unsigned long *addr,
> +             unsigned long nbits, unsigned long start, unsigned long invert)
>  {
> -     const unsigned long *p = addr + BITOP_WORD(offset);
> -     unsigned long result = offset & ~(BITS_PER_LONG-1);
>       unsigned long tmp;
>  
> -     if (offset >= size)
> -             return size;
> -     size -= result;
> -     offset %= BITS_PER_LONG;
> -     if (offset) {
> -             tmp = *(p++);
> -             tmp &= (~0UL << offset);
> -             if (size < BITS_PER_LONG)
> -                     goto found_first;
> -             if (tmp)
> -                     goto found_middle;
> -             size -= BITS_PER_LONG;
> -             result += BITS_PER_LONG;
> -     }
> -     while (size & ~(BITS_PER_LONG-1)) {
> -             if ((tmp = *(p++)))
> -                     goto found_middle;
> -             result += BITS_PER_LONG;
> -             size -= BITS_PER_LONG;
> +     if (!nbits || start >= nbits)
> +             return nbits;
> +
> +     tmp = addr[start / BITS_PER_LONG] ^ invert;
> +
> +     /* Handle 1st word. */
> +     tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);

And of course here, I'd then suggest using BITMAP_FIRST_WORD_MASK(start)
(that even matches the comment :-)), omitting the definition of
HIGH_BITS_MASK.

> @@ -113,24 +78,14 @@ EXPORT_SYMBOL(find_next_zero_bit);
>   */
>  unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
>  {
> -     const unsigned long *p = addr;
> -     unsigned long result = 0;
> -     unsigned long tmp;
> +     unsigned long idx;
>  
> -     while (size & ~(BITS_PER_LONG-1)) {
> -             if ((tmp = *(p++)))
> -                     goto found;
> -             result += BITS_PER_LONG;
> -             size -= BITS_PER_LONG;
> +     for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
> +             if (addr[idx])
> +                     return min(idx * BITS_PER_LONG + __ffs(addr[idx]), 
> size);
>       }
> -     if (!size)
> -             return result;
>  
> -     tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
> -     if (tmp == 0UL)         /* Are any bits set? */
> -             return result + size;   /* Nope. */
> -found:
> -     return result + __ffs(tmp);
> +     return size;
>  }
>  EXPORT_SYMBOL(find_first_bit);
>  #endif
> @@ -141,24 +96,14 @@ EXPORT_SYMBOL(find_first_bit);
>   */
>  unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long 
> size)
>  {
> -     const unsigned long *p = addr;
> -     unsigned long result = 0;
> -     unsigned long tmp;
> +     unsigned long idx;
>  
> -     while (size & ~(BITS_PER_LONG-1)) {
> -             if (~(tmp = *(p++)))
> -                     goto found;
> -             result += BITS_PER_LONG;
> -             size -= BITS_PER_LONG;
> +     for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
> +             if (addr[idx] != ~0UL)
> +                     return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
>       }

Since I'm afraid the above means I have to ask you to send a v5, I might
as well also comment on this: I think it would make the code much more
obviously parallel to find_first_bit if the test was "if (~addr[idx])"
and the ffz is then replaced by __ffs(~addr[idx]). Many architectures
implement ffz(x) as __ffs(~x) anyway, so it shouldn't be any less
efficient. But it's no big deal, so if you feel this is better, just
leave it.

Rasmus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to