From: Yury Norov <yury.no...@gmail.com>
Date: Sat, 5 Dec 2020 08:54:06 -0800

Hi,

> ARM64 doesn't implement find_first_{zero}_bit in arch code and doesn't
> enable it in config. It leads to using find_next_bit() which is less
> efficient:
>
> 0000000000000000 <find_first_bit>:
>    0: aa0003e4        mov     x4, x0
>    4: aa0103e0        mov     x0, x1
>    8: b4000181        cbz     x1, 38 <find_first_bit+0x38>
>    c: f9400083        ldr     x3, [x4]
>   10: d2800802        mov     x2, #0x40                       // #64
>   14: 91002084        add     x4, x4, #0x8
>   18: b40000c3        cbz     x3, 30 <find_first_bit+0x30>
>   1c: 14000008        b       3c <find_first_bit+0x3c>
>   20: f8408483        ldr     x3, [x4], #8
>   24: 91010045        add     x5, x2, #0x40
>   28: b50000c3        cbnz    x3, 40 <find_first_bit+0x40>
>   2c: aa0503e2        mov     x2, x5
>   30: eb02001f        cmp     x0, x2
>   34: 54ffff68        b.hi    20 <find_first_bit+0x20>  // b.pmore
>   38: d65f03c0        ret
>   3c: d2800002        mov     x2, #0x0                        // #0
>   40: dac00063        rbit    x3, x3
>   44: dac01063        clz     x3, x3
>   48: 8b020062        add     x2, x3, x2
>   4c: eb02001f        cmp     x0, x2
>   50: 9a829000        csel    x0, x0, x2, ls  // ls = plast
>   54: d65f03c0        ret
>
>   ...
>
> 0000000000000118 <_find_next_bit.constprop.1>:
>  118: eb02007f        cmp     x3, x2
>  11c: 540002e2        b.cs    178 <_find_next_bit.constprop.1+0x60>  // b.hs, 
> b.nlast
>  120: d346fc66        lsr     x6, x3, #6
>  124: f8667805        ldr     x5, [x0, x6, lsl #3]
>  128: b4000061        cbz     x1, 134 <_find_next_bit.constprop.1+0x1c>
>  12c: f8667826        ldr     x6, [x1, x6, lsl #3]
>  130: 8a0600a5        and     x5, x5, x6
>  134: ca0400a6        eor     x6, x5, x4
>  138: 92800005        mov     x5, #0xffffffffffffffff         // #-1
>  13c: 9ac320a5        lsl     x5, x5, x3
>  140: 927ae463        and     x3, x3, #0xffffffffffffffc0
>  144: ea0600a5        ands    x5, x5, x6
>  148: 54000120        b.eq    16c <_find_next_bit.constprop.1+0x54>  // b.none
>  14c: 1400000e        b       184 <_find_next_bit.constprop.1+0x6c>
>  150: d346fc66        lsr     x6, x3, #6
>  154: f8667805        ldr     x5, [x0, x6, lsl #3]
>  158: b4000061        cbz     x1, 164 <_find_next_bit.constprop.1+0x4c>
>  15c: f8667826        ldr     x6, [x1, x6, lsl #3]
>  160: 8a0600a5        and     x5, x5, x6
>  164: eb05009f        cmp     x4, x5
>  168: 540000c1        b.ne    180 <_find_next_bit.constprop.1+0x68>  // b.any
>  16c: 91010063        add     x3, x3, #0x40
>  170: eb03005f        cmp     x2, x3
>  174: 54fffee8        b.hi    150 <_find_next_bit.constprop.1+0x38>  // 
> b.pmore
>  178: aa0203e0        mov     x0, x2
>  17c: d65f03c0        ret
>  180: ca050085        eor     x5, x4, x5
>  184: dac000a5        rbit    x5, x5
>  188: dac010a5        clz     x5, x5
>  18c: 8b0300a3        add     x3, x5, x3
>  190: eb03005f        cmp     x2, x3
>  194: 9a839042        csel    x2, x2, x3, ls  // ls = plast
>  198: aa0203e0        mov     x0, x2
>  19c: d65f03c0        ret
>
>  ...
>
> 0000000000000238 <find_next_bit>:
>  238: a9bf7bfd        stp     x29, x30, [sp, #-16]!
>  23c: aa0203e3        mov     x3, x2
>  240: d2800004        mov     x4, #0x0                        // #0
>  244: aa0103e2        mov     x2, x1
>  248: 910003fd        mov     x29, sp
>  24c: d2800001        mov     x1, #0x0                        // #0
>  250: 97ffffb2        bl      118 <_find_next_bit.constprop.1>
>  254: a8c17bfd        ldp     x29, x30, [sp], #16
>  258: d65f03c0        ret
>
> Enabling this functions would also benefit for_each_{set,clear}_bit().
> Would it make sense to enable this config for all such architectures by
> default?

I confirm that GENERIC_FIND_FIRST_BIT also produces more optimized and
fast code on MIPS (32 R2) where there is also no architecture-specific
bitsearching routines.
So, if it's okay for other folks, I'd suggest to go for it and enable
for all similar arches.

(otherwise, I'll publish a separate entry for mips-next after 5.12-rc1
 release and mention you in "Suggested-by:")

> Signed-off-by: Yury Norov <yury.no...@gmail.com>
>
> ---
>  arch/arm64/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 1515f6f153a0..2b90ef1f548e 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -106,6 +106,7 @@ config ARM64
>       select GENERIC_CPU_AUTOPROBE
>       select GENERIC_CPU_VULNERABILITIES
>       select GENERIC_EARLY_IOREMAP
> +     select GENERIC_FIND_FIRST_BIT
>       select GENERIC_IDLE_POLL_SETUP
>       select GENERIC_IRQ_IPI
>       select GENERIC_IRQ_MULTI_HANDLER
> --
> 2.25.1

Thanks,
Al

Reply via email to