Re: [PATCH] arm: Use builtins for ffs/fls
Hi Sean, On Mon, Jul 31, 2023 at 2:28 PM Sean Anderson wrote: > > Since ARMv5, the clz instruction allows for efficient implementation of > ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is > only available in ARM mode. LTO makes it difficult to force specific > functions to be in ARM mode, as it is effectively a form of very > aggressive inlining. To work around this, fls/ffs are implemented in > assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. > Overall, this saves around 75 bytes per call. > > Signed-off-by: Sean Anderson > --- > I only tested this on ARMv8. If someone has an ARMv5 or ARMv6 board, > please test this. Thanks for the patch! I've tested it with the Pogo V4 board (Kirkwood 88F6192, ARMv5) with LTO and SYS_THUMB_BUILD. Also nice size reduction: Before: 502K After: 499K So for ARMv5, Tested-by: Tony Dinh All the best, Tony > > arch/arm/include/asm/bitops.h | 27 - > arch/arm/lib/Makefile | 5 +++ > arch/arm/lib/bitops.S | 45 ++ > include/asm-generic/bitops/builtin-__ffs.h | 16 > include/asm-generic/bitops/builtin-__fls.h | 16 > include/asm-generic/bitops/builtin-ffs.h | 15 > include/asm-generic/bitops/builtin-fls.h | 17 > 7 files changed, 140 insertions(+), 1 deletion(-) > create mode 100644 arch/arm/lib/bitops.S > create mode 100644 include/asm-generic/bitops/builtin-__ffs.h > create mode 100644 include/asm-generic/bitops/builtin-__fls.h > create mode 100644 include/asm-generic/bitops/builtin-ffs.h > create mode 100644 include/asm-generic/bitops/builtin-fls.h > > diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h > index fa8548624a..8e897833bb 100644 > --- a/arch/arm/include/asm/bitops.h > +++ b/arch/arm/include/asm/bitops.h > @@ -15,9 +15,34 @@ > #ifndef __ASM_ARM_BITOPS_H > #define __ASM_ARM_BITOPS_H > > +#if __LINUX_ARM_ARCH__ < 5 > + > #include > #include > #include > + > +#else > + > +#define PLATFORM_FFS > +#define PLATFORM_FLS > + > +#if !IS_ENABLED(CONFIG_HAS_THUMB2) && CONFIG_IS_ENABLED(SYS_THUMB_BUILD) > + > +unsigned long __fls(unsigned long word); > +unsigned long __ffs(unsigned long word); > +int fls(unsigned int x); > +int ffs(int x); > + > +#else > + > +#include > +#include > +#include > +#include > + > +#endif > +#endif > + > #include > > #ifdef __KERNEL__ > @@ -113,7 +138,7 @@ static inline int test_bit(int nr, const void * addr) > > static inline int __ilog2(unsigned int x) > { > - return generic_fls(x) - 1; > + return fls(x) - 1; > } > > #define ffz(x) __ffs(~(x)) > diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile > index 62cf80f373..b1bcd37466 100644 > --- a/arch/arm/lib/Makefile > +++ b/arch/arm/lib/Makefile > @@ -113,6 +113,11 @@ AFLAGS_REMOVE_memset.o := -mthumb -mthumb-interwork > AFLAGS_REMOVE_memcpy.o := -mthumb -mthumb-interwork > AFLAGS_memset.o := -DMEMSET_NO_THUMB_BUILD > AFLAGS_memcpy.o := -DMEMCPY_NO_THUMB_BUILD > + > +# This is only necessary to force ARM mode on THUMB1 targets. > +ifneq ($(CONFIG_SYS_ARM_ARCH),4) > +obj-y += bitops.o > +endif > endif > endif > > diff --git a/arch/arm/lib/bitops.S b/arch/arm/lib/bitops.S > new file mode 100644 > index 00..29d1524634 > --- /dev/null > +++ b/arch/arm/lib/bitops.S > @@ -0,0 +1,45 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2023 Sean Anderson > + * > + * ARM bitops to call when using THUMB1, which doesn't have these > instructions. > + */ > +#include > +#include > + > +.pushsection .text.__fls > +ENTRY(__fls) > + clz r0, r0 > + rsb r0, r0, #31 > + ret lr > +ENDPROC(__fls) > +.popsection > + > +.pushsection .text.__ffs > +ENTRY(__ffs) > + rsb r3, r0, #0 > + and r0, r0, r3 > + clz r0, r0 > + rsb r0, r0, #31 > + ret lr > +ENDPROC(__ffs) > +.popsection > + > +.pushsection .text.fls > +ENTRY(fls) > + cmp r0, #0 > + clzne r0, r0 > + rsbne r0, r0, #32 > + ret lr > +ENDPROC(fls) > +.popsection > + > +.pushsection .text.ffs > +ENTRY(ffs) > + rsb r3, r0, #0 > + and r0, r0, r3 > + clz r0, r0 > + rsb r0, r0, #32 > + ret lr > +ENDPROC(ffs) > +.popsection > diff --git a/include/asm-generic/bitops/builtin-__ffs.h > b/include/asm-generic/bitops/builtin-__ffs.h > new file mode 100644 > index 00..87024da44d > --- /dev/null > +++ b/include/asm-generic/bitops/builtin-__ffs.h > @@ -0,0 +1,16 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ > +#define _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ > + > +/** > + * __ffs - find first bit in word. > + * @word: The word to search > + * > + * Undefined if no bit exists, so code should check against 0 first. > + */ > +static __always_inline unsigned long __ffs(unsigned long word) > +{ > +
Re: [PATCH] arm: Use builtins for ffs/fls
On Mon, Jul 31, 2023 at 05:27:33PM -0400, Sean Anderson wrote: > Since ARMv5, the clz instruction allows for efficient implementation of > ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is > only available in ARM mode. LTO makes it difficult to force specific > functions to be in ARM mode, as it is effectively a form of very > aggressive inlining. To work around this, fls/ffs are implemented in > assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. > Overall, this saves around 75 bytes per call. > > This code is synced with v5.15 of the Linux kernel. > > Signed-off-by: Sean Anderson > Reviewed-by: Tom Rini Applied to u-boot/next, thanks! -- Tom signature.asc Description: PGP signature
Re: [PATCH] arm: Use builtins for ffs/fls
On Mon, Jul 31, 2023 at 05:27:33PM -0400, Sean Anderson wrote: > Since ARMv5, the clz instruction allows for efficient implementation of > ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is > only available in ARM mode. LTO makes it difficult to force specific > functions to be in ARM mode, as it is effectively a form of very > aggressive inlining. To work around this, fls/ffs are implemented in > assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. > Overall, this saves around 75 bytes per call. > > Signed-off-by: Sean Anderson I'll add some wording about when this was synced from the kernel when applying. Reviewed-by: Tom Rini -- Tom signature.asc Description: PGP signature
Re: [PATCH] arm: Use builtins for ffs/fls
On 7/31/23 17:27, Sean Anderson wrote: Since ARMv5, the clz instruction allows for efficient implementation of ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is only available in ARM mode. LTO makes it difficult to force specific functions to be in ARM mode, as it is effectively a form of very aggressive inlining. To work around this, fls/ffs are implemented in assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. Overall, this saves around 75 bytes per call. Signed-off-by: Sean Anderson --- I only tested this on ARMv8. If someone has an ARMv5 or ARMv6 board, please test this. +CC Linus since he may care about integrator. --Sean
Re: [PATCH] arm: Use builtins for ffs/fls
On 7/31/23 17:36, Tom Rini wrote: On Mon, Jul 31, 2023 at 05:27:33PM -0400, Sean Anderson wrote: Since ARMv5, the clz instruction allows for efficient implementation of ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is only available in ARM mode. LTO makes it difficult to force specific functions to be in ARM mode, as it is effectively a form of very aggressive inlining. To work around this, fls/ffs are implemented in assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. Overall, this saves around 75 bytes per call. Signed-off-by: Sean Anderson This looks like it's synced from the kernel, what tag? The builtins are synced from v5.15 and they haven't changed since. --Sean
Re: [PATCH] arm: Use builtins for ffs/fls
On Mon, Jul 31, 2023 at 05:27:33PM -0400, Sean Anderson wrote: > Since ARMv5, the clz instruction allows for efficient implementation of > ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is > only available in ARM mode. LTO makes it difficult to force specific > functions to be in ARM mode, as it is effectively a form of very > aggressive inlining. To work around this, fls/ffs are implemented in > assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. > Overall, this saves around 75 bytes per call. > > Signed-off-by: Sean Anderson This looks like it's synced from the kernel, what tag? -- Tom signature.asc Description: PGP signature
[PATCH] arm: Use builtins for ffs/fls
Since ARMv5, the clz instruction allows for efficient implementation of ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is only available in ARM mode. LTO makes it difficult to force specific functions to be in ARM mode, as it is effectively a form of very aggressive inlining. To work around this, fls/ffs are implemented in assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. Overall, this saves around 75 bytes per call. Signed-off-by: Sean Anderson --- I only tested this on ARMv8. If someone has an ARMv5 or ARMv6 board, please test this. arch/arm/include/asm/bitops.h | 27 - arch/arm/lib/Makefile | 5 +++ arch/arm/lib/bitops.S | 45 ++ include/asm-generic/bitops/builtin-__ffs.h | 16 include/asm-generic/bitops/builtin-__fls.h | 16 include/asm-generic/bitops/builtin-ffs.h | 15 include/asm-generic/bitops/builtin-fls.h | 17 7 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 arch/arm/lib/bitops.S create mode 100644 include/asm-generic/bitops/builtin-__ffs.h create mode 100644 include/asm-generic/bitops/builtin-__fls.h create mode 100644 include/asm-generic/bitops/builtin-ffs.h create mode 100644 include/asm-generic/bitops/builtin-fls.h diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h index fa8548624a..8e897833bb 100644 --- a/arch/arm/include/asm/bitops.h +++ b/arch/arm/include/asm/bitops.h @@ -15,9 +15,34 @@ #ifndef __ASM_ARM_BITOPS_H #define __ASM_ARM_BITOPS_H +#if __LINUX_ARM_ARCH__ < 5 + #include #include #include + +#else + +#define PLATFORM_FFS +#define PLATFORM_FLS + +#if !IS_ENABLED(CONFIG_HAS_THUMB2) && CONFIG_IS_ENABLED(SYS_THUMB_BUILD) + +unsigned long __fls(unsigned long word); +unsigned long __ffs(unsigned long word); +int fls(unsigned int x); +int ffs(int x); + +#else + +#include +#include +#include +#include + +#endif +#endif + #include #ifdef __KERNEL__ @@ -113,7 +138,7 @@ static inline int test_bit(int nr, const void * addr) static inline int __ilog2(unsigned int x) { - return generic_fls(x) - 1; + return fls(x) - 1; } #define ffz(x) __ffs(~(x)) diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 62cf80f373..b1bcd37466 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -113,6 +113,11 @@ AFLAGS_REMOVE_memset.o := -mthumb -mthumb-interwork AFLAGS_REMOVE_memcpy.o := -mthumb -mthumb-interwork AFLAGS_memset.o := -DMEMSET_NO_THUMB_BUILD AFLAGS_memcpy.o := -DMEMCPY_NO_THUMB_BUILD + +# This is only necessary to force ARM mode on THUMB1 targets. +ifneq ($(CONFIG_SYS_ARM_ARCH),4) +obj-y += bitops.o +endif endif endif diff --git a/arch/arm/lib/bitops.S b/arch/arm/lib/bitops.S new file mode 100644 index 00..29d1524634 --- /dev/null +++ b/arch/arm/lib/bitops.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Sean Anderson + * + * ARM bitops to call when using THUMB1, which doesn't have these instructions. + */ +#include +#include + +.pushsection .text.__fls +ENTRY(__fls) + clz r0, r0 + rsb r0, r0, #31 + ret lr +ENDPROC(__fls) +.popsection + +.pushsection .text.__ffs +ENTRY(__ffs) + rsb r3, r0, #0 + and r0, r0, r3 + clz r0, r0 + rsb r0, r0, #31 + ret lr +ENDPROC(__ffs) +.popsection + +.pushsection .text.fls +ENTRY(fls) + cmp r0, #0 + clzne r0, r0 + rsbne r0, r0, #32 + ret lr +ENDPROC(fls) +.popsection + +.pushsection .text.ffs +ENTRY(ffs) + rsb r3, r0, #0 + and r0, r0, r3 + clz r0, r0 + rsb r0, r0, #32 + ret lr +ENDPROC(ffs) +.popsection diff --git a/include/asm-generic/bitops/builtin-__ffs.h b/include/asm-generic/bitops/builtin-__ffs.h new file mode 100644 index 00..87024da44d --- /dev/null +++ b/include/asm-generic/bitops/builtin-__ffs.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ +#define _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ + +/** + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __always_inline unsigned long __ffs(unsigned long word) +{ + return __builtin_ctzl(word); +} + +#endif diff --git a/include/asm-generic/bitops/builtin-__fls.h b/include/asm-generic/bitops/builtin-__fls.h new file mode 100644 index 00..43a5aa9afb --- /dev/null +++ b/include/asm-generic/bitops/builtin-__fls.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_BUILTIN___FLS_H_ +#define _ASM_GENERIC_BITOPS_BUILTIN___FLS_H_ + +/** + * __fls - find last (most-significant) set bit in a long word + * @word: the word to search + * + * Undefined if no set bit exists, so code should check against 0 first. + */ +static __alwa