Sometimes gcc mysteriously doesn't inline
very small functions we expect to be inlined. See
    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122
Arguably, gcc should do better, but gcc people aren't willing
to invest time into it, asking to use __always_inline instead.

With this .config:
http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os,
the following functions get deinlined many times.
Examples of disassembly:

test_and_set_bit (166 copies, ~1260 calls)
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       f0 48 0f ab 3e          lock bts %rdi,(%rsi)
       72 04                   jb     <test_and_set_bit+0xf>
       31 c0                   xor    %eax,%eax
       eb 05                   jmp    <test_and_set_bit+0x14>
       b8 01 00 00 00          mov    $0x1,%eax
       5d                      pop    %rbp
       c3                      retq

test_and_clear_bit (124 copies, ~1000 calls)
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       f0 48 0f b3 3e          lock btr %rdi,(%rsi)
       72 04                   jb     <test_and_clear_bit+0xf>
       31 c0                   xor    %eax,%eax
       eb 05                   jmp    <test_and_clear_bit+0x14>
       b8 01 00 00 00          mov    $0x1,%eax
       5d                      pop    %rbp
       c3                      retq

change_bit (3 copies, 8 calls)
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       f0 48 0f bb 3e          lock btc %rdi,(%rsi)
       5d                      pop    %rbp
       c3                      retq

clear_bit_unlock (2 copies, 11 calls)
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       f0 48 0f b3 3e          lock btr %rdi,(%rsi)
       5d                      pop    %rbp
       c3                      retq

This patch fixes this via s/inline/__always_inline/.

Code size decrease after the patch is ~13.5k:

    text     data      bss       dec     hex filename
92110727 20826144 36417536 149354407 8e6f7a7 vmlinux
92097234 20826176 36417536 149340946 8e6c312 vmlinux9_bitops_after

Signed-off-by: Denys Vlasenko <dvlas...@redhat.com>
Cc: Ingo Molnar <mi...@kernel.org>
Cc: Thomas Graf <tg...@suug.ch>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: David Rientjes <rient...@google.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: "H. Peter Anvin" <h...@zytor.com>
Cc: Andi Kleen <a...@firstfloor.org>
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/bitops.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b95..7766d1c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
 {
        asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
 }
@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr)
  * clear_bit() is atomic and implies release semantics before the memory
  * operation. It can be used for an unlock.
  */
-static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void clear_bit_unlock(long nr, volatile unsigned long 
*addr)
 {
        barrier();
        clear_bit(nr, addr);
 }
 
-static inline void __clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
 {
        asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
 }
@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned 
long *addr)
  * No memory barrier is required here, because x86 cannot reorder stores past
  * older loads. Same principle as spin_unlock.
  */
-static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long 
*addr)
 {
        barrier();
        __clear_bit(nr, addr);
@@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile 
unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
 {
        asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
 }
@@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned 
long *addr)
  * Note that @nr may be almost arbitrarily large; this function is not
  * restricted to acting on a single-word quantity.
  */
-static inline void change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void change_bit(long nr, volatile unsigned long *addr)
 {
        if (IS_IMMEDIATE(nr)) {
                asm volatile(LOCK_PREFIX "xorb %1,%0"
@@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned 
long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_set_bit(long nr, volatile unsigned long 
*addr)
 {
        GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c");
 }
@@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_set_bit(long nr, volatile unsigned long 
*addr)
 {
        int oldbit;
 
@@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile 
unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_clear_bit(long nr, volatile unsigned long 
*addr)
 {
        GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c");
 }
@@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile 
unsigned long *addr)
  * accessed from a hypervisor on the same CPU if running in a VM: don't change
  * this without also updating arch/x86/kernel/kvm.c
  */
-static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_clear_bit(long nr, volatile unsigned 
long *addr)
 {
        int oldbit;
 
@@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile 
unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_change_bit(long nr, volatile unsigned 
long *addr)
 {
        int oldbit;
 
@@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile 
unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_change_bit(long nr, volatile unsigned long 
*addr)
 {
        GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c");
 }
@@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const 
volatile unsigned lo
                (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
-static inline int variable_test_bit(long nr, volatile const unsigned long 
*addr)
+static __always_inline int variable_test_bit(long nr, volatile const unsigned 
long *addr)
 {
        int oldbit;
 
@@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long 
*addr);
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned long __ffs(unsigned long word)
 {
        asm("rep; bsf %1,%0"
                : "=r" (word)
@@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word)
  *
  * Undefined if no zero exists, so code should check against ~0UL first.
  */
-static inline unsigned long ffz(unsigned long word)
+static __always_inline unsigned long ffz(unsigned long word)
 {
        asm("rep; bsf %1,%0"
                : "=r" (word)
@@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word)
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long __fls(unsigned long word)
 {
        asm("bsr %1,%0"
            : "=r" (word)
@@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word)
  * set bit if value is nonzero. The first (least significant) bit
  * is at position 1.
  */
-static inline int ffs(int x)
+static __always_inline int ffs(int x)
 {
        int r;
 
@@ -434,7 +434,7 @@ static inline int ffs(int x)
  * set bit if value is nonzero. The last (most significant) bit is
  * at position 32.
  */
-static inline int fls(int x)
+static __always_inline int fls(int x)
 {
        int r;
 
-- 
1.8.1.4

Reply via email to