Re: [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible
On Tue, 21 Sep 2021 17:09:47 +0200, Christophe Leroy wrote: > Today we get the following code generation for bitops like > set or clear bit: > > c0009fe0: 39 40 08 00 li r10,2048 > c0009fe4: 7c e0 40 28 lwarx r7,0,r8 > c0009fe8: 7c e7 53 78 or r7,r7,r10 > c0009fec: 7c e0 41 2d stwcx. r7,0,r8 > > [...] Applied to powerpc/next. [1/3] powerpc/bitops: Use immediate operand when possible https://git.kernel.org/powerpc/c/fb350784d8d17952afa93383bb47aaa6b715c459 [2/3] powerpc/atomics: Use immediate operand when possible https://git.kernel.org/powerpc/c/41d65207de9fbff58acd8937a7c3f8940c186a87 [3/3] powerpc/atomics: Remove atomic_inc()/atomic_dec() and friends https://git.kernel.org/powerpc/c/f05cab0034babaa9b3dfaf6003ee6493496a8180 cheers
Re: [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible
LEROY Christophe writes: > Hi Michael, > > Any chance to get this series merged this cycle ? Yeah. I hesitated a bit at the changes which make the atomic asm even more complicated, but I guess it's worth it. cheers > Le 21/09/2021 à 17:09, Christophe Leroy a écrit : >> Today we get the following code generation for bitops like >> set or clear bit: >> >> c0009fe0: 39 40 08 00 li r10,2048 >> c0009fe4: 7c e0 40 28 lwarx r7,0,r8 >> c0009fe8: 7c e7 53 78 or r7,r7,r10 >> c0009fec: 7c e0 41 2d stwcx. r7,0,r8 >> >> c000d568: 39 00 18 00 li r8,6144 >> c000d56c: 7c c0 38 28 lwarx r6,0,r7 >> c000d570: 7c c6 40 78 andcr6,r6,r8 >> c000d574: 7c c0 39 2d stwcx. r6,0,r7 >> >> Most set bits are constant on lower 16 bits, so it can easily >> be replaced by the "immediate" version of the operation. Allow >> GCC to choose between the normal or immediate form. >> >> For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for >> when all bits to be cleared are consecutive. >> >> On 64 bits we don't have any equivalent single operation for clearing, >> single bits or a few bits, we'd need two 'rldicl' so it is not >> worth it, the li/andc sequence is doing the same. >> >> With this patch we get: >> >> c0009fe0: 7d 00 50 28 lwarx r8,0,r10 >> c0009fe4: 61 08 08 00 ori r8,r8,2048 >> c0009fe8: 7d 00 51 2d stwcx. r8,0,r10 >> >> c000d558: 7c e0 40 28 lwarx r7,0,r8 >> c000d55c: 54 e7 05 64 rlwinm r7,r7,0,21,18 >> c000d560: 7c e0 41 2d stwcx. r7,0,r8 >> >> On pmac32_defconfig, it reduces the text by approx 10 kbytes. >> >> Signed-off-by: Christophe Leroy >> Reviewed-by: Segher Boessenkool >> --- >> v5: Fixed the argument of is_rlwinm_mask_valid() in test_and_clear_bits() >> >> v4: Rebased >> >> v3: >> - Using the mask validation proposed by Segher >> >> v2: >> - Use "n" instead of "i" as constraint for the rlwinm mask >> - Improve mask verification to handle more than single bit masks >> >> Signed-off-by: Christophe Leroy >> --- >> arch/powerpc/include/asm/bitops.h | 89 --- >> 1 file changed, 81 insertions(+), 8 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/bitops.h >> b/arch/powerpc/include/asm/bitops.h >> index 11847b6a244e..a05d8c62cbea 100644 >> --- a/arch/powerpc/include/asm/bitops.h >> +++ b/arch/powerpc/include/asm/bitops.h >> @@ -71,19 +71,61 @@ static inline void fn(unsigned long mask,\ >> __asm__ __volatile__ ( \ >> prefix \ >> "1:" PPC_LLARX "%0,0,%3,0\n" \ >> -stringify_in_c(op) "%0,%0,%2\n" \ >> +#op "%I2 %0,%0,%2\n"\ >> PPC_STLCX "%0,0,%3\n" \ >> "bne- 1b\n" \ >> : "=&r" (old), "+m" (*p)\ >> -: "r" (mask), "r" (p) \ >> +: "rK" (mask), "r" (p) \ >> : "cc", "memory"); \ >> } >> >> DEFINE_BITOP(set_bits, or, "") >> -DEFINE_BITOP(clear_bits, andc, "") >> -DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER) >> DEFINE_BITOP(change_bits, xor, "") >> >> +static __always_inline bool is_rlwinm_mask_valid(unsigned long x) >> +{ >> +if (!x) >> +return false; >> +if (x & 1) >> +x = ~x; // make the mask non-wrapping >> +x += x & -x;// adding the low set bit results in at most one bit set >> + >> +return !(x & (x - 1)); >> +} >> + >> +#define DEFINE_CLROP(fn, prefix)\ >> +static inline void fn(unsigned long mask, volatile unsigned long *_p) >> \ >> +{ \ >> +unsigned long old; \ >> +unsigned long *p = (unsigned long *)_p; \ >> +\ >> +if (IS_ENABLED(CONFIG_PPC32) && \ >> +__builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\ >> +asm volatile ( \ >> +prefix \ >> +"1:""lwarx %0,0,%3\n" \ >> +"rlwinm %0,%0,0,%2\n" \ >> +"stwcx. %0,0,%3\n" \ >> +"bne- 1b\n" \ >> +: "=&r" (old), "+m" (*p)\ >> +: "n" (~mask), "r" (p) \ >> +: "cc", "memory"); \ >> +} else
Re: [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible
Hi Michael, Any chance to get this series merged this cycle ? Thanks Christophe Le 21/09/2021 à 17:09, Christophe Leroy a écrit : > Today we get the following code generation for bitops like > set or clear bit: > > c0009fe0: 39 40 08 00 li r10,2048 > c0009fe4: 7c e0 40 28 lwarx r7,0,r8 > c0009fe8: 7c e7 53 78 or r7,r7,r10 > c0009fec: 7c e0 41 2d stwcx. r7,0,r8 > > c000d568: 39 00 18 00 li r8,6144 > c000d56c: 7c c0 38 28 lwarx r6,0,r7 > c000d570: 7c c6 40 78 andcr6,r6,r8 > c000d574: 7c c0 39 2d stwcx. r6,0,r7 > > Most set bits are constant on lower 16 bits, so it can easily > be replaced by the "immediate" version of the operation. Allow > GCC to choose between the normal or immediate form. > > For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for > when all bits to be cleared are consecutive. > > On 64 bits we don't have any equivalent single operation for clearing, > single bits or a few bits, we'd need two 'rldicl' so it is not > worth it, the li/andc sequence is doing the same. > > With this patch we get: > > c0009fe0: 7d 00 50 28 lwarx r8,0,r10 > c0009fe4: 61 08 08 00 ori r8,r8,2048 > c0009fe8: 7d 00 51 2d stwcx. r8,0,r10 > > c000d558: 7c e0 40 28 lwarx r7,0,r8 > c000d55c: 54 e7 05 64 rlwinm r7,r7,0,21,18 > c000d560: 7c e0 41 2d stwcx. r7,0,r8 > > On pmac32_defconfig, it reduces the text by approx 10 kbytes. > > Signed-off-by: Christophe Leroy > Reviewed-by: Segher Boessenkool > --- > v5: Fixed the argument of is_rlwinm_mask_valid() in test_and_clear_bits() > > v4: Rebased > > v3: > - Using the mask validation proposed by Segher > > v2: > - Use "n" instead of "i" as constraint for the rlwinm mask > - Improve mask verification to handle more than single bit masks > > Signed-off-by: Christophe Leroy > --- > arch/powerpc/include/asm/bitops.h | 89 --- > 1 file changed, 81 insertions(+), 8 deletions(-) > > diff --git a/arch/powerpc/include/asm/bitops.h > b/arch/powerpc/include/asm/bitops.h > index 11847b6a244e..a05d8c62cbea 100644 > --- a/arch/powerpc/include/asm/bitops.h > +++ b/arch/powerpc/include/asm/bitops.h > @@ -71,19 +71,61 @@ static inline void fn(unsigned long mask, \ > __asm__ __volatile__ ( \ > prefix \ > "1:"PPC_LLARX "%0,0,%3,0\n" \ > - stringify_in_c(op) "%0,%0,%2\n" \ > + #op "%I2 %0,%0,%2\n"\ > PPC_STLCX "%0,0,%3\n" \ > "bne- 1b\n" \ > : "=&r" (old), "+m" (*p)\ > - : "r" (mask), "r" (p) \ > + : "rK" (mask), "r" (p) \ > : "cc", "memory"); \ > } > > DEFINE_BITOP(set_bits, or, "") > -DEFINE_BITOP(clear_bits, andc, "") > -DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER) > DEFINE_BITOP(change_bits, xor, "") > > +static __always_inline bool is_rlwinm_mask_valid(unsigned long x) > +{ > + if (!x) > + return false; > + if (x & 1) > + x = ~x; // make the mask non-wrapping > + x += x & -x;// adding the low set bit results in at most one bit set > + > + return !(x & (x - 1)); > +} > + > +#define DEFINE_CLROP(fn, prefix) \ > +static inline void fn(unsigned long mask, volatile unsigned long *_p) > \ > +{\ > + unsigned long old; \ > + unsigned long *p = (unsigned long *)_p; \ > + \ > + if (IS_ENABLED(CONFIG_PPC32) && \ > + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\ > + asm volatile ( \ > + prefix \ > + "1:""lwarx %0,0,%3\n" \ > + "rlwinm %0,%0,0,%2\n" \ > + "stwcx. %0,0,%3\n" \ > + "bne- 1b\n" \ > + : "=&r" (old), "+m" (*p)\ > + : "n" (~mask), "r" (p) \ > + : "cc", "memory"); \ > + } else {\ > + asm volatile ( \ > + prefix
[PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible
Today we get the following code generation for bitops like set or clear bit: c0009fe0: 39 40 08 00 li r10,2048 c0009fe4: 7c e0 40 28 lwarx r7,0,r8 c0009fe8: 7c e7 53 78 or r7,r7,r10 c0009fec: 7c e0 41 2d stwcx. r7,0,r8 c000d568: 39 00 18 00 li r8,6144 c000d56c: 7c c0 38 28 lwarx r6,0,r7 c000d570: 7c c6 40 78 andcr6,r6,r8 c000d574: 7c c0 39 2d stwcx. r6,0,r7 Most set bits are constant on lower 16 bits, so it can easily be replaced by the "immediate" version of the operation. Allow GCC to choose between the normal or immediate form. For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for when all bits to be cleared are consecutive. On 64 bits we don't have any equivalent single operation for clearing, single bits or a few bits, we'd need two 'rldicl' so it is not worth it, the li/andc sequence is doing the same. With this patch we get: c0009fe0: 7d 00 50 28 lwarx r8,0,r10 c0009fe4: 61 08 08 00 ori r8,r8,2048 c0009fe8: 7d 00 51 2d stwcx. r8,0,r10 c000d558: 7c e0 40 28 lwarx r7,0,r8 c000d55c: 54 e7 05 64 rlwinm r7,r7,0,21,18 c000d560: 7c e0 41 2d stwcx. r7,0,r8 On pmac32_defconfig, it reduces the text by approx 10 kbytes. Signed-off-by: Christophe Leroy Reviewed-by: Segher Boessenkool --- v5: Fixed the argument of is_rlwinm_mask_valid() in test_and_clear_bits() v4: Rebased v3: - Using the mask validation proposed by Segher v2: - Use "n" instead of "i" as constraint for the rlwinm mask - Improve mask verification to handle more than single bit masks Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/bitops.h | 89 --- 1 file changed, 81 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 11847b6a244e..a05d8c62cbea 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -71,19 +71,61 @@ static inline void fn(unsigned long mask, \ __asm__ __volatile__ ( \ prefix \ "1:" PPC_LLARX "%0,0,%3,0\n" \ - stringify_in_c(op) "%0,%0,%2\n" \ + #op "%I2 %0,%0,%2\n"\ PPC_STLCX "%0,0,%3\n" \ "bne- 1b\n" \ : "=&r" (old), "+m" (*p)\ - : "r" (mask), "r" (p) \ + : "rK" (mask), "r" (p) \ : "cc", "memory"); \ } DEFINE_BITOP(set_bits, or, "") -DEFINE_BITOP(clear_bits, andc, "") -DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER) DEFINE_BITOP(change_bits, xor, "") +static __always_inline bool is_rlwinm_mask_valid(unsigned long x) +{ + if (!x) + return false; + if (x & 1) + x = ~x; // make the mask non-wrapping + x += x & -x;// adding the low set bit results in at most one bit set + + return !(x & (x - 1)); +} + +#define DEFINE_CLROP(fn, prefix) \ +static inline void fn(unsigned long mask, volatile unsigned long *_p) \ +{ \ + unsigned long old; \ + unsigned long *p = (unsigned long *)_p; \ + \ + if (IS_ENABLED(CONFIG_PPC32) && \ + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\ + asm volatile ( \ + prefix \ + "1:""lwarx %0,0,%3\n" \ + "rlwinm %0,%0,0,%2\n" \ + "stwcx. %0,0,%3\n" \ + "bne- 1b\n" \ + : "=&r" (old), "+m" (*p)\ + : "n" (~mask), "r" (p) \ + : "cc", "memory"); \ + } else {\ + asm volatile ( \ + prefix \ + "1:"PPC_LLARX "%0,0,%3,0\n" \ + "andc %0,%0,%2\n" \ + PPC_STLCX "%0,0,%3\n" \ + "bne- 1b\n"