https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114252
Bug ID: 114252
Summary: Introducing bswapsi reduces code performance
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: gjl at gcc dot gnu.org
Target Milestone: ---
Created attachment 57628
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57628&action=edit
GNU-C test case
typedef __UINT8_TYPE__ uint8_t;
typedef __UINT32_TYPE__ uint32_t;
typedef uint8_t __attribute__((vector_size(4))) v4u8_t;
uint32_t func1 (const uint8_t *buf) {
v4u8_t v4 = { buf[1], buf[0], buf[3], buf[2] };
return (uint32_t) v4;
}
Compile the code with
$ avr-gcc code.c -S -Os -dp
with v13 the result is:
func1:
mov r30,r24 ; 37 [c=4 l=1] movqi_insn/0
mov r31,r25 ; 38 [c=4 l=1] movqi_insn/0
ldd r22,Z+1 ; 39 [c=4 l=1] movqi_insn/3
ld r23,Z ; 40 [c=4 l=1] movqi_insn/3
ldd r24,Z+3 ; 41 [c=4 l=1] movqi_insn/3
ldd r25,Z+2 ; 42 [c=4 l=1] movqi_insn/3
/* epilogue start */
ret ; 45 [c=0 l=1] return
which is good code: insn 37, 38 move the address to pointer register Z, and
then follow 4 loads, one for each byte.
When compiled with v14 however:
func1:
mov r30,r24 ; 23 [c=4 l=2] *movhi/0
mov r31,r25
ld r22,Z ; 24 [c=16 l=4] *movsi/2
ldd r23,Z+1
ldd r24,Z+2
ldd r25,Z+3
rcall __bswapsi2 ; 25 [c=16 l=1] *bswapsi2.libgcc
mov r31,r23 ; 32 [c=4 l=1] movqi_insn/0
mov r23,r25 ; 33 [c=4 l=1] movqi_insn/0
mov r25,r31 ; 34 [c=4 l=1] movqi_insn/0
mov r31,r22 ; 35 [c=4 l=1] movqi_insn/0
mov r22,r24 ; 36 [c=4 l=1] movqi_insn/0
mov r24,r31 ; 37 [c=4 l=1] movqi_insn/0
/* epilogue start */
ret ; 40 [c=0 l=1] return
Target: avr
Configured with: ../../source/gcc-master/configure --target=avr --disable-nls
--with-dwarf2 --with-gnu-as --with-gnu-ld --disable-shared
--enable-languages=c,c++
Thread model: single
Supported LTO compression algorithms: zlib
gcc version 14.0.1 20240303 (experimental) (GCC)