https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106705
Bug ID: 106705
Summary: Expensive constant load replicated throughout switch
statement
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: macro at orcam dot me.uk
Target Milestone: ---
Given code like:
#define do_sd(a, b, c) do { b[c] = a; } while (0)
void sd(unsigned len, unsigned long *mem)
{
const unsigned long val = 2415920512;
switch (len) {
case 7:
do_sd(val, mem, -7);
case 6:
do_sd(val, mem, -6);
case 5:
do_sd(val, mem, -5);
case 4:
do_sd(val, mem, -4);
case 3:
do_sd(val, mem, -3);
case 2:
do_sd(val, mem, -2);
case 1:
do_sd(val, mem, -1);
}
}
(reduced from code using an out-of-tree intrinsic for a vendor-specific
machine instruction) we get 64-bit assembly at -O2 with a constant load
replicated across all the switch cases, terribly suboptimal.
E.g. for RISC-V this gets compiled to:
.text
.align 1
.globl sd
.type sd, @function
sd:
li a5,7
bgtu a0,a5,.L1
lui a5,%hi(.L4)
addi a5,a5,%lo(.L4)
slli a0,a0,2
add a0,a0,a5
lw a5,0(a0)
jr a5
.section .rodata
.align 2
.align 2
.L4:
.word .L1
.word .L10
.word .L9
.word .L8
.word .L7
.word .L6
.word .L5
.word .L3
.text
.L3:
li a5,9
slli a5,a5,28
addi a5,a5,1408
sd a5,-56(a1)
.L5:
li a5,9
slli a5,a5,28
addi a5,a5,1408
sd a5,-48(a1)
.L6:
li a5,9
slli a5,a5,28
addi a5,a5,1408
sd a5,-40(a1)
.L7:
li a5,9
slli a5,a5,28
addi a5,a5,1408
sd a5,-32(a1)
.L8:
li a5,9
slli a5,a5,28
addi a5,a5,1408
sd a5,-24(a1)
.L9:
li a5,9
slli a5,a5,28
addi a5,a5,1408
sd a5,-16(a1)
.L10:
li a5,9
slli a5,a5,28
addi a5,a5,1408
sd a5,-8(a1)
.L1:
ret
.size sd, .-sd
and similarly e.g. for Alpha:
[...]
$L3:
lda $1,9($31)
sll $1,28,$1
lda $1,1408($1)
stq $1,-56($17)
$L5:
lda $1,9($31)
sll $1,28,$1
lda $1,1408($1)
stq $1,-48($17)
[...]
MIPS:
[...]
.L3:
li $2,18874368 # 0x1200000
daddiu $2,$2,11
dsll $2,$2,7
sd $2,-56($5)
.L5:
li $2,18874368 # 0x1200000
daddiu $2,$2,11
dsll $2,$2,7
sd $2,-48($5)
[...]
POWER:
[...]
.L3:
lis 9,0x9000
ori 9,9,0x580
rldicl 9,9,0,32
std 9,-56(4)
.L5:
lis 9,0x9000
ori 9,9,0x580
rldicl 9,9,0,32
std 9,-48(4)
[...]
or x86 even:
[...]
.L3:
movl $2415920512, %eax
movq %rax, -56(%rsi)
.L5:
movl $2415920512, %eax
movq %rax, -48(%rsi)
[...]
I understand this is due forward constant propagation, and it used to
be possible to circumvent it at least at -Os by disabling several of
the tree passess (with GCC 9 this has been originally discovered with),
but apparently not anymore. Not that it would be useful for regular
compilations. In that case good RISC-V code was produced (except for
some needless sign-extensions, etc. coming from -Os inefficiency):
.text
.align 1
.globl sd
.type sd, @function
sd:
addiw a0,a0,-1
sext.w a4,a0
li a5,6
bgtu a4,a5,.L1
slli a0,a0,32
lui a5,%hi(.L4)
addi a5,a5,%lo(.L4)
srli a0,a0,30
add a0,a0,a5
lw a4,0(a0)
li a5,9
slli a5,a5,28
addi a5,a5,1408
jr a4
.section .rodata
.align 2
.align 2
.L4:
.word .L10
.word .L9
.word .L8
.word .L7
.word .L6
.word .L5
.word .L3
.text
.align 2
.L3:
sd a5,-56(a1)
.align 2
.L5:
sd a5,-48(a1)
.align 2
.L6:
sd a5,-40(a1)
.align 2
.L7:
sd a5,-32(a1)
.align 2
.L8:
sd a5,-24(a1)
.align 2
.L9:
sd a5,-16(a1)
.align 2
.L10:
sd a5,-8(a1)
.align 2
.L1:
ret
.size sd, .-sd