https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124808
--- Comment #7 from Daniel Henrique Barboza ---
I implemented Andrew's suggestions in comment 5 and the situation improved.
Using the shorter version from the description, the RISC-V asm being generated
after changes is:
---
SetupPrecalculatedData:
.LFB0:
.cfi_startproc
mv a3,a0 # 7 [c=4 l=4] *movdi_64bit/0
li a5,0# 4 [c=4 l=4] *movdi_64bit/1
li a1,55 # 9 [c=4 l=4] *movdi_64bit/1
li a6,64 # 31[c=4 l=4] *movdi_64bit/1
j .L5 # 73[c=4 l=4] jump
.L8:
ld a2,0(a3)# 17[c=28 l=4] *movdi_64bit/2
addiw a5,a5,1 # 20[c=8 l=4] addsi3_extended/1
addia3,a3,8 # 23[c=4 l=4] *adddi3/1
add a4,a2,a4# 18[c=4 l=4] *adddi3/0
sd a4,-8(a3) # 19[c=4 l=4] *movdi_64bit/3
.L5:
addiw a4,a5,8 # 13[c=8 l=4] addsi3_extended/1
bseta4,x0,a4# 16[c=4 l=4] *bsetdi_1
ble a5,a1,.L8 # 10[c=16 l=4] *branchdi
addiw a5,a5,1 # 28[c=8 l=4] addsi3_extended/1
beq a5,a6,.L6 # 32[c=16 l=4] *branchdi
addia3,a3,8 # 34[c=4 l=4] *adddi3/1
j .L5 # 76[c=4 l=4] jump
.L6:
ret # 69[c=0 l=4] simple_return
.cfi_endproc
---
18 insns, 4 less than what we have in trunk (LLVM version is 16 insns).
For aarch64, in trunk:
---
SetupPrecalculatedData:
.LFB0:
.cfi_startproc
mov x1, 0 // 4[c=4 l=4] *movdi_aarch64/4
mov x4, 1 // 13 [c=4 l=4] *movdi_aarch64/4
.p2align 5,,15
.L5:
asr w2, w1, 3 // 8[c=4 l=4]
*aarch64_ashr_sisd_or_int_si3/0
cmp w2, 7 // 9[c=4 l=4] cmpsi/1
beq .L2 // 10 [c=12 l=4] aarch64_bcond
.L7:
ldr x2, [x0, x1, lsl 3] // 15 [c=16 l=4] *movdi_aarch64/9
add w3, w1, 8 // 12 [c=4 l=4] *addsi3_aarch64/0
lsl x3, x4, x3 // 14 [c=4 l=4]
*aarch64_ashl_sisd_or_int_di3/1
add x2, x2, x3 // 16 [c=4 l=4] *adddi3_aarch64/1
str x2, [x0, x1, lsl 3] // 17 [c=0 l=4] *movdi_aarch64/11
add x1, x1, 1 // 18 [c=4 l=4] *adddi3_aarch64/0
asr w2, w1, 3 // 66 [c=4 l=4]
*aarch64_ashr_sisd_or_int_si3/0
cmp w2, 7 // 67 [c=4 l=4] cmpsi/1
bne .L7 // 68 [c=12 l=4] aarch64_bcond
.L2:
add x1, x1, 1 // 23 [c=4 l=4] *adddi3_aarch64/0
cmp x1, 64 // 24 [c=4 l=4] cmpdi/1
bne .L5 // 25 [c=12 l=4] aarch64_bcond
ret // 60 [c=0 l=4] *do_return
.cfi_endproc
---
With the changes:
---
SetupPrecalculatedData:
.LFB0:
.cfi_startproc
mov x1, 0 // 4[c=4 l=4] *movdi_aarch64/4
mov x4, 1 // 12 [c=4 l=4] *movdi_aarch64/4
b .L5 // 64 [c=12 l=4] jump
.p2align 2,,3
.L7:
ldr x2, [x0, x1, lsl 3] // 14 [c=16 l=4] *movdi_aarch64/9
add w3, w1, 8 // 11 [c=4 l=4] *addsi3_aarch64/0
lsl x3, x4, x3 // 13 [c=4 l=4]
*aarch64_ashl_sisd_or_int_di3/1
add x2, x2, x3 // 15 [c=4 l=4] *adddi3_aarch64/1
str x2, [x0, x1, lsl 3] // 16 [c=0 l=4] *movdi_aarch64/11
add x1, x1, 1 // 17 [c=4 l=4] *adddi3_aarch64/0
.L5:
cmp x1, 55 // 8[c=4 l=4] cmpdi/1
bls .L7 // 9[c=12 l=4] aarch64_bcond
add x1, x1, 1 // 22 [c=4 l=4] *adddi3_aarch64/0
cmp x1, 64 // 23 [c=4 l=4] cmpdi/1
bne .L5 // 24 [c=12 l=4] aarch64_bcond
ret // 59 [c=0 l=4] *do_return
.cfi_endproc
---
With trunk we have 18 insns, after changes 15 insns. For reference, LLVM
generates 14 insns:
SetupPrecalculatedData:
mov x8, xzr
mov w9, #256
b .LBB0_2
.LBB0_1:
add x8, x8, #1
cmp x8, #64
b.eq.LBB0_4
.LBB0_2:
cmp x8, #55
b.hi.LBB0_1
lsl x10, x9, x8
ldr x11, [x0, x8, lsl #3]
add x10, x11, x10
str x10, [x0, x8, lsl #3]
b .LBB0_1
.LBB0_4:
ret
We're definitely closer to LLVM (maybe equal/better? I didn't do a cost
analysis to see if GCC happens to generate better code even with a couple insn
more).
As for the full testcase we're now converting SetupPrecaculatedData1 to a call
to SetupPreCalculatedData (i.e. the functions are now the same):
SetupPrecalculatedData:
.LFB0:
.cfi_startproc
adrpx6, .LANCHOR0 // 7[c=4 l=4] *movdi_aarch64/15
add x6, x6, :lo12:.LANCHOR0 /