https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124808
--- Comment #7 from Daniel Henrique Barboza <daniel.barboza at oss dot
qualcomm.com> ---
I implemented Andrew's suggestions in comment 5 and the situation improved.
Using the shorter version from the description, the RISC-V asm being generated
after changes is:
---
SetupPrecalculatedData:
.LFB0:
.cfi_startproc
mv a3,a0 # 7 [c=4 l=4] *movdi_64bit/0
li a5,0 # 4 [c=4 l=4] *movdi_64bit/1
li a1,55 # 9 [c=4 l=4] *movdi_64bit/1
li a6,64 # 31 [c=4 l=4] *movdi_64bit/1
j .L5 # 73 [c=4 l=4] jump
.L8:
ld a2,0(a3) # 17 [c=28 l=4] *movdi_64bit/2
addiw a5,a5,1 # 20 [c=8 l=4] addsi3_extended/1
addi a3,a3,8 # 23 [c=4 l=4] *adddi3/1
add a4,a2,a4 # 18 [c=4 l=4] *adddi3/0
sd a4,-8(a3) # 19 [c=4 l=4] *movdi_64bit/3
.L5:
addiw a4,a5,8 # 13 [c=8 l=4] addsi3_extended/1
bset a4,x0,a4 # 16 [c=4 l=4] *bsetdi_1
ble a5,a1,.L8 # 10 [c=16 l=4] *branchdi
addiw a5,a5,1 # 28 [c=8 l=4] addsi3_extended/1
beq a5,a6,.L6 # 32 [c=16 l=4] *branchdi
addi a3,a3,8 # 34 [c=4 l=4] *adddi3/1
j .L5 # 76 [c=4 l=4] jump
.L6:
ret # 69 [c=0 l=4] simple_return
.cfi_endproc
---
18 insns, 4 less than what we have in trunk (LLVM version is 16 insns).
For aarch64, in trunk:
---
SetupPrecalculatedData:
.LFB0:
.cfi_startproc
mov x1, 0 // 4 [c=4 l=4] *movdi_aarch64/4
mov x4, 1 // 13 [c=4 l=4] *movdi_aarch64/4
.p2align 5,,15
.L5:
asr w2, w1, 3 // 8 [c=4 l=4]
*aarch64_ashr_sisd_or_int_si3/0
cmp w2, 7 // 9 [c=4 l=4] cmpsi/1
beq .L2 // 10 [c=12 l=4] aarch64_bcond
.L7:
ldr x2, [x0, x1, lsl 3] // 15 [c=16 l=4] *movdi_aarch64/9
add w3, w1, 8 // 12 [c=4 l=4] *addsi3_aarch64/0
lsl x3, x4, x3 // 14 [c=4 l=4]
*aarch64_ashl_sisd_or_int_di3/1
add x2, x2, x3 // 16 [c=4 l=4] *adddi3_aarch64/1
str x2, [x0, x1, lsl 3] // 17 [c=0 l=4] *movdi_aarch64/11
add x1, x1, 1 // 18 [c=4 l=4] *adddi3_aarch64/0
asr w2, w1, 3 // 66 [c=4 l=4]
*aarch64_ashr_sisd_or_int_si3/0
cmp w2, 7 // 67 [c=4 l=4] cmpsi/1
bne .L7 // 68 [c=12 l=4] aarch64_bcond
.L2:
add x1, x1, 1 // 23 [c=4 l=4] *adddi3_aarch64/0
cmp x1, 64 // 24 [c=4 l=4] cmpdi/1
bne .L5 // 25 [c=12 l=4] aarch64_bcond
ret // 60 [c=0 l=4] *do_return
.cfi_endproc
---
With the changes:
---
SetupPrecalculatedData:
.LFB0:
.cfi_startproc
mov x1, 0 // 4 [c=4 l=4] *movdi_aarch64/4
mov x4, 1 // 12 [c=4 l=4] *movdi_aarch64/4
b .L5 // 64 [c=12 l=4] jump
.p2align 2,,3
.L7:
ldr x2, [x0, x1, lsl 3] // 14 [c=16 l=4] *movdi_aarch64/9
add w3, w1, 8 // 11 [c=4 l=4] *addsi3_aarch64/0
lsl x3, x4, x3 // 13 [c=4 l=4]
*aarch64_ashl_sisd_or_int_di3/1
add x2, x2, x3 // 15 [c=4 l=4] *adddi3_aarch64/1
str x2, [x0, x1, lsl 3] // 16 [c=0 l=4] *movdi_aarch64/11
add x1, x1, 1 // 17 [c=4 l=4] *adddi3_aarch64/0
.L5:
cmp x1, 55 // 8 [c=4 l=4] cmpdi/1
bls .L7 // 9 [c=12 l=4] aarch64_bcond
add x1, x1, 1 // 22 [c=4 l=4] *adddi3_aarch64/0
cmp x1, 64 // 23 [c=4 l=4] cmpdi/1
bne .L5 // 24 [c=12 l=4] aarch64_bcond
ret // 59 [c=0 l=4] *do_return
.cfi_endproc
---
With trunk we have 18 insns, after changes 15 insns. For reference, LLVM
generates 14 insns:
SetupPrecalculatedData:
mov x8, xzr
mov w9, #256
b .LBB0_2
.LBB0_1:
add x8, x8, #1
cmp x8, #64
b.eq .LBB0_4
.LBB0_2:
cmp x8, #55
b.hi .LBB0_1
lsl x10, x9, x8
ldr x11, [x0, x8, lsl #3]
add x10, x11, x10
str x10, [x0, x8, lsl #3]
b .LBB0_1
.LBB0_4:
ret
We're definitely closer to LLVM (maybe equal/better? I didn't do a cost
analysis to see if GCC happens to generate better code even with a couple insn
more).
As for the full testcase we're now converting SetupPrecaculatedData1 to a call
to SetupPreCalculatedData (i.e. the functions are now the same):
SetupPrecalculatedData:
.LFB0:
.cfi_startproc
adrp x6, .LANCHOR0 // 7 [c=4 l=4] *movdi_aarch64/15
add x6, x6, :lo12:.LANCHOR0 // 8 [c=4 l=4] add_losym_di
(...)
SetupPrecalculatedData1:
.LFB3:
.cfi_startproc
b SetupPrecalculatedData // 5 [c=0 l=4]
*sibcall_insn/1
.cfi_endproc
I'll clean stuff up and send it to the ML. Not sure if this is all that we can
do w.r.t this PR but at least it's a step in the right direction.