https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124808

--- Comment #7 from Daniel Henrique Barboza <daniel.barboza at oss dot 
qualcomm.com> ---
I implemented Andrew's suggestions in comment 5 and the situation improved.
Using the shorter version from the description, the RISC-V asm being generated
after changes is:

---
SetupPrecalculatedData:
.LFB0:
        .cfi_startproc
        mv      a3,a0   # 7     [c=4 l=4]  *movdi_64bit/0
        li      a5,0            # 4     [c=4 l=4]  *movdi_64bit/1
        li      a1,55           # 9     [c=4 l=4]  *movdi_64bit/1
        li      a6,64           # 31    [c=4 l=4]  *movdi_64bit/1
        j       .L5             # 73    [c=4 l=4]  jump
.L8:
        ld      a2,0(a3)                # 17    [c=28 l=4]  *movdi_64bit/2
        addiw   a5,a5,1 # 20    [c=8 l=4]  addsi3_extended/1
        addi    a3,a3,8 # 23    [c=4 l=4]  *adddi3/1
        add     a4,a2,a4        # 18    [c=4 l=4]  *adddi3/0
        sd      a4,-8(a3)       # 19    [c=4 l=4]  *movdi_64bit/3
.L5:
        addiw   a4,a5,8 # 13    [c=8 l=4]  addsi3_extended/1
        bset    a4,x0,a4        # 16    [c=4 l=4]  *bsetdi_1
        ble     a5,a1,.L8       # 10    [c=16 l=4]  *branchdi
        addiw   a5,a5,1 # 28    [c=8 l=4]  addsi3_extended/1
        beq     a5,a6,.L6       # 32    [c=16 l=4]  *branchdi
        addi    a3,a3,8 # 34    [c=4 l=4]  *adddi3/1
        j       .L5             # 76    [c=4 l=4]  jump
.L6:
        ret             # 69    [c=0 l=4]  simple_return
        .cfi_endproc
---

18 insns, 4 less than what we have in trunk (LLVM version is 16 insns).


For aarch64, in trunk:

---
SetupPrecalculatedData:
.LFB0:
        .cfi_startproc
        mov     x1, 0   // 4    [c=4 l=4]  *movdi_aarch64/4
        mov     x4, 1   // 13   [c=4 l=4]  *movdi_aarch64/4
        .p2align 5,,15
.L5:
        asr     w2, w1, 3       // 8    [c=4 l=4] 
*aarch64_ashr_sisd_or_int_si3/0
        cmp     w2, 7   // 9    [c=4 l=4]  cmpsi/1
        beq     .L2             // 10   [c=12 l=4]  aarch64_bcond
.L7:
        ldr     x2, [x0, x1, lsl 3]     // 15   [c=16 l=4]  *movdi_aarch64/9
        add     w3, w1, 8       // 12   [c=4 l=4]  *addsi3_aarch64/0
        lsl     x3, x4, x3      // 14   [c=4 l=4] 
*aarch64_ashl_sisd_or_int_di3/1
        add     x2, x2, x3      // 16   [c=4 l=4]  *adddi3_aarch64/1
        str     x2, [x0, x1, lsl 3]     // 17   [c=0 l=4]  *movdi_aarch64/11
        add     x1, x1, 1       // 18   [c=4 l=4]  *adddi3_aarch64/0
        asr     w2, w1, 3       // 66   [c=4 l=4] 
*aarch64_ashr_sisd_or_int_si3/0
        cmp     w2, 7   // 67   [c=4 l=4]  cmpsi/1
        bne     .L7             // 68   [c=12 l=4]  aarch64_bcond
.L2:
        add     x1, x1, 1       // 23   [c=4 l=4]  *adddi3_aarch64/0
        cmp     x1, 64  // 24   [c=4 l=4]  cmpdi/1
        bne     .L5             // 25   [c=12 l=4]  aarch64_bcond
        ret             // 60   [c=0 l=4]  *do_return
        .cfi_endproc
---


With the changes:

---
SetupPrecalculatedData:
.LFB0:
        .cfi_startproc
        mov     x1, 0   // 4    [c=4 l=4]  *movdi_aarch64/4
        mov     x4, 1   // 12   [c=4 l=4]  *movdi_aarch64/4
        b       .L5             // 64   [c=12 l=4]  jump
        .p2align 2,,3
.L7:
        ldr     x2, [x0, x1, lsl 3]     // 14   [c=16 l=4]  *movdi_aarch64/9
        add     w3, w1, 8       // 11   [c=4 l=4]  *addsi3_aarch64/0
        lsl     x3, x4, x3      // 13   [c=4 l=4] 
*aarch64_ashl_sisd_or_int_di3/1
        add     x2, x2, x3      // 15   [c=4 l=4]  *adddi3_aarch64/1
        str     x2, [x0, x1, lsl 3]     // 16   [c=0 l=4]  *movdi_aarch64/11
        add     x1, x1, 1       // 17   [c=4 l=4]  *adddi3_aarch64/0
.L5:
        cmp     x1, 55  // 8    [c=4 l=4]  cmpdi/1
        bls     .L7             // 9    [c=12 l=4]  aarch64_bcond
        add     x1, x1, 1       // 22   [c=4 l=4]  *adddi3_aarch64/0
        cmp     x1, 64  // 23   [c=4 l=4]  cmpdi/1
        bne     .L5             // 24   [c=12 l=4]  aarch64_bcond
        ret             // 59   [c=0 l=4]  *do_return
        .cfi_endproc
---

With trunk we have 18 insns, after changes 15 insns.  For reference, LLVM
generates 14 insns:

SetupPrecalculatedData:
        mov     x8, xzr
        mov     w9, #256
        b       .LBB0_2
.LBB0_1:
        add     x8, x8, #1
        cmp     x8, #64
        b.eq    .LBB0_4
.LBB0_2:
        cmp     x8, #55
        b.hi    .LBB0_1
        lsl     x10, x9, x8
        ldr     x11, [x0, x8, lsl #3]
        add     x10, x11, x10
        str     x10, [x0, x8, lsl #3]
        b       .LBB0_1
.LBB0_4:
        ret


We're definitely closer to LLVM (maybe equal/better?  I didn't do a cost
analysis to see if GCC happens to generate better code even with a couple insn
more). 

As for the full testcase we're now converting SetupPrecaculatedData1 to a call
to SetupPreCalculatedData (i.e. the functions are now the same):


SetupPrecalculatedData:
.LFB0:
        .cfi_startproc
        adrp    x6, .LANCHOR0   // 7    [c=4 l=4]  *movdi_aarch64/15
        add     x6, x6, :lo12:.LANCHOR0 // 8    [c=4 l=4]  add_losym_di
(...)
SetupPrecalculatedData1:
.LFB3:
        .cfi_startproc
        b       SetupPrecalculatedData          // 5    [c=0 l=4] 
*sibcall_insn/1
        .cfi_endproc


I'll clean stuff up and send it to the ML.  Not sure if this is all that we can
do w.r.t this PR but at least it's a step in the right direction.

Reply via email to