[Bug tree-optimization/106705] New: Expensive constant load replicated throughout switch statement

macro at orcam dot me.uk via Gcc-bugs Sun, 21 Aug 2022 14:17:47 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106705


            Bug ID: 106705
           Summary: Expensive constant load replicated throughout switch
                    statement
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: macro at orcam dot me.uk
  Target Milestone: ---

Given code like:

#define do_sd(a, b, c) do { b[c] = a; } while (0)

void sd(unsigned len, unsigned long *mem)
{
        const unsigned long val = 2415920512;

        switch (len) {
        case 7:
                do_sd(val, mem, -7);
        case 6:
                do_sd(val, mem, -6);
        case 5:
                do_sd(val, mem, -5);
        case 4:
                do_sd(val, mem, -4);
        case 3:
                do_sd(val, mem, -3);
        case 2:
                do_sd(val, mem, -2);
        case 1:
                do_sd(val, mem, -1);
        }
}

(reduced from code using an out-of-tree intrinsic for a vendor-specific
machine instruction) we get 64-bit assembly at -O2 with a constant load
replicated across all the switch cases, terribly suboptimal.

E.g. for RISC-V this gets compiled to:

        .text
        .align  1
        .globl  sd
        .type   sd, @function
sd:
        li      a5,7
        bgtu    a0,a5,.L1
        lui     a5,%hi(.L4)
        addi    a5,a5,%lo(.L4)
        slli    a0,a0,2
        add     a0,a0,a5
        lw      a5,0(a0)
        jr      a5
        .section        .rodata
        .align  2
        .align  2
.L4:
        .word   .L1
        .word   .L10
        .word   .L9
        .word   .L8
        .word   .L7
        .word   .L6
        .word   .L5
        .word   .L3
        .text
.L3:
        li      a5,9
        slli    a5,a5,28
        addi    a5,a5,1408
        sd      a5,-56(a1)
.L5:
        li      a5,9
        slli    a5,a5,28
        addi    a5,a5,1408
        sd      a5,-48(a1)
.L6:
        li      a5,9
        slli    a5,a5,28
        addi    a5,a5,1408
        sd      a5,-40(a1)
.L7:
        li      a5,9
        slli    a5,a5,28
        addi    a5,a5,1408
        sd      a5,-32(a1)
.L8:
        li      a5,9
        slli    a5,a5,28
        addi    a5,a5,1408
        sd      a5,-24(a1)
.L9:
        li      a5,9
        slli    a5,a5,28
        addi    a5,a5,1408
        sd      a5,-16(a1)
.L10:
        li      a5,9
        slli    a5,a5,28
        addi    a5,a5,1408
        sd      a5,-8(a1)
.L1:
        ret
        .size   sd, .-sd

and similarly e.g. for Alpha:

[...]
$L3:
        lda $1,9($31)
        sll $1,28,$1
        lda $1,1408($1)
        stq $1,-56($17)
$L5:
        lda $1,9($31)
        sll $1,28,$1
        lda $1,1408($1)
        stq $1,-48($17)
[...]

MIPS:

[...]
.L3:
        li      $2,18874368                     # 0x1200000
        daddiu  $2,$2,11
        dsll    $2,$2,7
        sd      $2,-56($5)
.L5:
        li      $2,18874368                     # 0x1200000
        daddiu  $2,$2,11
        dsll    $2,$2,7
        sd      $2,-48($5)
[...]

POWER:

[...]
.L3:
        lis 9,0x9000
        ori 9,9,0x580
        rldicl 9,9,0,32
        std 9,-56(4)
.L5:
        lis 9,0x9000
        ori 9,9,0x580
        rldicl 9,9,0,32
        std 9,-48(4)
[...]

or x86 even:

[...]
.L3:
        movl    $2415920512, %eax
        movq    %rax, -56(%rsi)
.L5:
        movl    $2415920512, %eax
        movq    %rax, -48(%rsi)
[...]

I understand this is due forward constant propagation, and it used to
be possible to circumvent it at least at -Os by disabling several of
the tree passess (with GCC 9 this has been originally discovered with),
but apparently not anymore.  Not that it would be useful for regular
compilations.  In that case good RISC-V code was produced (except for
some needless sign-extensions, etc. coming from -Os inefficiency):

        .text
        .align  1
        .globl  sd
        .type   sd, @function
sd:
        addiw   a0,a0,-1
        sext.w  a4,a0
        li      a5,6
        bgtu    a4,a5,.L1
        slli    a0,a0,32
        lui     a5,%hi(.L4)
        addi    a5,a5,%lo(.L4)
        srli    a0,a0,30
        add     a0,a0,a5
        lw      a4,0(a0)
        li      a5,9
        slli    a5,a5,28
        addi    a5,a5,1408
        jr      a4
        .section        .rodata
        .align  2
        .align  2
.L4:
        .word   .L10
        .word   .L9
        .word   .L8
        .word   .L7
        .word   .L6
        .word   .L5
        .word   .L3
        .text
        .align  2
.L3:
        sd      a5,-56(a1)
        .align  2
.L5:
        sd      a5,-48(a1)
        .align  2
.L6:
        sd      a5,-40(a1)
        .align  2
.L7:
        sd      a5,-32(a1)
        .align  2
.L8:
        sd      a5,-24(a1)
        .align  2
.L9:
        sd      a5,-16(a1)
        .align  2
.L10:
        sd      a5,-8(a1)
        .align  2
.L1:
        ret
        .size   sd, .-sd

[Bug tree-optimization/106705] New: Expensive constant load replicated throughout switch statement

Reply via email to