[Bug rtl-optimization/52543] lower-subreg.c: code bloat of 300%-400% for multi-word memory splits

gjl at gcc dot gnu.org Thu, 02 Aug 2012 11:30:03 -0700

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=52543


Georg-Johann Lay <gjl at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|RESOLVED                    |REOPENED
         Resolution|FIXED                       |

--- Comment #10 from Georg-Johann Lay <gjl at gcc dot gnu.org> 2012-08-02 
18:29:41 UTC ---
(In reply to comment #9)
> Fixed.

NACK. I cannot confirm that anything changed:

If I revert the Hack from comment #4 that represents a memory access as UNSPEC
instead of as MEM (so that lower-subreg won't split), the code bloat returns.

Just suppose the examples given in comment #0

long readx (const __memx long *p)
{
    return *p;
}

long read0 (const __flash long *p)
{
    return *p;
}


with "GCC: (GNU) 4.8.0 20120802 (experimental)"
and -S -Os -dp -mmcu=avr5 -fno-split-wide-types
the result is fine and there are 32-bit accesses:

readx:
    movw r30,r22     ;  17    *movhi/1    [length = 1]
    mov r21,r24     ;  18    movqi_insn/1    [length = 1]
    call __xload_4     ;  19    xload_si_libgcc    [length = 2]
    ret     ;  24    return    [length = 1]

read0:
    movw r30,r24     ;  19    *movhi/1    [length = 1]
    lpm r22,Z+     ;  20    *movsi/3    [length = 4]
    lpm r23,Z+
    lpm r24,Z+
    lpm r25,Z+
    ret     ;  24    return    [length = 1]


Without -fno-split-wide-types the bloat is still there:

readx:
    push r12     ;  54    pushqi1/1    [length = 1]
    push r13     ;  55    pushqi1/1    [length = 1]
    push r14     ;  56    pushqi1/1    [length = 1]
    mov r26,r24     ;  2    *movpsi/1    [length = 2]
    movw r24,r22
    movw r30,r24     ;  35    *movhi/1    [length = 1]
    sbrc r26,7     ;  37    xload_8/1    [length = 4]
    ld r22,Z
    sbrs r26,7
    lpm r22,Z
    ldi r18,lo8(1)     ;  49    *movpsi/5    [length = 3]
    ldi r19,0
    ldi r20,0
    add r18,r24     ;  19    addpsi3/1    [length = 3]
    adc r19,r25
    adc r20,r26
    movw r30,r18     ;  38    *movhi/1    [length = 1]
    sbrc r20,7     ;  40    xload_8/1    [length = 4]
    ld r23,Z
    sbrs r20,7
    lpm r23,Z
    ldi r18,lo8(2)     ;  64    *reload_inpsi    [length = 4]
    mov r12,r18
    mov r13,__zero_reg__
    mov r14,__zero_reg__
    add r12,r24     ;  22    addpsi3/1    [length = 3]
    adc r13,r25
    adc r14,r26
    ldi r18,lo8(3)     ;  51    *movpsi/5    [length = 3]
    ldi r19,0
    ldi r20,0
    add r18,r24     ;  25    addpsi3/1    [length = 3]
    adc r19,r25
    adc r20,r26
    movw r30,r12     ;  41    *movhi/1    [length = 1]
    sbrc r14,7     ;  43    xload_8/1    [length = 4]
    ld r24,Z
    sbrs r14,7
    lpm r24,Z
    movw r30,r18     ;  44    *movhi/1    [length = 1]
    sbrc r20,7     ;  46    xload_8/1    [length = 4]
    ld r25,Z
    sbrs r20,7
    lpm r25,Z
    pop r14     ;  59    popqi    [length = 1]
    pop r13     ;  60    popqi    [length = 1]
    pop r12     ;  61    popqi    [length = 1]
    ret     ;  62    return_from_epilogue    [length = 1]

read0:
    movw r30,r24     ;  35    *movhi/1    [length = 1]
    lpm r22,Z+     ;  17    movqi_insn/4    [length = 1]
    lpm r23,Z     ;  20    movqi_insn/4    [length = 1]
    movw r18,r24     ;  38    *movhi/1    [length = 1]
    subi r18,-2     ;  22    addhi3_clobber/2    [length = 2]
    sbci r19,-1
    movw r20,r24     ;  39    *movhi/1    [length = 1]
    subi r20,-3     ;  25    addhi3_clobber/2    [length = 2]
    sbci r21,-1
    movw r30,r18     ;  40    *movhi/1    [length = 1]
    lpm r24,Z     ;  33    movqi_insn/4    [length = 1]
    movw r30,r20     ;  41    *movhi/1    [length = 1]
    lpm r25,Z     ;  34    movqi_insn/4    [length = 1]
    ret     ;  44    return    [length = 1]

Each 32-bit move is still split into 4 8-bit moves even though that is
extremely expensive.  This happens both for PSImode and for HImode addresses.

[Bug rtl-optimization/52543] lower-subreg.c: code bloat of 300%-400% for multi-word memory splits

Reply via email to