https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92244

--- Comment #1 from Peter Cordes <peter at cordes dot ca> ---
On AArch64 (with gcc8.2), we see a similar effect, more instructions in the
loop.  And an indexed addressing mode.

https://godbolt.org/z/6ZVWY_


# strrev_explicit   -O3 -mcpu=cortex-a53
       ...
.L4:
        ldr     q1, [x4, x2]        # tail
        ldr     q0, [x3]            # head
        tbl     v1.16b, {v1.16b}, v2.16b    # byte shuffle
        tbl     v0.16b, {v0.16b}, v2.16b
        str     q1, [x3], 16        # post-increment store to head
        cmp     x3, x1
        str     q0, [x4, x2]
        sub     x2, x2, #16           # doesn't update flags, not SUBS
        bne     .L4                 # }while( head != end_head )



# strrev_implicit   -O3 -mcpu=cortex-a53
        ...
.L19:
        ldr     q1, [x3]
        ldr     q0, [x2]
        tbl     v1.16b, {v1.16b}, v2.16b
        tbl     v0.16b, {v0.16b}, v2.16b
        str     q1, [x2], 16           # post-increment addressing mode 
        cmp     x2, x4
        str     q0, [x3], -16          # post-decrement addressing mode 
        bne     .L19                   # }while( head != end_head )

Reply via email to