8 regression: ARM(64) ld3 st4 less optimized

linux at carewolf dot com Fri, 25 Jan 2019 01:39:20 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89058


            Bug ID: 89058
           Summary: GCC 7->8 regression: ARM(64) ld3 st4 less optimized
           Product: gcc
           Version: 8.2.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: linux at carewolf dot com
  Target Milestone: ---

When using the vld3_u8 and vst4_u8 instrinsics, the code generated with gcc8 is
less efficient than the code generated with gcc7. One has 3 moves, and the
other 9 moves.

The code in question is:

#include <stdint.h>
#include <arm_neon.h>

void qt_convert_rgb888_to_rgb32_neon(unsigned *dst, const unsigned char *src,
int len)
{
    if (!len)
        return;

    const unsigned *const end = dst + len;

    // align dst on 64 bits
    const int offsetToAlignOn8Bytes = (reinterpret_cast<uintptr_t>(dst) >> 2) &
0x1;
    for (int i = 0; i < offsetToAlignOn8Bytes; ++i) {
        *dst++ = 0xff000000 | (src[0] << 16) | (src[1] << 8) | src[2];
        src += 3;
    }

    if ((len - offsetToAlignOn8Bytes) >= 8) {
        const unsigned *const simdEnd = end - 7;
        // non-inline asm version (uses more moves)
        uint8x8x4_t dstVector;
        dstVector.val[3] = vdup_n_u8(0xff);
        do {
            uint8x8x3_t srcVector = vld3_u8(src);
            src += 3 * 8;
            dstVector.val[0] = srcVector.val[2];
            dstVector.val[1] = srcVector.val[1];
            dstVector.val[2] = srcVector.val[0];
            vst4_u8((uint8_t*)dst, dstVector);
            dst += 8;
        } while (dst < simdEnd);
    }

    while (dst != end) {
        *dst++ = 0xff000000 | (src[0] << 16) | (src[1] << 8) | src[2];
         src += 3;
     }
}


With gcc 7.3 the inner loop is:
.L5:
        ld3     {v4.8b - v6.8b}, [x1]
        add     x1, x1, 24
        orr     v3.16b, v7.16b, v7.16b
        mov     v0.8b, v6.8b
        mov     v1.8b, v5.8b
        mov     v2.8b, v4.8b
        st4     {v0.8b - v3.8b}, [x0]
        add     x0, x0, 32
        cmp     x3, x0
        bhi     .L5

With gcc 8.2 the inner loop is:
.L5:
        ld3     {v4.8b - v6.8b}, [x1]
        adrp    x3, .LC1
        add     x1, x1, 24
        ldr     q3, [x3, #:lo12:.LC1]
        mov     v16.8b, v6.8b
        mov     v7.8b, v5.8b
        mov     v4.8b, v4.8b
        ins     v16.d[1], v17.d[0]
        ins     v7.d[1], v17.d[0]
        ins     v4.d[1], v17.d[0]
        mov     v0.16b, v16.16b
        mov     v1.16b, v7.16b
        mov     v2.16b, v4.16b
        st4     {v0.8b - v3.8b}, [x0]
        add     x0, x0, 32
        cmp     x2, x0
        bhi     .L5

[Bug target/89058] New: GCC 7->8 regression: ARM(64) ld3 st4 less optimized

Reply via email to