https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114323

            Bug ID: 114323
           Summary: [14 Regression] MVE vector load intrinsic miscompiled
                    since r14-5622-g4d7647edfd7d98
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: acoplan at gcc dot gnu.org
  Target Milestone: ---

The following testcase:

#include <arm_mve.h>

uint32x4_t foo (void) {
  uint32x4_t V0 = vld1q_u32(((const uint32_t[4]){1, 2, 3, 4}));
  return V0;
}

is miscompiled with -O2 -march=armv8.1-m.main+mve -mfloat-abi=hard on
arm-none-eabi.  Since r14-5622-g4d7647edfd7d985fbefe13de03c8bc2e3a74fc61 we
generate:

foo:
        sub     sp, sp, #16
        vldrw.32        q0, [sp]
        add     sp, sp, #16
        bx      lr

i.e. we do a vector load from uninitialized stack memory.  GCC 13 used to give:

foo:
        sub     sp, sp, #16
        mov     ip, sp
        ldr     r3, .L4
        ldm     r3, {r0, r1, r2, r3}
        stm     ip, {r0, r1, r2, r3}
        vldrw.32        q0, [ip]
        add     sp, sp, #16
        bx      lr
        .align  2
.L4:
        .word   .LANCHOR0
        .size   foo, .-foo
        .section        .rodata
        .align  2
        .set    .LANCHOR0,. + 0
        .word   1
        .word   2
        .word   3
        .word   4

which, while not optimal, is at least correct.  Here is a full executable
testcase for the testsuite:

#include <arm_mve.h>

__attribute__((noipa))
uint32x4_t foo (void) {
  uint32x4_t V0 = vld1q_u32(((const uint32_t[4]){1, 2, 3, 4}));
  return V0;
}

int main(void)
{
  uint32_t buf[4];
  vst1q_u32 (buf, foo());

  for (int i = 0; i < 4; i++)
    if (buf[i] != i+1)
      __builtin_abort ();
}

Reply via email to