https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112891

            Bug ID: 112891
           Summary: [10/11/12/13/14 Regression] Missing vzeroupper insert.
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: liuhongt at gcc dot gnu.org
  Target Milestone: ---

#include<math.h>
void
__attribute__((noinline))
bar (double* a)
{
    a[0] = 1.0;
    a[1] = 2.0;
}

void
__attribute__((noinline))
foo (double* __restrict a, double* b)
{
    a[0] += b[0];
    a[1] += b[1];
    a[2] += b[2];
    a[3] += b[3];
    bar (b);
}

double
foo1 (double* __restrict a, double* b)
{
    foo (a, b);
    return exp (b[1]);
}


gcc -O3 -mavx2 Got

bar(double*):
        vmovapd xmm0, XMMWORD PTR .LC0[rip]
        vmovupd XMMWORD PTR [rdi], xmm0
        ret
foo(double*, double*):
        mov     rax, rdi
        vmovupd ymm0, YMMWORD PTR [rsi]
        mov     rdi, rsi
        vaddpd  ymm0, ymm0, YMMWORD PTR [rax]
        vmovupd YMMWORD PTR [rax], ymm0
        jmp     bar(double*)
foo1(double*, double*):
        sub     rsp, 8
        call    foo(double*, double*)
        vmovsd  xmm0, QWORD PTR [rsi+8]
        add     rsp, 8
        jmp     exp
.LC0:
        .long   0
        .long   1072693248
        .long   0
        .long   1073741824

In foo, 256-bit ymm are used, and the upper bits are dirty, but there's no
vzeroupper inserted by exp which cause big avx->sse transition penalty.

Reply via email to