https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80689

            Bug ID: 80689
           Summary: 128 loads generated for structure copying with gcc
                    7.10 and leads to STLF stalls in avx2 targets.
           Product: gcc
           Version: 7.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: venkataramanan.kumar at amd dot com
  Target Milestone: ---

For the below test case, GCC 7.1.0 started generating 128 bit loads and stores
while copying the structure elements. 

This pattern is observed in some benchmarks and leads to STLF stalls for few
AVX2 targets.

typedef struct st1
{
        long unsigned int a,b;
        long int c,d;
}R;

typedef struct st2
{
        int  t;
        R  reg;
}N;

void Set (const R *region,  N *n_info );

void test(N  *n_obj ,const long unsigned int a, const long unsigned int b, 
const long int c,const long int d)
{
        R reg;

        reg.a=a;
        reg.b=b;
        reg.c=c;
        reg.d=d;
        Set (&reg, n_obj);

}

void Set (const R *reg,  N *n_obj )
{
        n_obj->reg=(*reg);
}

 flag: -fno-inline -O2 

 GCC 6.3.0 

Set:
.LFB1:
        .cfi_startproc
        movq    (%rdi), %rax
        movq    %rax, 8(%rsi)
        movq    8(%rdi), %rax
        movq    %rax, 16(%rsi)
        movq    16(%rdi), %rax
        movq    %rax, 24(%rsi)
        movq    24(%rdi), %rax
        movq    %rax, 32(%rsi)
        ret
        .cfi_endproc
.LFE1:
        .size   Set, .-Set
        .p2align 4,,15
        .globl  test
        .type   test, @function
test:
.LFB0:
        .cfi_startproc
        subq    $40, %rsp
       .cfi_def_cfa_offset 48
        movq    %rsi, (%rsp)
        movq    %rdi, %rsi
        movq    %rsp, %rdi
        movq    %rdx, 8(%rsp)
        movq    %rcx, 16(%rsp)
        movq    %r8, 24(%rsp)
        call    Set
        addq    $40, %rsp
        .cfi_def_cfa_offset 8
        ret


GCC 7.1.0 

Set:
.LFB1:
        .cfi_startproc
        movdqu  (%rdi), %xmm0 <== 128 bit loads
        movups  %xmm0, 8(%rsi)
        movdqu  16(%rdi), %xmm0  <== 128 bit loads
        movups  %xmm0, 24(%rsi)
        ret
        .cfi_endproc
.LFE1:
        .size   Set, .-Set
        .p2align 4,,15
        .globl  test
        .type   test, @function
test:
.LFB0:
        .cfi_startproc
        subq    $40, %rsp
        .cfi_def_cfa_offset 48
        movq    %rsi, (%rsp)
        movq    %rdi, %rsi
        movq    %rsp, %rdi
        movq    %rdx, 8(%rsp)
        movq    %rcx, 16(%rsp)
        movq    %r8, 24(%rsp)
        call    Set
        addq    $40, %rsp
        .cfi_def_cfa_offset 8
        ret

Reply via email to