https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80689
Bug ID: 80689 Summary: 128 loads generated for structure copying with gcc 7.10 and leads to STLF stalls in avx2 targets. Product: gcc Version: 7.1.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: venkataramanan.kumar at amd dot com Target Milestone: --- For the below test case, GCC 7.1.0 started generating 128 bit loads and stores while copying the structure elements. This pattern is observed in some benchmarks and leads to STLF stalls for few AVX2 targets. typedef struct st1 { long unsigned int a,b; long int c,d; }R; typedef struct st2 { int t; R reg; }N; void Set (const R *region, N *n_info ); void test(N *n_obj ,const long unsigned int a, const long unsigned int b, const long int c,const long int d) { R reg; reg.a=a; reg.b=b; reg.c=c; reg.d=d; Set (®, n_obj); } void Set (const R *reg, N *n_obj ) { n_obj->reg=(*reg); } flag: -fno-inline -O2 GCC 6.3.0 Set: .LFB1: .cfi_startproc movq (%rdi), %rax movq %rax, 8(%rsi) movq 8(%rdi), %rax movq %rax, 16(%rsi) movq 16(%rdi), %rax movq %rax, 24(%rsi) movq 24(%rdi), %rax movq %rax, 32(%rsi) ret .cfi_endproc .LFE1: .size Set, .-Set .p2align 4,,15 .globl test .type test, @function test: .LFB0: .cfi_startproc subq $40, %rsp .cfi_def_cfa_offset 48 movq %rsi, (%rsp) movq %rdi, %rsi movq %rsp, %rdi movq %rdx, 8(%rsp) movq %rcx, 16(%rsp) movq %r8, 24(%rsp) call Set addq $40, %rsp .cfi_def_cfa_offset 8 ret GCC 7.1.0 Set: .LFB1: .cfi_startproc movdqu (%rdi), %xmm0 <== 128 bit loads movups %xmm0, 8(%rsi) movdqu 16(%rdi), %xmm0 <== 128 bit loads movups %xmm0, 24(%rsi) ret .cfi_endproc .LFE1: .size Set, .-Set .p2align 4,,15 .globl test .type test, @function test: .LFB0: .cfi_startproc subq $40, %rsp .cfi_def_cfa_offset 48 movq %rsi, (%rsp) movq %rdi, %rsi movq %rsp, %rdi movq %rdx, 8(%rsp) movq %rcx, 16(%rsp) movq %r8, 24(%rsp) call Set addq $40, %rsp .cfi_def_cfa_offset 8 ret