https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107910
Bug ID: 107910 Summary: Missed optimization of struct members with mixed sizes Product: gcc Version: 12.2.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: pionere at freemail dot hu Target Milestone: --- Store-merging generates suboptimal code to copy members of structs with continuous memory: #include <stdint.h> #pragma pack(push, 1) typedef struct StructA { int32_t v00; int32_t v01; int8_t v02_0; int8_t v02_1; int8_t v02_2; int8_t v02_3; int32_t v03; int32_t v04; int32_t v05; int32_t v06; int32_t v07; int32_t v08; int32_t v09; int32_t v10; int32_t v11; } StructA; typedef struct StructB { int32_t v00; int32_t v01; int32_t v02; int32_t v03; int32_t v04; int32_t v05; int32_t v06; int32_t v07; int32_t v08; int32_t v09; int32_t v10; int32_t v11; } StructB; #pragma pack(pop) void copyA(StructA* __restrict dest, const StructA* __restrict src) { dest->v00 = src->v00; dest->v01 = src->v01; dest->v02_0 = src->v02_0; dest->v02_1 = src->v02_1; dest->v02_2 = src->v02_2; dest->v02_3 = src->v02_3; dest->v03 = src->v03; } void copyB(StructB* __restrict dest, const StructB* __restrict src) { dest->v00 = src->v00; dest->v01 = src->v01; dest->v02 = src->v02; dest->v03 = src->v03; } void copyAA(StructA* __restrict dest, const StructA* __restrict src) { dest->v00 = src->v00; dest->v01 = src->v01; dest->v02_0 = src->v02_0; dest->v02_1 = src->v02_1; dest->v02_2 = src->v02_2; dest->v02_3 = src->v02_3; dest->v03 = src->v03; dest->v04 = src->v04; dest->v05 = src->v05; dest->v06 = src->v06; dest->v07 = src->v07; } void copyBB(StructB* __restrict dest, const StructB* __restrict src) { dest->v00 = src->v00; dest->v01 = src->v01; dest->v02 = src->v02; dest->v03 = src->v03; dest->v04 = src->v04; dest->v05 = src->v05; dest->v06 = src->v06; dest->v07 = src->v07; } void copyAAA(StructA* __restrict dest, const StructA* __restrict src) { dest->v00 = src->v00; dest->v01 = src->v01; dest->v02_0 = src->v02_0; dest->v02_1 = src->v02_1; dest->v02_2 = src->v02_2; dest->v02_3 = src->v02_3; dest->v03 = src->v03; dest->v04 = src->v04; dest->v05 = src->v05; dest->v06 = src->v06; dest->v07 = src->v07; dest->v08 = src->v08; dest->v09 = src->v09; dest->v10 = src->v10; dest->v11 = src->v11; } void copyBBB(StructB* __restrict dest, const StructB* __restrict src) { dest->v00 = src->v00; dest->v01 = src->v01; dest->v02 = src->v02; dest->v03 = src->v03; dest->v04 = src->v04; dest->v05 = src->v05; dest->v06 = src->v06; dest->v07 = src->v07; dest->v08 = src->v08; dest->v09 = src->v09; dest->v10 = src->v10; dest->v11 = src->v11; } copyA* should generate the same code as its corresponding copyB* function. Currently gcc 12 (or trunk) generates the following: copyA(StructA*, StructA const*): mov rax, QWORD PTR [rsi] mov QWORD PTR [rdi], rax mov rax, QWORD PTR [rsi+8] mov QWORD PTR [rdi+8], rax ret copyB(StructB*, StructB const*): movdqu xmm0, XMMWORD PTR [rsi] movups XMMWORD PTR [rdi], xmm0 ret copyAA(StructA*, StructA const*): mov rax, QWORD PTR [rsi] movdqu xmm0, XMMWORD PTR [rsi+12] mov QWORD PTR [rdi], rax mov eax, DWORD PTR [rsi+8] movups XMMWORD PTR [rdi+12], xmm0 mov DWORD PTR [rdi+8], eax mov eax, DWORD PTR [rsi+28] mov DWORD PTR [rdi+28], eax ret copyBB(StructB*, StructB const*): movdqu xmm0, XMMWORD PTR [rsi+16] movdqu xmm1, XMMWORD PTR [rsi] movups XMMWORD PTR [rdi+16], xmm0 movups XMMWORD PTR [rdi], xmm1 ret copyAAA(StructA*, StructA const*): mov rax, QWORD PTR [rsi] movdqu xmm0, XMMWORD PTR [rsi+12] movdqu xmm1, XMMWORD PTR [rsi+28] mov QWORD PTR [rdi], rax mov eax, DWORD PTR [rsi+8] movups XMMWORD PTR [rdi+12], xmm0 mov DWORD PTR [rdi+8], eax mov eax, DWORD PTR [rsi+44] movups XMMWORD PTR [rdi+28], xmm1 mov DWORD PTR [rdi+44], eax ret copyBBB(StructB*, StructB const*): movdqu xmm1, XMMWORD PTR [rsi+16] movdqu xmm0, XMMWORD PTR [rsi+32] movdqu xmm2, XMMWORD PTR [rsi] movups XMMWORD PTR [rdi+16], xmm1 movups XMMWORD PTR [rdi], xmm2 movups XMMWORD PTR [rdi+32], xmm0 ret