https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115146
--- Comment #1 from Sergei Trofimovich <slyfox at gcc dot gnu.org> --- Specifically if I change original example to contain 16 bytes instead of 8: --- bug.c.orig 2024-05-18 11:07:47.426351557 +0100 +++ bug.c 2024-05-18 11:08:02.135601287 +0100 @@ -15,2 +15,2 @@ - u8 src[8] __attribute__((aligned(16))) = { 0 }; - u8 dst[8] __attribute__((aligned(16))) = { 0 }; + u8 src[16] __attribute__((aligned(16))) = { 0 }; + u8 dst[16] __attribute__((aligned(16))) = { 0 }; @@ -23 +23 @@ - for (unsigned long i = 0; i < 8; i += 2) { + for (unsigned long i = 0; i < 16; i += 2) { I get expected code: Dump of assembler code for function main: 0x0000000000401030 <+0>: sub $0x28,%rsp 0x0000000000401034 <+4>: pxor %xmm0,%xmm0 0x0000000000401038 <+8>: mov %rsp,%rdi 0x000000000040103b <+11>: movaps %xmm0,(%rsp) 0x000000000040103f <+15>: movaps %xmm0,0x10(%rsp) 0x0000000000401044 <+20>: call 0x401170 <fill_src> 0x0000000000401049 <+25>: movdqa (%rsp),%xmm0 0x000000000040104e <+30>: lea 0x10(%rsp),%rdi 0x0000000000401053 <+35>: movdqa %xmm0,%xmm1 0x0000000000401057 <+39>: psllw $0x8,%xmm0 0x000000000040105c <+44>: psrlw $0x8,%xmm1 0x0000000000401061 <+49>: por %xmm0,%xmm1 0x0000000000401065 <+53>: movaps %xmm1,0x10(%rsp) 0x000000000040106a <+58>: call 0x401180 <assert_dst> 0x000000000040106f <+63>: xor %eax,%eax 0x0000000000401071 <+65>: add $0x28,%rsp 0x0000000000401075 <+69>: ret