https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97428
Bug ID: 97428 Summary: -O3 is great for basic AoSoA packing of complex arrays, but horrible one step above the basic Product: gcc Version: 10.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: already5chosen at yahoo dot com Target Milestone: --- That my next example of bad handling of AoSoA layout by gcc optimizer/vectorizer. For discussion of AoSoA see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97343 The issue in hand is transformation (packing) of complex AoS numbers into AoSoA format. Compiler used: gcc 10.2 Target: AVX2 (Skylake) Part 1. typedef struct { double re, im; } dcmlx_t; typedef struct { double re[4], im[4]; } dcmlx4_t; void foo(dcmlx4_t dst[], const dcmlx_t src[], int n) { for (int i = 0; i < n; ++i) { dcmlx_t s00 = src[i*4+0]; dcmlx_t s01 = src[i*4+1]; dcmlx_t s02 = src[i*4+2]; dcmlx_t s03 = src[i*4+3]; dcmlx_t s10 = src[i*4+0+n]; dcmlx_t s11 = src[i*4+1+n]; dcmlx_t s12 = src[i*4+2+n]; dcmlx_t s13 = src[i*4+3+n]; dst[i*2+0].re[0] = s00.re; dst[i*2+0].re[1] = s01.re; dst[i*2+0].re[2] = s02.re; dst[i*2+0].re[3] = s03.re; dst[i*2+0].im[0] = s00.im; dst[i*2+0].im[1] = s01.im; dst[i*2+0].im[2] = s02.im; dst[i*2+0].im[3] = s03.im; dst[i*2+1].re[0] = s10.re; dst[i*2+1].re[1] = s11.re; dst[i*2+1].re[2] = s12.re; dst[i*2+1].re[3] = s13.re; dst[i*2+1].im[0] = s10.im; dst[i*2+1].im[1] = s11.im; dst[i*2+1].im[2] = s12.im; dst[i*2+1].im[3] = s13.im; } } -march=skylake -O2 produces following inner loop: .L3: vmovsd (%rdx), %xmm7 vmovsd 8(%rdx), %xmm3 vmovsd 16(%rdx), %xmm6 vmovsd 24(%rdx), %xmm2 vmovsd 32(%rdx), %xmm5 vmovsd 40(%rdx), %xmm1 vmovsd 48(%rdx), %xmm4 vmovsd 56(%rdx), %xmm0 addq $64, %rdx vmovsd %xmm7, (%rcx) vmovsd %xmm6, 8(%rcx) vmovsd %xmm5, 16(%rcx) vmovsd %xmm4, 24(%rcx) vmovsd %xmm3, 32(%rcx) vmovsd %xmm2, 40(%rcx) vmovsd %xmm1, 48(%rcx) vmovsd %xmm0, 56(%rcx) addq $64, %rcx cmpq %rax, %rdx jne .L3 Quite reasonable for non-vectorizing optimization level. It's possible to save one instruction by using indexed addressing, but in majority of situations it wouldn't be faster. -march=skylake -O3 inner loop: .L3: vmovupd (%rdx,%rax), %ymm0 vmovupd 32(%rdx,%rax), %ymm2 vunpcklpd %ymm2, %ymm0, %ymm1 vunpckhpd %ymm2, %ymm0, %ymm0 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm0, %ymm0 vmovupd %ymm1, (%rcx,%rax) vmovupd %ymm0, 32(%rcx,%rax) addq $64, %rax cmpq %r8, %rax jne .L3 That's excellent. It's not only looks better. According to my measurement, for source array in external memory and destination in L1/L2 cache it's actually ~1.5x faster than -O2, which is not a small fit. Part 2. A little more involved case. Now we want to interleave 2 lines of source matrix. Sometimes it's desirable to have interleaved layout, because it improves locality of access for the rest of processing, and also can reduce pressure on GPRs that are used as pointers or indices. typedef struct { double re, im; } dcmlx_t; typedef struct { double re[4], im[4]; } dcmlx4_t; void foo_i2(dcmlx4_t dst[], const dcmlx_t src[], int n) { for (int i = 0; i < n; ++i) { dcmlx_t s00 = src[i*4+0]; dcmlx_t s01 = src[i*4+1]; dcmlx_t s02 = src[i*4+2]; dcmlx_t s03 = src[i*4+3]; dcmlx_t s10 = src[i*4+0+n]; dcmlx_t s11 = src[i*4+1+n]; dcmlx_t s12 = src[i*4+2+n]; dcmlx_t s13 = src[i*4+3+n]; dst[i*2+0].re[0] = s00.re; dst[i*2+0].re[1] = s01.re; dst[i*2+0].re[2] = s02.re; dst[i*2+0].re[3] = s03.re; dst[i*2+0].im[0] = s00.im; dst[i*2+0].im[1] = s01.im; dst[i*2+0].im[2] = s02.im; dst[i*2+0].im[3] = s03.im; dst[i*2+1].re[0] = s10.re; dst[i*2+1].re[1] = s11.re; dst[i*2+1].re[2] = s12.re; dst[i*2+1].re[3] = s13.re; dst[i*2+1].im[0] = s10.im; dst[i*2+1].im[1] = s11.im; dst[i*2+1].im[2] = s12.im; dst[i*2+1].im[3] = s13.im; } } -march=skylake -O2 produces following inner loop: .L3: vmovsd (%rdx), %xmm15 vmovsd 8(%rdx), %xmm11 vmovsd 16(%rdx), %xmm14 vmovsd 24(%rdx), %xmm10 vmovsd 32(%rdx), %xmm13 vmovsd 40(%rdx), %xmm9 vmovsd 48(%rdx), %xmm12 vmovsd 56(%rdx), %xmm8 vmovsd (%rax), %xmm7 vmovsd 8(%rax), %xmm3 vmovsd 16(%rax), %xmm6 vmovsd 24(%rax), %xmm2 vmovsd 32(%rax), %xmm5 vmovsd 40(%rax), %xmm1 vmovsd 48(%rax), %xmm4 vmovsd 56(%rax), %xmm0 subq $-128, %rcx vmovsd %xmm15, -128(%rcx) vmovsd %xmm14, -120(%rcx) vmovsd %xmm13, -112(%rcx) vmovsd %xmm12, -104(%rcx) vmovsd %xmm11, -96(%rcx) vmovsd %xmm10, -88(%rcx) vmovsd %xmm9, -80(%rcx) vmovsd %xmm8, -72(%rcx) vmovsd %xmm7, -64(%rcx) vmovsd %xmm6, -56(%rcx) vmovsd %xmm5, -48(%rcx) vmovsd %xmm4, -40(%rcx) vmovsd %xmm3, -32(%rcx) vmovsd %xmm2, -24(%rcx) vmovsd %xmm1, -16(%rcx) vmovsd %xmm0, -8(%rcx) addq $64, %rdx addq $64, %rax cmpq %rcx, %r8 jne .L3 Once again, in absence of vectorizer it's very reasonable. But may be, vectorizer can do better, as it did in the Part 1? -march=skylake -O3 inner loop: .L4: vmovupd (%rcx), %ymm5 vmovupd 64(%rcx), %ymm4 vunpcklpd 32(%rcx), %ymm5, %ymm3 vunpckhpd 32(%rcx), %ymm5, %ymm1 vmovupd 128(%rcx), %ymm5 vmovupd 192(%rcx), %ymm7 vunpcklpd 160(%rcx), %ymm5, %ymm0 vunpckhpd 160(%rcx), %ymm5, %ymm2 vmovupd 192(%rcx), %ymm5 vunpcklpd 96(%rcx), %ymm4, %ymm6 vunpcklpd 224(%rcx), %ymm5, %ymm5 vunpckhpd 96(%rcx), %ymm4, %ymm4 vunpckhpd 224(%rcx), %ymm7, %ymm7 vpermpd $216, %ymm5, %ymm5 vpermpd $216, %ymm0, %ymm0 vpermpd $216, %ymm3, %ymm3 vpermpd $216, %ymm4, %ymm4 vpermpd $216, %ymm7, %ymm7 vunpcklpd %ymm5, %ymm0, %ymm8 vpermpd $216, %ymm1, %ymm1 vunpckhpd %ymm5, %ymm0, %ymm0 vpermpd $216, %ymm6, %ymm6 vpermpd $216, %ymm2, %ymm2 vunpcklpd %ymm6, %ymm3, %ymm15 vunpcklpd %ymm4, %ymm1, %ymm5 vunpckhpd %ymm6, %ymm3, %ymm6 vunpckhpd %ymm4, %ymm1, %ymm1 vpermpd $216, %ymm0, %ymm3 vunpcklpd %ymm7, %ymm2, %ymm0 vunpckhpd %ymm7, %ymm2, %ymm2 vpermpd $216, %ymm0, %ymm0 vpermpd $216, %ymm2, %ymm2 vpermpd $216, %ymm6, %ymm6 vpermpd $216, %ymm5, %ymm5 vpermpd $216, %ymm1, %ymm1 vunpcklpd %ymm0, %ymm5, %ymm13 vunpcklpd %ymm2, %ymm1, %ymm12 vunpckhpd %ymm0, %ymm5, %ymm5 vunpckhpd %ymm2, %ymm1, %ymm1 vunpcklpd %ymm3, %ymm6, %ymm0 vmovupd (%rdx), %ymm2 vunpckhpd %ymm3, %ymm6, %ymm6 vmovupd 64(%rdx), %ymm3 vunpcklpd 32(%rdx), %ymm2, %ymm2 vpermpd $216, %ymm1, %ymm4 vunpcklpd 96(%rdx), %ymm3, %ymm1 vmovupd 128(%rdx), %ymm3 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm2, %ymm2 vunpcklpd %ymm1, %ymm2, %ymm2 vunpcklpd 160(%rdx), %ymm3, %ymm1 vmovupd 192(%rdx), %ymm3 vpermpd $216, %ymm1, %ymm1 vunpcklpd 224(%rdx), %ymm3, %ymm3 vmovupd 64(%rdx), %ymm7 vpermpd $216, %ymm3, %ymm3 vunpcklpd %ymm3, %ymm1, %ymm1 vmovupd (%rdx), %ymm3 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm2, %ymm2 vmovupd %ymm4, (%rsp) vunpcklpd %ymm1, %ymm2, %ymm2 vunpckhpd 32(%rdx), %ymm3, %ymm4 vunpckhpd 96(%rdx), %ymm7, %ymm1 vmovupd 128(%rdx), %ymm3 vmovupd 192(%rdx), %ymm7 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm4, %ymm4 vunpcklpd %ymm1, %ymm4, %ymm4 vunpckhpd 160(%rdx), %ymm3, %ymm1 vunpckhpd 224(%rdx), %ymm7, %ymm3 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm3, %ymm3 vunpcklpd %ymm3, %ymm1, %ymm1 vmovupd (%r11), %ymm3 vpermpd $216, %ymm1, %ymm1 vunpcklpd 32(%r11), %ymm3, %ymm9 vmovupd 64(%r11), %ymm7 vpermpd $216, %ymm4, %ymm4 vunpcklpd %ymm1, %ymm4, %ymm4 vunpcklpd 96(%r11), %ymm7, %ymm1 vmovupd 128(%r11), %ymm3 vmovupd 192(%r11), %ymm7 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm9, %ymm9 vunpcklpd %ymm1, %ymm9, %ymm9 vunpcklpd 160(%r11), %ymm3, %ymm1 vunpcklpd 224(%r11), %ymm7, %ymm3 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm3, %ymm3 vmovupd 64(%r11), %ymm7 vunpcklpd %ymm3, %ymm1, %ymm1 vmovupd (%r11), %ymm3 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm9, %ymm9 vunpckhpd 32(%r11), %ymm3, %ymm3 vunpcklpd %ymm1, %ymm9, %ymm9 vunpckhpd 96(%r11), %ymm7, %ymm1 vmovupd 128(%r11), %ymm7 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm3, %ymm3 vunpcklpd %ymm1, %ymm3, %ymm3 vunpckhpd 160(%r11), %ymm7, %ymm1 vmovupd 192(%r11), %ymm7 vpermpd $216, %ymm1, %ymm1 vunpckhpd 224(%r11), %ymm7, %ymm7 vpermpd $216, %ymm8, %ymm8 vpermpd $216, %ymm7, %ymm7 vunpcklpd %ymm7, %ymm1, %ymm1 vmovupd (%r10), %ymm7 vpermpd $216, %ymm15, %ymm15 vunpcklpd %ymm8, %ymm15, %ymm10 vunpckhpd %ymm8, %ymm15, %ymm15 vunpcklpd 32(%r10), %ymm7, %ymm8 vmovupd 64(%r10), %ymm7 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm3, %ymm3 vunpcklpd %ymm1, %ymm3, %ymm3 vunpcklpd 96(%r10), %ymm7, %ymm1 vmovupd 128(%r10), %ymm7 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm8, %ymm8 vunpcklpd %ymm1, %ymm8, %ymm8 vunpcklpd 160(%r10), %ymm7, %ymm1 vmovupd 192(%r10), %ymm7 vpermpd $216, %ymm1, %ymm1 vunpcklpd 224(%r10), %ymm7, %ymm7 vpermpd $216, %ymm8, %ymm8 vpermpd $216, %ymm7, %ymm7 vunpcklpd %ymm7, %ymm1, %ymm1 vmovupd (%r10), %ymm7 vpermpd $216, %ymm1, %ymm1 vunpcklpd %ymm1, %ymm8, %ymm8 vunpckhpd 32(%r10), %ymm7, %ymm1 vmovupd 64(%r10), %ymm7 vpermpd $216, %ymm1, %ymm1 vunpckhpd 96(%r10), %ymm7, %ymm7 vmovupd 192(%r10), %ymm11 vpermpd $216, %ymm7, %ymm7 vunpcklpd %ymm7, %ymm1, %ymm1 vmovupd 128(%r10), %ymm7 vunpckhpd 224(%r10), %ymm11, %ymm11 vunpckhpd 160(%r10), %ymm7, %ymm7 vpermpd $216, %ymm11, %ymm11 vpermpd $216, %ymm7, %ymm7 vunpcklpd %ymm11, %ymm7, %ymm7 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm7, %ymm7 vunpcklpd %ymm7, %ymm1, %ymm7 vmovupd (%r9), %ymm1 vpermpd $216, %ymm7, %ymm7 vmovupd %ymm7, 32(%rsp) vunpcklpd 32(%r9), %ymm1, %ymm7 vmovupd 64(%r9), %ymm1 vpermpd $216, %ymm7, %ymm7 vunpcklpd 96(%r9), %ymm1, %ymm1 vmovupd 192(%r9), %ymm14 vpermpd $216, %ymm1, %ymm1 vunpcklpd %ymm1, %ymm7, %ymm7 vmovupd 128(%r9), %ymm1 vunpcklpd 224(%r9), %ymm14, %ymm11 vunpcklpd 160(%r9), %ymm1, %ymm1 vpermpd $216, %ymm11, %ymm11 vpermpd $216, %ymm1, %ymm1 vunpcklpd %ymm11, %ymm1, %ymm1 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm7, %ymm7 vmovupd 64(%r9), %ymm11 vunpcklpd %ymm1, %ymm7, %ymm7 vmovupd (%r9), %ymm1 vunpckhpd 96(%r9), %ymm11, %ymm11 vunpckhpd 32(%r9), %ymm1, %ymm1 vmovupd 128(%r9), %ymm14 vpermpd $216, %ymm11, %ymm11 vpermpd $216, %ymm1, %ymm1 vunpcklpd %ymm11, %ymm1, %ymm1 vunpckhpd 160(%r9), %ymm14, %ymm11 vmovupd 192(%r9), %ymm14 vpermpd $216, %ymm11, %ymm11 vunpckhpd 224(%r9), %ymm14, %ymm14 vpermpd $216, %ymm10, %ymm10 vpermpd $216, %ymm14, %ymm14 vunpcklpd %ymm14, %ymm11, %ymm11 vpermpd $216, %ymm2, %ymm2 vpermpd $216, %ymm11, %ymm11 vpermpd $216, %ymm1, %ymm1 vpermpd $68, %ymm10, %ymm14 vpermpd $216, %ymm0, %ymm0 vpermpd $216, %ymm9, %ymm9 vunpcklpd %ymm11, %ymm1, %ymm1 vpermpd $238, %ymm10, %ymm10 vpermpd $68, %ymm2, %ymm11 vpermpd $238, %ymm2, %ymm2 vshufpd $12, %ymm11, %ymm14, %ymm11 vshufpd $12, %ymm2, %ymm10, %ymm2 vpermpd $216, %ymm15, %ymm15 vpermpd $68, %ymm0, %ymm10 vpermpd $216, %ymm8, %ymm8 vpermpd $68, %ymm9, %ymm14 vpermpd $238, %ymm0, %ymm0 vpermpd $238, %ymm9, %ymm9 vshufpd $12, %ymm9, %ymm0, %ymm0 vpermpd $216, %ymm6, %ymm6 vmovupd %ymm0, 64(%rsp) vpermpd $68, %ymm15, %ymm9 vpermpd $216, %ymm7, %ymm7 vpermpd $68, %ymm8, %ymm0 vshufpd $12, %ymm14, %ymm10, %ymm14 vshufpd $12, %ymm0, %ymm9, %ymm0 vpermpd $216, %ymm13, %ymm13 vpermpd $68, %ymm7, %ymm9 vpermpd $216, %ymm4, %ymm4 vpermpd $238, %ymm8, %ymm8 vpermpd $68, %ymm6, %ymm10 vpermpd $238, %ymm7, %ymm7 vpermpd $238, %ymm6, %ymm6 vpermpd $238, %ymm15, %ymm15 vshufpd $12, %ymm8, %ymm15, %ymm15 vshufpd $12, %ymm9, %ymm10, %ymm10 vshufpd $12, %ymm7, %ymm6, %ymm8 vmovupd %ymm10, 128(%rsp) vpermpd $68, %ymm4, %ymm6 vpermpd $68, %ymm13, %ymm10 vshufpd $12, %ymm6, %ymm10, %ymm10 vpermpd $216, %ymm12, %ymm12 vmovupd 32(%rsp), %ymm6 vpermpd $216, %ymm3, %ymm3 vpermpd $238, %ymm13, %ymm13 vpermpd $238, %ymm4, %ymm4 vmovupd %ymm15, 96(%rsp) vmovupd %ymm8, 160(%rsp) vshufpd $12, %ymm4, %ymm13, %ymm15 vpermpd $216, %ymm5, %ymm5 vpermpd $68, %ymm3, %ymm4 vpermpd $68, %ymm12, %ymm8 vpermpd $238, %ymm3, %ymm3 vpermpd $238, %ymm12, %ymm12 vshufpd $12, %ymm4, %ymm8, %ymm8 vshufpd $12, %ymm3, %ymm12, %ymm12 vpermpd $68, %ymm5, %ymm4 vpermpd $68, %ymm6, %ymm3 vpermpd $238, %ymm5, %ymm5 vpermpd $238, %ymm6, %ymm6 vshufpd $12, %ymm6, %ymm5, %ymm6 vmovupd (%rsp), %ymm5 vpermpd $216, %ymm1, %ymm1 vshufpd $12, %ymm3, %ymm4, %ymm3 vpermpd $68, %ymm5, %ymm7 vpermpd $68, %ymm1, %ymm4 vpermpd $238, %ymm5, %ymm5 vpermpd $238, %ymm1, %ymm1 vshufpd $12, %ymm1, %ymm5, %ymm5 vshufpd $12, %ymm4, %ymm7, %ymm7 vpermpd $68, %ymm10, %ymm1 vpermpd $68, %ymm11, %ymm4 vpermpd $68, %ymm2, %ymm9 vshufpd $12, %ymm1, %ymm4, %ymm4 vpermpd $238, %ymm10, %ymm10 vpermpd $68, %ymm15, %ymm1 vpermpd $238, %ymm2, %ymm2 vpermpd $238, %ymm15, %ymm15 vpermpd $238, %ymm11, %ymm11 vshufpd $12, %ymm10, %ymm11, %ymm11 vshufpd $12, %ymm1, %ymm9, %ymm1 vshufpd $12, %ymm15, %ymm2, %ymm10 vpermpd $68, %ymm14, %ymm9 vpermpd $68, %ymm8, %ymm2 vpermpd $238, %ymm14, %ymm14 vpermpd $238, %ymm8, %ymm8 vshufpd $12, %ymm8, %ymm14, %ymm8 vmovupd 64(%rsp), %ymm14 vshufpd $12, %ymm2, %ymm9, %ymm2 vpermpd $68, %ymm14, %ymm9 vmovupd %ymm2, 32(%rsp) vpermpd $68, %ymm12, %ymm2 vshufpd $12, %ymm2, %ymm9, %ymm13 vpermpd $238, %ymm12, %ymm12 vpermpd $238, %ymm14, %ymm9 vmovupd 96(%rsp), %ymm15 vshufpd $12, %ymm12, %ymm9, %ymm14 vpermpd $68, %ymm3, %ymm2 vpermpd $68, %ymm0, %ymm9 vpermpd $238, %ymm3, %ymm3 vpermpd $238, %ymm0, %ymm0 vshufpd $12, %ymm3, %ymm0, %ymm12 vmovupd %ymm13, 64(%rsp) vpermpd $68, %ymm6, %ymm0 vpermpd $238, %ymm6, %ymm13 vmovupd 128(%rsp), %ymm6 vpermpd $68, %ymm15, %ymm3 vpermpd $238, %ymm15, %ymm15 vshufpd $12, %ymm2, %ymm9, %ymm2 vshufpd $12, %ymm13, %ymm15, %ymm13 vpermpd $238, %ymm6, %ymm9 vpermpd $68, %ymm6, %ymm15 vmovupd 160(%rsp), %ymm6 vshufpd $12, %ymm0, %ymm3, %ymm3 vpermpd $68, %ymm7, %ymm0 vpermpd $238, %ymm7, %ymm7 vshufpd $12, %ymm7, %ymm9, %ymm9 vshufpd $12, %ymm0, %ymm15, %ymm15 vpermpd $68, %ymm6, %ymm7 vpermpd $68, %ymm5, %ymm0 vshufpd $12, %ymm0, %ymm7, %ymm7 vmovupd %ymm14, 192(%rsp) vpermpd $68, %ymm2, %ymm0 vpermpd $238, %ymm5, %ymm14 vpermpd $238, %ymm2, %ymm2 vpermpd $68, %ymm4, %ymm5 vpermpd $238, %ymm4, %ymm4 vshufpd $12, %ymm0, %ymm5, %ymm5 vshufpd $12, %ymm2, %ymm4, %ymm4 vpermpd $68, %ymm12, %ymm0 vpermpd $68, %ymm3, %ymm2 vmovupd %ymm5, (%rsp) vmovupd %ymm4, 96(%rsp) vpermpd $68, %ymm11, %ymm5 vpermpd $68, %ymm1, %ymm4 vpermpd $238, %ymm3, %ymm3 vpermpd $238, %ymm1, %ymm1 vshufpd $12, %ymm0, %ymm5, %ymm5 vshufpd $12, %ymm2, %ymm4, %ymm4 vshufpd $12, %ymm3, %ymm1, %ymm1 vpermpd $68, %ymm13, %ymm2 vpermpd $238, %ymm12, %ymm0 vpermpd $68, %ymm10, %ymm3 vmovupd 32(%rsp), %ymm12 vshufpd $12, %ymm2, %ymm3, %ymm3 vpermpd $238, %ymm6, %ymm6 vpermpd $238, %ymm10, %ymm2 vpermpd $238, %ymm13, %ymm13 vshufpd $12, %ymm14, %ymm6, %ymm14 vshufpd $12, %ymm13, %ymm2, %ymm2 vpermpd $68, %ymm15, %ymm6 vpermpd $68, %ymm12, %ymm13 vpermpd $238, %ymm15, %ymm15 vpermpd $238, %ymm12, %ymm12 vshufpd $12, %ymm15, %ymm12, %ymm12 vpermpd $238, %ymm11, %ymm11 vmovupd 64(%rsp), %ymm15 vshufpd $12, %ymm0, %ymm11, %ymm0 vpermpd $238, %ymm9, %ymm10 vpermpd $68, %ymm8, %ymm11 vpermpd $238, %ymm8, %ymm8 vshufpd $12, %ymm6, %ymm13, %ymm13 vshufpd $12, %ymm10, %ymm8, %ymm10 vpermpd $68, %ymm9, %ymm6 vpermpd $238, %ymm15, %ymm8 vpermpd $68, %ymm15, %ymm9 vmovupd 192(%rsp), %ymm15 vshufpd $12, %ymm6, %ymm11, %ymm11 vpermpd $68, %ymm7, %ymm6 vpermpd $238, %ymm7, %ymm7 vshufpd $12, %ymm6, %ymm9, %ymm9 vshufpd $12, %ymm7, %ymm8, %ymm8 vpermpd $68, %ymm14, %ymm6 vpermpd $68, %ymm15, %ymm7 vshufpd $12, %ymm6, %ymm7, %ymm7 vpermpd $238, %ymm14, %ymm14 vpermpd $238, %ymm15, %ymm6 vshufpd $12, %ymm14, %ymm6, %ymm6 vpermpd $68, (%rsp), %ymm14 vpermpd $68, %ymm13, %ymm15 vshufpd $12, %ymm15, %ymm14, %ymm14 vmovupd 96(%rsp), %ymm15 vmovupd %ymm14, (%rax) vpermpd $238, (%rsp), %ymm14 vpermpd $238, %ymm13, %ymm13 vshufpd $12, %ymm13, %ymm14, %ymm13 vpermpd $68, %ymm12, %ymm14 vmovupd %ymm13, 32(%rax) vpermpd $68, %ymm15, %ymm13 vshufpd $12, %ymm14, %ymm13, %ymm13 vpermpd $238, %ymm12, %ymm12 vmovupd %ymm13, 64(%rax) vpermpd $238, %ymm15, %ymm13 vshufpd $12, %ymm12, %ymm13, %ymm12 vpermpd $68, %ymm11, %ymm13 vmovupd %ymm12, 96(%rax) vpermpd $238, %ymm11, %ymm11 vpermpd $68, %ymm5, %ymm12 vpermpd $238, %ymm5, %ymm5 vshufpd $12, %ymm11, %ymm5, %ymm5 vpermpd $68, %ymm10, %ymm11 vmovupd %ymm5, 160(%rax) vpermpd $238, %ymm10, %ymm10 vpermpd $68, %ymm0, %ymm5 vpermpd $238, %ymm0, %ymm0 vshufpd $12, %ymm11, %ymm5, %ymm5 vshufpd $12, %ymm10, %ymm0, %ymm0 vmovupd %ymm5, 192(%rax) vmovupd %ymm0, 224(%rax) vpermpd $68, %ymm9, %ymm5 vpermpd $68, %ymm4, %ymm0 vpermpd $238, %ymm9, %ymm9 vpermpd $238, %ymm4, %ymm4 vshufpd $12, %ymm5, %ymm0, %ymm0 vshufpd $12, %ymm9, %ymm4, %ymm4 vmovupd %ymm0, 256(%rax) vmovupd %ymm4, 288(%rax) vpermpd $68, %ymm1, %ymm0 vpermpd $68, %ymm8, %ymm4 vpermpd $238, %ymm1, %ymm1 vpermpd $238, %ymm8, %ymm8 vshufpd $12, %ymm4, %ymm0, %ymm0 vshufpd $12, %ymm8, %ymm1, %ymm1 vmovupd %ymm0, 320(%rax) vmovupd %ymm1, 352(%rax) vpermpd $68, %ymm3, %ymm0 vpermpd $68, %ymm7, %ymm1 vshufpd $12, %ymm1, %ymm0, %ymm0 vpermpd $238, %ymm3, %ymm3 vmovupd %ymm0, 384(%rax) vpermpd $68, %ymm6, %ymm1 vpermpd $68, %ymm2, %ymm0 vpermpd $238, %ymm7, %ymm7 vpermpd $238, %ymm2, %ymm2 vpermpd $238, %ymm6, %ymm6 addq $256, %rcx vshufpd $12, %ymm13, %ymm12, %ymm12 vshufpd $12, %ymm7, %ymm3, %ymm3 vmovupd %ymm12, 128(%rax) vmovupd %ymm3, 416(%rax) vshufpd $12, %ymm1, %ymm0, %ymm0 vshufpd $12, %ymm6, %ymm2, %ymm2 vmovupd %ymm0, 448(%rax) vmovupd %ymm2, 480(%rax) addq $256, %rdx addq $256, %r11 addq $256, %r10 addq $256, %r9 addq $512, %rax cmpq %rcx, %rbp jne .L4 I am not kidding. gcc 10.2 -O3 really generates code that is approximately 3 times slower than scalar output of -O2 and, may be, 4-4.5 times slower than good SIMD code similar to what was generated in Part1. My guess is that it's once again, as in nearly all my complains of recent months it a case of earlier phase of optimization producing a mess that totally confuses a later stage. I just can't guess what is the name of stage in fault this time. You have so many.