https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80634
Bug ID: 80634 Summary: strangely missed vectorization optimizations Product: gcc Version: 6.3.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: other Assignee: unassigned at gcc dot gnu.org Reporter: steven at uplinklabs dot net Target Milestone: --- Created attachment 41322 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=41322&action=edit gcc 6.3.1 outputs for ELEMS=1 through ELEMS=32 (Not sure which component is the correct one for this issue). I've noticed that the vectorizer makes some strange choices occasionally, and will turn some straightforward code into a large branchy code sequence. Take this, for example: void saxpy(float a, float * restrict x, float * restrict y) { for (int i = 0; i < ELEMS; ++i) y[i] = a*x[i] + y[i]; } If I use the flags "-O3 -march=haswell" (or "-O3 -xAVX2" on ICC) and use varying definitions of ELEMS, I will sometimes get odd results with GCC 6.3.1. Here's -DELEMS=6 with GCC 6.3.1: saxpy: vshufps $0, %xmm0, %xmm0, %xmm1 vmovups (%rsi), %xmm2 vfmadd132ps (%rdi), %xmm2, %xmm1 vmovss 20(%rsi), %xmm3 vmovups %xmm1, (%rsi) vmovss 16(%rdi), %xmm1 vfmadd213ss 16(%rsi), %xmm0, %xmm1 vfmadd132ss 20(%rdi), %xmm3, %xmm0 vmovss %xmm1, 16(%rsi) vmovss %xmm0, 20(%rsi) ret Seems reasonable. Here's -DELEMS=7 with GCC 6.3.1: saxpy: movq %rsi, %rax shrq $2, %rax negq %rax andl $3, %eax je .L7 vmovss (%rdi), %xmm1 vfmadd213ss (%rsi), %xmm0, %xmm1 vmovss %xmm1, (%rsi) cmpl $1, %eax je .L8 vmovss 4(%rdi), %xmm1 vfmadd213ss 4(%rsi), %xmm0, %xmm1 vmovss %xmm1, 4(%rsi) cmpl $3, %eax jne .L9 vmovss 8(%rdi), %xmm1 vfmadd213ss 8(%rsi), %xmm0, %xmm1 movl $4, %r8d movl $3, %edx vmovss %xmm1, 8(%rsi) .L3: movl $7, %ecx movl %eax, %r9d subl %eax, %ecx .L2: leaq (%rsi,%r9,4), %rax vshufps $0, %xmm0, %xmm0, %xmm1 vmovaps (%rax), %xmm3 vfmadd132ps (%rdi,%r9,4), %xmm3, %xmm1 vmovaps %xmm1, (%rax) leal 4(%rdx), %eax cmpl $4, %ecx je .L19 cltq addl $5, %edx leaq (%rsi,%rax,4), %rcx vmovss (%rdi,%rax,4), %xmm1 vfmadd213ss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) cmpl $5, %r8d je .L17 movslq %edx, %rdx leaq (%rsi,%rdx,4), %rax vmovss (%rdi,%rdx,4), %xmm1 vfmadd213ss (%rax), %xmm0, %xmm1 vmovss %xmm1, (%rax) cmpl $6, %r8d je .L17 vmovss 24(%rsi), %xmm2 vfmadd132ss 24(%rdi), %xmm2, %xmm0 vmovss %xmm0, 24(%rsi) ret .L17: ret .L7: movl $7, %ecx xorl %r9d, %r9d movl $7, %r8d xorl %edx, %edx jmp .L2 .L19: ret .L8: movl $6, %r8d movl $1, %edx jmp .L3 .L9: movl $5, %r8d movl $2, %edx jmp .L3 This might be explained away by it being an odd number just short of a power of two, but ICC does an apparently better job (one packed FMA plus three single FMAs): saxpy: vbroadcastss %xmm0, %xmm2 vmovups (%rdi), %xmm1 vmovss 16(%rdi), %xmm3 vmovss 20(%rdi), %xmm4 vmovss 24(%rdi), %xmm5 vfmadd213ps (%rsi), %xmm1, %xmm2 vfmadd213ss 16(%rsi), %xmm0, %xmm3 vfmadd213ss 20(%rsi), %xmm0, %xmm4 vfmadd213ss 24(%rsi), %xmm5, %xmm0 vmovups %xmm2, (%rsi) vmovss %xmm3, 16(%rsi) vmovss %xmm4, 20(%rsi) vmovss %xmm0, 24(%rsi) ret The results from GCC 6.3.1 for ELEMS values 8 through 14 look fine (short branchless code sequences similar to what ICC emits), but things go to crap again for what seems to be *any* value ELEMS=15 or above. It even misses the opportunity with ELEMS=16 to just do two packed FMAs with YMM registers: saxpy: movq %rsi, %rax shrq $2, %rax negq %rax andl $7, %eax je .L7 vmovss (%rdi), %xmm1 vfmadd213ss (%rsi), %xmm0, %xmm1 vmovss %xmm1, (%rsi) cmpl $1, %eax je .L8 vmovss 4(%rdi), %xmm1 vfmadd213ss 4(%rsi), %xmm0, %xmm1 vmovss %xmm1, 4(%rsi) cmpl $2, %eax je .L9 vmovss 8(%rdi), %xmm1 vfmadd213ss 8(%rsi), %xmm0, %xmm1 vmovss %xmm1, 8(%rsi) cmpl $3, %eax je .L10 vmovss 12(%rdi), %xmm1 vfmadd213ss 12(%rsi), %xmm0, %xmm1 vmovss %xmm1, 12(%rsi) cmpl $4, %eax je .L11 vmovss 16(%rdi), %xmm1 vfmadd213ss 16(%rsi), %xmm0, %xmm1 vmovss %xmm1, 16(%rsi) cmpl $5, %eax je .L12 vmovss 20(%rdi), %xmm1 vfmadd213ss 20(%rsi), %xmm0, %xmm1 vmovss %xmm1, 20(%rsi) cmpl $7, %eax jne .L13 vmovss 24(%rdi), %xmm1 vfmadd213ss 24(%rsi), %xmm0, %xmm1 movl $9, %r9d movl $7, %r10d vmovss %xmm1, 24(%rsi) .L3: movl $16, %ecx movl %eax, %edx movl $8, %r8d movl $1, %r11d subl %eax, %ecx .L2: salq $2, %rdx vbroadcastss %xmm0, %ymm1 leaq (%rdi,%rdx), %rax addq %rsi, %rdx vmovups (%rax), %ymm2 vfmadd213ps (%rdx), %ymm1, %ymm2 vmovaps %ymm2, (%rdx) cmpl $2, %r11d jne .L4 vmovaps 32(%rdx), %ymm4 vfmadd132ps 32(%rax), %ymm4, %ymm1 vmovaps %ymm1, 32(%rdx) .L4: movl %r9d, %edx leal (%r8,%r10), %eax subl %r8d, %edx cmpl %r8d, %ecx je .L29 movslq %eax, %r8 leaq (%rsi,%r8,4), %rcx vmovss (%rdi,%r8,4), %xmm1 vfmadd213ss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) leal 1(%rax), %ecx cmpl $1, %edx je .L29 movslq %ecx, %rcx leaq (%rsi,%rcx,4), %r8 vmovss (%rdi,%rcx,4), %xmm1 leal 2(%rax), %ecx vfmadd213ss (%r8), %xmm0, %xmm1 vmovss %xmm1, (%r8) cmpl $2, %edx je .L29 movslq %ecx, %rcx leaq (%rsi,%rcx,4), %r8 vmovss (%rdi,%rcx,4), %xmm1 leal 3(%rax), %ecx vfmadd213ss (%r8), %xmm0, %xmm1 vmovss %xmm1, (%r8) cmpl $3, %edx je .L29 movslq %ecx, %rcx leaq (%rsi,%rcx,4), %r8 vmovss (%rdi,%rcx,4), %xmm1 leal 4(%rax), %ecx vfmadd213ss (%r8), %xmm0, %xmm1 vmovss %xmm1, (%r8) cmpl $4, %edx je .L29 movslq %ecx, %rcx leaq (%rsi,%rcx,4), %r8 vmovss (%rdi,%rcx,4), %xmm1 leal 5(%rax), %ecx vfmadd213ss (%r8), %xmm0, %xmm1 vmovss %xmm1, (%r8) cmpl $5, %edx je .L29 movslq %ecx, %rcx addl $6, %eax leaq (%rsi,%rcx,4), %r8 vmovss (%rdi,%rcx,4), %xmm1 vfmadd213ss (%r8), %xmm0, %xmm1 vmovss %xmm1, (%r8) cmpl $6, %edx je .L29 cltq leaq (%rsi,%rax,4), %rdx vmovss (%rdx), %xmm3 vfmadd132ss (%rdi,%rax,4), %xmm3, %xmm0 vmovss %xmm0, (%rdx) .L29: vzeroupper ret .L7: movl $16, %r8d movl $16, %ecx xorl %edx, %edx xorl %r10d, %r10d movl $2, %r11d movl $16, %r9d jmp .L2 .L13: movl $10, %r9d movl $6, %r10d jmp .L3 .L8: movl $15, %r9d movl $1, %r10d jmp .L3 .L9: movl $14, %r9d movl $2, %r10d jmp .L3 .L10: movl $13, %r9d movl $3, %r10d jmp .L3 .L11: movl $12, %r9d movl $4, %r10d jmp .L3 .L12: movl $11, %r9d movl $5, %r10d jmp .L3 ICC gets ELEMS=16 right: saxpy: vmovups (%rdi), %ymm1 vmovups 32(%rdi), %ymm2 vbroadcastss %xmm0, %ymm3 vfmadd213ps (%rsi), %ymm3, %ymm1 vfmadd213ps 32(%rsi), %ymm2, %ymm3 vmovups %ymm1, (%rsi) vmovups %ymm3, 32(%rsi) vzeroupper ret I'll attach the code outputs for ELEMS values 1 through 32 using GCC 6.3.1 and ICC 17.0.1.