[Bug c/79938] gcc unnecessarily spills xmm register to stack when inserting vector items
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938 --- Comment #3 from postmaster at raasu dot org --- With -mssse3 instead of -msse4.1, the issue gets even worse: --- ... pxor%xmm1, %xmm1 movl$.LC0, %esi movl$1, %edi movd%eax, %xmm0 movdqa %xmm0, %xmm4 pshufb %xmm1, %xmm4 movaps %xmm4, (%rsp) movzbl (%rsp), %eax movaps %xmm4, 224(%rsp) movzbl 225(%rsp), %edx movaps %xmm4, 208(%rsp) movaps %xmm4, 192(%rsp) movaps %xmm4, 176(%rsp) addl%edx, %eax movzbl 210(%rsp), %edx movaps %xmm4, 160(%rsp) movaps %xmm4, 144(%rsp) movaps %xmm4, 128(%rsp) movaps %xmm4, 112(%rsp) addl%edx, %eax movzbl 195(%rsp), %edx movaps %xmm4, 96(%rsp) movzbl 105(%rsp), %ecx movaps %xmm4, 80(%rsp) movaps %xmm4, 64(%rsp) movaps %xmm4, 48(%rsp) addl%edx, %eax movzbl 165(%rsp), %edx movaps %xmm4, 32(%rsp) movd%eax, %xmm0 movzbl 180(%rsp), %eax movaps %xmm4, 16(%rsp) movaps %xmm4, 240(%rsp) addl%edx, %eax movzbl 150(%rsp), %edx addl%edx, %eax movzbl 135(%rsp), %edx addl%eax, %edx movzbl 120(%rsp), %eax movd%edx, %xmm6 punpckldq %xmm6, %xmm0 addl%ecx, %eax movzbl 90(%rsp), %ecx addl%ecx, %eax movzbl 75(%rsp), %ecx addl%ecx, %eax movzbl 45(%rsp), %ecx movd%eax, %xmm1 movzbl 60(%rsp), %eax addl%ecx, %eax movzbl 30(%rsp), %ecx addl%ecx, %eax movzbl 15(%rsp), %ecx addl%ecx, %eax movd%eax, %xmm5 xorl%eax, %eax punpckldq %xmm5, %xmm1 punpcklqdq %xmm1, %xmm0 movdqa %xmm0, %xmm2 movd%xmm0, %edx pshufd $255, %xmm0, %xmm3 punpckhdq %xmm0, %xmm2 pshufd $85, %xmm0, %xmm1 ... --- Notice all the lines starting with "movaps %xmm4," Same register contents are polluted all over the stack.
[Bug c/79938] gcc unnecessarily spills xmm register to stack when inserting vector items
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938 --- Comment #2 from postmaster at raasu dot org --- (In reply to Richard Biener from comment #1) > The situation is slightly better with GCC 7, only two spill/loads are > remaining. > Possibly BIT_INSERT_EXPR helps here. With gcc 6.2.0 and gcc -msse4.1 -mtune=core2 -O3 -S hadd.c -Wall -Wextra -fno-strict-aliasing -fwrapv -o hadd.s The resulting assembler output is almost perfect, but adding -mtune=core2 kinda makes the code optimal only for Intel processors. --- ... pxor%xmm1, %xmm1 movl$1, %edi movd%eax, %xmm0 pshufb %xmm1, %xmm0 pextrb $1, %xmm0, %edx pextrb $0, %xmm0, %eax addl%edx, %eax pextrb $2, %xmm0, %edx addl%edx, %eax pextrb $4, %xmm0, %ecx pextrb $3, %xmm0, %edx addl%eax, %edx pextrb $5, %xmm0, %eax addl%eax, %ecx pextrb $6, %xmm0, %eax addl%eax, %ecx pextrb $9, %xmm0, %esi pextrb $7, %xmm0, %eax addl%eax, %ecx pextrb $8, %xmm0, %eax addl%esi, %eax pextrb $10, %xmm0, %esi addl%esi, %eax pextrb $11, %xmm0, %esi addl%esi, %eax pextrb $13, %xmm0, %esi movd%eax, %xmm1 pextrb $12, %xmm0, %eax addl%esi, %eax pextrb $14, %xmm0, %esi addl%eax, %esi pextrb $15, %xmm0, %eax movd%edx, %xmm0 addl%esi, %eax pinsrd $1, %ecx, %xmm0 movl$.LC0, %esi pinsrd $1, %eax, %xmm1 xorl%eax, %eax punpcklqdq %xmm1, %xmm0 ...
[Bug c/79938] gcc unnecessarily spills xmm register to stack when inserting vector items
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938 --- Comment #1 from Richard Biener --- The situation is slightly better with GCC 7, only two spill/loads are remaining. Possibly BIT_INSERT_EXPR helps here. For the testcase you want to add __attribute__((noinline)) to haddd_epu8 as otherwise eliding the result vector after inlining into main is desirable (but not done). Then with GCC 7 we get: haddd_epu8: .LFB4966: .cfi_startproc pextrb $1, %xmm0, %edx pextrb $0, %xmm0, %eax addl%edx, %eax pextrb $2, %xmm0, %edx addl%edx, %eax pextrb $3, %xmm0, %edx addl%edx, %eax pextrb $5, %xmm0, %edx movl%eax, -12(%rsp) pextrb $4, %xmm0, %eax movd-12(%rsp), %xmm1 addl%edx, %eax pextrb $6, %xmm0, %edx addl%edx, %eax pextrb $7, %xmm0, %edx addl%edx, %eax pextrb $9, %xmm0, %edx pinsrd $1, %eax, %xmm1 pextrb $8, %xmm0, %eax addl%edx, %eax pextrb $10, %xmm0, %edx addl%edx, %eax pextrb $11, %xmm0, %edx addl%edx, %eax pextrb $13, %xmm0, %edx pinsrd $2, %eax, %xmm1 pextrb $12, %xmm0, %eax addl%edx, %eax pextrb $14, %xmm0, %edx addl%eax, %edx pextrb $15, %xmm0, %eax addl%edx, %eax pinsrd $3, %eax, %xmm1 movdqa %xmm1, %xmm0 ret which looks optimial to me. The single stack use is because by default inter-unit moves are disabled. With -mtune=core-avx2 you'd get haddd_epu8: .LFB4966: .cfi_startproc pextrb $1, %xmm0, %edx pextrb $0, %xmm0, %eax addl%edx, %eax pextrb $2, %xmm0, %edx addl%edx, %eax pextrb $3, %xmm0, %edx addl%edx, %eax pextrb $5, %xmm0, %edx movd%eax, %xmm1 pextrb $4, %xmm0, %eax addl%edx, %eax pextrb $6, %xmm0, %edx addl%edx, %eax pextrb $7, %xmm0, %edx addl%edx, %eax pextrb $9, %xmm0, %edx pinsrd $1, %eax, %xmm1 pextrb $8, %xmm0, %eax addl%edx, %eax pextrb $10, %xmm0, %edx addl%edx, %eax pextrb $11, %xmm0, %edx addl%edx, %eax pextrb $13, %xmm0, %edx pinsrd $2, %eax, %xmm1 pextrb $12, %xmm0, %eax addl%edx, %eax pextrb $14, %xmm0, %edx addl%eax, %edx pextrb $15, %xmm0, %eax addl%edx, %eax pinsrd $3, %eax, %xmm1 movdqa %xmm1, %xmm0 ret