[Bug c/79938] gcc unnecessarily spills xmm register to stack when inserting vector items

2017-03-07 Thread postmaster at raasu dot org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938

--- Comment #3 from postmaster at raasu dot org ---
With -mssse3 instead of -msse4.1, the issue gets even worse:

---
...
pxor%xmm1, %xmm1
movl$.LC0, %esi
movl$1, %edi
movd%eax, %xmm0
movdqa  %xmm0, %xmm4
pshufb  %xmm1, %xmm4
movaps  %xmm4, (%rsp)
movzbl  (%rsp), %eax
movaps  %xmm4, 224(%rsp)
movzbl  225(%rsp), %edx
movaps  %xmm4, 208(%rsp)
movaps  %xmm4, 192(%rsp)
movaps  %xmm4, 176(%rsp)
addl%edx, %eax
movzbl  210(%rsp), %edx
movaps  %xmm4, 160(%rsp)
movaps  %xmm4, 144(%rsp)
movaps  %xmm4, 128(%rsp)
movaps  %xmm4, 112(%rsp)
addl%edx, %eax
movzbl  195(%rsp), %edx
movaps  %xmm4, 96(%rsp)
movzbl  105(%rsp), %ecx
movaps  %xmm4, 80(%rsp)
movaps  %xmm4, 64(%rsp)
movaps  %xmm4, 48(%rsp)
addl%edx, %eax
movzbl  165(%rsp), %edx
movaps  %xmm4, 32(%rsp)
movd%eax, %xmm0
movzbl  180(%rsp), %eax
movaps  %xmm4, 16(%rsp)
movaps  %xmm4, 240(%rsp)
addl%edx, %eax
movzbl  150(%rsp), %edx
addl%edx, %eax
movzbl  135(%rsp), %edx
addl%eax, %edx
movzbl  120(%rsp), %eax
movd%edx, %xmm6
punpckldq   %xmm6, %xmm0
addl%ecx, %eax
movzbl  90(%rsp), %ecx
addl%ecx, %eax
movzbl  75(%rsp), %ecx
addl%ecx, %eax
movzbl  45(%rsp), %ecx
movd%eax, %xmm1
movzbl  60(%rsp), %eax
addl%ecx, %eax
movzbl  30(%rsp), %ecx
addl%ecx, %eax
movzbl  15(%rsp), %ecx
addl%ecx, %eax
movd%eax, %xmm5
xorl%eax, %eax
punpckldq   %xmm5, %xmm1
punpcklqdq  %xmm1, %xmm0
movdqa  %xmm0, %xmm2
movd%xmm0, %edx
pshufd  $255, %xmm0, %xmm3
punpckhdq   %xmm0, %xmm2
pshufd  $85, %xmm0, %xmm1
...
---

Notice all the lines starting with "movaps  %xmm4,"
Same register contents are polluted all over the stack.

[Bug c/79938] gcc unnecessarily spills xmm register to stack when inserting vector items

2017-03-07 Thread postmaster at raasu dot org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938

--- Comment #2 from postmaster at raasu dot org ---
(In reply to Richard Biener from comment #1)
> The situation is slightly better with GCC 7, only two spill/loads are
> remaining.
> Possibly BIT_INSERT_EXPR helps here.

With gcc 6.2.0 and
gcc -msse4.1 -mtune=core2 -O3 -S hadd.c -Wall -Wextra -fno-strict-aliasing
-fwrapv -o hadd.s

The resulting assembler output is almost perfect, but adding -mtune=core2 kinda
makes the code optimal only for Intel processors.

---
...
pxor%xmm1, %xmm1
movl$1, %edi
movd%eax, %xmm0
pshufb  %xmm1, %xmm0
pextrb  $1, %xmm0, %edx
pextrb  $0, %xmm0, %eax
addl%edx, %eax
pextrb  $2, %xmm0, %edx
addl%edx, %eax
pextrb  $4, %xmm0, %ecx
pextrb  $3, %xmm0, %edx
addl%eax, %edx
pextrb  $5, %xmm0, %eax
addl%eax, %ecx
pextrb  $6, %xmm0, %eax
addl%eax, %ecx
pextrb  $9, %xmm0, %esi
pextrb  $7, %xmm0, %eax
addl%eax, %ecx
pextrb  $8, %xmm0, %eax
addl%esi, %eax
pextrb  $10, %xmm0, %esi
addl%esi, %eax
pextrb  $11, %xmm0, %esi
addl%esi, %eax
pextrb  $13, %xmm0, %esi
movd%eax, %xmm1
pextrb  $12, %xmm0, %eax
addl%esi, %eax
pextrb  $14, %xmm0, %esi
addl%eax, %esi
pextrb  $15, %xmm0, %eax
movd%edx, %xmm0
addl%esi, %eax
pinsrd  $1, %ecx, %xmm0
movl$.LC0, %esi
pinsrd  $1, %eax, %xmm1
xorl%eax, %eax
punpcklqdq  %xmm1, %xmm0
...

[Bug c/79938] gcc unnecessarily spills xmm register to stack when inserting vector items

2017-03-07 Thread rguenth at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938

--- Comment #1 from Richard Biener  ---
The situation is slightly better with GCC 7, only two spill/loads are
remaining.
Possibly BIT_INSERT_EXPR helps here.  For the testcase you want to add
__attribute__((noinline)) to haddd_epu8 as otherwise eliding the result
vector after inlining into main is desirable (but not done).  Then with GCC 7
we get:

haddd_epu8:
.LFB4966:
.cfi_startproc
pextrb  $1, %xmm0, %edx
pextrb  $0, %xmm0, %eax
addl%edx, %eax
pextrb  $2, %xmm0, %edx
addl%edx, %eax
pextrb  $3, %xmm0, %edx
addl%edx, %eax
pextrb  $5, %xmm0, %edx
movl%eax, -12(%rsp)
pextrb  $4, %xmm0, %eax
movd-12(%rsp), %xmm1
addl%edx, %eax
pextrb  $6, %xmm0, %edx
addl%edx, %eax
pextrb  $7, %xmm0, %edx
addl%edx, %eax
pextrb  $9, %xmm0, %edx
pinsrd  $1, %eax, %xmm1
pextrb  $8, %xmm0, %eax
addl%edx, %eax
pextrb  $10, %xmm0, %edx
addl%edx, %eax
pextrb  $11, %xmm0, %edx
addl%edx, %eax
pextrb  $13, %xmm0, %edx
pinsrd  $2, %eax, %xmm1
pextrb  $12, %xmm0, %eax
addl%edx, %eax
pextrb  $14, %xmm0, %edx
addl%eax, %edx
pextrb  $15, %xmm0, %eax
addl%edx, %eax
pinsrd  $3, %eax, %xmm1
movdqa  %xmm1, %xmm0
ret

which looks optimial to me.  The single stack use is because by default
inter-unit moves are disabled.  With -mtune=core-avx2 you'd get

haddd_epu8:
.LFB4966:
.cfi_startproc
pextrb  $1, %xmm0, %edx
pextrb  $0, %xmm0, %eax
addl%edx, %eax
pextrb  $2, %xmm0, %edx
addl%edx, %eax
pextrb  $3, %xmm0, %edx
addl%edx, %eax
pextrb  $5, %xmm0, %edx
movd%eax, %xmm1
pextrb  $4, %xmm0, %eax
addl%edx, %eax
pextrb  $6, %xmm0, %edx
addl%edx, %eax
pextrb  $7, %xmm0, %edx
addl%edx, %eax
pextrb  $9, %xmm0, %edx
pinsrd  $1, %eax, %xmm1
pextrb  $8, %xmm0, %eax
addl%edx, %eax
pextrb  $10, %xmm0, %edx
addl%edx, %eax
pextrb  $11, %xmm0, %edx
addl%edx, %eax
pextrb  $13, %xmm0, %edx
pinsrd  $2, %eax, %xmm1
pextrb  $12, %xmm0, %eax
addl%edx, %eax
pextrb  $14, %xmm0, %edx
addl%eax, %edx
pextrb  $15, %xmm0, %eax
addl%edx, %eax
pinsrd  $3, %eax, %xmm1
movdqa  %xmm1, %xmm0
ret