[Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2

2024-05-06 Thread amonakov at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944

--- Comment #4 from Alexander Monakov  ---
Like this:

pandxmm1, XMMWORD PTR .LC0[rip]
movaps  XMMWORD PTR [rsp-40], xmm0
xor eax, eax
xor edx, edx
movaps  XMMWORD PTR [rsp-24], xmm1
movzx   ecx, BYTE PTR [rsp-17]
mov al, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-9]
mov dl, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-18]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-10]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-19]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-11]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-20]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-12]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-21]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-13]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-22]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-14]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-23]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-15]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-24]
sal rax, 8
mov al, BYTE PTR [rsp-40+rcx]
movzx   ecx, BYTE PTR [rsp-16]
sal rdx, 8
mov dl, BYTE PTR [rsp-40+rcx]
movqxmm0, rax
movqxmm2, rdx
punpcklqdq  xmm0, xmm2

[Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2

2024-05-06 Thread amonakov at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944

Alexander Monakov  changed:

   What|Removed |Added

 CC||amonakov at gcc dot gnu.org

--- Comment #3 from Alexander Monakov  ---
Throughput-wise, the code in comment 2 has a significant bottleneck on port 5
on Haswell and Skylake (31 uops out of 70 go to port 5). Straightforward code
that does 16x movzx-movzx-movb for each byte should fare better, even
considering the load-store penalty for retrieving the vector from memory:

pandxmm1, XMMWORD PTR .LC0[rip]
movaps  XMMWORD PTR [rsp-56], xmm0
movaps  XMMWORD PTR [rsp-40], xmm1
movzx   eax, BYTE PTR [rsp-40]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-24], al
movzx   eax, BYTE PTR [rsp-39]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-23], al
movzx   eax, BYTE PTR [rsp-38]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-22], al
movzx   eax, BYTE PTR [rsp-37]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-21], al
movzx   eax, BYTE PTR [rsp-36]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-20], al
movzx   eax, BYTE PTR [rsp-35]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-19], al
movzx   eax, BYTE PTR [rsp-34]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-18], al
movzx   eax, BYTE PTR [rsp-33]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-17], al
movzx   eax, BYTE PTR [rsp-32]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-16], al
movzx   eax, BYTE PTR [rsp-31]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-15], al
movzx   eax, BYTE PTR [rsp-30]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-14], al
movzx   eax, BYTE PTR [rsp-29]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-13], al
movzx   eax, BYTE PTR [rsp-28]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-12], al
movzx   eax, BYTE PTR [rsp-27]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-11], al
movzx   eax, BYTE PTR [rsp-26]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-10], al
movzx   eax, BYTE PTR [rsp-25]
movzx   eax, BYTE PTR [rsp-56+rax]
mov BYTE PTR [rsp-9], al
movdqa  xmm0, XMMWORD PTR [rsp-24]

If you want to avoid the load-store forwarding stall, perhaps you can assemble
two halves of the shuffled vector on GPRs (e.g. do 'movzx ecx, byte[...]; shl
eax, 8; mov al, byte [...+rcx]), then merge two 64-bit GPRs into one 128-bit
vector.

[Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2

2024-05-06 Thread john_platts at hotmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944

--- Comment #2 from John Platts  ---
Here is more optimal codegen for SSE2ShuffleI8 on x86_64:
SSE2ShuffleI8(long long __vector(2), long long __vector(2)):
pandxmm1, XMMWORD PTR .LC0[rip]
movaps  XMMWORD PTR [rsp-24], xmm0
movdeax, xmm1
movzx   eax, al
movzx   eax, BYTE PTR [rsp-24+rax]
movaps  XMMWORD PTR [rsp-40], xmm1
movdxmm0, eax
movzx   eax, BYTE PTR [rsp-39]
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm2, eax
movzx   eax, BYTE PTR [rsp-38]
punpcklbw   xmm0, xmm2
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm8, eax
movzx   eax, BYTE PTR [rsp-37]
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm2, eax
movzx   eax, BYTE PTR [rsp-36]
punpcklbw   xmm8, xmm2
movzx   eax, BYTE PTR [rsp-24+rax]
punpcklwd   xmm0, xmm8
movdxmm5, eax
movzx   eax, BYTE PTR [rsp-35]
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm2, eax
movzx   eax, BYTE PTR [rsp-34]
punpcklbw   xmm5, xmm2
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm7, eax
movzx   eax, BYTE PTR [rsp-33]
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm2, eax
movzx   eax, BYTE PTR [rsp-32]
punpcklbw   xmm7, xmm2
movzx   eax, BYTE PTR [rsp-24+rax]
punpcklwd   xmm5, xmm7
punpckldq   xmm0, xmm5
movdxmm3, eax
movzx   eax, BYTE PTR [rsp-31]
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm4, eax
movzx   eax, BYTE PTR [rsp-30]
punpcklbw   xmm3, xmm4
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm6, eax
movzx   eax, BYTE PTR [rsp-29]
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm2, eax
movzx   eax, BYTE PTR [rsp-28]
punpcklbw   xmm6, xmm2
movzx   eax, BYTE PTR [rsp-24+rax]
punpcklwd   xmm3, xmm6
movdxmm2, eax
movzx   eax, BYTE PTR [rsp-27]
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm4, eax
movzx   eax, BYTE PTR [rsp-26]
punpcklbw   xmm2, xmm4
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm4, eax
movzx   eax, BYTE PTR [rsp-25]
movzx   eax, BYTE PTR [rsp-24+rax]
movdxmm1, eax
punpcklbw   xmm4, xmm1
movdqa  xmm1, xmm2
movdqa  xmm2, xmm3
punpcklwd   xmm1, xmm4
punpckldq   xmm2, xmm1
punpcklqdq  xmm0, xmm2
ret
.LC0:
.quad   1085102592571150095
.quad   1085102592571150095

[Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2

2024-05-04 Thread john_platts at hotmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944

John Platts  changed:

   What|Removed |Added

 Target||x86_64-*-*, i?86-*-*

--- Comment #1 from John Platts  ---
Here is another snippet of code that has suboptimal codegen on SSE2 with GCC
13.2.0:
#include 
#include 

__m128i SSE2ShuffleI8(__m128i a, __m128i b) {
  alignas(16) uint8_t a_lanes[16];
  alignas(16) uint8_t b_lanes[16];

  _mm_store_si128(reinterpret_cast<__m128i*>(a_lanes), a);
  _mm_store_si128(reinterpret_cast<__m128i*>(b_lanes),
  _mm_and_si128(b, _mm_set1_epi8(static_cast(15;

  __m128i v0 = _mm_cvtsi32_si128(a_lanes[b_lanes[0]]);
  __m128i v1 = _mm_cvtsi32_si128(a_lanes[b_lanes[1]]);
  __m128i v2 = _mm_cvtsi32_si128(a_lanes[b_lanes[2]]);
  __m128i v3 = _mm_cvtsi32_si128(a_lanes[b_lanes[3]]);
  __m128i v4 = _mm_cvtsi32_si128(a_lanes[b_lanes[4]]);
  __m128i v5 = _mm_cvtsi32_si128(a_lanes[b_lanes[5]]);
  __m128i v6 = _mm_cvtsi32_si128(a_lanes[b_lanes[6]]);
  __m128i v7 = _mm_cvtsi32_si128(a_lanes[b_lanes[7]]);
  __m128i v8 = _mm_cvtsi32_si128(a_lanes[b_lanes[8]]);
  __m128i v9 = _mm_cvtsi32_si128(a_lanes[b_lanes[9]]);
  __m128i v10 = _mm_cvtsi32_si128(a_lanes[b_lanes[10]]);
  __m128i v11 = _mm_cvtsi32_si128(a_lanes[b_lanes[11]]);
  __m128i v12 = _mm_cvtsi32_si128(a_lanes[b_lanes[12]]);
  __m128i v13 = _mm_cvtsi32_si128(a_lanes[b_lanes[13]]);
  __m128i v14 = _mm_cvtsi32_si128(a_lanes[b_lanes[14]]);
  __m128i v15 = _mm_cvtsi32_si128(a_lanes[b_lanes[15]]);

  v0 = _mm_unpacklo_epi8(v0, v1);
  v2 = _mm_unpacklo_epi8(v2, v3);
  v4 = _mm_unpacklo_epi8(v4, v5);
  v6 = _mm_unpacklo_epi8(v6, v7);
  v8 = _mm_unpacklo_epi8(v8, v9);
  v10 = _mm_unpacklo_epi8(v10, v11);
  v12 = _mm_unpacklo_epi8(v12, v13);
  v14 = _mm_unpacklo_epi8(v14, v15);

  v0 = _mm_unpacklo_epi16(v0, v2);
  v4 = _mm_unpacklo_epi16(v4, v6);
  v8 = _mm_unpacklo_epi16(v8, v10);
  v12 = _mm_unpacklo_epi16(v12, v14);

  v0 = _mm_unpacklo_epi32(v0, v4);
  v8 = _mm_unpacklo_epi32(v8, v12);

  return _mm_unpacklo_epi64(v0, v8);
}

Here is the code that is generated when the above code is compiled on x86_64
GCC 13.2.0 with the -O2 option:
SSE2ShuffleI8(long long __vector(2), long long __vector(2)):
sub rsp, 144
pandxmm1, XMMWORD PTR .LC0[rip]
movaps  XMMWORD PTR [rsp+120], xmm0
movdeax, xmm1
movzx   eax, al
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp+104], xmm1
movdxmm0, eax
movzx   eax, BYTE PTR [rsp+105]
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp+88], xmm1
movdxmm2, eax
movzx   eax, BYTE PTR [rsp+90]
punpcklbw   xmm0, xmm2
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp+72], xmm1
movdxmm8, eax
movzx   eax, BYTE PTR [rsp+75]
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp+56], xmm1
movdxmm2, eax
movzx   eax, BYTE PTR [rsp+60]
punpcklbw   xmm8, xmm2
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp+40], xmm1
punpcklwd   xmm0, xmm8
movdxmm5, eax
movzx   eax, BYTE PTR [rsp+45]
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp+24], xmm1
movdxmm2, eax
movzx   eax, BYTE PTR [rsp+30]
punpcklbw   xmm5, xmm2
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp+8], xmm1
movdxmm7, eax
movzx   eax, BYTE PTR [rsp+15]
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp-8], xmm1
movdxmm2, eax
movzx   eax, BYTE PTR [rsp]
punpcklbw   xmm7, xmm2
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp-24], xmm1
punpcklwd   xmm5, xmm7
punpckldq   xmm0, xmm5
movdxmm3, eax
movzx   eax, BYTE PTR [rsp-15]
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp-40], xmm1
movdxmm4, eax
movzx   eax, BYTE PTR [rsp-30]
punpcklbw   xmm3, xmm4
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp-56], xmm1
movdxmm6, eax
movzx   eax, BYTE PTR [rsp-45]
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp-72], xmm1
movdxmm2, eax
movzx   eax, BYTE PTR [rsp-60]
punpcklbw   xmm6, xmm2
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp-88], xmm1
punpcklwd   xmm3, xmm6
movdxmm2, eax
movzx   eax, BYTE PTR [rsp-75]
movzx   eax, BYTE PTR [rsp+120+rax]
movaps  XMMWORD PTR [rsp-104], xmm1
movdxmm4, eax
movzx   eax, BYTE PTR [rsp-90]
punpcklbw   xmm2, xmm4