https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944

--- Comment #2 from John Platts <john_platts at hotmail dot com> ---
Here is more optimal codegen for SSE2ShuffleI8 on x86_64:
SSE2ShuffleI8(long long __vector(2), long long __vector(2)):
        pand    xmm1, XMMWORD PTR .LC0[rip]
        movaps  XMMWORD PTR [rsp-24], xmm0
        movd    eax, xmm1
        movzx   eax, al
        movzx   eax, BYTE PTR [rsp-24+rax]
        movaps  XMMWORD PTR [rsp-40], xmm1
        movd    xmm0, eax
        movzx   eax, BYTE PTR [rsp-39]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-38]
        punpcklbw       xmm0, xmm2
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm8, eax
        movzx   eax, BYTE PTR [rsp-37]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-36]
        punpcklbw       xmm8, xmm2
        movzx   eax, BYTE PTR [rsp-24+rax]
        punpcklwd       xmm0, xmm8
        movd    xmm5, eax
        movzx   eax, BYTE PTR [rsp-35]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-34]
        punpcklbw       xmm5, xmm2
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm7, eax
        movzx   eax, BYTE PTR [rsp-33]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-32]
        punpcklbw       xmm7, xmm2
        movzx   eax, BYTE PTR [rsp-24+rax]
        punpcklwd       xmm5, xmm7
        punpckldq       xmm0, xmm5
        movd    xmm3, eax
        movzx   eax, BYTE PTR [rsp-31]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm4, eax
        movzx   eax, BYTE PTR [rsp-30]
        punpcklbw       xmm3, xmm4
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm6, eax
        movzx   eax, BYTE PTR [rsp-29]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-28]
        punpcklbw       xmm6, xmm2
        movzx   eax, BYTE PTR [rsp-24+rax]
        punpcklwd       xmm3, xmm6
        movd    xmm2, eax
        movzx   eax, BYTE PTR [rsp-27]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm4, eax
        movzx   eax, BYTE PTR [rsp-26]
        punpcklbw       xmm2, xmm4
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm4, eax
        movzx   eax, BYTE PTR [rsp-25]
        movzx   eax, BYTE PTR [rsp-24+rax]
        movd    xmm1, eax
        punpcklbw       xmm4, xmm1
        movdqa  xmm1, xmm2
        movdqa  xmm2, xmm3
        punpcklwd       xmm1, xmm4
        punpckldq       xmm2, xmm1
        punpcklqdq      xmm0, xmm2
        ret
.LC0:
        .quad   1085102592571150095
        .quad   1085102592571150095

Reply via email to