https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110986

--- Comment #11 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
So actually for x86_64, The trunk actually produces better code (figures)

.L2:
        movdqu  (%rdx,%rax), %xmm0
        movdqu  (%rsi,%rax), %xmm2
        movdqu  (%rsi,%rax), %xmm1
        pcmpeqw %xmm3, %xmm0
        pxor    %xmm4, %xmm2
        pand    %xmm0, %xmm1
        pandn   %xmm2, %xmm0
        por     %xmm1, %xmm0
        movups  %xmm0, (%rdi,%rax)
        addq    $16, %rax
        cmpq    $256, %rax
        jne     .L2

.L2:
        movdqu  (%rdx,%rax), %xmm0
        movdqu  (%rsi,%rax), %xmm2
        pcmpeqw %xmm1, %xmm0
        pcmpeqw %xmm1, %xmm0
        pxor    %xmm2, %xmm0
        movups  %xmm0, (%rdi,%rax)
        addq    $16, %rax
        cmpq    $256, %rax
        jne     .L2

Reply via email to