https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
The situation is slightly better with GCC 7, only two spill/loads are
remaining.
Possibly BIT_INSERT_EXPR helps here.  For the testcase you want to add
__attribute__((noinline)) to haddd_epu8 as otherwise eliding the result
vector after inlining into main is desirable (but not done).  Then with GCC 7
we get:

haddd_epu8:
.LFB4966:
        .cfi_startproc
        pextrb  $1, %xmm0, %edx
        pextrb  $0, %xmm0, %eax
        addl    %edx, %eax
        pextrb  $2, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $3, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $5, %xmm0, %edx
        movl    %eax, -12(%rsp)
        pextrb  $4, %xmm0, %eax
        movd    -12(%rsp), %xmm1
        addl    %edx, %eax
        pextrb  $6, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $7, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $9, %xmm0, %edx
        pinsrd  $1, %eax, %xmm1
        pextrb  $8, %xmm0, %eax
        addl    %edx, %eax
        pextrb  $10, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $11, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $13, %xmm0, %edx
        pinsrd  $2, %eax, %xmm1
        pextrb  $12, %xmm0, %eax
        addl    %edx, %eax
        pextrb  $14, %xmm0, %edx
        addl    %eax, %edx
        pextrb  $15, %xmm0, %eax
        addl    %edx, %eax
        pinsrd  $3, %eax, %xmm1
        movdqa  %xmm1, %xmm0
        ret

which looks optimial to me.  The single stack use is because by default
inter-unit moves are disabled.  With -mtune=core-avx2 you'd get

haddd_epu8:
.LFB4966:
        .cfi_startproc
        pextrb  $1, %xmm0, %edx
        pextrb  $0, %xmm0, %eax
        addl    %edx, %eax
        pextrb  $2, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $3, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $5, %xmm0, %edx
        movd    %eax, %xmm1
        pextrb  $4, %xmm0, %eax
        addl    %edx, %eax
        pextrb  $6, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $7, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $9, %xmm0, %edx
        pinsrd  $1, %eax, %xmm1
        pextrb  $8, %xmm0, %eax
        addl    %edx, %eax
        pextrb  $10, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $11, %xmm0, %edx
        addl    %edx, %eax
        pextrb  $13, %xmm0, %edx
        pinsrd  $2, %eax, %xmm1
        pextrb  $12, %xmm0, %eax
        addl    %edx, %eax
        pextrb  $14, %xmm0, %edx
        addl    %eax, %edx
        pextrb  $15, %xmm0, %eax
        addl    %edx, %eax
        pinsrd  $3, %eax, %xmm1
        movdqa  %xmm1, %xmm0
        ret

Reply via email to