https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92645
--- Comment #13 from Richard Biener <rguenth at gcc dot gnu.org> --- So with all tricks I arrive at the following for the reduced testcase f: .LFB2: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movl %ecx, %r9d vpxor %xmm4, %xmm4, %xmm4 movl $255, %eax shrl $24, %r9d subl %r9d, %eax movq %rsp, %rbp .cfi_def_cfa_register 6 andq $-32, %rsp movl %eax, %r8d vmovdqa %ymm4, -32(%rsp) vmovd %ecx, %xmm4 shrl $7, %eax vpshufd $0, %xmm4, %xmm4 addl %r8d, %eax vmovaps %xmm4, -32(%rsp) vpmovzxbw -32(%rsp), %ymm2 vmovd %eax, %xmm0 vpbroadcastb %xmm0, %xmm0 vpsllw $8, %ymm2, %ymm2 vpaddw .LC1(%rip), %ymm2, %ymm2 testq %rdx, %rdx je .L10 vpxor %xmm5, %xmm5, %xmm5 vmovdqa .LC0(%rip), %xmm3 xorl %eax, %eax vmovdqa %ymm5, -32(%rsp) vmovaps %xmm0, -32(%rsp) vpmovzxbw -32(%rsp), %ymm4 .p2align 4,,10 .p2align 3 .L7: vmovdqu (%rsi,%rax), %xmm6 vpxor %xmm5, %xmm5, %xmm5 vmovdqa %ymm5, -32(%rsp) vmovaps %xmm6, -32(%rsp) vpmovzxbw -32(%rsp), %ymm0 vpmullw %ymm4, %ymm0, %ymm0 vpaddw %ymm2, %ymm0, %ymm0 vpsrlw $8, %ymm0, %ymm0 vmovdqa %xmm0, %xmm1 vextracti128 $0x1, %ymm0, %xmm0 vpand %xmm1, %xmm3, %xmm1 vpand %xmm0, %xmm3, %xmm0 vpackuswb %xmm0, %xmm1, %xmm0 vmovups %xmm0, (%rdi,%rax) addq $16, %rax subq $4, %rdx jne .L7 .L10: vzeroupper leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE2: .size f, .-f where the stack spills still look bad - shomehow we don't like _60 = BIT_INSERT_EXPR <{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, _4, 0>; _61 = [vec_unpack_lo_expr] _60; which is "widening" _4 to double vector size when we know we'll just need the lowpart for the VEC_UNPACK_LO_EXPR. This _should_ translate to a mov %xmm, %ymm but somehow it doesn't. A small testcase for that is the zxt() function in the reduced testcase. Using an undef SSA name in place off the { 0, ... } vector doesn't help either. A simple VIEW_CONVERT isn't valid (it changes size).