https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115531
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> --- AVX512 produces .L3: vmovdqu8 (%rsi), %zmm9{%k1} kshiftrq $32, %k1, %k5 kshiftrq $48, %k1, %k4 movl %r9d, %eax vmovdqu32 128(%rcx), %zmm7{%k5} subl %esi, %eax movl $64, %edi vmovdqu32 128(%rdx), %zmm3{%k5} kshiftrq $16, %k1, %k6 addl %r10d, %eax vmovdqu32 192(%rcx), %zmm8{%k4} cmpl %edi, %eax vmovdqu32 192(%rdx), %zmm4{%k4} cmova %edi, %eax addq $64, %rsi addq $256, %rcx vmovdqu32 -256(%rcx), %zmm5{%k1} vmovdqu32 (%rdx), %zmm1{%k1} vmovdqu32 -192(%rcx), %zmm6{%k6} vmovdqu32 64(%rdx), %zmm2{%k6} vpcmpb $4, %zmm14, %zmm9, %k2 kshiftrq $32, %k2, %k3 vpblendmd %zmm7, %zmm3, %zmm10{%k3} kshiftrd $16, %k3, %k3 vpblendmd %zmm8, %zmm4, %zmm0{%k3} vpblendmd %zmm5, %zmm1, %zmm12{%k2} vmovdqu32 %zmm10, 128(%rdx){%k5} kshiftrd $16, %k2, %k2 vmovdqu32 %zmm0, 192(%rdx){%k4} vpblendmd %zmm6, %zmm2, %zmm11{%k2} vpbroadcastb %eax, %zmm0 movl %r9d, %eax vmovdqu32 %zmm12, (%rdx){%k1} subl %esi, %eax addl %r8d, %eax vmovdqu32 %zmm11, 64(%rdx){%k6} addq $256, %rdx vpcmpub $6, %zmm13, %zmm0, %k1 cmpl $64, %eax ja .L3 The vectorizer sees <bb 4> [local count: 955630225]: # i_26 = PHI <i_23(9), 0(16)> _1 = (long unsigned int) i_26; _2 = _1 * 4; _3 = c_17(D) + _2; res_18 = *_3; _4 = stride_14(D) + i_26; _5 = (long unsigned int) _4; _6 = _5 * 4; _7 = b_19(D) + _6; t_20 = *_7; _8 = a_21(D) + _1; _9 = *_8; _34 = _9 != 0; res_11 = _34 ? t_20 : res_18; *_3 = res_11; i_23 = i_26 + 1; if (n_16(D) > i_23) I believe that to get proper vectorizer costing we want to have an optimization phase that can take into account whether we use a masked loop or not. Note that your intended transform relies on identifying the open-coded "conditional store" int res = c[i]; if (a[i] != 0) res = t; c[i] = res; As Andrew says when that's a .MASK_STORE it's going to be easier to identify the opportunity. So yeah, if-conversion could recognize this pattern and produce a .MASK_STORE from it as a first step.