https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> --- Adding fully masked AVX512 and AVX512 with a masked epilog data: size scalar 128 256 512 512e 512f 1 9.42 11.32 9.35 11.17 15.13 16.89 2 5.72 6.53 6.66 6.66 7.62 8.56 3 4.49 5.10 5.10 5.74 5.08 5.73 4 4.10 4.33 4.29 5.21 3.79 4.25 6 3.78 3.85 3.86 4.76 2.54 2.85 8 3.64 1.89 3.76 4.50 1.92 2.16 12 3.56 2.21 3.75 4.26 1.26 1.42 16 3.36 0.83 1.06 4.16 0.95 1.07 20 3.39 1.42 1.33 4.07 0.75 0.85 24 3.23 0.66 1.72 4.22 0.62 0.70 28 3.18 1.09 2.04 4.20 0.54 0.61 32 3.16 0.47 0.41 0.41 0.47 0.53 34 3.16 0.67 0.61 0.56 0.44 0.50 38 3.19 0.95 0.95 0.82 0.40 0.45 42 3.09 0.58 1.21 1.13 0.36 0.40 text sizes are not much different: 1389 1837 2125 1629 1721 1689 the AVX2 size is large because we completely peel the scalar epilogue, same for the SSE case. The scalar epilogue of the 512 loop iterates 32 times (too many for peeling), the masked loop/epilogue are quite large due to the EVEX encoded instructions so the saved scalar/vector epilogues do not show. The AVX512 masked epilogue case now looks like: .p2align 3 .L5: vmovdqu8 (%r8,%rax), %zmm0 vpavgb (%rsi,%rax), %zmm0, %zmm0 vmovdqu8 %zmm0, (%rdi,%rax) addq $64, %rax cmpq %rcx, %rax jne .L5 movl %edx, %ecx andl $-64, %ecx testb $63, %dl je .L19 .L4: movl %ecx, %eax subl %ecx, %edx movl $255, %ecx cmpl %ecx, %edx cmova %ecx, %edx vpbroadcastb %edx, %zmm0 vpcmpub $6, .LC0(%rip), %zmm0, %k1 vmovdqu8 (%rsi,%rax), %zmm0{%k1}{z} vmovdqu8 (%r8,%rax), %zmm1{%k1}{z} vpavgb %zmm1, %zmm0, %zmm0 vmovdqu8 %zmm0, (%rdi,%rax){%k1} .L19: vzeroupper ret where there's a missed optimization around the saturation to 255. The fully masked AVX512 loop is vmovdqa64 .LC0(%rip), %zmm3 movl $255, %eax cmpl %eax, %ecx cmovbe %ecx, %eax vpbroadcastb %eax, %zmm0 vpcmpub $6, %zmm3, %zmm0, %k1 .p2align 4 .p2align 3 .L4: vmovdqu8 (%rsi,%rax), %zmm1{%k1} vmovdqu8 (%r8,%rax), %zmm2{%k1} movl %r10d, %edx movl $255, %ecx subl %eax, %edx cmpl %ecx, %edx cmova %ecx, %edx vpavgb %zmm2, %zmm1, %zmm0 vmovdqu8 %zmm0, (%rdi,%rax){%k1} vpbroadcastb %edx, %zmm0 addq $64, %rax movl %r9d, %edx subl %eax, %edx vpcmpub $6, %zmm3, %zmm0, %k1 cmpl $64, %edx ja .L4 vzeroupper ret which is a much larger loop body due to the mask creation. At least that interleaves nicely (dependence wise) with the loop control and vectorized stmts. What needs to be optimized somehow is what IVOPTs makes out of the decreasing remaining scalar iters IV with the IV required for the memory accesses. Without IVOPTs the body looks like .L4: vmovdqu8 (%rsi), %zmm1{%k1} vmovdqu8 (%rdx), %zmm2{%k1} movl $255, %eax movl %ecx, %r8d subl $64, %ecx addq $64, %rsi addq $64, %rdx vpavgb %zmm2, %zmm1, %zmm0 vmovdqu8 %zmm0, (%rdi){%k1} addq $64, %rdi cmpl %eax, %ecx cmovbe %ecx, %eax vpbroadcastb %eax, %zmm0 vpcmpub $6, %zmm3, %zmm0, %k1 cmpl $64, %r8d ja .L4 and the key thing to optimize is ivtmp_78 = ivtmp_77 + 4294967232; // -64 _79 = MIN_EXPR <ivtmp_78, 255>; _80 = (unsigned char) _79; _81 = {_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80}; that is we want to broadcast a saturated (to vector element precision) value.