https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105066
--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> --- pinsrw is under sse2 for both reg and mem operands, but not for pextrw which requires sse4.1 for memory operands. 10593(define_insn "vec_set<mode>_0" 10594 [(set (match_operand:V8_128 0 "register_operand" 10595 "=v,v,v,x,x,Yr,*x,x,x,x,v,v") 10596 (vec_merge:V8_128 10597 (vec_duplicate:V8_128 10598 (match_operand:<ssescalarmode> 2 "nonimmediate_operand" 10599 " r,m,v,r,m,Yr,*x,r,m,x,r,m")) 10600 (match_operand:V8_128 1 "reg_or_0_operand" 10601 " C,C,v,0,0,0 ,0 ,x,x,x,v,v") 10602 (const_int 1)))] 10603 "TARGET_SSE2" 10604 "@ 10605 vmovw\t{%k2, %0|%0, %k2} 10606 vmovw\t{%2, %0|%0, %2} 10607 vmovsh\t{%2, %1, %0|%0, %1, %2} 10608 pinsrw\t{$0, %k2, %0|%0, %k2, 0} 10609 pinsrw\t{$0, %2, %0|%0, %2, 0} 10610 pblendw\t{$1, %2, %0|%0, %2, 1} 10611 pblendw\t{$1, %2, %0|%0, %2, 1} 10612 vpinsrw\t{$0, %k2, %1, %0|%0, %1, %k2, 0} 10613 vpinsrw\t{$0, %2, %1, %0|%0, %1, %2, 0} 10614 vpblendw\t{$1, %2, %1, %0|%0, %1, %2, 1} 10615 vpinsrw\t{$0, %k2, %1, %0|%0, %1, %k2, 0} 10616 vpinsrw\t{$0, %2, %1, %0|%0, %1, %2, 0}" 10617 [(set (attr "isa") 10618 (cond [(eq_attr "alternative" "0,1,2") 10619 (const_string "avx512fp16") 10620 (eq_attr "alternative" "3") 10621 (const_string "noavx") 10622 (eq_attr "alternative" "4,5,6") 10623 (const_string "sse4_noavx") alternative 4 doesn't require sse4. and for performance pinsw mem > vmovd reg > pinsrw reg and yes, it's sub-optimization for below. pmovzxbq(void*): # -O3 -msse4.1 -mtune=haswell pxor %xmm0, %xmm0 # 1 uop pinsrw $0, (%rdi), %xmm0 # 2 uops, one for shuffle port pmovzxbq %xmm0, %xmm0 # 1 uop for the same shuffle port ret