https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92819
--- Comment #8 from Richard Biener <rguenth at gcc dot gnu.org> --- Update: --- t2.s.9 2019-12-06 09:03:03.104331362 +0100 +++ t2.s.10 2019-12-06 09:02:52.700181055 +0100 @@ -6,8 +6,9 @@ foo: .LFB0: .cfi_startproc - vunpckhpd %xmm0, %xmm0, %xmm0 - vmovhpd (%rdi), %xmm0, %xmm0 + vbroadcastsd (%rdi), %ymm1 + vperm2f128 $48, %ymm1, %ymm0, %ymm0 + vshufpd $11, %ymm1, %ymm0, %ymm0 ret .cfi_endproc .LFE0: @@ -29,8 +30,8 @@ baz: .LFB2: .cfi_startproc - vunpckhpd %xmm0, %xmm0, %xmm0 - vmovhpd (%rdi), %xmm0, %xmm0 + vmovddup (%rdi), %xmm1 + vunpckhpd %xmm1, %xmm0, %xmm0 ret .cfi_endproc .LFE2: @@ -52,12 +53,12 @@ corge: .LFB4: .cfi_startproc - vextractf128 $0x1, %ymm0, %xmm0 - vunpckhpd %xmm0, %xmm0, %xmm0 - vmovhpd (%rdi), %xmm0, %xmm0 + vbroadcastsd (%rdi), %ymm1 + vperm2f128 $49, %ymm1, %ymm0, %ymm0 + vshufpd $11, %ymm1, %ymm0, %ymm0 ret .cfi_endproc .LFE4: .size corge, .-corge - .ident "GCC: (GNU) 9.2.0" + .ident "GCC: (GNU) 10.0.0 20191206 (experimental) [trunk revision 270575]" .section .note.GNU-stack,"",@progbits "foo" is clearly worse, "corge" very likely and "baz" _might_ be slightly better (the load happens one instruction earlier, eventually hiding some latency and a dup from a load should be able to micro-fuse to the load op in my naiive wishful thinking). foo (v4df x, double * p) { double _1; vector(4) double _5; vector(4) double _6; vector(2) double _7; <bb 2> [local count: 1073741824]: _1 = *p_4(D); _5 = {_1, _1, _1, _1}; _6 = VEC_PERM_EXPR <x_2(D), _5, { 1, 5, 6, 7 }>; _7 = BIT_FIELD_REF <_6, 128, 0>; return _7; baz (v2df x, double * p) { double _1; vector(2) double _5; vector(2) double _6; <bb 2> [local count: 1073741824]: _1 = *p_4(D); _5 = {_1, _1}; _6 = VEC_PERM_EXPR <x_2(D), _5, { 1, 3 }>; return _6; corge (v4df x, double * p, v2df y) { double _1; vector(4) double _5; vector(4) double _6; vector(2) double _7; <bb 2> [local count: 1073741824]: _1 = *p_4(D); _5 = {_1, _1, _1, _1}; _6 = VEC_PERM_EXPR <x_2(D), _5, { 3, 5, 6, 7 }>; _7 = BIT_FIELD_REF <_6, 128, 0>; return _7; Now disabling VEC_PERM generation won't be a 100% solution in general since the user can present us with the above as well. Looking at "corge" gcc9 expanded this as extract upper half of x extract lane 1 from this load the scalar concat both so it performed the permute on smaller vectors rather than permuting the orignals and then selecting the lowpart. I'll see if I can cook up sth for simplify_bitfield_ref.