https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88278

--- Comment #6 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
I wonder about something like:
--- gcc/config/i386/sse.md.jj   2018-11-29 23:16:06.481301632 +0100
+++ gcc/config/i386/sse.md      2018-11-30 16:21:21.480379008 +0100
@@ -7248,6 +7248,17 @@
    (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex")
    (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])

+(define_insn "*vec_concatv4sf_0"
+  [(set (match_operand:V4SF 0 "register_operand"       "=v")
+       (vec_concat:V4SF
+         (match_operand:V2SF 1 "nonimmediate_operand" "xm")
+         (match_operand:V2SF 2 "const0_operand"       " C")))]
+  "TARGET_SSE2"
+  "%vmovq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DF")])
+
 ;; Avoid combining registers from different units in a single alternative,
 ;; see comment above inline_secondary_memory_needed function in i386.c
 (define_insn "vec_set<mode>_0"
@@ -14409,6 +14420,23 @@
    (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex")
    (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])

+(define_insn "*vec_concatv4si_0"
+  [(set (match_operand:V4SI 0 "register_operand"       "=v,x")
+       (vec_concat:V4SI
+         (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
+         (match_operand:V2SI 2 "const0_operand"       " C,C")))]
+  "TARGET_SSE2"
+  "@
+   %vmovq\t{%1, %0|%0, %1}
+   movq2dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex,orig")
+   (set_attr "mode" "TI")
+   (set (attr "preferred_for_speed")
+     (if_then_else (eq_attr "alternative" "1")
+       (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+       (symbol_ref "true")))])
+
 ;; movd instead of movq is required to handle broken assemblers.
 (define_insn "vec_concatv2di"
   [(set (match_operand:V2DI 0 "register_operand"

but the #c0 testcases don't compile for me with -O2 -msse2 -fgimple (nor
-mavx), so I can't easily verify.

I don't see how we could get rid of those for the v <- v,C cases, unless we
analyze whatever instruction generated it and prove that it leaves all the
higher bits set to zero.  E.g. one could have a v4si to v2si downcast (just
picking the lowpart subreg) followed by concatenating it with zero, and if we
blindly drop the movq instruction, the upper bits might be non-zero.

Reply via email to