On Mon, Feb 11, 2019 at 11:55 PM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> Emulate MMX punpcklXX/punpckhXX with SSE punpcklXX.  For MMX punpckhXX,
> move bits 64:127 to bits 0:63 in SSE register.  Only SSE register source
> operand is allowed.
>
>         PR target/89021
>         * config/i386/i386-protos.h (ix86_split_mmx_punpck): New
>         prototype.
>         * config/i386/i386.c (ix86_split_mmx_punpck): New function.
>         * config/i386/mmx.m (mmx_punpckhbw): Changed to
>         define_insn_and_split to support SSE emulation.
>         (mmx_punpcklbw): Likewise.
>         (mmx_punpckhwd): Likewise.
>         (mmx_punpcklwd): Likewise.
>         (mmx_punpckhdq): Likewise.
>         (mmx_punpckldq): Likewise.
> ---
>  gcc/config/i386/i386-protos.h |   1 +
>  gcc/config/i386/i386.c        |  77 +++++++++++++++++++
>  gcc/config/i386/mmx.md        | 138 ++++++++++++++++++++++------------
>  3 files changed, 168 insertions(+), 48 deletions(-)
>
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index bb96a420a85..dc7fc38d8e4 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -202,6 +202,7 @@ extern rtx ix86_split_stack_guard (void);
>
>  extern void ix86_move_vector_high_sse_to_mmx (rtx);
>  extern void ix86_split_mmx_pack (rtx[], enum rtx_code);
> +extern void ix86_split_mmx_punpck (rtx[], bool);
>
>  #ifdef TREE_CODE
>  extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index b8d5ba7f28f..7d65192c1cd 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -20009,6 +20009,83 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code 
> code)
>    ix86_move_vector_high_sse_to_mmx (op0);
>  }
>
> +/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */
> +
> +void
> +ix86_split_mmx_punpck (rtx operands[], bool high_p)
> +{
> +  rtx op0 = operands[0];
> +  rtx op1 = operands[1];
> +  rtx op2 = operands[2];
> +  machine_mode mode = GET_MODE (op0);
> +  rtx mask;
> +  /* The corresponding SSE mode.  */
> +  machine_mode sse_mode, double_sse_mode;
> +
> +  switch (mode)
> +    {
> +    case E_V8QImode:
> +      sse_mode = V16QImode;
> +      double_sse_mode = V32QImode;
> +      mask = gen_rtx_PARALLEL (VOIDmode,
> +                              gen_rtvec (16,
> +                                         GEN_INT (0), GEN_INT (16),
> +                                         GEN_INT (1), GEN_INT (17),
> +                                         GEN_INT (2), GEN_INT (18),
> +                                         GEN_INT (3), GEN_INT (19),
> +                                         GEN_INT (4), GEN_INT (20),
> +                                         GEN_INT (5), GEN_INT (21),
> +                                         GEN_INT (6), GEN_INT (22),
> +                                         GEN_INT (7), GEN_INT (23)));
> +      break;
> +
> +    case E_V4HImode:
> +      sse_mode = V8HImode;
> +      double_sse_mode = V16HImode;
> +      mask = gen_rtx_PARALLEL (VOIDmode,
> +                              gen_rtvec (8,
> +                                         GEN_INT (0), GEN_INT (8),
> +                                         GEN_INT (1), GEN_INT (9),
> +                                         GEN_INT (2), GEN_INT (10),
> +                                         GEN_INT (3), GEN_INT (11)));
> +      break;
> +
> +    case E_V2SImode:
> +      sse_mode = V4SImode;
> +      double_sse_mode = V8SImode;
> +      mask = gen_rtx_PARALLEL (VOIDmode,
> +                              gen_rtvec (4,
> +                                         GEN_INT (0), GEN_INT (4),
> +                                         GEN_INT (1), GEN_INT (5)));
> +      break;
> +
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  /* Generate SSE punpcklXX.  */
> +  rtx dest = gen_rtx_REG (sse_mode, REGNO (op0));
> +  op1 = gen_rtx_REG (sse_mode, REGNO (op1));
> +  op2 = gen_rtx_REG (sse_mode, REGNO (op2));

lowpart_subreg here.

Uros.

> +
> +  op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
> +  op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
> +  rtx insn = gen_rtx_SET (dest, op2);
> +  emit_insn (insn);
> +
> +  if (high_p)
> +    {
> +      /* Move bits 64:127 to bits 0:63.  */
> +      mask = gen_rtx_PARALLEL (VOIDmode,
> +                              gen_rtvec (4, GEN_INT (2), GEN_INT (3),
> +                                         GEN_INT (0), GEN_INT (0)));
> +      dest = gen_rtx_REG (V4SImode, REGNO (dest));
> +      op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
> +      insn = gen_rtx_SET (dest, op1);
> +      emit_insn (insn);
> +    }
> +}
> +
>  /* Helper function of ix86_fixup_binary_operands to canonicalize
>     operand order.  Returns true if the operands should be swapped.  */
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index 840d369ab02..034c6a855e0 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -1089,87 +1089,129 @@
>     (set_attr "type" "mmxshft,sselog,sselog")
>     (set_attr "mode" "DI,TI,TI")])
>
> -(define_insn "mmx_punpckhbw"
> -  [(set (match_operand:V8QI 0 "register_operand" "=y")
> +(define_insn_and_split "mmx_punpckhbw"
> +  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv")
>         (vec_select:V8QI
>           (vec_concat:V16QI
> -           (match_operand:V8QI 1 "register_operand" "0")
> -           (match_operand:V8QI 2 "nonimmediate_operand" "ym"))
> +           (match_operand:V8QI 1 "register_operand" "0,0,Yv")
> +           (match_operand:V8QI 2 "nonimmediate_operand" "ym,x,Yv"))
>            (parallel [(const_int 4) (const_int 12)
>                       (const_int 5) (const_int 13)
>                       (const_int 6) (const_int 14)
>                       (const_int 7) (const_int 15)])))]
> -  "TARGET_MMX"
> -  "punpckhbw\t{%2, %0|%0, %2}"
> -  [(set_attr "type" "mmxcvt")
> -   (set_attr "mode" "DI")])
> +  "TARGET_MMX || TARGET_MMX_WITH_SSE"
> +  "@
> +   punpckhbw\t{%2, %0|%0, %2}
> +   #
> +   #"
> +  "TARGET_MMX_WITH_SSE && reload_completed"
> +  [(const_int 0)]
> +  "ix86_split_mmx_punpck (operands, true); DONE;"
> +  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
> +   (set_attr "type" "mmxcvt,sselog,sselog")
> +   (set_attr "mode" "DI,TI,TI")])
>
> -(define_insn "mmx_punpcklbw"
> -  [(set (match_operand:V8QI 0 "register_operand" "=y")
> +(define_insn_and_split "mmx_punpcklbw"
> +  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv")
>         (vec_select:V8QI
>           (vec_concat:V16QI
> -           (match_operand:V8QI 1 "register_operand" "0")
> -           (match_operand:V8QI 2 "nonimmediate_operand" "ym"))
> +           (match_operand:V8QI 1 "register_operand" "0,0,Yv")
> +           (match_operand:V8QI 2 "nonimmediate_operand" "ym,x,Yv"))
>            (parallel [(const_int 0) (const_int 8)
>                       (const_int 1) (const_int 9)
>                       (const_int 2) (const_int 10)
>                       (const_int 3) (const_int 11)])))]
> -  "TARGET_MMX"
> -  "punpcklbw\t{%2, %0|%0, %k2}"
> -  [(set_attr "type" "mmxcvt")
> -   (set_attr "mode" "DI")])
> +  "TARGET_MMX || TARGET_MMX_WITH_SSE"
> +  "@
> +   punpcklbw\t{%2, %0|%0, %k2}
> +   #
> +   #"
> +  "TARGET_MMX_WITH_SSE && reload_completed"
> +  [(const_int 0)]
> +  "ix86_split_mmx_punpck (operands, false); DONE;"
> +  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
> +   (set_attr "type" "mmxcvt,sselog,sselog")
> +   (set_attr "mode" "DI,TI,TI")])
>
> -(define_insn "mmx_punpckhwd"
> -  [(set (match_operand:V4HI 0 "register_operand" "=y")
> +(define_insn_and_split "mmx_punpckhwd"
> +  [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv")
>         (vec_select:V4HI
>           (vec_concat:V8HI
> -           (match_operand:V4HI 1 "register_operand" "0")
> -           (match_operand:V4HI 2 "nonimmediate_operand" "ym"))
> +           (match_operand:V4HI 1 "register_operand" "0,0,Yv")
> +           (match_operand:V4HI 2 "nonimmediate_operand" "ym,x,Yv"))
>            (parallel [(const_int 2) (const_int 6)
>                       (const_int 3) (const_int 7)])))]
> -  "TARGET_MMX"
> -  "punpckhwd\t{%2, %0|%0, %2}"
> -  [(set_attr "type" "mmxcvt")
> -   (set_attr "mode" "DI")])
> +  "TARGET_MMX || TARGET_MMX_WITH_SSE"
> +  "@
> +   punpckhwd\t{%2, %0|%0, %2}
> +   #
> +   #"
> +  "TARGET_MMX_WITH_SSE && reload_completed"
> +  [(const_int 0)]
> +  "ix86_split_mmx_punpck (operands, true); DONE;"
> +  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
> +   (set_attr "type" "mmxcvt,sselog,sselog")
> +   (set_attr "mode" "DI,TI,TI")])
>
> -(define_insn "mmx_punpcklwd"
> -  [(set (match_operand:V4HI 0 "register_operand" "=y")
> +(define_insn_and_split "mmx_punpcklwd"
> +  [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv")
>         (vec_select:V4HI
>           (vec_concat:V8HI
> -           (match_operand:V4HI 1 "register_operand" "0")
> -           (match_operand:V4HI 2 "nonimmediate_operand" "ym"))
> +           (match_operand:V4HI 1 "register_operand" "0,0,Yv")
> +           (match_operand:V4HI 2 "nonimmediate_operand" "ym,x,Yv"))
>            (parallel [(const_int 0) (const_int 4)
>                       (const_int 1) (const_int 5)])))]
> -  "TARGET_MMX"
> -  "punpcklwd\t{%2, %0|%0, %k2}"
> -  [(set_attr "type" "mmxcvt")
> -   (set_attr "mode" "DI")])
> +  "TARGET_MMX || TARGET_MMX_WITH_SSE"
> +  "@
> +   punpcklwd\t{%2, %0|%0, %k2}
> +   #
> +   #"
> +  "TARGET_MMX_WITH_SSE && reload_completed"
> +  [(const_int 0)]
> +  "ix86_split_mmx_punpck (operands, false); DONE;"
> +  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
> +   (set_attr "type" "mmxcvt,sselog,sselog")
> +   (set_attr "mode" "DI,TI,TI")])
>
> -(define_insn "mmx_punpckhdq"
> -  [(set (match_operand:V2SI 0 "register_operand" "=y")
> +(define_insn_and_split "mmx_punpckhdq"
> +  [(set (match_operand:V2SI 0 "register_operand" "=y,x,Yv")
>         (vec_select:V2SI
>           (vec_concat:V4SI
> -           (match_operand:V2SI 1 "register_operand" "0")
> -           (match_operand:V2SI 2 "nonimmediate_operand" "ym"))
> +           (match_operand:V2SI 1 "register_operand" "0,0,Yv")
> +           (match_operand:V2SI 2 "nonimmediate_operand" "ym,x,Yv"))
>           (parallel [(const_int 1)
>                      (const_int 3)])))]
> -  "TARGET_MMX"
> -  "punpckhdq\t{%2, %0|%0, %2}"
> -  [(set_attr "type" "mmxcvt")
> -   (set_attr "mode" "DI")])
> +  "TARGET_MMX || TARGET_MMX_WITH_SSE"
> +  "@
> +   punpckhdq\t{%2, %0|%0, %2}
> +   #
> +   #"
> +  "TARGET_MMX_WITH_SSE && reload_completed"
> +  [(const_int 0)]
> +  "ix86_split_mmx_punpck (operands, true); DONE;"
> +  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
> +   (set_attr "type" "mmxcvt,sselog,sselog")
> +   (set_attr "mode" "DI,TI,TI")])
>
> -(define_insn "mmx_punpckldq"
> -  [(set (match_operand:V2SI 0 "register_operand" "=y")
> +(define_insn_and_split "mmx_punpckldq"
> +  [(set (match_operand:V2SI 0 "register_operand" "=y,x,Yv")
>         (vec_select:V2SI
>           (vec_concat:V4SI
> -           (match_operand:V2SI 1 "register_operand" "0")
> -           (match_operand:V2SI 2 "nonimmediate_operand" "ym"))
> +           (match_operand:V2SI 1 "register_operand" "0,0,Yv")
> +           (match_operand:V2SI 2 "nonimmediate_operand" "ym,x,Yv"))
>           (parallel [(const_int 0)
>                      (const_int 2)])))]
> -  "TARGET_MMX"
> -  "punpckldq\t{%2, %0|%0, %k2}"
> -  [(set_attr "type" "mmxcvt")
> -   (set_attr "mode" "DI")])
> +  "TARGET_MMX || TARGET_MMX_WITH_SSE"
> +  "@
> +   punpckldq\t{%2, %0|%0, %k2}
> +   #
> +   #"
> +  "TARGET_MMX_WITH_SSE && reload_completed"
> +  [(const_int 0)]
> +  "ix86_split_mmx_punpck (operands, false); DONE;"
> +  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
> +   (set_attr "type" "mmxcvt,sselog,sselog")
> +   (set_attr "mode" "DI,TI,TI")])
>
>  (define_expand "mmx_pinsrw"
>    [(set (match_operand:V4HI 0 "register_operand")
> --
> 2.20.1
>

Reply via email to