--- gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.c | 76 +++++++++++++++++++++++++++++++++++++++++ gcc/config/i386/predicates.md | 7 ++++ gcc/config/i386/sse.md | 72 +++++++------------------------------- 4 files changed, 97 insertions(+), 59 deletions(-)
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index f300a56..431db6c 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -222,6 +222,7 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx); extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned); extern bool ix86_expand_pinsr (rtx *); +extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); /* In i386-c.c */ extern void ix86_target_macros (void); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 578a756..0dc08f3 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -38438,6 +38438,82 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd) expand_vec_perm_even_odd_1 (&d, odd); } +void +ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) +{ + rtx op1_m1, op1_m2; + rtx op2_m1, op2_m2; + rtx res_1, res_2; + + /* Shift both input vectors down one element, so that elements 3 + and 1 are now in the slots for elements 2 and 0. For K8, at + least, this is faster than using a shuffle. */ + op1_m1 = op1 = force_reg (V4SImode, op1); + op1_m2 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2), + gen_lowpart (V1TImode, op1), + GEN_INT (32))); + + if (GET_CODE (op2) == CONST_VECTOR) + { + rtvec v; + + /* Constant propagate the vector shift, leaving the dont-care + vector elements as zero. */ + v = rtvec_alloc (4); + RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0); + RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2); + RTVEC_ELT (v, 1) = const0_rtx; + RTVEC_ELT (v, 3) = const0_rtx; + op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v); + op2_m1 = force_reg (V4SImode, op2_m1); + + v = rtvec_alloc (4); + RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1); + RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3); + RTVEC_ELT (v, 1) = const0_rtx; + RTVEC_ELT (v, 3) = const0_rtx; + op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v); + op2_m2 = force_reg (V4SImode, op2_m2); + } + else + { + op2_m1 = op2 = force_reg (V4SImode, op2); + op2_m2 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2), + gen_lowpart (V1TImode, op2), + GEN_INT (32))); + } + + /* Widening multiply of elements 0+2, and 1+3. */ + res_1 = gen_reg_rtx (V4SImode); + res_2 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1), + op1_m1, op2_m1)); + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2), + op1_m2, op2_m2)); + + /* Move the results in element 2 down to element 1; we don't care + what goes in elements 2 and 3. Then we can merge the parts + back together with an interleave. + + Note that two other sequences were tried: + (1) Use interleaves at the start instead of psrldq, which allows + us to use a single shufps to merge things back at the end. + (2) Use shufps here to combine the two vectors, then pshufd to + put the elements in the correct order. + In both cases the cost of the reformatting stall was too high + and the overall sequence slower. */ + + emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx, + const0_rtx, const0_rtx)); + emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx, + const0_rtx, const0_rtx)); + res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2)); + + set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2)); +} + /* Expand an insert into a vector register through pinsr insn. Return true if successful. */ diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 92db809..f23e932 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -816,6 +816,13 @@ return false; }) +;; Return true when OP is a nonimmediate or a vector constant. Note +;; that most vector constants are not legitimate operands, so we need +;; to special-case this. +(define_predicate "nonimmediate_or_const_vector_operand" + (ior (match_code "const_vector") + (match_operand 0 "nonimmediate_operand"))) + ;; Return true if OP is a register or a zero. (define_predicate "reg_or_0_operand" (ior (match_operand 0 "register_operand") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 6a8206a..1f6fdb4 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5610,12 +5610,22 @@ (define_expand "mul<mode>3" [(set (match_operand:VI4_AVX2 0 "register_operand") - (mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand") - (match_operand:VI4_AVX2 2 "register_operand")))] + (mult:VI4_AVX2 + (match_operand:VI4_AVX2 1 "nonimmediate_operand") + (match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))] "TARGET_SSE2" { if (TARGET_SSE4_1 || TARGET_AVX) - ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands); + { + if (CONSTANT_P (operands[2])) + operands[2] = force_const_mem (<MODE>mode, operands[2]); + ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands); + } + else + { + ix86_expand_sse2_mulv4si3 (operands[0], operands[1], operands[2]); + DONE; + } }) (define_insn "*<sse4_1_avx2>_mul<mode>3" @@ -5633,62 +5643,6 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn_and_split "*sse2_mulv4si3" - [(set (match_operand:V4SI 0 "register_operand") - (mult:V4SI (match_operand:V4SI 1 "register_operand") - (match_operand:V4SI 2 "register_operand")))] - "TARGET_SSE2 && !TARGET_SSE4_1 && !TARGET_AVX - && can_create_pseudo_p ()" - "#" - "&& 1" - [(const_int 0)] -{ - rtx t1, t2, t3, t4, t5, t6, thirtytwo; - rtx op0, op1, op2; - - op0 = operands[0]; - op1 = operands[1]; - op2 = operands[2]; - t1 = gen_reg_rtx (V4SImode); - t2 = gen_reg_rtx (V4SImode); - t3 = gen_reg_rtx (V4SImode); - t4 = gen_reg_rtx (V4SImode); - t5 = gen_reg_rtx (V4SImode); - t6 = gen_reg_rtx (V4SImode); - thirtytwo = GEN_INT (32); - - /* Multiply elements 2 and 0. */ - emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1), - op1, op2)); - - /* Shift both input vectors down one element, so that elements 3 - and 1 are now in the slots for elements 2 and 0. For K8, at - least, this is faster than using a shuffle. */ - emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2), - gen_lowpart (V1TImode, op1), - thirtytwo)); - emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3), - gen_lowpart (V1TImode, op2), - thirtytwo)); - /* Multiply elements 3 and 1. */ - emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4), - t2, t3)); - - /* Move the results in element 2 down to element 1; we don't care - what goes in elements 2 and 3. */ - emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx, - const0_rtx, const0_rtx)); - emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx, - const0_rtx, const0_rtx)); - - /* Merge the parts back together. */ - emit_insn (gen_vec_interleave_lowv4si (op0, t5, t6)); - - set_unique_reg_note (get_last_insn (), REG_EQUAL, - gen_rtx_MULT (V4SImode, operands[1], operands[2])); - DONE; -}) - (define_insn_and_split "mul<mode>3" [(set (match_operand:VI8_AVX2 0 "register_operand") (mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand") -- 1.7.7.6