This patch would add a new middle-end representation for matching the
x264 narrowing clip idiom:
inline
U_NT clip_uint8 (S_WT x)
{
return x & (~((U_NT)-1)) ? (-x) >> 31 : x;
}
which would be accessible through the define_expand <us>clip<m1><m2>2
optabs.
For example, truncating int32_t to uint8_t would produce the following
results:
* .NARROW_CLIP (254) => 254
* .NARROW_CLIP (255) => 255
* .NARROW_CLIP (65535) => 255
* .NARROW_CLIP (-1) => 0
Currently this patch only supports clipping and returning an unsigned
narrow type. I'm unsure if this is the best way to approach the problem
as there is a similar optab .SAT_TRUNC which performs a similar
operation. The main difference between .NARROW_CLIP and .SAT_TRUNC can
be described in the example above (clipping int32_t to uint8_t)
* .SAT_TRUNC (-1) => 255
* .NARROW_CLIP (-1) => 0
This breaks the intended semantics of the code which is why I thought
another optab would make sense. If there is a better way to approach
this which would utilize more of the .SAT_TRUNC optab, please let me
know.
PR target/120378
gcc/ChangeLog:
* config/riscv/autovec.md (sclip<mode><v_oct_trunc>2): New
pattern.
(uclip<mode><v_oct_trunc>2): Ditto.
(sclip<mode><v_quad_trunc>2): Ditto.
(uclip<mode><v_quad_trunc>2): Ditto.
(sclip<mode><v_double_trunc>2): Ditto.
(uclip<mode><v_double_trunc>2): Ditto.
* internal-fn.def (NARROW_CLIP): New ifn.
* match.pd: Match narrow clip idiom.
* optabs.def (OPTAB_CL): Add (un)signed narrow clip optab.
* rtl.def (S_NARROW_CLIP): Match for narrow clip rtl.
(U_NARROW_CLIP): Ditto.
* simplify-rtx.cc (simplify_const_unary_operation): New case.
* tree-vect-patterns.cc (gimple_unsigned_integer_narrow_clip):
New pattern.
(gimple_signed_integer_narrow_clip): New pattern.
(vect_recog_narrow_clip_pattern): New pattern.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/pr120378.c: New test.
Signed-off-by: Edwin Lu <[email protected]>
---
gcc/config/riscv/autovec.md | 73 +++++++++++++++++
gcc/internal-fn.def | 2 +
gcc/match.pd | 24 ++++++
gcc/optabs.def | 3 +
gcc/rtl.def | 6 ++
gcc/simplify-rtx.cc | 16 ++++
.../gcc.target/riscv/rvv/autovec/pr120378.c | 20 +++++
gcc/tree-vect-patterns.cc | 82 +++++++++++++++++++
8 files changed, 226 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 1fff8ac2fc4..95394f4dd15 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -3029,3 +3029,76 @@ (define_expand "uabd<mode>3"
DONE;
});
+
+; ========
+; == Narrow clip
+; ========
+
+(define_expand "sclip<mode><v_oct_trunc>2"
+ [(match_operand:<V_OCT_TRUNC> 0 "register_operand")
+ (match_operand:VOEXTI 1 "register_operand")]
+ "TARGET_VECTOR && 0"
+ {
+ gcc_assert(0);
+ });
+
+(define_expand "uclip<mode><v_oct_trunc>2"
+ [(match_operand:<V_OCT_TRUNC> 0 "register_operand")
+ (match_operand:VOEXTI 1 "register_operand") ]
+ "TARGET_VECTOR"
+ {
+ rtx max = gen_reg_rtx (<MODE>mode);
+ insn_code icode = code_for_pred (SMAX, <MODE>mode);
+ rtx ops1[] = {max, operands[1], CONST0_RTX (<MODE>mode)};
+ riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1);
+
+ riscv_vector::expand_vec_oct_ustrunc (operands[0], max, <MODE>mode,
+ <V_DOUBLE_TRUNC>mode,
+ <V_QUAD_TRUNC>mode);
+ DONE;
+ });
+
+(define_expand "sclip<mode><v_quad_trunc>2"
+ [(match_operand:<V_QUAD_TRUNC> 0 "register_operand")
+ (match_operand:VQEXTI 1 "register_operand")]
+ "TARGET_VECTOR && 0"
+ {
+ gcc_assert(0);
+ });
+
+(define_expand "uclip<mode><v_quad_trunc>2"
+ [(match_operand:<V_QUAD_TRUNC> 0 "register_operand")
+ (match_operand:VQEXTI 1 "register_operand") ]
+ "TARGET_VECTOR"
+ {
+ rtx max = gen_reg_rtx (<MODE>mode);
+ insn_code icode = code_for_pred (SMAX, <MODE>mode);
+ rtx ops1[] = {max, operands[1], CONST0_RTX (<MODE>mode)};
+ riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1);
+
+ riscv_vector::expand_vec_quad_ustrunc (operands[0], max, <MODE>mode,
+ <V_DOUBLE_TRUNC>mode);
+ DONE;
+ });
+
+(define_expand "sclip<mode><v_double_trunc>2"
+ [(match_operand:<V_DOUBLE_TRUNC> 0 "register_operand")
+ (match_operand:VWEXTI 1 "register_operand")]
+ "TARGET_VECTOR && 0"
+ {
+ gcc_assert(0);
+ });
+
+(define_expand "uclip<mode><v_double_trunc>2"
+ [(match_operand:<V_DOUBLE_TRUNC> 0 "register_operand")
+ (match_operand:VWEXTI 1 "register_operand") ]
+ "TARGET_VECTOR"
+ {
+ rtx max = gen_reg_rtx (<MODE>mode);
+ insn_code icode = code_for_pred (SMAX, <MODE>mode);
+ rtx ops1[] = {max, operands[1], CONST0_RTX (<MODE>mode)};
+ riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1);
+
+ riscv_vector::expand_vec_double_ustrunc (operands[0], max, <MODE>mode);
+ DONE;
+ });
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index d2480a1bf79..85f44a53729 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -286,6 +286,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_MUL, ECF_CONST, first,
ssmul, usmul, binary)
DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_TRUNC, ECF_CONST, first, sstrunc, ustrunc,
unary_convert)
+DEF_INTERNAL_SIGNED_OPTAB_FN (NARROW_CLIP, ECF_CONST, first, sclip, uclip,
unary_convert)
+
DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
diff --git a/gcc/match.pd b/gcc/match.pd
index 4903552c82a..73013bc1e29 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3360,6 +3360,30 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
}
(if (wi::eq_p (sum, wi::uhwi (0, precision))))))))
+/* Narrow clip for unsigned integer. */
+(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
+ (match (unsigned_integer_narrow_clip @0)
+ /* NARROW_CLIP = (T)X & (NT)(-1) ? (-X) >> 31 : X
+
+ The gimple representation uses X > (NT)(-1) instead of
+ using & so match on gt instead of bit_and. */
+ (convert (cond^ (gt (nop_convert? @0) INTEGER_CST@1)
+ (rshift:s (nop_convert? (negate (nop_convert? @0))) INTEGER_CST@2)
+ @0))
+ (if (! TYPE_UNSIGNED (TREE_TYPE (@0)))
+ (with
+ {
+ unsigned itype_precision = TYPE_PRECISION (TREE_TYPE (@0));
+ unsigned otype_precision = TYPE_PRECISION (type);
+ wide_int trunc_max = wi::mask (otype_precision, false, itype_precision);
+ wide_int int_cst_1 = wi::to_wide (@1, itype_precision);
+ wide_int int_cst_2 = wi::to_wide (@2, itype_precision);
+ wide_int shift_amount = wi::uhwi ((HOST_WIDE_INT_1U << 5) - 1,
+ itype_precision); // Aka 31
+ }
+ (if (otype_precision < itype_precision && wi::eq_p (trunc_max,
+ int_cst_1) && wi::eq_p(int_cst_2, shift_amount)))))))
+
/* Saturation truncate for unsigned integer. */
(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
(match (unsigned_integer_sat_trunc @0)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 87a8b85da15..b56e9e75a75 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -70,6 +70,9 @@ OPTAB_CL(satfractuns_optab, "satfractuns$I$b$Q$a2",
UNSIGNED_SAT_FRACT, "satfrac
OPTAB_CL(ustrunc_optab, "ustrunc$b$a2", US_TRUNCATE, "ustrunc", NULL)
OPTAB_CL(sstrunc_optab, "sstrunc$b$a2", SS_TRUNCATE, "sstrunc", NULL)
+OPTAB_CL(uclip_optab, "uclip$b$a2", U_NARROW_CLIP, "uclip", NULL)
+OPTAB_CL(sclip_optab, "sclip$b$a2", S_NARROW_CLIP, "sclip", NULL)
+
OPTAB_CD(sfixtrunc_optab, "fix_trunc$F$b$I$a2")
OPTAB_CD(ufixtrunc_optab, "fixuns_trunc$F$b$I$a2")
diff --git a/gcc/rtl.def b/gcc/rtl.def
index 15ae7d10fcc..f3387aa8ea7 100644
--- a/gcc/rtl.def
+++ b/gcc/rtl.def
@@ -753,6 +753,12 @@ DEF_RTL_EXPR(SS_TRUNCATE, "ss_truncate", "e", RTX_UNARY)
/* Unsigned saturating truncate. */
DEF_RTL_EXPR(US_TRUNCATE, "us_truncate", "e", RTX_UNARY)
+/* Signed narrowing clip. */
+DEF_RTL_EXPR(S_NARROW_CLIP, "s_narrow_clip", "e", RTX_UNARY)
+
+/* Unsigned narrowing clip. */
+DEF_RTL_EXPR(U_NARROW_CLIP, "u_narrow_clip", "e", RTX_UNARY)
+
/* Floating point multiply/add combined instruction. */
DEF_RTL_EXPR(FMA, "fma", "eee", RTX_TERNARY)
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index cbe61b49bf6..a195ec502c5 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -2179,6 +2179,22 @@ simplify_const_unary_operation (enum rtx_code code,
machine_mode mode,
result = wide_int::from (result, width, sgn);
break;
}
+
+ case U_NARROW_CLIP:
+ case S_NARROW_CLIP:
+ {
+ signop sgn = code == U_NARROW_CLIP ? UNSIGNED : SIGNED;
+ wide_int nmax
+ = wide_int::from (wi::max_value (width, sgn),
+ GET_MODE_PRECISION (imode), sgn);
+ wide_int nmin
+ = wide_int::from (wi::min_value (width, sgn),
+ GET_MODE_PRECISION (imode), sgn);
+ result = wi::min (wi::max (op0, nmin, sgn), nmax, sgn);
+ result = wide_int::from (result, width, sgn);
+ break;
+ }
+
case SIGN_EXTEND:
result = wide_int::from (op0, width, SIGNED);
break;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c
new file mode 100644
index 00000000000..4cfedde99ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include <stdint.h>
+
+inline uint8_t
+clip_uint8 (int x)
+{
+ return x & (~255) ? (-x) >> 31 : x;
+}
+
+void __attribute__ ((noipa))
+clip_loop (uint8_t *res, int *x, int w)
+{
+ for (int i = 0; i < w; i++)
+ res[i] = clip_uint8 (x[i]);
+}
+
+/* { dg-final { scan-tree-dump-times ".NARROW_CLIP " 1 "optimized" } } */
+/* { dg-final { scan-assembler-times {vnclipu\.wi} 2 } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 0f6d6b77ea1..31629a31b93 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3675,6 +3675,87 @@ vect_recog_cast_forwprop_pattern (vec_info *vinfo,
return pattern_stmt;
}
+extern bool gimple_unsigned_integer_narrow_clip (tree, tree*, tree (*)(tree));
+extern bool gimple_signed_integer_narrow_clip (tree, tree*, tree (*)(tree));
+
+/* Function vect_recog_narrow_clip_pattern
+
+ Try to find the following narrow clip pattern:
+
+ type x_t;
+ TYPE x_T, clip = init;
+ loop:
+ clip_0 = phi <init, clip_1>
+ S1 x_t = *p;
+ S2 temp_t = type_u x_t;
+ S3 neg_x_t = -temp_t;
+ S4 neg_signed_x_t = (type) neg_x_t;
+ S5 x_shifted = neg_signed_x_t >> 31;
+ S6 is_greater = x_shifted > 255;
+ S7 cond = is_greater ? x_shifted : x_t;
+ S8 clip_1 = (TYPE) cond;
+
+ where 'TYPE' is at least twice as smalls as the size of type 'type'.
+
+ Input:
+
+ * STMT_VINFO: The stmt from which the pattern search begins. In the
+ example, when this function is called with S8, the pattern
+ {S3,S4,S5,S6,S7,S8} will be detected.
+
+ Output:
+
+ * TYPE_OUT: The type of the output of this pattern.
+
+ * Return value: A new stmt that will be used to replace the sequence of
+ stmts that constitute the pattern. In this case it will be:
+ .NARROW_CLIP <x_t, y_t, sum_0>
+ */
+
+static gimple *
+vect_recog_narrow_clip_pattern (vec_info *vinfo,
+ stmt_vec_info stmt_vinfo, tree *type_out)
+{
+
+ gimple *last_stmt = STMT_VINFO_STMT (stmt_vinfo);
+
+ if (!is_gimple_assign (last_stmt))
+ return NULL;
+
+ tree ops[1];
+ tree lhs = gimple_assign_lhs (last_stmt);
+ tree otype = TREE_TYPE (lhs);
+
+ if ((gimple_unsigned_integer_narrow_clip (lhs, ops, NULL))
+ // || gimple_signed_integer_narrow_clip (lhs, ops, NULL))
+ && type_has_mode_precision_p (otype))
+ {
+ tree itype = TREE_TYPE (ops[0]);
+ tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
+ tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
+ internal_fn fn = IFN_NARROW_CLIP;
+
+ if (v_itype != NULL_TREE && v_otype != NULL_TREE
+ && direct_internal_fn_supported_p (fn, tree_pair (v_otype, v_itype),
+ OPTIMIZE_FOR_BOTH))
+ {
+ gcall *call = gimple_build_call_internal (fn, 1, ops[0]);
+ tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
+
+ gimple_call_set_lhs (call, out_ssa);
+ gimple_call_set_nothrow (call, /* nothrow_p */ false);
+ gimple_set_location (call, gimple_location (last_stmt));
+
+ *type_out = v_otype;
+ vect_pattern_detected ("vect_recog_narrow_clip_pattern",
stmt_vinfo->stmt);
+
+ return call;
+ }
+ }
+
+ return NULL;
+}
+
/* Try to detect a shift left of a widened input, converting LSHIFT_EXPR
to WIDEN_LSHIFT_EXPR. See vect_recog_widen_op_pattern for details. */
@@ -6917,6 +6998,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
{ vect_recog_sat_add_pattern, "sat_add" },
{ vect_recog_sat_sub_pattern, "sat_sub" },
{ vect_recog_sat_trunc_pattern, "sat_trunc" },
+ { vect_recog_narrow_clip_pattern, "narrow_clip" },
{ vect_recog_gcond_pattern, "gcond" },
{ vect_recog_bool_pattern, "bool" },
/* This must come before mask conversion, and includes the parts
--
2.43.0