This patch implements support for vectors of booleans to support MVE predicates, instead of HImode. Since the ABI mandates pred16_t (aka uint16_t) to represent predicates in intrinsics prototypes, we introduce a new "predicate" type qualifier so that we can map relevant builtins HImode arguments and return value to the appropriate vector of booleans (VxBI).
We have to update test_vector_ops_duplicate, because it iterates using an offset in bytes, where we would need to iterate in bits: we stop iterating when we reach the end of the vector of booleans. 2021-10-13 Christophe Lyon <christophe.l...@foss.st.com> gcc/ PR target/100757 PR target/101325 * config/arm/arm-builtins.c (arm_type_qualifiers): Add qualifier_predicate. (arm_init_simd_builtin_types): Add new simd types. (arm_init_builtin): Map predicate vectors arguments to HImode. (arm_expand_builtin_args): Move HImode predicate arguments to VxBI rtx. Move return value to HImode rtx. * config/arm/arm-modes.def (V16BI, V8BI, V4BI): New modes. * config/arm/arm-simd-builtin-types.def (Pred1x16_t, Pred2x8_t,Pred4x4_t): New. * simplify-rtx.c (test_vector_ops_duplicate): Skip vec_merge test with vectors of booleans. diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c index 3a9ff8f26b8..771759f0cdd 100644 --- a/gcc/config/arm/arm-builtins.c +++ b/gcc/config/arm/arm-builtins.c @@ -92,7 +92,9 @@ enum arm_type_qualifiers qualifier_lane_pair_index = 0x1000, /* Lane indices selected in quadtuplets - must be within range of previous argument = a vector. */ - qualifier_lane_quadtup_index = 0x2000 + qualifier_lane_quadtup_index = 0x2000, + /* MVE vector predicates. */ + qualifier_predicate = 0x4000 }; /* The qualifier_internal allows generation of a unary builtin from @@ -1633,6 +1635,13 @@ arm_init_simd_builtin_types (void) arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node; arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node; + if (TARGET_HAVE_MVE) + { + arm_simd_types[Pred1x16_t].eltype = unsigned_intHI_type_node; + arm_simd_types[Pred2x8_t].eltype = unsigned_intHI_type_node; + arm_simd_types[Pred4x4_t].eltype = unsigned_intHI_type_node; + } + for (i = 0; i < nelts; i++) { tree eltype = arm_simd_types[i].eltype; @@ -1780,6 +1789,11 @@ arm_init_builtin (unsigned int fcode, arm_builtin_datum *d, if (qualifiers & qualifier_map_mode) op_mode = d->mode; + /* MVE Predicates use HImode as mandated by the ABI: pred16_t is unsigned + short. */ + if (qualifiers & qualifier_predicate) + op_mode = HImode; + /* For pointers, we want a pointer to the basic type of the vector. */ if (qualifiers & qualifier_pointer && VECTOR_MODE_P (op_mode)) @@ -3024,6 +3038,11 @@ arm_expand_builtin_args (rtx target, machine_mode map_mode, int fcode, case ARG_BUILTIN_COPY_TO_REG: if (POINTER_TYPE_P (TREE_TYPE (arg[argc]))) op[argc] = convert_memory_address (Pmode, op[argc]); + + /* MVE uses mve_pred16_t (aka HImode) for vectors of predicates. */ + if (GET_MODE_CLASS (mode[argc]) == MODE_VECTOR_BOOL) + op[argc] = gen_lowpart (mode[argc], op[argc]); + /*gcc_assert (GET_MODE (op[argc]) == mode[argc]); */ if (!(*insn_data[icode].operand[opno].predicate) (op[argc], mode[argc])) @@ -3229,6 +3248,13 @@ constant_arg: else emit_insn (insn); + if (GET_MODE_CLASS (tmode) == MODE_VECTOR_BOOL) + { + rtx HItarget = gen_reg_rtx (HImode); + emit_move_insn (HItarget, gen_lowpart (HImode, target)); + return HItarget; + } + return target; } diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def index a5e74ba3943..b414a709a62 100644 --- a/gcc/config/arm/arm-modes.def +++ b/gcc/config/arm/arm-modes.def @@ -84,6 +84,11 @@ VECTOR_MODE (FLOAT, BF, 2); /* V2BF. */ VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */ VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */ +/* Predicates for MVE. */ +VECTOR_BOOL_MODE (V16BI, 16, 2); +VECTOR_BOOL_MODE (V8BI, 8, 2); +VECTOR_BOOL_MODE (V4BI, 4, 2); + /* Fraction and accumulator vector modes. */ VECTOR_MODES (FRACT, 4); /* V4QQ V2HQ */ VECTOR_MODES (UFRACT, 4); /* V4UQQ V2UHQ */ diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def index c19a1b6e3eb..d3987985b4c 100644 --- a/gcc/config/arm/arm-simd-builtin-types.def +++ b/gcc/config/arm/arm-simd-builtin-types.def @@ -51,3 +51,7 @@ ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20) ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20) ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20) + + ENTRY (Pred1x16_t, V16BI, unsigned, 16, uint16, 21) + ENTRY (Pred2x8_t, V8BI, unsigned, 8, uint16, 21) + ENTRY (Pred4x4_t, V4BI, unsigned, 4, uint16, 21) diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c index a719f57870f..7b3962339ba 100644 --- a/gcc/simplify-rtx.c +++ b/gcc/simplify-rtx.c @@ -7634,17 +7634,23 @@ test_vector_ops_duplicate (machine_mode mode, rtx scalar_reg) duplicate, last_par)); /* Test a scalar subreg of a VEC_MERGE of a VEC_DUPLICATE. */ - rtx vector_reg = make_test_reg (mode); - for (unsigned HOST_WIDE_INT i = 0; i < const_nunits; i++) + /* Skip this test for vectors of booleans, because offset is in bytes, + while vec_merge indices are in elements (usually bits). */ + if (GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL) { - if (i >= HOST_BITS_PER_WIDE_INT) - break; - rtx mask = GEN_INT ((HOST_WIDE_INT_1U << i) | (i + 1)); - rtx vm = gen_rtx_VEC_MERGE (mode, duplicate, vector_reg, mask); - poly_uint64 offset = i * GET_MODE_SIZE (inner_mode); - ASSERT_RTX_EQ (scalar_reg, - simplify_gen_subreg (inner_mode, vm, - mode, offset)); + rtx vector_reg = make_test_reg (mode); + for (unsigned HOST_WIDE_INT i = 0; i < const_nunits; i++) + { + if (i >= HOST_BITS_PER_WIDE_INT) + break; + rtx mask = GEN_INT ((HOST_WIDE_INT_1U << i) | (i + 1)); + rtx vm = gen_rtx_VEC_MERGE (mode, duplicate, vector_reg, mask); + poly_uint64 offset = i * GET_MODE_SIZE (inner_mode); + + ASSERT_RTX_EQ (scalar_reg, + simplify_gen_subreg (inner_mode, vm, + mode, offset)); + } } } -- 2.25.1