Hi,The attached patch implements vec_init for AArch64. This has been tested on aarch64-none-elf with no regressions. OK for trunk?
Thanks, Tejas Belagod ARM. 2013-01-07 Tejas Belagod <tejas.bela...@arm.com> gcc/ * config/aarch64/aarch64-simd.md (vec_init<mode>): New. * config/aarch64/aarch64-protos.h (aarch64_expand_vector_init): Declare. * config/aarch64/aarch64.c (aarch64_simd_dup_constant, aarch64_simd_make_constant, aarch64_expand_vector_init): New.
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index bcd3bb1..e8859a0 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -138,6 +138,7 @@ HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned); bool aarch64_bitmask_imm (HOST_WIDE_INT val, enum machine_mode); bool aarch64_const_double_zero_rtx_p (rtx); bool aarch64_constant_address_p (rtx); +void aarch64_expand_vector_init (rtx, rtx); bool aarch64_function_arg_regno_p (unsigned); bool aarch64_gen_movmemqi (rtx *); bool aarch64_is_extend_from_extract (enum machine_mode, rtx, rtx); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index febf71d..c630808 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3527,3 +3527,14 @@ DONE; }) +;; Standard pattern name vec_init<mode>. + +(define_expand "vec_init<mode>" + [(match_operand:VALL 0 "register_operand" "") + (match_operand 1 "" "")] + "TARGET_SIMD" +{ + aarch64_expand_vector_init (operands[0], operands[1]); + DONE; +}) + diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 7bc2f6b..29b8e64 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -6423,6 +6423,166 @@ aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed) return true; } +/* If VALS is a vector constant that can be loaded into a register + using DUP, generate instructions to do so and return an RTX to + assign to the register. Otherwise return NULL_RTX. */ +static rtx +aarch64_simd_dup_constant (rtx vals) +{ + enum machine_mode mode = GET_MODE (vals); + enum machine_mode inner_mode = GET_MODE_INNER (mode); + int n_elts = GET_MODE_NUNITS (mode); + bool all_same = true; + rtx x; + int i; + + if (GET_CODE (vals) != CONST_VECTOR) + return NULL_RTX; + + for (i = 1; i < n_elts; ++i) + { + x = CONST_VECTOR_ELT (vals, i); + if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0))) + all_same = false; + } + + if (!all_same) + return NULL_RTX; + + /* We can load this constant by using DUP and a constant in a + single ARM register. This will be cheaper than a vector + load. */ + x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0)); + return gen_rtx_VEC_DUPLICATE (mode, x); +} + + +/* Generate code to load VALS, which is a PARALLEL containing only + constants (for vec_init) or CONST_VECTOR, efficiently into a + register. Returns an RTX to copy into the register, or NULL_RTX + for a PARALLEL that can not be converted into a CONST_VECTOR. */ +rtx +aarch64_simd_make_constant (rtx vals) +{ + enum machine_mode mode = GET_MODE (vals); + rtx const_dup; + rtx const_vec = NULL_RTX; + int n_elts = GET_MODE_NUNITS (mode); + int n_const = 0; + int i; + + if (GET_CODE (vals) == CONST_VECTOR) + const_vec = vals; + else if (GET_CODE (vals) == PARALLEL) + { + /* A CONST_VECTOR must contain only CONST_INTs and + CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF). + Only store valid constants in a CONST_VECTOR. */ + for (i = 0; i < n_elts; ++i) + { + rtx x = XVECEXP (vals, 0, i); + if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) + n_const++; + } + if (n_const == n_elts) + const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)); + } + else + gcc_unreachable (); + + if (const_vec != NULL_RTX + && aarch64_simd_immediate_valid_for_move (const_vec, mode, NULL, NULL, + NULL, NULL, NULL)) + /* Load using MOVI/MVNI. */ + return const_vec; + else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX) + /* Loaded using DUP. */ + return const_dup; + else if (const_vec != NULL_RTX) + /* Load from constant pool. We can not take advantage of single-cycle + LD1 because we need a PC-relative addressing mode. */ + return const_vec; + else + /* A PARALLEL containing something not valid inside CONST_VECTOR. + We can not construct an initializer. */ + return NULL_RTX; +} + +void +aarch64_expand_vector_init (rtx target, rtx vals) +{ + enum machine_mode mode = GET_MODE (target); + enum machine_mode inner_mode = GET_MODE_INNER (mode); + int n_elts = GET_MODE_NUNITS (mode); + int n_var = 0, one_var = -1; + bool all_same = true; + rtx x, mem; + int i; + + x = XVECEXP (vals, 0, 0); + if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x)) + n_var = 1, one_var = 0; + + for (i = 1; i < n_elts; ++i) + { + x = XVECEXP (vals, 0, i); + if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x)) + ++n_var, one_var = i; + + if (!rtx_equal_p (x, XVECEXP (vals, 0, 0))) + all_same = false; + } + + if (n_var == 0) + { + rtx constant = aarch64_simd_make_constant (vals); + if (constant != NULL_RTX) + { + emit_move_insn (target, constant); + return; + } + } + + /* Splat a single non-constant element if we can. */ + if (all_same) + { + x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); + aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x)); + return; + } + + /* One field is non-constant. Load constant then overwrite varying + field. This is more efficient than using the stack. */ + if (n_var == 1) + { + rtx copy = copy_rtx (vals); + rtx index = GEN_INT (one_var); + enum insn_code icode; + + /* Load constant part of vector, substitute neighboring value for + varying element. */ + XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1); + aarch64_expand_vector_init (target, copy); + + /* Insert variable. */ + x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var)); + icode = optab_handler (vec_set_optab, mode); + gcc_assert (icode != CODE_FOR_nothing); + emit_insn (GEN_FCN (icode) (target, x, index)); + return; + } + + /* Construct the vector in memory one field at a time + and load the whole vector. */ + mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); + for (i = 0; i < n_elts; i++) + emit_move_insn (adjust_address_nv (mem, inner_mode, + i * GET_MODE_SIZE (inner_mode)), + XVECEXP (vals, 0, i)); + emit_move_insn (target, mem); + +} + static unsigned HOST_WIDE_INT aarch64_shift_truncation_mask (enum machine_mode mode) {