In `store_bit_field_1`, when the value to be written in the bitfield
and/or the bitfield itself have vector modes, non-canonical subregs
are generated, like `(subreg:V4SI (reg:V8SI x) 0)`. If one them is
a scalar, this happens only when the scalar mode is different than the
vector's inner mode.
This patch tries to prevent this, using vec_set patterns when
possible.
Bootstrapped/regtested on AArch64 and x86_64.
PR rtl-optimization/118873
gcc/ChangeLog:
* expmed.cc (generate_vec_concat): New function.
(store_bit_field_1): Check for cases where the value
to be written and/or the bitfield have vector modes
and try to generate the corresponding vec_set patterns
instead of subregs.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr118873.c: New test.
---
gcc/expmed.cc | 174 ++++++++++++++++++++++-
gcc/testsuite/gcc.target/i386/pr118873.c | 33 +++++
2 files changed, 200 insertions(+), 7 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr118873.c
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8cf10d9c73bf..8c641f55b9c6 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -740,6 +740,42 @@ store_bit_field_using_insv (const extraction_insn *insv,
rtx op0,
return false;
}
+/* Helper function for store_bit_field_1, used in the case that the bitfield
+ and the destination are both vectors. It extracts the elements of OP from
+ LOWER_BOUND to UPPER_BOUND using a vec_select and uses a vec_concat to
+ concatenate the extracted elements with the VALUE. */
+
+rtx
+generate_vec_concat (machine_mode fieldmode, rtx op, rtx value,
+ HOST_WIDE_INT lower_bound,
+ HOST_WIDE_INT upper_bound)
+{
+ if (!VECTOR_MODE_P (fieldmode))
+ return NULL_RTX;
+
+ rtvec vec = rtvec_alloc (GET_MODE_NUNITS (fieldmode).to_constant ());
+ machine_mode outermode = GET_MODE (op);
+
+ for (HOST_WIDE_INT i = lower_bound; i < upper_bound; ++i)
+ RTVEC_ELT (vec, i) = GEN_INT (i);
+ rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
+ rtx select = gen_rtx_VEC_SELECT (fieldmode, op, par);
+ if (BYTES_BIG_ENDIAN)
+ {
+ if (lower_bound > 0)
+ return gen_rtx_VEC_CONCAT (outermode, select, value);
+ else
+ return gen_rtx_VEC_CONCAT (outermode, value, select);
+ }
+ else
+ {
+ if (lower_bound > 0)
+ return gen_rtx_VEC_CONCAT (outermode, value, select);
+ else
+ return gen_rtx_VEC_CONCAT (outermode, select, value);
+ }
+}
+
/* A subroutine of store_bit_field, with the same arguments. Return true
if the operation could be implemented.
@@ -778,18 +814,142 @@ store_bit_field_1 (rtx str_rtx, poly_uint64 bitsize,
poly_uint64 bitnum,
if (VECTOR_MODE_P (outermode)
&& !MEM_P (op0)
&& optab_handler (vec_set_optab, outermode) != CODE_FOR_nothing
- && fieldmode == innermode
- && known_eq (bitsize, GET_MODE_PRECISION (innermode))
&& multiple_p (bitnum, GET_MODE_PRECISION (innermode), &pos))
{
+ /* Cases where the destination's inner mode is not equal to the
+ value's mode need special treatment. */
+
class expand_operand ops[3];
enum insn_code icode = optab_handler (vec_set_optab, outermode);
- create_fixed_operand (&ops[0], op0);
- create_input_operand (&ops[1], value, innermode);
- create_integer_operand (&ops[2], pos);
- if (maybe_expand_insn (icode, 3, ops))
- return true;
+ /* Subreg expressions should operate on scalars only. Subregs on
+ vectors are not canonical. Extractions from vectors should use
+ vector operations instead. */
+ bool is_non_canon_subreg = GET_CODE (value) == SUBREG
+ && VECTOR_MODE_P (fieldmode)
+ && !VECTOR_MODE_P (
+ GET_MODE (SUBREG_REG (value)));
+
+ /* If the value to be written is a memory expression or a non-canonical
+ scalar to vector subreg, don't try to generate a vec_set pattern.
+ Instead, fall back and try to generate an instruction without
+ touching the operands. */
+ if (!MEM_P (value) && !is_non_canon_subreg)
+ {
+ if (VECTOR_MODE_P (fieldmode))
+ {
+ /* Handle the case where both the value to be written and the
+ destination are vectors. */
+
+ HOST_WIDE_INT op_elem_num
+ = GET_MODE_NUNITS (outermode).to_constant ();
+ rtx concat_rtx = value;
+ rtx_insn *last_insn = get_last_insn ();
+ HOST_WIDE_INT index = 0;
+ /* If the store position is not at the start of the bitfield,
+ store the value by selecting the first pos elements of the
+ vector and then placing the value after them, using
+ a vec_concat. */
+ if (pos.to_constant () > 0)
+ {
+ concat_rtx = generate_vec_concat (fieldmode, op0, value, 0,
+ pos.to_constant ());
+
+ index = pos.to_constant () + bitsize.to_constant ()
+ / GET_MODE_UNIT_BITSIZE (outermode);
+ }
+
+ /* Reconstruct the rest of the vector, after the value. */
+ if (index < op_elem_num)
+ concat_rtx = generate_vec_concat (fieldmode, op0, concat_rtx,
+ index, op_elem_num);
+
+ rtx_insn *set_insn = emit_insn (gen_rtx_SET (op0, concat_rtx));
+
+ if (recog_memoized (set_insn) >= 0)
+ return true;
+ else
+ delete_insns_since (last_insn);
+ }
+ else if (fieldmode != innermode)
+ {
+ /* Handle the case where the destination is a vector and
+ the value's mode is different than the vector's inner
+ mode. We have to treat the bitfield insertion differently
+ depending on which of those modes is wider than the other. */
+
+ if (known_gt (GET_MODE_SIZE (fieldmode),
+ GET_MODE_SIZE (innermode)))
+ {
+ /* If the value's mode is wider than the vector's inner
+ mode, extract a part from the value with size equal
+ to the vector's inner mode size and write it in the
+ appropriate position inside the vector, using a vec_set
+ pattern. Repeat, until the whole value is written. */
+
+ unsigned int curr_pos = 0;
+ bool failed = false;
+ rtx_insn *last_insn = get_last_insn ();
+ while (curr_pos < bitsize.to_constant ())
+ {
+
+ rtx subreg = gen_reg_rtx (innermode);
+ unsigned int innermode_size = GET_MODE_BITSIZE (innermode);
+ rtx bitfield
+ = extract_bit_field (value, innermode_size,
+ curr_pos, 0, NULL_RTX, innermode,
+ innermode, false, 0);
+
+ store_bit_field_1 (subreg, innermode_size, 0, 0,
+ 0, innermode, bitfield, false, false,
+ false);
+
+ HOST_WIDE_INT index
+ = pos.to_constant () + curr_pos / innermode_size;
+ create_fixed_operand (&ops[0], op0);
+ create_input_operand (&ops[1], subreg, innermode);
+ create_integer_operand (&ops[2], index);
+ if (!maybe_expand_insn (icode, 3, ops))
+ {
+ failed = true;
+ break;
+ }
+
+ curr_pos += innermode_size;
+ }
+
+ if (!failed)
+ return true;
+ else
+ delete_insns_since (last_insn);
+ }
+ else if (known_lt (GET_MODE_SIZE (fieldmode),
+ GET_MODE_SIZE (innermode)))
+ {
+ /* If the value's mode is narrower than the vector's inner
+ mode, extend the value's mode to the vector's inner
+ mode and use a vec_set pattern for the insertion. */
+
+ rtx ext_value = gen_rtx_ZERO_EXTEND (innermode, value);
+
+ create_fixed_operand (&ops[0], op0);
+ create_input_operand (&ops[1], ext_value, innermode);
+ create_integer_operand (&ops[2], pos);
+ if (maybe_expand_insn (icode, 3, ops))
+ return true;
+ }
+ }
+ }
+
+ if (fieldmode == innermode
+ && known_eq (bitsize, GET_MODE_PRECISION (innermode)))
+ {
+ create_fixed_operand (&ops[0], op0);
+ create_input_operand (&ops[1], value, innermode);
+ create_integer_operand (&ops[2], pos);
+ if (maybe_expand_insn (icode, 3, ops))
+ return true;
+ }
}
/* If the target is a register, overwriting the entire object, or storing
diff --git a/gcc/testsuite/gcc.target/i386/pr118873.c
b/gcc/testsuite/gcc.target/i386/pr118873.c
new file mode 100644
index 000000000000..3a07c7cc87f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr118873.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -favoid-store-forwarding" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+
+v8si a;
+v4si b;
+
+/*
+** foo:
+** ...
+** vmovdqa a\(%rip\), %ymm0
+** vmovdqa b\(%rip\), %xmm1
+** vmovdqa %ymm0, \(%rdi\)
+** vmovdqa 16\(%rdi\), %ymm0
+** vmovdqa %xmm1, 32\(%rdi\)
+** vinserti128 \$0x1, %xmm1, %ymm0, %ymm0
+** vmovdqa %ymm0, a\(%rip\)
+** vzeroupper
+** ret
+** ...
+*/
+void foo (int *p)
+{
+ v8si aa = a;
+ v4si bb = b;
+ *(v8si *)p = a;
+ *(v4si *)(p + 8) = b;
+ a = *(v8si *)(p + 4);
+}
+
--
2.49.0