May I please ping this one?? https://gcc.gnu.org/pipermail/gcc-patches/2023-February/612152.html
Many Thanks! Victor On 2/16/23 15:48, Victor L. Do Nascimento wrote: > Hi all, > > The back-end pattern for mapping the auto-vectorized representation of > vector * scalar to to machine instruction VMUL was missing, and > multiple instructions were needed to reproduce this behavior as a > result of failed RTL pattern match in combine pass. > > RTL patterns were introduced to reproduce the behavior of the > intrinsics vmulq_n_<mode> and vmulq_n_f<mode>. > > In the case of literal constants, an intermediate instruction was > added in to initial RTL expansion to ensure a general-purpose register > was allocated to store the constant, which could then be be extracted > from the constant vector. > > For the function > > void test_vmulimm_s32x4 (int32_t * __restrict__ dest, int32_t *a) > { > int i; > for (i=0; i<4; i++) { > dest[i] = a[i] * 5; > } > } > > > The GIMPLE -> RTL expansion is modified to produce: > (set (reg:SI 119) > (const_int 5 [0x5])) > (set (reg:V4SI 118) > (mult:V4SI (vec_duplicate:V4SI (reg:SI 119)) > (reg:V4SI 117))) > > instead of: > (set (reg:V4SI 119) > (const_vector:V4SI [ > (const_int 5 [0x5]) repeated x4 > ])) > (set (reg:V4SI 118) > (mult:V4SI (reg:V4SI 117) > (reg:V4SI 119))) > > The end assembly for the above function introduces the emission of the > following insn: > vmul.i32 q3, q3, r3 > > as opposed to: > vmul.i32 q3, q3, q2 > > All tests in gcc.target/arm/simd/mve-vmul-scalar-1.c now pass. > > Added new RTL templates, amended unit test and checked for regressions on > arm-none-eabi. > > Thanks, > Victor > > gcc: > * gcc/config/arm/arm.cc (neon_vdup_constant): static keyword > removed. > * gcc/config/arm/arm-protos.h (neon_vdup_constant): prototype > added. > * gcc/config/arm/mve.md (@mve_vmulq_n_<mode>2): New. > * gcc/config/arm/predicates.md (reg_or_me_replicated_const_operand): > New. > * gcc/config/arm/vec-common.md (mul<mode>3): Modify to use > `reg_or_me_replicated_const_operand'. > > testsuite: > * gcc.target/arm/simd/mve-vmul-scalar-1.c: Corrected typo, > xfails removed. > --- > gcc/config/arm/arm-protos.h | 1 + > gcc/config/arm/arm.cc | 2 +- > gcc/config/arm/mve.md | 11 +++++++++++ > gcc/config/arm/predicates.md | 8 ++++++++ > gcc/config/arm/vec-common.md | 14 ++++++++++++-- > .../gcc.target/arm/simd/mve-vmul-scalar-1.c | 13 ++++++------- > 6 files changed, 39 insertions(+), 10 deletions(-) > > diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h > index aea472bfbb9..4cf9fb00e01 100644 > --- a/gcc/config/arm/arm-protos.h > +++ b/gcc/config/arm/arm-protos.h > @@ -199,6 +199,7 @@ extern rtx arm_load_tp (rtx); > extern bool arm_coproc_builtin_available (enum unspecv); > extern bool arm_coproc_ldc_stc_legitimate_address (rtx); > extern rtx arm_stack_protect_tls_canary_mem (bool); > +extern rtx neon_vdup_constant (rtx, bool); > > > #if defined TREE_CODE > diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc > index efc48349dd3..7d9d265b0a7 100644 > --- a/gcc/config/arm/arm.cc > +++ b/gcc/config/arm/arm.cc > @@ -13301,7 +13301,7 @@ neon_pairwise_reduce (rtx op0, rtx op1, machine_mode > mode, > If this is the case, and GENERATE is set, we also generate > instructions to do this and return an RTX to assign to the register. */ > > -static rtx > +rtx > neon_vdup_constant (rtx vals, bool generate) > { > machine_mode mode = GET_MODE (vals); > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md > index 555ad1b66c8..806c24e33aa 100644 > --- a/gcc/config/arm/mve.md > +++ b/gcc/config/arm/mve.md > @@ -1376,6 +1376,17 @@ > [(set_attr "type" "mve_move") > ]) > > +(define_insn "@mve_vmulq_n_<mode>2" > + [ > + (set (match_operand:MVE_VLD_ST 0 "s_register_operand" "=w") > + (mult:MVE_VLD_ST (vec_duplicate:MVE_VLD_ST (match_operand:<V_elem> 1 > "s_register_operand" "r")) > + (match_operand:MVE_VLD_ST 2 > "s_register_operand" "w"))) > + ] > + "TARGET_HAVE_MVE" > + "vmul.%#<V_if_elem>\t%q0, %q2, %r1" > + [(set_attr "type" "mve_move") > +]) > + > ;; > ;; [vmulq_u, vmulq_s]) > ;; > diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md > index 3139750c606..31eadfa2d3b 100644 > --- a/gcc/config/arm/predicates.md > +++ b/gcc/config/arm/predicates.md > @@ -113,6 +113,14 @@ > && neon_immediate_valid_for_logic (op, mode, 1, NULL, NULL)); > }) > > +(define_predicate "reg_or_mve_replicated_const_operand" > + (if_then_else (and (match_test "TARGET_HAVE_MVE") > + (match_code "const_vector") > + (match_test "const_vec_duplicate_p (op)")) > + (match_operand 0 "immediate_operand") > + (match_operand 0 "s_register_operand")) > +) > + > (define_predicate "neon_inv_logic_op2" > (ior (match_operand 0 "imm_for_neon_inv_logic_operand") > (match_operand 0 "s_register_operand"))) > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > index f06df4db636..17b67c214b4 100644 > --- a/gcc/config/arm/vec-common.md > +++ b/gcc/config/arm/vec-common.md > @@ -102,12 +102,22 @@ > (define_expand "mul<mode>3" > [(set (match_operand:VDQWH 0 "s_register_operand") > (mult:VDQWH (match_operand:VDQWH 1 "s_register_operand") > - (match_operand:VDQWH 2 "s_register_operand")))] > + (match_operand:VDQWH 2 > "reg_or_mve_replicated_const_operand")))] > "ARM_HAVE_<MODE>_ARITH > && (!TARGET_REALLY_IWMMXT > || <MODE>mode == V4HImode > || <MODE>mode == V2SImode)" > -) > +{ > + if ((GET_CODE (operands[2]) == CONST_VECTOR) && can_create_pseudo_p () > + && (VALID_MVE_SI_MODE (<MODE>mode) || VALID_MVE_SF_MODE (<MODE>mode))) > + { > + rtx tmp = gen_reg_rtx (<V_elem>mode); > + emit_move_insn (tmp, neon_vdup_constant (operands[2], 0)); > + emit_insn (maybe_gen_mve_vmulq_n_2 (<MODE>mode, operands[0], tmp, > + operands[1])); > + DONE; > + } > +}) > > (define_expand "smin<mode>3" > [(set (match_operand:VALLW 0 "s_register_operand") > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c > b/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c > index 22be452e8d9..0736847a96d 100644 > --- a/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c > @@ -24,9 +24,9 @@ FUNC_IMM(u, uint, 8, 16, *, vmulimm) > > /* For the moment we do not select the T2 vmul variant operating on a scalar > final argument. */ > -/* { dg-final { scan-assembler-times {vmul\.i32\tq[0-9]+, q[0-9]+, r[0-9]+} > 2 { xfail *-*-* } } } */ > -/* { dg-final { scan-assembler-times {vmul\.i16\tq[0-9]+, q[0-9]+, r[0-9]+} > 2 { xfail *-*-* } } } */ > -/* { dg-final { scan-assembler-times {vmul\.i8\tq[0-9]+, q[0-9]+, r[0-9]+} 2 > { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-times {vmul\.i32\tq[0-9]+, q[0-9]+, r[0-9]+} > 2 } } */ > +/* { dg-final { scan-assembler-times {vmul\.i16\tq[0-9]+, q[0-9]+, r[0-9]+} > 2 } } */ > +/* { dg-final { scan-assembler-times {vmul\.i8\tq[0-9]+, q[0-9]+, r[0-9]+} 2 > } } */ > > void test_vmul_f32 (float * dest, float * a, float * b) { > int i; > @@ -40,16 +40,15 @@ void test_vmulimm_f32 (float * dest, float * a) { > dest[i] = a[i] * 5.0; > } > } > -/* { dg-final { scan-assembler-times {vmul\.f32\tq[0-9]+, q[0-9]+, r[0-9]+} > 2 { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-times {vmul\.f32\tq[0-9]+, q[0-9]+, r[0-9]+} > 2 } } */ > > void test_vmul_f16 (__fp16 * dest, __fp16 * a, __fp16 * b) { > int i; > for (i=0; i<8; i++) { > - dest[i] = a[i] * b[i]; > + dest[i] = a[i] * b[1]; > } > } > > -/* Note that dest[i] = a[i] * 5.0f16 is not vectorized. */ > void test_vmulimm_f16 (__fp16 * dest, __fp16 * a) { > int i; > __fp16 b = 5.0f16; > @@ -57,4 +56,4 @@ void test_vmulimm_f16 (__fp16 * dest, __fp16 * a) { > dest[i] = a[i] * b; > } > } > -/* { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, r[0-9]+} > 2 { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, r[0-9]+} > 2 } } */