Re: [PATCH 6/8] aarch64: Tweak MLA vector costs

Richard Sandiford via Gcc-patches Wed, 04 Aug 2021 05:19:55 -0700

Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Richard Biener <richard.guent...@gmail.com> writes:
>> On Tue, Aug 3, 2021 at 2:10 PM Richard Sandiford via Gcc-patches
>> <gcc-patches@gcc.gnu.org> wrote:
>>>
>>> The issue-based vector costs currently assume that a multiply-add
>>> sequence can be implemented using a single instruction.  This is
>>> generally true for scalars (which have a 4-operand instruction)
>>> and SVE (which allows the output to be tied to any input).
>>> However, for Advanced SIMD, multiplying two values and adding
>>> an invariant will end up being a move and an MLA.
>>>
>>> The only target to use the issue-based vector costs is Neoverse V1,
>>> which would generally prefer SVE in this case anyway.  I therefore
>>> don't have a self-contained testcase.  However, the distinction
>>> becomes more important with a later patch.
>>
>> But we do cost any invariants separately (for the prologue), so they
>> should be available in a register.  How doesn't that work?
>
> Yeah, that works, and the prologue part is costed correctly.  But the
> result of an Advanced SIMD FMLA is tied to the accumulator input, so if
> the accumulator input is an invariant, we need a register move (in the
> loop body) before the FMLA.
>
> E.g. for:
>
> void
> f (float *restrict x, float *restrict y, float *restrict z, float a)
> {
>   for (int i = 0; i < 100; ++i)
>     x[i] = y[i] * z[i];


+ 1.0, not sure where that went.

> }
>
> the scalar code is:
>
> .L2:
>         ldr     s1, [x1, x3]
>         ldr     s2, [x2, x3]
>         fmadd   s1, s1, s2, s0
>         str     s1, [x0, x3]
>         add     x3, x3, 4
>         cmp     x3, 400
>         bne     .L2
>
> the SVE code is:
>
> .L2:
>         ld1w    z1.s, p0/z, [x1, x3, lsl 2]
>         ld1w    z0.s, p0/z, [x2, x3, lsl 2]
>         fmad    z0.s, p1/m, z1.s, z2.s
>         st1w    z0.s, p0, [x0, x3, lsl 2]
>         add     x3, x3, x5
>         whilelo p0.s, w3, w4
>         b.any   .L2
>
> but the Advanced SIMD code is:
>
> .L2:
>         mov     v0.16b, v3.16b   // <-------- boo
>         ldr     q2, [x2, x3]
>         ldr     q1, [x1, x3]
>         fmla    v0.4s, v2.4s, v1.4s
>         str     q0, [x0, x3]
>         add     x3, x3, 16
>         cmp     x3, 400
>         bne     .L2
>
> Thanks,
> Richard
>
>
>>
>>> gcc/
>>>         * config/aarch64/aarch64.c (aarch64_multiply_add_p): Add a vec_flags
>>>         parameter.  Detect cases in which an Advanced SIMD MLA would almost
>>>         certainly require a MOV.
>>>         (aarch64_count_ops): Update accordingly.
>>> ---
>>>  gcc/config/aarch64/aarch64.c | 25 ++++++++++++++++++++++---
>>>  1 file changed, 22 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>>> index 084f8caa0da..19045ef6944 100644
>>> --- a/gcc/config/aarch64/aarch64.c
>>> +++ b/gcc/config/aarch64/aarch64.c
>>> @@ -14767,9 +14767,12 @@ aarch64_integer_truncation_p (stmt_vec_info 
>>> stmt_info)
>>>
>>>  /* Return true if STMT_INFO is the second part of a two-statement 
>>> multiply-add
>>>     or multiply-subtract sequence that might be suitable for fusing into a
>>> -   single instruction.  */
>>> +   single instruction.  If VEC_FLAGS is zero, analyze the operation as
>>> +   a scalar one, otherwise analyze it as an operation on vectors with those
>>> +   VEC_* flags.  */
>>>  static bool
>>> -aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info)
>>> +aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
>>> +                       unsigned int vec_flags)
>>>  {
>>>    gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
>>>    if (!assign)
>>> @@ -14797,6 +14800,22 @@ aarch64_multiply_add_p (vec_info *vinfo, 
>>> stmt_vec_info stmt_info)
>>>        if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
>>>         continue;
>>>
>>> +      if (vec_flags & VEC_ADVSIMD)
>>> +       {
>>> +         /* Scalar and SVE code can tie the result to any FMLA input (or 
>>> none,
>>> +            although that requires a MOVPRFX for SVE).  However, Advanced 
>>> SIMD
>>> +            only supports MLA forms, so will require a move if the result
>>> +            cannot be tied to the accumulator.  The most important case in
>>> +            which this is true is when the accumulator input is invariant. 
>>>  */
>>> +         rhs = gimple_op (assign, 3 - i);
>>> +         if (TREE_CODE (rhs) != SSA_NAME)
>>> +           return false;
>>> +         def_stmt_info = vinfo->lookup_def (rhs);
>>> +         if (!def_stmt_info
>>> +             || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
>>> +           return false;
>>> +       }
>>> +
>>>        return true;
>>>      }
>>>    return false;
>>> @@ -15232,7 +15251,7 @@ aarch64_count_ops (class vec_info *vinfo, 
>>> aarch64_vector_costs *costs,
>>>      }
>>>
>>>    /* Assume that multiply-adds will become a single operation.  */
>>> -  if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info))
>>> +  if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
>>>      return;
>>>
>>>    /* When costing scalar statements in vector code, the count already

Re: [PATCH 6/8] aarch64: Tweak MLA vector costs

Reply via email to