https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106553

            Bug ID: 106553
           Summary: pre-register allocation scheduler is now RMW aware
           Product: gcc
           Version: 11.3.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64*

The following example is minimized from the math routines in glibc:

#include <arm_neon.h>

typedef float32x4_t v_f32_t;

static inline v_f32_t
v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
{
  return vfmaq_f32 (z, x, y);
}

v_f32_t
__v_sinf (v_f32_t x,v_f32_t z, v_f32_t n, v_f32_t r)
{
  v_f32_t r2, y;
  r2 = r * r;
  y = v_fma_f32 (n, r2, x);
  y = v_fma_f32 (y, r2, x);
  r = v_fma_f32 (y, r2, z);
  y = v_fma_f32 (y, r2, x);
  y = v_fma_f32 (y * r2, r, r);

  return y;
}

here we generate at -O2

__v_sinf(__Float32x4_t, __Float32x4_t, __Float32x4_t, __Float32x4_t):
        fmul    v3.4s, v3.4s, v3.4s
        mov     v5.16b, v0.16b
        mov     v4.16b, v0.16b
        fmla    v5.4s, v2.4s, v3.4s
        fmla    v4.4s, v5.4s, v3.4s
        fmla    v0.4s, v4.4s, v3.4s
        mov     v2.16b, v0.16b
        mov     v0.16b, v1.16b
        fmla    v0.4s, v4.4s, v3.4s
        fmul    v3.4s, v3.4s, v2.4s
        fmla    v0.4s, v3.4s, v0.4s
        ret

the 3rd move is there because the pre-register allocation scheduler created a
false dependency by scheduling the the fmul after the fmla. This forces reload
to have to create a reload to keep `v0` alive after the destructive operation.

with -O2  -fno-schedule-insns we get

__v_sinf(__Float32x4_t, __Float32x4_t, __Float32x4_t, __Float32x4_t):
        fmul    v3.4s, v3.4s, v3.4s
        mov     v4.16b, v0.16b
        fmla    v0.4s, v2.4s, v3.4s
        mov     v2.16b, v4.16b
        fmla    v2.4s, v0.4s, v3.4s
        mov     v0.16b, v1.16b
        fmla    v4.4s, v2.4s, v3.4s
        fmla    v0.4s, v2.4s, v3.4s
        fmul    v3.4s, v3.4s, v4.4s
        fmla    v0.4s, v3.4s, v0.4s
        ret

In glibc these additional moves cost double digit performance by breaking up
the fmla chains.


Should we perhaps use a special RMW scheduling attribute to make it treat the
last input as an output too?

Reply via email to