7] Tree-sra forces parameters to memory causing awful code generation

wschmidt at gcc dot gnu.org Thu, 11 Aug 2016 13:29:49 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=74585


            Bug ID: 74585
           Summary: [5/6/7] Tree-sra forces parameters to memory causing
                    awful code generation
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: wschmidt at gcc dot gnu.org
  Target Milestone: ---

For the PowerPC64 ELF V2 ABI, homogeneous aggregates of vectors (any
combination of structs and arrays containing only vectors) are passed in the
first eight vector registers.  Tree-sra does not understand this and performs
scalarization on such aggregates, forcing them to memory in the process.  This
causes unnecessary stores/reloads to the stack on function entry and exit,
resulting in terrible performance.

As an example, consider:

--- SNIP ---
#define VEC_DW_H (1)
#define VEC_DW_L (0)

typedef struct
{
  __vector double vx0;
  __vector double vx1;
  __vector double vx2;
  __vector double vx3;
} vdoublex8_t;

vdoublex8_t
test_vecd8_rotate_left (vdoublex8_t a)
{
  __vector double temp;
  vdoublex8_t result;

  temp = a.vx0;

  /* Copy low dword of vx0 and high dword of vx1 to vx0 high / low.  */
  result.vx0[VEC_DW_H] = a.vx0[VEC_DW_L];
  result.vx0[VEC_DW_L] = a.vx1[VEC_DW_H];
  /* Copy low dword of vx1 and high dword of vx2 to vx1 high / low.  */
  result.vx1[VEC_DW_H] = a.vx1[VEC_DW_L];
  result.vx1[VEC_DW_L] = a.vx2[VEC_DW_H];
  /* Copy low dword of vx2 and high dword of vx2 to vx2 high / low.  */
  result.vx2[VEC_DW_H] = a.vx2[VEC_DW_L];
  result.vx2[VEC_DW_L] = a.vx3[VEC_DW_H];
  /* Copy low dword of vx3 and high dword of vx0 to vx3 high / low.  */
  result.vx3[VEC_DW_H] = a.vx3[VEC_DW_L];
  result.vx3[VEC_DW_L] = temp[VEC_DW_H];

  return (result);
}
--- SNIP ---

After 031t.forwprop, we have:


;; Function test_vecd8_rotate_left (test_vecd8_rotate_left, funcdef_no=0,
decl_uid=2364, cgraph_uid=0, symbol_order=0)

test_vecd8_rotate_left (struct vdoublex8_t a)
{
  struct vdoublex8_t result;
  __vector double temp;
  struct vdoublex8_t D.2369;
  __vector double _1;
  double _2;
  double _3;
  double _4;
  double _5;
  double _6;
  double _7;
  double _8;
  double _9;

  <bb 2>:
  _1 = a.vx0;
  temp_23 = _1;
  _2 = BIT_FIELD_REF <a.vx0, 64, 0>;
  BIT_FIELD_REF <result.vx0, 64, 64> = _2;
  _3 = BIT_FIELD_REF <a.vx1, 64, 64>;
  BIT_FIELD_REF <result.vx0, 64, 0> = _3;
  _4 = BIT_FIELD_REF <a.vx1, 64, 0>;
  BIT_FIELD_REF <result.vx1, 64, 64> = _4;
  _5 = BIT_FIELD_REF <a.vx2, 64, 64>;
  BIT_FIELD_REF <result.vx1, 64, 0> = _5;
  _6 = BIT_FIELD_REF <a.vx2, 64, 0>;
  BIT_FIELD_REF <result.vx2, 64, 64> = _6;
  _7 = BIT_FIELD_REF <a.vx3, 64, 64>;
  BIT_FIELD_REF <result.vx2, 64, 0> = _7;
  _8 = BIT_FIELD_REF <a.vx3, 64, 0>;
  BIT_FIELD_REF <result.vx3, 64, 64> = _8;
  _9 = BIT_FIELD_REF <_1, 64, 64>;
  BIT_FIELD_REF <result.vx3, 64, 0> = _9;
  D.2369 = result;
  result ={v} {CLOBBER};
  return D.2369;

}

but after 032t.esra, we have:

test_vecd8_rotate_left (struct vdoublex8_t a)
{
  __vector double result$vx3;
  __vector double result$vx2;
  __vector double result$vx1;
  __vector double result$vx0;
  __vector double a$vx3;
  __vector double a$vx2;
  __vector double a$vx1;
  __vector double a$vx0;
  struct vdoublex8_t result;
  __vector double temp;
  struct vdoublex8_t D.2369;
  __vector double _1;
  double _2;
  double _3;
  double _4;
  double _5;
  double _6;
  double _7;
  double _8;
  double _9;
  __vector double _11;
  __vector double _21;
  __vector double _25;
  __vector double _26;

  <bb 2>:
  a$vx0_27 = MEM[(struct  *)&a];
  a$vx1_28 = MEM[(struct  *)&a + 16B];
  a$vx2_29 = MEM[(struct  *)&a + 32B];
  a$vx3_30 = MEM[(struct  *)&a + 48B];
  _1 = a$vx0_27;
  temp_23 = _1;
  _2 = BIT_FIELD_REF <a$vx0_27, 64, 0>;
  BIT_FIELD_REF <result$vx0, 64, 64> = _2;
  _3 = BIT_FIELD_REF <a$vx1_28, 64, 64>;
  BIT_FIELD_REF <result$vx0, 64, 0> = _3;
  _4 = BIT_FIELD_REF <a$vx1_28, 64, 0>;
  BIT_FIELD_REF <result$vx1, 64, 64> = _4;
  _5 = BIT_FIELD_REF <a$vx2_29, 64, 64>;
  BIT_FIELD_REF <result$vx1, 64, 0> = _5;
  _6 = BIT_FIELD_REF <a$vx2_29, 64, 0>;
  BIT_FIELD_REF <result$vx2, 64, 64> = _6;
  _7 = BIT_FIELD_REF <a$vx3_30, 64, 64>;
  BIT_FIELD_REF <result$vx2, 64, 0> = _7;
  _8 = BIT_FIELD_REF <a$vx3_30, 64, 0>;
  BIT_FIELD_REF <result$vx3, 64, 64> = _8;
  _9 = BIT_FIELD_REF <_1, 64, 64>;
  BIT_FIELD_REF <result$vx3, 64, 0> = _9;
  _21 = result$vx0;
  MEM[(struct  *)&D.2369] = _21;
  _11 = result$vx1;
  MEM[(struct  *)&D.2369 + 16B] = _11;
  _25 = result$vx2;
  MEM[(struct  *)&D.2369 + 32B] = _25;
  _26 = result$vx3;
  MEM[(struct  *)&D.2369 + 48B] = _26;
  result$vx0 ={v} {CLOBBER};
  result$vx1 ={v} {CLOBBER};
  result$vx2 ={v} {CLOBBER};
  result$vx3 ={v} {CLOBBER};
  return D.2369;

}

I will spare you the terrible assembly code that we get as a result, but
suffice it to say it is filled with unnecessary memory accesses when all this
logic can be done efficiently in registers.

I'm not familiar with the tree-sra code, but a quick scan indicates there isn't
any sort of target hook to indicate when pushing parameters to memory is a bad
idea.  I'm guessing we need one.  Or is there another way that this behavior
can be avoided?

There are a number of criteria in find_param_candidates to preclude
scalarization, but nothing seems to obviously fit the condition of "the ABI in
effect requests that you leave this guy alone."  The issue is complicated by
the fact that scalarization would indeed be a reasonable thing to do if we have
run out of protocol registers for a parameter; but I am willing to give that up
if we can avoid lousy code in the common case.

Our ABI has the same issues if we replace vectors by float, double, or long
double.

Any thoughts on how this should be addressed?

[Bug tree-optimization/74585] New: [5/6/7] Tree-sra forces parameters to memory causing awful code generation

Reply via email to