https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88760

            Bug ID: 88760
           Summary: GCC unrolling is suboptimal
           Product: gcc
           Version: 9.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
                CC: rguenth at gcc dot gnu.org
  Target Milestone: ---

One of the hot loops in 510.parest_r from SPEC2017 can be approximated through:
unsigned int *colnums;
double *val;

struct foostruct
{
  unsigned int rows;
  unsigned int *colnums;
  unsigned int *rowstart;
};

struct foostruct *cols;

void
foo (double *dst, const double *src)
{
  const unsigned int n_rows = cols->rows;
  const double *val_ptr = &val[cols->rowstart[0]];
  const unsigned int *colnum_ptr = &cols->colnums[cols->rowstart[0]];  

  double *dst_ptr = dst;
  for (unsigned int row=0; row<n_rows; ++row)
    {
      double s = 0.;
      const double *const val_end_of_row = &val[cols->rowstart[row+1]];
      while (val_ptr != val_end_of_row)
        s += *val_ptr++ * src[*colnum_ptr++];
      *dst_ptr++ = s;
    }
}


At -Ofast -mcpu=cortex-a57 on aarch64 GCC generates a tight FMA loop:
.L4:
        ldr     w3, [x7, x2, lsl 2]
        cmp     x6, x2
        ldr     d2, [x5, x2, lsl 3]
        add     x2, x2, 1
        ldr     d1, [x1, x3, lsl 3]
        fmadd   d0, d2, d1, d0
        bne     .L4


LLVM unrolls the loop more intelligently:
.LBB0_8:                                // %vector.body
                                        //   Parent Loop BB0_2 Depth=1
                                        // =>  This Inner Loop Header: Depth=2
        ldp     w21, w22, [x20, #-8]
        ldr     d5, [x1, x21, lsl #3]
        ldp     d3, d4, [x7, #-16]
        ldr     d6, [x1, x22, lsl #3]
        ldp     w21, w22, [x20], #16
        fmadd   d2, d6, d4, d2
        fmadd   d1, d5, d3, d1
        ldr     d5, [x1, x21, lsl #3]
        ldr     d6, [x1, x22, lsl #3]
        add     x5, x5, #4              // =4
        adds    x19, x19, #2            // =2
        ldp     d3, d4, [x7], #32
        fmadd   d1, d5, d3, d1
        fmadd   d2, d6, d4, d2
        b.ne    .LBB0_8


With -funroll-loops GCC does do unrolling, but it does it differently:
<snip>
        ands    x12, x11, 7
        beq     .L70
        cmp     x12, 1
        beq     .L55
        cmp     x12, 2
        beq     .L57
        cmp     x12, 3
        beq     .L59
        cmp     x12, 4
        beq     .L61
        cmp     x12, 5
        beq     .L63
        cmp     x12, 6
        bne     .L72
.L65:
        ldr     w14, [x4, x2, lsl 2]
        ldr     d3, [x3, x2, lsl 3]
        add     x2, x2, 1
        ldr     d4, [x1, x14, lsl 3]
        fmadd   d0, d3, d4, d0
.L63:
        ldr     w5, [x4, x2, lsl 2]
        ldr     d5, [x3, x2, lsl 3]
        add     x2, x2, 1
        ldr     d6, [x1, x5, lsl 3]
        fmadd   d0, d5, d6, d0
.L61:
        ldr     w9, [x4, x2, lsl 2]
        ldr     d7, [x3, x2, lsl 3]
        add     x2, x2, 1
        ldr     d16, [x1, x9, lsl 3]
        fmadd   d0, d7, d16, d0
<snip>

On the whole of 510.parest_r this makes LLVM about 6% faster than GCC on
Cortex-A57.

Perhaps this can be used as a motivating testcase to move the GCC unrolling
discussions forward?

Reply via email to