http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53791

             Bug #: 53791
           Summary: Branches not re-ordered using profile-information
    Classification: Unclassified
           Product: gcc
           Version: 4.8.0
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassig...@gcc.gnu.org
        ReportedBy: ste...@gcc.gnu.org


Consider the following test case:

extern void abort(void) __attribute__((__noreturn__));

static int __attribute__((__noinline__,__noclone__))
candidate_for_reordering (int x)
{
  asm volatile ("" : : : "memory");
  if (x == 1)
    return 2;
  else if (x == 2)
    return 4;
  else if (x == 3)
    return 8;
  abort ();
}

int accum = 0;

int main (int argc __attribute__((__unused__)),
          char **argv __attribute__((__unused__)))
{
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);

  accum += candidate_for_reordering (1);

  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);

  accum += candidate_for_reordering (2);

  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);
  accum += candidate_for_reordering (3);

  return accum; /* 30*8 + 4 + 2 = 246 */
}


Compiling with -fprofile-generate, doing 3 test runs, and compiling with
-fprofile-use, results in this .040t.feedback_fnsplit dump for the
candidate_for_reordering function:

candidate_for_reordering (intD.0 xD.1241)
{
  intD.0 D.1319;

  # BLOCK 2 freq:10000 count:96
  # PRED: ENTRY [100.0%]  count:96 (fallthru,exec)
  # .MEMD.1325_7 = VDEF <.MEMD.1325_6(D)>
  __asm__ __volatile__("" :  :  : "memory");
  # SUCC: 3 [100.0%]  count:96 (fallthru)

  # BLOCK 3 freq:10000 count:96
  # PRED: 2 [100.0%]  count:96 (fallthru)
  if (xD.1241_2(D) == 1)
    goto <bb 7>;
  else
    goto <bb 4>;
  # SUCC: 7 [3.1%]  count:3 (true,exec) 4 [96.9%]  count:93 (false,exec)

  # BLOCK 4 freq:9688 count:93
  # PRED: 3 [96.9%]  count:93 (false,exec)
  if (xD.1241_2(D) == 2)
    goto <bb 7>;
  else
    goto <bb 5>;
  # SUCC: 7 [3.2%]  count:3 (true,exec) 5 [96.8%]  count:90 (false,exec)

  # BLOCK 5 freq:9375 count:90
  # PRED: 4 [96.8%]  count:90 (false,exec)
  if (xD.1241_2(D) == 3)
    goto <bb 7>;
  else
    goto <bb 6>;
  # SUCC: 7 [100.0%]  count:90 (true,exec) 6 (false,exec)


  # BLOCK 6
  # PRED: 5 (false,exec)
  # VUSE <.MEMD.1325_7>
  # USE = nonlocal
  # CLB = nonlocal
  abortD.830 ();
  # SUCC:

  # BLOCK 7 freq:10000 count:96
  # PRED: 3 [3.1%]  count:3 (true,exec) 4 [3.2%]  count:3 (true,exec) 5
[100.0%]  count:90 (true,exec)
  # D.1319_1 = PHI <2(3), 4(4), 8(5)>
  return D.1319_1;
  # SUCC: EXIT [100.0%]  count:96

}




... and the following assembly (powerpc64):

candidate_for_reordering:
        .quad   .L.candidate_for_reordering,.TOC.@tocbase
        .previous
        .type   candidate_for_reordering, @function
.L.candidate_for_reordering:
        mflr 0
        std 0,16(1)
        stdu 1,-112(1)
        cmpwi 7,3,1
        beq- 7,.L4
        cmpwi 0,3,2
        beq- 0,.L5
        cmpwi 1,3,3
        bne- 1,.L3
        li 3,8
.L1:   
        addi 1,1,112
        ld 4,16(1)
        mtlr 4
        blr
.L4:   
        li 3,2
        b .L1
.L5:   
        li 3,4
        b .L1
.L3:   
        bl abort
        nop


Note the very obvious (to the human eye, anyway) optimization opportunity to
test for (x == 3) first. GCC is currently (AFAICT) not able to optimize the
order of branches for mutually exclusive conditions using profile info.

This prevents my work to re-implement emit_case_bit_tests as a GIMPLE lowering
pass to have any meaningful impact on the compiler speed with
profiledbootstrap: There are a couple of tree and GIMPLE predicates that would
benefit from this transformation, but it's not happening...

Reply via email to