[Bug tree-optimization/78916] New: suboptimal code for x % C1 == C2

rv at rasmusvillemoes dot dk Fri, 23 Dec 2016 15:46:45 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78916


            Bug ID: 78916
           Summary: suboptimal code for x % C1 == C2
           Product: gcc
           Version: 5.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rv at rasmusvillemoes dot dk
  Target Milestone: ---

Everything below applies to the case of unsigned operands; signed division is a
whole other matter.

When a is an odd constant, the expression x % a == 0 is equivalent to x*inv(a)
<= ~0U/a, where inv(a) is the mod-2^32 multiplicative inverse of a. More
generally, x % a == b can be rewritten as x*inv(a) - b*inv(a) <= (~0U - b)/a,
where everything but the first multiplication are compile-time constants, and
the comparison is unsigned. gcc seems to always compute the full remainder and
compare that to the RHS.

For example,

int f(unsigned x) { return x % 3 == 0; }

could compile to

  20:   69 ff ab aa aa aa       imul   $0xaaaaaaab,%edi,%edi
  26:   31 c0                   xor    %eax,%eax
  28:   81 ff 55 55 55 55       cmp    $0x55555555,%edi
  2e:   0f 96 c0                setbe  %al
  31:   c3                      retq   

but gcc generates

   0:   89 f8                   mov    %edi,%eax
   2:   ba ab aa aa aa          mov    $0xaaaaaaab,%edx
   7:   f7 e2                   mul    %edx
   9:   d1 ea                   shr    %edx
   b:   8d 04 52                lea    (%rdx,%rdx,2),%eax
   e:   39 c7                   cmp    %eax,%edi
  10:   0f 94 c0                sete   %al
  13:   0f b6 c0                movzbl %al,%eax
  16:   c3                      retq   

If gcc needs to compute the quotient x/a anyway, computing the remainder from
that may be optimal, but in the somewhat realistic case where the quotient is
only used if the division is exact, it's much better to compute the tentative
quotient as above and test that, e.g.

unsigned f(unsigned x);
unsigned g(unsigned x)
{
        return (x % 7U == 0) ? f(x / 7U) : 0;
}

should/could be compiled as

unsigned h(unsigned x)
{
        return (x * 3067833783U <= (~0U)/7) ? f(x * 3067833783U) : 0;
}

0000000000000080 <g>:
  80:   89 f8                   mov    %edi,%eax
  82:   ba 25 49 92 24          mov    $0x24924925,%edx
  87:   f7 e2                   mul    %edx
  89:   89 f8                   mov    %edi,%eax
  8b:   29 d0                   sub    %edx,%eax
  8d:   d1 e8                   shr    %eax
  8f:   01 c2                   add    %eax,%edx
  91:   c1 ea 02                shr    $0x2,%edx
  94:   8d 04 d5 00 00 00 00    lea    0x0(,%rdx,8),%eax
  9b:   29 d0                   sub    %edx,%eax
  9d:   39 c7                   cmp    %eax,%edi
  9f:   74 07                   je     a8 <g+0x28>
  a1:   31 c0                   xor    %eax,%eax
  a3:   c3                      retq   
  a4:   0f 1f 40 00             nopl   0x0(%rax)
  a8:   89 d7                   mov    %edx,%edi
  aa:   e9 00 00 00 00          jmpq   af <g+0x2f>
  af:   90                      nop

00000000000000b0 <h>:
  b0:   69 ff b7 6d db b6       imul   $0xb6db6db7,%edi,%edi
  b6:   81 ff 24 49 92 24       cmp    $0x24924924,%edi
  bc:   76 0a                   jbe    c8 <h+0x18>
  be:   31 c0                   xor    %eax,%eax
  c0:   c3                      retq   
  c1:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)
  c8:   e9 00 00 00 00          jmpq   cd <h+0x1d>

[Bug tree-optimization/78916] New: suboptimal code for x % C1 == C2

Reply via email to