=i; }

peter at cordes dot ca Wed, 11 Apr 2018 23:56:32 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85366


            Bug ID: 85366
           Summary: Failure to use both div and mod results of one IDIV in
                    a prime-factor loop while(n%i==0) { n/=i; }
           Product: gcc
           Version: 8.0.1
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: peter at cordes dot ca
  Target Milestone: ---
            Target: x86_64-*-*, i?86-*-*

From
https://codereview.stackexchange.com/questions/191792/find-prime-factors-in-c/191801#191801,
simplified to use a pointer instead of returning std::vector<int>. 
Interestingly, the version with std::vector can be more easily coaxed to use
both results of one idiv, see the Godbolt link.

void find_prime_factors_ptr(int n, int *p)
{
    // inefficient to test even numbers > 2, but that's a separate missed
optimization.
    for (int i = 2; i <= n; i++) {
        while (n % i == 0) {
            *p++ = i;
            n /= i;   // reordering the loop body doesn't help
        }
    }
}

https://godbolt.org/g/ogyZW8

g++ 8.0.1 20180411 -O3 -march=haswell gives us this inner loop:

     ...
     # outer loop
     movl    %edi, %eax
        # idiv to test if inner loop should even run once, leaving n/i in eax
.L4:
        movl    %edi, %eax        # but instead we discard it
        addq    $4, %rsi
        movl    %ecx, -4(%rsi)
        cltd
        idivl   %ecx
        cltd                      # then modulo that division result to see if
the next iteration should run
        movl    %eax, %edi
        idivl   %ecx              # leaves n/i in eax, ready for next
iteration...
        testl   %edx, %edx
        je      .L4
     ...

So both ways to get to .L4 (fall in or loop) have n/i in EAX from an idiv
already!  The loop doesn't need to be re-structured to take advantage, gcc just
needs to keep track of what it's doing.

## Hand optimized version of the whole function:
        cmpl    $1, %edi
        jle     .L9
        movl    $2, %ecx
.L5:
        movl    %edi, %eax
        cltd
        idivl   %ecx          # eax = tmp = n/i
        testl   %edx, %edx
        jne     .L3
.L4:
        movl    %ecx, (%rsi)
        addq    $4, %rsi      # we're tuning for Haswell, no register-read
stalls so increment after reading and save a byte in the addressing mode
        movl    %eax, %edi    # n = tmp
        cltd
        idivl   %ecx          # eax = tmp = n/i
        testl   %edx, %edx
        je      .L4
.L3:
        incl    %ecx
        cmpl    %edi, %ecx
        jle     .L5
.L9:
        ret


I didn't make *any* changes to the code outside the inner loop.  I ended up
just removing movl %edi, %eax / cltd / idiv %ecx.

Changing the inner loop to

        int tmp;
        while (tmp = n/i, n % i == 0) {
            *p++ = i;
            n = tmp;
        }

gives us the asm almost that good (an extra mov inside the loop), but we get a
jmp into the loop instead of peeling the while condition from before the first
iteration:


# gcc8.0.1 -O3 -march=haswell output, commented but unmodified
find_prime_factors_ptr_opt(int, int*):
        cmpl    $1, %edi
        jle     .L18
        movl    $2, %ecx
        jmp     .L19
.L16:                                 # top of inner loop
        addq    $4, %rsi
        movl    %ecx, -4(%rsi)
        movl    %eax, %edi            # extra mov puts this and the next mov on
the critical path
.L19:                        # inner loop entry point
        movl    %edi, %eax
        cltd
        idivl   %ecx
        testl   %edx, %edx
        je      .L16                  # bottom of inner
        incl    %ecx
        cmpl    %edi, %ecx
        jle     .L19               # bottom of outer
.L18:
        ret

Saving code-size here with the dependent chain of movl %eax, %edi / movl %edi,
%eax is pretty minor even on CPUs like original Sandybridge, or Bulldozer,
without mov-elimination, because idiv's latency dominates.  But it could easily
be taken out of the inner loop by duplicating it outside the outer loop, then
moving it to the outer-only part of the loop body, like this:

        cmpl    $1, %edi
        jle     .L18
        movl    $2, %ecx
        movl    %edi, %eax   # eax = n added here
        jmp     .L19
.L16:                                 # top of inner loop
        addq    $4, %rsi
        movl    %ecx, -4(%rsi)
        movl    %eax, %edi     # n = tmp  still here
.L19:                        # inner loop entry point
         #movl    %edi, %eax  # eax = n removed from here in inner/outer loop
        cltd
        idivl   %ecx
        testl   %edx, %edx
        je      .L16                  # bottom of inner

        movl    %edi, %eax    # eax = n also added here, in the outer-only part
        incl    %ecx
        cmpl    %edi, %ecx
        jle     .L19               # bottom of outer
.L18:
        ret

In the inner loop, the loop-carried dep chain is just idiv -> cdq -> idiv via
eax, without any mov instructions.

In the outer loop (.L19: to jle .L19), the loop-carried dep is only inc %ecx. 
(Bottleneck on idiv throughput, not latency, like before.)

[Bug target/85366] New: Failure to use both div and mod results of one IDIV in a prime-factor loop while(n%i==0) { n/=i; }

Reply via email to