Consider the following functions:

typedef unsigned long long int u64;
void foo(u64* d, u64 const* s, u64 k) {
    *d = ((__uint128_t) *s*k) >> 64;
}
void foo(u64* d, u64 const* s, u64 k, u64 m) {
    *d = ((__uint128_t) (*s&m)*k) >> 64;
}
void foo2(u64* d, u64 const* s, u64 k) {
    foo(d,  s,  k);
    foo(d+1,s+1,k);
}
void foo2(u64* d, u64 const* s, u64 k, u64 m) {
    foo(d,  s,  k, m);
    foo(d+1,s+1,k, m);
}

Compiling them with "g++ -O3" gives:

_Z3fooPyPKyy:
        movq    %rdx, %rax
        mulq    (%rsi)
        movq    %rdx, (%rdi)
        ret
_Z3fooPyPKyyy:
        andq    (%rsi), %rcx
        movq    %rcx, %rax
        mulq    %rdx
        movq    %rdx, (%rdi)
        ret
_Z4foo2PyPKyy:
        movq    (%rsi), %rax
        xorl    %r9d, %r9d
        movq    %rdx, %r8
        movq    %r9, %rcx
        imulq   %rax, %rcx
        mulq    %rdx
        leaq    (%rcx,%rdx), %rdx
        movq    %r9, %rcx
        movq    %rdx, (%rdi)
        movq    8(%rsi), %rax
        imulq   %rax, %rcx
        mulq    %r8
        leaq    (%rcx,%rdx), %rdx
        movq    %rdx, 8(%rdi)
        ret
_Z4foo2PyPKyyy:
        movq    %rcx, %rax
        andq    (%rsi), %rax
        movq    %rdx, %r10
        xorl    %r11d, %r11d
        xorl    %edx, %edx
        movq    %rdx, %r8
        movq    %r11, %r9
        imulq   %r10, %r8
        imulq   %rax, %r9
        mulq    %r10
        addq    %r9, %r8
        leaq    (%r8,%rdx), %rdx
        movq    %rdx, (%rdi)
        andq    8(%rsi), %rcx
        xorl    %edx, %edx
        movq    %r11, %rsi
        movq    %rcx, %rax
        movq    %rdx, %rcx
        imulq   %rax, %rsi
        imulq   %r10, %rcx
        mulq    %r10
        addq    %rsi, %rcx
        leaq    (%rcx,%rdx), %rdx
        movq    %rdx, 8(%rdi)
        ret

The two versions of foo() do exactly what you would expect: AND+MUL, then store
the high dword. The two versions of foo2(), on the other hand, perform two and
four signed multiplies, in addition to the two unsigned multiplies that would
be expected. In my debugger, at least, "xorl %edx, %edx" zeros out all 64 bits,
so the two signed multiplies give zero for their result, making them completely
redundant. 

Compiling without optimizations gives the IMUL+IMUL+MUL combination even for
foo(), so it appears that the optimizer is missing something once it has more
than one multiply to deal with.


-- 
           Summary: Significant extra code generation for 64x64=>128-bit
                    multiply
           Product: gcc
           Version: 4.1.2
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: scovich at gmail dot com
GCC target triplet: x86_64-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=32662

Reply via email to