The following function gets optimized at -O3 to:

long long tmod2(long long x)
{
        return x % 2;
}


mov    %rdi,%rdx                                                   
shr    $0x3f,%rdx                                                  
lea    (%rdi,%rdx,1),%rax                                          
and    $0x1,%eax                                                   
sub    %rdx,%rax                                                   
retq

This is very good code.  Unfortunately, the 128 bit version doesn't get
optimized nearly so well.

__int128_t tmod2(__int128_t x)
{
        return x % 2;
}

mov    %rsi,%rdx
mov    %rdi,%r8
xor    %ecx,%ecx
shr    $0x3f,%rdx
push   %rbx
add    %rdx,%r8
xor    %edi,%edi
mov    %r8,%rsi
mov    %rdi,%r9
and    $0x1,%esi
mov    %rsi,%r8
sub    %rdx,%r8
sbb    %rcx,%r9
mov    %r8,%rax
mov    %r9,%rdx
pop    %rbx
retq

It looks like this simple variation of the 64bit algorithm will work for the
128 bit version:

mov    %rsi,%rdx    <--- Just changed rdi into rsi
shr    $0x3f,%rdx   <--- nicely already calculates high bytes in rdx
lea    (%rdi,%rdx,1),%rax
and    $0x1,%eax
sub    %rdx,%rax
retq


-- 
           Summary: missed optimization of constant __int128_t modulus
           Product: gcc
           Version: 4.5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: svfuerst at gmail dot com
 GCC build triplet: x86_64-linux
  GCC host triplet: x86_64-linux
GCC target triplet: x86_64-linux


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43883

Reply via email to