http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44281

--- Comment #6 from Adam Warner <adam at consulting dot net.nz> 2011-03-04 
07:22:47 UTC ---
Below is a very simple test case of an ordinary input argument to a function
being:

(a) copied to a spare register
(b) copied back from a spare register

When the input argument is:

(a) never modified; and
(b) an ordinary register (not a global register variable)

unmodified_ordinary_register_is_copied.c:


#include <stdint.h>

/* Six caller-saved registers as input arguments */
#define CALLER_SAVED uint64_t REG0, uint64_t REG1, uint64_t REG2, \
                     uint64_t REG3, uint64_t REG4, uint64_t REG5
typedef void (*fn_t)(CALLER_SAVED);

/* Six callee-saved registers as global register variables */
register uint64_t REG6 __asm__("rbx");
register fn_t    *REG7 __asm__("rbp");
register uint64_t REG8 __asm__("r12");
register uint64_t REG9 __asm__("r13");
register uint64_t REG10 __asm__("r14");
register uint64_t REG11 __asm__("r15");

/* Free general purpose registers are RSP, RAX, R10 and R11 */

void optimal_code_generation(CALLER_SAVED) {
  fn_t next=REG7[1];
  next(REG0, REG1, REG2, REG3, REG4, REG5);
}

void unmodified_input_arg_is_copied(CALLER_SAVED) {
  fn_t next=REG7[1];
  ++REG7;
  next(REG0, REG1, REG2, REG3, REG4, REG5);
}

int main() {
  return 0;
}


gcc-4.5 generates optimal code for both functions:
$ gcc-4.5 -O3 unmodified_ordinary_register_is_copied.c && objdump -d -m
i386:x86-64 a.out|less
...
00000000004004a0 <optimal_code_generation>:
  4004a0:       48 8b 45 08             mov    0x8(%rbp),%rax
  4004a4:       ff e0                   jmpq   *%rax
...
00000000004004b0 <unmodified_input_arg_is_copied>:
  4004b0:       48 8b 45 08             mov    0x8(%rbp),%rax
  4004b4:       48 83 c5 08             add    $0x8,%rbp
  4004b8:       ff e0                   jmpq   *%rax
...

Compare with GCC 4.6:
$ gcc-4.6 --version
gcc-4.6 (Debian 4.6-20110227-1) 4.6.0 20110227 (experimental) [trunk revision
170543]
...

$ gcc-4.6 -O3 unmodified_ordinary_register_is_copied.c && objdump -d -m
i386:x86-64 a.out|less
...
00000000004004a0 <optimal_code_generation>:
  4004a0:       48 8b 45 08             mov    0x8(%rbp),%rax
  4004a4:       ff e0                   jmpq   *%rax
...
00000000004004b0 <unmodified_input_arg_is_copied>:
  4004b0:       49 89 fa                mov    %rdi,%r10
  4004b3:       48 8b 45 08             mov    0x8(%rbp),%rax
  4004b7:       48 8d 6d 08             lea    0x8(%rbp),%rbp
  4004bb:       4c 89 d7                mov    %r10,%rdi
  4004be:       ff e0                   jmpq   *%rax
...

According to the Linux x86-64 ABI %rdi is the first argument passed to the
functions. For some reason this is being copied to %r10 before being copied
back from %r10 to %rdi. At no stage is %rdi modified.

(Minor aside:
lea 0x8(%rbp),%rbp has also replaced add $0x8,%rbp. My Intel Core 2 hardware
can execute a maximum of one LEA instruction per clock cycle compared to three
ADD instructions per clock cycle. If I add -march=core2 -mtune=core2 the code
generation becomes:
00000000004004b0 <unmodified_input_arg_is_copied>:
  4004b0:       48 8b 45 08             mov    0x8(%rbp),%rax
  4004b4:       48 8d 6d 08             lea    0x8(%rbp),%rbp
  4004b8:       49 89 fa                mov    %rdi,%r10
  4004bb:       4c 89 d7                mov    %r10,%rdi
  4004be:       ff e0                   jmpq   *%rax
)

This bizarre register copying goes away if I comment out one of the six global
register variables (i.e. five callee-saved global register variables instead of
six). For some reason GCC 4.6 cannot generate sensible code with %rsp, %rax,
%r10 and %r11 available---but can generate sensible code when an additional
register (%rbx, %r12, %r13, %r14 or %r15) is available.

Reply via email to