http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44281

--- Comment #10 from Adam Warner <adam at consulting dot net.nz> 2011-03-05 
02:01:04 UTC ---
Jakub,

Thanks for the explanation [The "weird" saving/restoring of %rdi into/from %r10
is because the RA chose to use %rdi for a temporary used in incrementing of
REG7 and loading the next pointer from it, while postreload managed to remove
all needs for such a temporary register, it is too late for the save/restore
code not to be emitted.]

I've replaced the memory lookup and REG7 increment with equivalent inline
assembly to help clarify this explanation. With one remaining source code
variable (next of type fn_t) and everything else opaque assembly the code
generation is worse.


#include <stdint.h>

/* Six caller-saved registers as input arguments */
#define CALLER_SAVED uint64_t REG0, uint64_t REG1, uint64_t REG2, \
                     uint64_t REG3, uint64_t REG4, uint64_t REG5
typedef void (*fn_t)(CALLER_SAVED);

/* Six callee-saved registers as global register variables */
register uint64_t REG6 __asm__("rbx");
register fn_t    *REG7 __asm__("rbp");
register uint64_t REG8 __asm__("r12");
register uint64_t REG9 __asm__("r13");
register uint64_t REG10 __asm__("r14");
register uint64_t REG11 __asm__("r15");

/* Free general purpose registers are RSP, RAX, R10 and R11 */

void optimal_code_generation(CALLER_SAVED) {
  fn_t next=REG7[1];
  next(REG0, REG1, REG2, REG3, REG4, REG5);
}

void unmodified_input_arg_is_copied(CALLER_SAVED) {
  fn_t next=REG7[1];
  ++REG7;
  next(REG0, REG1, REG2, REG3, REG4, REG5);
}

void unmodified_input_arg_is_copied_alt(CALLER_SAVED) {
  fn_t next=REG7[1];
  __asm__("add $8, %0" : "+r" (REG7));
  next(REG0, REG1, REG2, REG3, REG4, REG5);
}

void unmodified_input_arg_is_copied_alt2(CALLER_SAVED) {
  fn_t next;
  __asm__("mov 0x8(%[from]), %[to]" : [to] "=a" (next) : [from] "r" (REG7));
  __asm__("add $8, %0" : "+r" (REG7));
  next(REG0, REG1, REG2, REG3, REG4, REG5);
}

int main() {
  return 0;
}


$ gcc-4.6 -O3 unmodified_ordinary_register_is_copied_with_pure_asm.c && objdump
-d -m i386:x86-64 a.out|less

00000000004004a0 <optimal_code_generation>:
  4004a0:       48 8b 45 08             mov    0x8(%rbp),%rax
  4004a4:       ff e0                   jmpq   *%rax
  4004a6:       66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
  4004ad:       00 00 00 

00000000004004b0 <unmodified_input_arg_is_copied>:
  4004b0:       49 89 fa                mov    %rdi,%r10
  4004b3:       48 8b 45 08             mov    0x8(%rbp),%rax
  4004b7:       48 8d 6d 08             lea    0x8(%rbp),%rbp
  4004bb:       4c 89 d7                mov    %r10,%rdi
  4004be:       ff e0                   jmpq   *%rax

00000000004004c0 <unmodified_input_arg_is_copied_alt>:
  4004c0:       49 89 fa                mov    %rdi,%r10
  4004c3:       48 8b 45 08             mov    0x8(%rbp),%rax
  4004c7:       4c 89 d7                mov    %r10,%rdi
  4004ca:       48 83 c5 08             add    $0x8,%rbp
  4004ce:       ff e0                   jmpq   *%rax

00000000004004d0 <unmodified_input_arg_is_copied_alt2>:
  4004d0:       49 89 fa                mov    %rdi,%r10
  4004d3:       48 89 f7                mov    %rsi,%rdi
  4004d6:       48 89 d6                mov    %rdx,%rsi
  4004d9:       48 8b 45 08             mov    0x8(%rbp),%rax
  4004dd:       48 89 f2                mov    %rsi,%rdx
  4004e0:       48 89 fe                mov    %rdi,%rsi
  4004e3:       4c 89 d7                mov    %r10,%rdi
  4004e6:       48 83 c5 08             add    $0x8,%rbp
  4004ea:       ff e0                   jmpq   *%rax

unmodified_input_arg_is_copied_alt2() specifies a variable next of type fn_t.
The first assembly statement __asm__("mov 0x8(%[from]), %[to]" : [to] "=a"
(next) : [from] "r" (REG7)); directly translates to mov 0x8(%rbp),%rax. Note
use of the "=a" machine constrain to force use of the free %rax register.

The second assembly statement __asm__("add $8, %0" : "+r" (REG7)); directly
translates to add $0x8,%rbp. This is in-place register mutation which does not
require a temporary for incrementing.

While I suspected I might be able to work around the spurious saving/restoring
of unmodified registers with inline assembly the results are far worse. mov
%rdi,%r10; mov %rsi,%rdi; mov %rdx,%rsi is maximally serialized. One cannot
move %rdx into %rsi until %rsi is moved into %rdi. But one cannot move %rsi
into %rdi until %rdi is moved into %r10. Restoring the unmodified registers is
also maximally serialized.

Reply via email to