On Jun 8, 2010, at 9:53 AM, George Bosilca wrote:
> As you can see there is no explicit call, the opal_atomic_cmpset_32 is really
> inlined. I think the problem is that you didn't specify the -O3 flag on your
> command line.
Ah, you wanted me to compile the OMPI code itself and send you the assembly.
That's not what you asked for. :-)
(I just took the code you sent in the mail, stuffed it into george.c, and
compiled that with -s -- outside of the context of the Open MPI code tree)
Here's the new output. It still didn't inline, but you can see the code for
the _cmpset function:
-----
[7:13] svbu-mpi:~/tmp % cat george.c
#include <stdint.h>
#include "opal/sys/atomic.h"
int foo(void) {
int32_t oldval, delta;
int32_t *addr = 0;
do {
oldval = *addr;
} while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
return (oldval + delta);
}
[7:13] svbu-mpi:~/tmp % pgcc -O3 -I /home/jsquyres/svn/ompi4
-I/home/jsquyres/svn/ompi4/opal/include -c -s george.c
[7:13] svbu-mpi:~/tmp % cat george.s
.file "george.c"
.version "01.01"
## PGC 7.0 -opt 1
## PGC 06/08/2010 05:10:04
## pgcc george.c -c -S
## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc
## george.c -opt 1 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x 123
0x1000
## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 1 -quad
## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc
/opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include
## -def unix -def __unix -def __unix__ -def linux -def __linux -def __linux__
## -def __NO_MATH_INLINES -def __x86_64__ -def __LONG_MAX__=9223372036854775807L
## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int -def
__THROW=
## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def __SSE2__
## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix)
#cpu(x86_64)
## -cmdline +pgcc george.c -c -S -x 123 4 -x 123 0x80000000 -alwaysinline
/opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4
## -asm george.s
## lineno: 3
.text
.align 16
.globl foo
foo:
..Dcfb0:
pushq %rbp
..Dcfi0:
movq %rsp, %rbp
..Dcfi1:
subq $16, %rsp
..EN1:
## lineno: 5
movq $0, -8(%rbp)
.p2align 4,,3
.LB157:
## lineno: 6
movq -8(%rbp), %rdi
movl (%rdi), %esi
movl %esi, -12(%rbp)
movl -16(%rbp), %edx
addl %esi, %edx
xorl %eax, %eax
call opal_atomic_cmpset_32
testl %eax, %eax
je .LB157
movl -16(%rbp), %eax
addl -12(%rbp), %eax
## lineno: 10
leave
ret
.type foo,@function
.size foo,.-foo
..Dcfe0:
__fooEND:
.section .pgi_trace
.align 8
.quad foo ## address of routine
.quad __fooEND - foo ## size of routine
.2byte 0 ## flags for future use
.2byte 3 ## length of following string
## name:foo:
.byte 0x66,0x6f,0x6f,0x00
.data
.globl opal_atomic_cmpset_32
.section .debug_frame
..Dcieb0:
.4byte ..Dciee0-..Dcieb0-4 ## CIE length
.4byte 0xffffffff ## CIE ID
.byte 0x1 ## CIE version
.byte 0x0 ## no augmentation
.byte 0x1 ## ULEB128 1, code alignment factor
.byte 0x78 ## SLEB128 -8, data alignment factor
.byte 0x10 ## return address column
.byte 0xc ## DW_CFA_def_cfa (col 7)
.byte 0x7 ## ULEB128 7
.byte 0x8 ## ULEB128 8
.byte 0x90 ## DW_CFA_offset (col 16)
.byte 0x1 ## ULEB128 1
.align 8
..Dciee0:
.4byte ..Dfdee0-..Dfdeb0 ## FDE length
..Dfdeb0:
.4byte ..Dcieb0 ## CIE pointer
.quad ..Dcfb0 ## initial location
.quad ..Dcfe0-..Dcfb0 ## address range
.byte 0x4 ## DW_CFA_advance_loc4
.4byte ..Dcfi0-..Dcfb0
.byte 0xe ## DW_CFA_def_cfa_offset
.byte 0x10 ## ULEB128 16
.byte 0x86 ## DW_CFA_offset (col 6)
.byte 0x2 ## ULEB128 2
.byte 0x4 ## DW_CFA_advance_loc4
.4byte ..Dcfi1-..Dcfi0
.byte 0xd ## DW_CFA_def_cfa_register (col 6)
.byte 0x6 ## ULEB128 6
.align 8
..Dfdee0:
.ident "PGC 7.0-7"
[7:13] svbu-mpi:~/tmp %
-----
--
Jeff Squyres
[email protected]
For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/