A simple test program, a.c: #include <stdio.h> #include <emmintrin.h> int main(void) { double a[2]; __m128d x = _mm_set1_pd(3); _mm_storeu_pd(a,x); printf("%f %f\n",a[0],a[1]); return 0; }
$ gcc-4.5 -O0 -march=k8 a.c && ./a.out # broken 0.000000 0.000000 $ gcc-4.5 -O1 -march=k8 a.c && ./a.out # good 3.000000 3.000000 $ gcc-4.5 -O0 -march=core2 a.c && ./a.out # good 3.000000 3.000000 $ gcc-4.5 -O0 -march=k8 -c a.c && objdump -d -M intel --prefix-addresses a.o | grep main 0000000000000000 <main> push rbp 0000000000000001 <main+0x1> mov rbp,rsp 0000000000000004 <main+0x4> sub rsp,0x40 0000000000000008 <main+0x8> mov rax,0x4008000000000000 0000000000000012 <main+0x12> mov QWORD PTR [rbp-0x8],rax 0000000000000016 <main+0x16> movsd xmm2,xmm1 000000000000001a <main+0x1a> unpcklpd xmm2,xmm2 000000000000001e <main+0x1e> movapd xmm0,xmm2 0000000000000022 <main+0x22> movlpd xmm1,QWORD PTR [rbp-0x8] 0000000000000027 <main+0x27> movaps XMMWORD PTR [rbp-0x40],xmm0 000000000000002b <main+0x2b> movapd xmm0,XMMWORD PTR [rbp-0x40] 0000000000000030 <main+0x30> lea rax,[rbp-0x30] 0000000000000034 <main+0x34> mov QWORD PTR [rbp-0x10],rax 0000000000000038 <main+0x38> movaps XMMWORD PTR [rbp-0x20],xmm0 000000000000003c <main+0x3c> mov rax,QWORD PTR [rbp-0x10] 0000000000000040 <main+0x40> movapd xmm0,XMMWORD PTR [rbp-0x20] 0000000000000045 <main+0x45> movupd XMMWORD PTR [rax],xmm0 0000000000000049 <main+0x49> movlpd xmm1,QWORD PTR [rbp-0x28] 000000000000004e <main+0x4e> movlpd xmm0,QWORD PTR [rbp-0x30] 0000000000000053 <main+0x53> mov eax,0x0 0000000000000058 <main+0x58> mov rdi,rax 000000000000005b <main+0x5b> mov eax,0x2 0000000000000060 <main+0x60> call 0000000000000065 <main+0x65> 0000000000000065 <main+0x65> mov eax,0x0 000000000000006a <main+0x6a> leave 000000000000006b <main+0x6b> ret $ gcc-4.5 -O0 -march=core2 -c a.c && objdump -d -M intel --prefix-addresses a.o | grep main 0000000000000000 <main> push rbp 0000000000000001 <main+0x1> mov rbp,rsp 0000000000000004 <main+0x4> sub rsp,0x40 0000000000000008 <main+0x8> mov rax,0x4008000000000000 0000000000000012 <main+0x12> mov QWORD PTR [rbp-0x8],rax 0000000000000016 <main+0x16> movddup xmm0,QWORD PTR [rbp-0x8] 000000000000001b <main+0x1b> movapd XMMWORD PTR [rbp-0x40],xmm0 0000000000000020 <main+0x20> movapd xmm0,XMMWORD PTR [rbp-0x40] 0000000000000025 <main+0x25> lea rax,[rbp-0x30] 0000000000000029 <main+0x29> mov QWORD PTR [rbp-0x10],rax 000000000000002d <main+0x2d> movapd XMMWORD PTR [rbp-0x20],xmm0 0000000000000032 <main+0x32> mov rax,QWORD PTR [rbp-0x10] 0000000000000036 <main+0x36> movapd xmm0,XMMWORD PTR [rbp-0x20] 000000000000003b <main+0x3b> movupd XMMWORD PTR [rax],xmm0 000000000000003f <main+0x3f> mov rdx,QWORD PTR [rbp-0x28] 0000000000000043 <main+0x43> movsd xmm0,QWORD PTR [rbp-0x30] 0000000000000048 <main+0x48> mov eax,0x0 000000000000004d <main+0x4d> movq xmm1,rdx 0000000000000052 <main+0x52> mov rdi,rax 0000000000000055 <main+0x55> mov eax,0x2 000000000000005a <main+0x5a> call 000000000000005f <main+0x5f> 000000000000005f <main+0x5f> mov eax,0x0 0000000000000064 <main+0x64> leave 0000000000000065 <main+0x65> ret The incorrect bit is 0000000000000016 <main+0x16> movsd xmm2,xmm1 000000000000001a <main+0x1a> unpcklpd xmm2,xmm2 000000000000001e <main+0x1e> movapd xmm0,xmm2 0000000000000022 <main+0x22> movlpd xmm1,QWORD PTR [rbp-0x8] 0000000000000027 <main+0x27> movaps XMMWORD PTR [rbp-0x40],xmm0 which is corrected by -march=core2 to 0000000000000016 <main+0x16> movddup xmm0,QWORD PTR [rbp-0x8] 000000000000001b <main+0x1b> movapd XMMWORD PTR [rbp-0x40],xmm0 Of course all the redundant stores are collapsed at any positive optimization level, and the result becomes correct regardless of -march. Unfortunately, the bug is in the generic x86-64 target so it's highly visible. This bug is not present in 4.4.2. $ gcc-4.5 -v Using built-in specs. COLLECT_GCC=gcc-4.5 COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/lto-wrapper Target: x86_64-unknown-linux-gnu Configured with: ../configure --prefix=/usr --enable-languages=c,c++,fortran --enable-gold --enable-plugin --enable-threads=posix --enable-__cxa_atexit --enable-clocale=gnu --enable-lto --enable-gnu-unique-object --disable-multilib --disable-libstdcxx-pch --with-tune=generic --with-system-zlib --with-ppl --with-cloog --libdir=/usr/lib --libexecdir=/usr/lib --mandir=/usr/share/man --infodir=/usr/share/info --disable-werror --enable-checking=release --program-suffix=-4.5 --enable-version-specific-runtime-libs : (reconfigured) ../configure --prefix=/usr --enable-languages=c,c++,fortran --enable-gold --enable-plugin --enable-threads=posix --enable-__cxa_atexit --enable-clocale=gnu --enable-lto --enable-gnu-unique-object --disable-multilib --disable-libstdcxx-pch --with-system-zlib --with-ppl --with-cloog --libdir=/usr/lib --libexecdir=/usr/lib --mandir=/usr/share/man --infodir=/usr/share/info --disable-werror --enable-checking=release --program-suffix=-4.5 --enable-version-specific-runtime-libs Thread model: posix gcc version 4.5.0 20100121 (experimental) (GCC) -- Summary: SSE2 intrinsics miscompiled at -O0 -march=k8 Product: gcc Version: 4.5.0 Status: UNCONFIRMED Severity: major Priority: P3 Component: inline-asm AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: bugs at 59A2 dot org GCC build triplet: x86_64-unknown-linux-gnu GCC host triplet: x86_64-unknown-linux-gnu GCC target triplet: x86_64-unknown-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42881