Sometime the temporary in that composite intrinsic doesn't get eliminated. Test case: -O3 -march=k8 -fomit-frame-pointer #include <xmmintrin.h>
__m128 not_eliminated(const float f1, const float f2) { const __m128 a = _mm_set_ps1(f1), b = _mm_set_ps1(f2), c = _mm_mul_ps(a, b); return c; } #define broadcast(f) _mm_shuffle_ps(_mm_load_ss(&(f)),_mm_load_ss(&(f)),0) __m128 eliminated(const float f1, const float f2) { const __m128 a = broadcast(f1), b = broadcast(f2), c = _mm_mul_ps(a, b); return c; } int main() { return 0; } With gcc4-20040102 + patch http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19240 (happens in older version too), i still get: 00401070 <not_eliminated(float, float)>: 401070: sub $0x10,%esp 401073: mov 0x14(%esp),%eax 401077: mov %eax,0xc(%esp) 40107b: mov 0x18(%esp),%eax 40107f: movss 0xc(%esp),%xmm0 401085: shufps $0x0,%xmm0,%xmm0 401089: mov %eax,0xc(%esp) 40108d: movss 0xc(%esp),%xmm1 401093: add $0x10,%esp 401096: shufps $0x0,%xmm1,%xmm1 40109a: mulps %xmm1,%xmm0 40109d: ret and: 004010a0 <eliminated(float, float)>: 4010a0: movss 0x4(%esp),%xmm0 4010a6: movss 0x8(%esp),%xmm1 4010ac: shufps $0x0,%xmm0,%xmm0 4010b0: shufps $0x0,%xmm1,%xmm1 4010b4: mulps %xmm1,%xmm0 4010b7: ret It might happens with other intrinsics, but i haven't spotted it yet :) -- Summary: temporary not eliminated in composite _mm_set_ps1 Product: gcc Version: 4.0.0 Status: UNCONFIRMED Severity: normal Priority: P2 Component: rtl-optimization AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: tbptbp at gmail dot com CC: gcc-bugs at gcc dot gnu dot org GCC host triplet: cygwin http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19274