------- Comment #9 from tim at klingt dot org 2008-11-17 18:49 ------- i have updated the test program and attached preprocessed sources of gcc 4.3 and 4.4
the loop prefix contains 4.4 (9 invariant loads, one store of a generated constant to the stack): pxor %xmm5, %xmm5 xorl %eax, %eax movdqa %xmm5, %xmm0 xorl %edx, %edx pcmpeqd %xmm5, %xmm0 movaps .LC2(%rip), %xmm14 psrld $31, %xmm0 movdqa .LC3(%rip), %xmm13 pslld $31, %xmm0 movaps .LC4(%rip), %xmm12 movaps .LC5(%rip), %xmm11 movaps .LC6(%rip), %xmm10 movaps .LC7(%rip), %xmm9 movaps .LC8(%rip), %xmm8 movaps .LC9(%rip), %xmm7 movaps .LC16(%rip), %xmm6 movdqa %xmm0, -24(%rsp) 4.3 (8 invariant loads, store one generated constant in register): pxor %xmm6, %xmm6 xorl %edx, %edx movdqa %xmm6, %xmm0 xorl %eax, %eax pcmpeqd %xmm6, %xmm0 movaps .LC9(%rip), %xmm15 psrld $31, %xmm0 movaps .LC10(%rip), %xmm14 pslld $31, %xmm0 movaps .LC11(%rip), %xmm13 movaps .LC12(%rip), %xmm12 movaps .LC13(%rip), %xmm11 movdqa .LC14(%rip), %xmm10 movaps .LC15(%rip), %xmm9 movaps .LC16(%rip), %xmm8 movdqa %xmm0, %xmm7 body: 4.3 (7 loads from memory, 2 loads are used in the next instruction, others are used later): .L48: movaps in(%rax), %xmm2 movaps .LC2(%rip), %xmm0 movdqa %xmm2, %xmm5 movdqa .LC3(%rip), %xmm4 pand %xmm7, %xmm5 movaps .LC4(%rip), %xmm1 addl $4, %edx #APP # 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm5, %xmm2 # 0 "" 2 #NO_APP mulps %xmm2, %xmm0 movaps %xmm2, %xmm3 #APP # 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvttps2dq %xmm0, %xmm0 # 0 "" 2 #NO_APP pand %xmm0, %xmm4 paddd %xmm0, %xmm4 #APP # 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvtdq2ps %xmm4, %xmm0 # 0 "" 2 #NO_APP pand %xmm10, %xmm4 mulps %xmm0, %xmm1 psrld $1, %xmm4 subps %xmm1, %xmm3 movaps .LC5(%rip), %xmm1 mulps %xmm0, %xmm1 mulps .LC6(%rip), %xmm0 subps %xmm1, %xmm3 subps %xmm0, %xmm3 movaps .LC7(%rip), %xmm0 movaps %xmm3, %xmm1 cmpltps %xmm2, %xmm0 mulps %xmm3, %xmm1 movaps %xmm0, %xmm2 movaps .LC8(%rip), %xmm0 mulps %xmm1, %xmm0 addps %xmm15, %xmm0 mulps %xmm1, %xmm0 addps %xmm14, %xmm0 mulps %xmm1, %xmm0 addps %xmm13, %xmm0 mulps %xmm1, %xmm0 addps %xmm12, %xmm0 mulps %xmm1, %xmm0 addps %xmm11, %xmm0 mulps %xmm1, %xmm0 mulps %xmm3, %xmm0 addps %xmm3, %xmm0 #APP # 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andps %xmm2, %xmm0 # 0 "" 2 # 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andnps %xmm3, %xmm2 # 0 "" 2 #NO_APP movaps %xmm8, %xmm3 #APP # 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 orps %xmm2, %xmm0 # 0 "" 2 #NO_APP movdqa %xmm6, %xmm2 movaps %xmm0, %xmm1 psubd %xmm4, %xmm2 addps %xmm9, %xmm1 divps %xmm1, %xmm3 movaps %xmm3, %xmm1 #APP # 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andps %xmm2, %xmm1 # 0 "" 2 # 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andnps %xmm0, %xmm2 # 0 "" 2 # 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 orps %xmm2, %xmm1 # 0 "" 2 # 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm5, %xmm1 # 0 "" 2 #NO_APP movaps %xmm1, out(%rax) addq $16, %rax cmpl %edi, %edx jne .L48 4.4 (6 loads from memory, 5 loads are used as memory argument to opcodes): .L54: movaps in(%rax), %xmm2 movdqa -24(%rsp), %xmm3 addl $4, %edx pand %xmm2, %xmm3 #APP # 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm3, %xmm2 # 0 "" 2 #NO_APP movaps %xmm2, %xmm4 movaps %xmm2, %xmm15 mulps %xmm14, %xmm4 #APP # 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvttps2dq %xmm4, %xmm4 # 0 "" 2 #NO_APP movdqa %xmm4, %xmm0 pand %xmm13, %xmm0 paddd %xmm0, %xmm4 #APP # 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 cvtdq2ps %xmm4, %xmm0 # 0 "" 2 #NO_APP pand .LC14(%rip), %xmm4 movaps %xmm0, %xmm1 psrld $1, %xmm4 mulps %xmm12, %xmm1 subps %xmm1, %xmm15 movaps %xmm15, %xmm1 movaps %xmm0, %xmm15 mulps %xmm10, %xmm0 mulps %xmm11, %xmm15 subps %xmm15, %xmm1 movaps %xmm9, %xmm15 subps %xmm0, %xmm1 cmpltps %xmm2, %xmm15 movaps %xmm1, %xmm0 movaps %xmm15, %xmm2 mulps %xmm1, %xmm0 movaps %xmm0, %xmm15 mulps %xmm8, %xmm15 addps %xmm7, %xmm15 mulps %xmm0, %xmm15 addps .LC10(%rip), %xmm15 mulps %xmm0, %xmm15 addps .LC11(%rip), %xmm15 mulps %xmm0, %xmm15 addps .LC12(%rip), %xmm15 mulps %xmm0, %xmm15 addps .LC13(%rip), %xmm15 mulps %xmm15, %xmm0 mulps %xmm1, %xmm0 addps %xmm1, %xmm0 #APP # 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andps %xmm2, %xmm0 # 0 "" 2 # 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 andnps %xmm1, %xmm2 # 0 "" 2 #NO_APP movdqa %xmm5, %xmm1 psubd %xmm4, %xmm1 movdqa %xmm1, %xmm4 movaps .LC15(%rip), %xmm1 #APP # 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 orps %xmm2, %xmm0 # 0 "" 2 #NO_APP movaps %xmm6, %xmm2 addps %xmm0, %xmm1 divps %xmm1, %xmm2 movaps %xmm2, %xmm1 #APP # 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andps %xmm4, %xmm1 # 0 "" 2 # 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 andnps %xmm0, %xmm4 # 0 "" 2 #NO_APP movaps %xmm1, %xmm0 #APP # 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1 orps %xmm4, %xmm0 # 0 "" 2 # 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1 xorps %xmm3, %xmm0 # 0 "" 2 #NO_APP movaps %xmm0, out(%rax) addq $16, %rax cmpl %edi, %edx jne .L54 hth -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134