Awesome. Thanks :) Committed revision 1048.
On 8/26/07, Lenard Lindstrom <[EMAIL PROTECTED]> wrote: > I am including a new patch for transform.c . It replaces the patches I > submitted earlier. It cleans up the assembly code and and omits the > NO_SSE flag. Profiling shows that filter_shrink_X_MMX is still nearly 4X > faster than filter_shrink_X_ONLYC. No extra tweaking, by interleaving > instructions, was done since that showed a less that 1% performance > change (below the margin of error for the profile). > > -- > Lenard Lindstrom > <[EMAIL PROTECTED]> > > > 1018c1018 > < long long One64 = 0x4000400040004000; > --- > > long long One64 = 0x4000400040004000ULL; > 1025,1026c1025,1026 > < " movq %2, %%mm6; " /* mm6 = 2^14 */ > < " pshufw $0, %%mm7, %%mm7; " > --- > > " punpcklwd %%mm7, %%mm7; " > > " punpckldq %%mm7, %%mm7; " > 1042,1043c1042,1044 > < " movq %%mm6, %%mm3; " /* mm3 = 2^14 */ > < " pshufw $0, %%mm2, %%mm2; " > --- > > " movq %2, %%mm3; " /* mm3 = 2^14 */ > > " punpcklwd %%mm2, %%mm2; " > > " punpckldq %%mm2, %%mm2; " > 1049,1050c1050,1067 > < " pmulhuw %%mm4, %%mm2; " /* mm2 = (srcpix * > xcounter >> 16) */ > < " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * > xfrac) >> 16 */ > --- > > " movq %%mm4, %%mm5; " /* mm2 = (srcpix * > > xcounter >> 16) */ > > " psraw $15, %%mm5; " > > " pand %%mm2, %%mm5; " > > " movq %%mm2, %%mm6; " > > " psraw $15, %%mm6; " > > " pand %%mm4, %%mm6; " > > " pmulhw %%mm4, %%mm2; " > > " paddw %%mm5, %%mm2; " > > " paddw %%mm6, %%mm2; " > > " movq %%mm4, %%mm5; " /* mm3 = (srcpix * > > xfrac) >> 16) */ > > " psraw $15, %%mm5; " > > " pand %%mm3, %%mm5; " > > " movq %%mm3, %%mm6; " > > " psraw $15, %%mm6; " > > " pand %%mm4, %%mm6; " > > " pmulhw %%mm4, %%mm3; " > > " paddw %%mm5, %%mm3; " > > " paddw %%mm6, %%mm3; " > 1053c1070,1078 > < " pmulhuw %%mm7, %%mm2; " > --- > > " movq %%mm7, %%mm5; " > > " psraw $15, %%mm5; " > > " pand %%mm2, %%mm5; " > > " movq %%mm2, %%mm6; " > > " psraw $15, %%mm6; " > > " pand %%mm7, %%mm6; " > > " pmulhw %%mm7, %%mm2; " > > " paddw %%mm5, %%mm2; " > > " paddw %%mm6, %%mm2; " > 1076,1077c1101,1102 > < " movq %2, %%mm6; " /* mm6 = 2^14 */ > < " pshufw $0, %%mm7, %%mm7; " > --- > > " punpcklwd %%mm7, %%mm7; " > > " punpckldq %%mm7, %%mm7; " > 1093,1094c1118,1120 > < " movq %%mm6, %%mm3; " /* mm3 = 2^14 */ > < " pshufw $0, %%mm2, %%mm2; " > --- > > " movq %2, %%mm3; " /* mm3 = 2^14 */ > > " punpcklwd %%mm2, %%mm2; " > > " punpckldq %%mm2, %%mm2; " > 1100,1101c1126,1143 > < " pmulhuw %%mm4, %%mm2; " /* mm2 = (srcpix * > xcounter >> 16) */ > < " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * > xfrac) >> 16 */ > --- > > " movq %%mm4, %%mm5; " /* mm2 = (srcpix * > > xcounter >> 16) */ > > " psraw $15, %%mm5; " > > " pand %%mm2, %%mm5; " > > " movq %%mm2, %%mm6; " > > " psraw $15, %%mm6; " > > " pand %%mm4, %%mm6; " > > " pmulhw %%mm4, %%mm2; " > > " paddw %%mm5, %%mm2; " > > " paddw %%mm6, %%mm2; " > > " movq %%mm4, %%mm5; " /* mm3 = (srcpix * > > xfrac) >> 16) */ > > " psraw $15, %%mm5; " > > " pand %%mm3, %%mm5; " > > " movq %%mm3, %%mm6; " > > " psraw $15, %%mm6; " > > " pand %%mm4, %%mm6; " > > " pmulhw %%mm4, %%mm3; " > > " paddw %%mm5, %%mm3; " > > " paddw %%mm6, %%mm3; " > 1104c1146,1154 > < " pmulhuw %%mm7, %%mm2; " > --- > > " movq %%mm7, %%mm5; " > > " psraw $15, %%mm5; " > > " pand %%mm2, %%mm5; " > > " movq %%mm2, %%mm6; " > > " psraw $15, %%mm6; " > > " pand %%mm7, %%mm6; " > > " pmulhw %%mm7, %%mm2; " > > " paddw %%mm5, %%mm2; " > > " paddw %%mm6, %%mm2; " > 1202c1252 > < long long One64 = 0x4000400040004000; > --- > > long long One64 = 0x4000400040004000ULL; > 1210c1260,1261 > < " pshufw $0, %%mm7, %%mm7; " > --- > > " punpcklwd %%mm7, %%mm7; " > > " punpckldq %%mm7, %%mm7; " > 1232c1283,1284 > < " pshufw $0, %%mm1, %%mm1; " > --- > > " punpcklwd %%mm1, %%mm1; " > > " punpckldq %%mm1, %%mm1; " > 1241,1242c1293,1310 > < " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * > yfrac) >> 16 */ > < " pmulhuw %%mm1, %%mm4; " /* mm4 = (srcpix * > ycounter >> 16) */ > --- > > " movq %%mm4, %%mm0; " /* mm3 = (srcpix * > > yfrac) >> 16) */ > > " psraw $15, %%mm0; " > > " pand %%mm3, %%mm0; " > > " movq %%mm3, %%mm2; " > > " psraw $15, %%mm2; " > > " pand %%mm4, %%mm2; " > > " pmulhw %%mm4, %%mm3; " > > " paddw %%mm0, %%mm3; " > > " paddw %%mm2, %%mm3; " > > " movq %%mm1, %%mm0; " /* mm4 = (srcpix * > > ycounter >> 16) */ > > " psraw $15, %%mm0; " > > " pand %%mm4, %%mm0; " > > " movq %%mm4, %%mm2; " > > " psraw $15, %%mm2; " > > " pand %%mm1, %%mm2; " > > " pmulhw %%mm1, %%mm4; " > > " paddw %%mm0, %%mm4; " > > " paddw %%mm2, %%mm4; " > 1246c1314,1323 > < " pmulhuw %%mm7, %%mm4; " > --- > > " movq %%mm7, %%mm0; " > > " psraw $15, %%mm0; " > > " pand %%mm4, %%mm0; " > > " movq %%mm4, %%mm2; " > > " psraw $15, %%mm2; " > > " pand %%mm7, %%mm2; " > > " pmulhw %%mm7, %%mm4; " > > " paddw %%mm0, %%mm4; " > > " paddw %%mm2, %%mm4; " > > " pxor %%mm0, %%mm0; " > 1270c1347,1348 > < " pshufw $0, %%mm7, %%mm7; " > --- > > " punpcklwd %%mm7, %%mm7; " > > " punpckldq %%mm7, %%mm7; " > 1292c1370,1371 > < " pshufw $0, %%mm1, %%mm1; " > --- > > " punpcklwd %%mm1, %%mm1; " > > " punpckldq %%mm1, %%mm1; " > 1301,1302c1380,1397 > < " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * > yfrac) >> 16 */ > < " pmulhuw %%mm1, %%mm4; " /* mm4 = (srcpix * > ycounter >> 16) */ > --- > > " movq %%mm4, %%mm0; " /* mm3 = (srcpix * > > yfrac) >> 16) */ > > " psraw $15, %%mm0; " > > " pand %%mm3, %%mm0; " > > " movq %%mm3, %%mm2; " > > " psraw $15, %%mm2; " > > " pand %%mm4, %%mm2; " > > " pmulhw %%mm4, %%mm3; " > > " paddw %%mm0, %%mm3; " > > " paddw %%mm2, %%mm3; " > > " movq %%mm1, %%mm0; " /* mm4 = (srcpix * > > ycounter >> 16) */ > > " psraw $15, %%mm0; " > > " pand %%mm4, %%mm0; " > > " movq %%mm4, %%mm2; " > > " psraw $15, %%mm2; " > > " pand %%mm1, %%mm2; " > > " pmulhw %%mm1, %%mm4; " > > " paddw %%mm0, %%mm4; " > > " paddw %%mm2, %%mm4; " > 1306c1401,1410 > < " pmulhuw %%mm7, %%mm4; " > --- > > " movq %%mm7, %%mm0; " > > " psraw $15, %%mm0; " > > " pand %%mm4, %%mm0; " > > " movq %%mm4, %%mm2; " > > " psraw $15, %%mm2; " > > " pand %%mm7, %%mm2; " > > " pmulhw %%mm7, %%mm4; " > > " paddw %%mm0, %%mm4; " > > " paddw %%mm2, %%mm4; " > > " pxor %%mm0, %%mm0; " > 1535,1536c1639,1642 > < " pshufw $0, %%mm1, %%mm1; " > < " pshufw $0, %%mm2, %%mm2; " > --- > > " punpcklwd %%mm1, %%mm1; " > > " punpckldq %%mm1, %%mm1; " > > " punpcklwd %%mm2, %%mm2; " > > " punpckldq %%mm2, %%mm2; " > 1564,1565c1670,1673 > < " pshufw $0, %%mm1, %%mm1; " > < " pshufw $0, %%mm2, %%mm2; " > --- > > " punpcklwd %%mm1, %%mm1; " > > " punpckldq %%mm1, %%mm1; " > > " punpcklwd %%mm2, %%mm2; " > > " punpckldq %%mm2, %%mm2; " > >
