'Morning! I guess there are a number of people on here who are experts at writing optimised code exploiting every bit of a processor's instruction set. The code I recently isolated from the Perian project also attempts this, and I just came across something that got flabbergasted me. Perian is a Mac-only project, so it can make a number of safe assumptions about the CPUs it'll run on. Not so the split-off I created, which is Mac + Win32, likely to run on (lowly) AMD CPUs as well as high-end Intel ones. So I looked at the actual performance of a few of the SSE-optimised functions, knowing from experience that using SIMD also introduces overhead and that a good compiler is likely to create better assembly than a hand-coding non-expert tinkerer.
So I have this little benchmark comparing 2 routines that convert yuv420 to yuv422, running them on data that's allocated only once and should contain random bits, counting the number of calls made during a 5 second period (using tested, high resolution, low-latency timing functions): AVPicture pict; uint8_t *baseAddr = NULL; int width = 720, height = 576, outRB = 1440; double t; unsigned long N = 0; memset( &pict.data, 0, sizeof(pict.data) ); pict.linesize[0] = 752, pict.linesize[1] = 376; init_HRTime(); if( (pict.data[0] = (uint8_t*) malloc( width * height * 2 * sizeof(uint8_t) )) && (pict.data[1] = (uint8_t*) malloc( width * height * 2 * sizeof(uint8_t) )) && (pict.data[2] = (uint8_t*) malloc( width * height * 2 * sizeof(uint8_t) )) && (baseAddr = (uint8_t*) malloc( width * height * 4 * 2 * sizeof(uint8_t) )) ){ double startT = HRTime_Time(); do{ Y420toY422_sse2( &pict, baseAddr, outRB, width, height, &N ); } while( (t = HRTime_Time() - startT) < 5 ); fprintf( stderr, "%lu Y420toY422_sse2(outRB=%d,width=%d,height=%d) conversions in %gs; %gHz\n", N, outRB, width, height, t, N / t ); startT = HRTime_Time(); N = 0; do{ Y420toY422_x86_scalar( &pict, baseAddr, outRB, width, height, &N ); N += 1; } while( (t = HRTime_Time() - startT) < 5 ); fprintf( stderr, "%lu Y420toY422_x86_scalar(outRB=%d,width=%d,height=%d) conversions in %gs; %gHz\n", N, outRB, width, height, t, N / t ); } On my 2.7Ghz dual-core i7 MBP, I get about 10000Hz for the SSE version, and roughly half that for the generic, scalar function, using gcc-4.2 as well as using MSVC 2010 Express running under WinXP in VirtualBox. The factor 2 speed gain for SSE code also applies on 2 AMD machines (mid-end laptop and C62 netbook). Then I installed a new mingw32 cross-compiler based on gcc 4.7 and for the heck of it compiled my benchmark with it ... and found same factor 2 ... but in favour of the scalar code, on my i7 . It's more like a factor 2.5, actually. Same thing after installing the native OS X gcc 4.7 version. The question: is gcc-4.7 clever enough to do a better optimisation of the 2nd benchmark loop than the 1st loop, or does it really generate so much better assembly from the scalar function? NB, -fno-inline-functions has no effect here. Not that it matters much, as even on the C62 netbook the SSE function runs at almost 700Hz. The functions: //Handles the last row for Y420 videos with an odd number of luma rows //FIXME: odd number of luma columns is not handled and they will be lost static void Y420toY422_lastrow(uint8_t *o, uint8_t *yc, uint8_t *uc, uint8_t *vc, int halfWidth) { int x; for(x=0; x < halfWidth; x++) { int x4 = x*4, x2 = x*2; o[x4] = uc[x]; o[++x4] = yc[x2]; o[++x4] = vc[x]; o[++x4] = yc[++x2]; } } #define HandleLastRow(o, yc, uc, vc, halfWidth, height) if (unlikely(height & 1)) Y420toY422_lastrow(o, yc, uc, vc, halfWidth) #include <emmintrin.h> #ifdef _MSCVER # define FASTCALL __fastcall #elif defined(__i386__) && !defined(__llvm__) && !defined(_MSC_VER) # define FASTCALL __attribute__((fastcall)) #else # define FASTCALL #endif static FASTCALL void Y420toY422_sse2(AVPicture *picture, uint8_t *o, int outRB, int width, int height, unsigned long *N) { uint8_t *yc = picture->data[0], *uc = picture->data[1], *vc = picture->data[2]; int rY = picture->linesize[0], rUV = picture->linesize[1]; int y, x, halfwidth = width >> 1, halfheight = height >> 1; int vWidth = width >> 5; for (y = 0; y < halfheight; y++) { uint8_t *o2 = o + outRB, *yc2 = yc + rY; __m128i *ov = (__m128i*)o, *ov2 = (__m128i*)o2, *yv = (__m128i*)yc, *yv2 = (__m128i*)yc2; __m128i *uv = (__m128i*)uc,*vv = (__m128i*)vc; #if defined(__i386__) && !defined(_MSC_VER) //FIXMERJVB int vWidth_ = vWidth; asm volatile( "\n0: \n\t" "movdqa (%2), %%xmm0 \n\t" "movdqa 16(%2), %%xmm2 \n\t" "movdqa (%3), %%xmm1 \n\t" "movdqa 16(%3), %%xmm3 \n\t" "movdqu (%4), %%xmm4 \n\t" "movdqu (%5), %%xmm5 \n\t" "addl $32, %2 \n\t" "addl $32, %3 \n\t" "addl $16, %4 \n\t" "addl $16, %5 \n\t" "movdqa %%xmm4, %%xmm6 \n\t" "punpcklbw %%xmm5, %%xmm4 \n\t" /*chroma_l*/ "punpckhbw %%xmm5, %%xmm6 \n\t" /*chroma_h*/ "movdqa %%xmm4, %%xmm5 \n\t" "punpcklbw %%xmm0, %%xmm5 \n\t" "movntdq %%xmm5, (%0) \n\t" /*ov[x4]*/ "movdqa %%xmm4, %%xmm5 \n\t" "punpckhbw %%xmm0, %%xmm5 \n\t" "movntdq %%xmm5, 16(%0) \n\t" /*ov[x4+1]*/ "movdqa %%xmm6, %%xmm5 \n\t" "punpcklbw %%xmm2, %%xmm5 \n\t" "movntdq %%xmm5, 32(%0) \n\t" /*ov[x4+2]*/ "movdqa %%xmm6, %%xmm5 \n\t" "punpckhbw %%xmm2, %%xmm5 \n\t" "movntdq %%xmm5, 48(%0) \n\t" /*ov[x4+3]*/ "addl $64, %0 \n\t" "movdqa %%xmm4, %%xmm5 \n\t" "punpcklbw %%xmm1, %%xmm5 \n\t" "movntdq %%xmm5, (%1) \n\t" /*ov2[x4]*/ "punpckhbw %%xmm1, %%xmm4 \n\t" "movntdq %%xmm4, 16(%1) \n\t" /*ov2[x4+1]*/ "movdqa %%xmm6, %%xmm5 \n\t" "punpcklbw %%xmm3, %%xmm5 \n\t" "movntdq %%xmm5, 32(%1) \n\t" /*ov2[x4+2]*/ "punpckhbw %%xmm3, %%xmm6 \n\t" "movntdq %%xmm6, 48(%1) \n\t" /*ov2[x4+3]*/ "addl $64, %1 \n\t" "decl %6 \n\t" "jnz 0b \n\t" : "+r" (ov), "+r" (ov2), "+r" (yv), "+r" (yv2), "+r" (uv), "+r" (vv), "+m"(vWidth_) : : "memory", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); #else for (x = 0; x < vWidth; x++) { int x2 = x*2, x4 = x*4; __m128i tmp_y = yv[x2], tmp_y3 = yv[x2+1], tmp_y2 = yv2[x2], tmp_y4 = yv2[x2+1], tmp_u = _mm_loadu_si128(&uv[x]), tmp_v = _mm_loadu_si128(&vv[x]), chroma_l = _mm_unpacklo_epi8(tmp_u, tmp_v), chroma_h = _mm_unpackhi_epi8(tmp_u, tmp_v); _mm_stream_si128(&ov[x4], _mm_unpacklo_epi8(chroma_l, tmp_y)); _mm_stream_si128(&ov[x4+1], _mm_unpackhi_epi8(chroma_l, tmp_y)); _mm_stream_si128(&ov[x4+2], _mm_unpacklo_epi8(chroma_h, tmp_y3)); _mm_stream_si128(&ov[x4+3], _mm_unpackhi_epi8(chroma_h, tmp_y3)); _mm_stream_si128(&ov2[x4], _mm_unpacklo_epi8(chroma_l, tmp_y2)); _mm_stream_si128(&ov2[x4+1],_mm_unpackhi_epi8(chroma_l, tmp_y2)); _mm_stream_si128(&ov2[x4+2],_mm_unpacklo_epi8(chroma_h, tmp_y4)); _mm_stream_si128(&ov2[x4+3],_mm_unpackhi_epi8(chroma_h, tmp_y4)); } #endif for (x=vWidth * 16; x < halfwidth; x++) { int x4 = x*4, x2 = x*2; o2[x4] = o[x4] = uc[x]; x4++; o [x4] = yc[x2], o2[x4] = yc2[x2]; x4++; o2[x4] = o[x4] = vc[x]; x4++, x2++; o [x4] = yc[x2], o2[x4] = yc2[x2]; } o += outRB*2; yc += rY*2; uc += rUV; vc += rUV; } HandleLastRow(o, yc, uc, vc, halfwidth, height); *N += 1; } static FASTCALL void Y420toY422_x86_scalar(AVPicture *picture, uint8_t *o, int outRB, int width, int height, unsigned long *N) { uint8_t *yc = picture->data[0], *u = picture->data[1], *v = picture->data[2]; int rY = picture->linesize[0], rUV = picture->linesize[1]; int halfheight = height >> 1, halfwidth = width >> 1; int y, x; for (y = 0; y < halfheight; y ++) { uint8_t *o2 = o + outRB, *yc2 = yc + rY; for (x = 0; x < halfwidth; x++) { int x4 = x*4, x2 = x*2; o2[x4] = o[x4] = u[x]; o [++x4] = yc[x2]; o2[x4] = yc2[x2]; x4++; o2[x4] = o[x4] = v[x]; o [++x4] = yc[++x2]; o2[x4] = yc2[x2]; } o += outRB*2; yc += rY*2; u += rUV; v += rUV; } HandleLastRow(o, yc, u, v, halfwidth, height); *N += 1; } _______________________________________________ Libav-user mailing list Libav-user@ffmpeg.org http://ffmpeg.org/mailman/listinfo/libav-user