On Thu, Oct 17, 2013 at 7:17 AM, <[email protected]> wrote:
> # HG changeset patch > # User Yuvaraj Venkatesh <[email protected]> > # Date 1382012201 -19800 > # Thu Oct 17 17:46:41 2013 +0530 > # Node ID 77f60b1e4441ab947f75291eadf199d2f3ad1057 > # Parent fc9dbd798ac37ec1acc0596aa179f0deb586c092 > pixel16: converted sad_4 from vector class to intrinsic > Queued, but this is not where you should be spending your time. We have an urgent need to convert the remaining vector class 8bpp primitives (HIGH_BIT_DEPTH=0) to intrinsics, and we are almost done. The only ones left are in intra-sse3.cpp, blockcopy-avx2.cpp, and pixel-avx2.cpp. Of those, the intra DC and planar primitives have the highest priority, followed by AVX2, followed by the intra-angular functions. I am very tempted to just delete the 8bpp vector class intra-angular functions and use the C references until we generate assembly for those because I think their general concept needs to be redesigned. The HIGH_BIT_DEPTH=1 primitives will all go directly to assembly code because we are in no rush for those. > diff -r fc9dbd798ac3 -r 77f60b1e4441 source/common/vec/pixel16-sse41.cpp > --- a/source/common/vec/pixel16-sse41.cpp Thu Oct 17 14:14:40 2013 > +0530 > +++ b/source/common/vec/pixel16-sse41.cpp Thu Oct 17 17:46:41 2013 > +0530 > @@ -41,42 +41,38 @@ > template<int ly> > int sad_4(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t > frefstride) > { > - Vec8s m1, n1; > + __m128i sum1 = _mm_setzero_si128(); > + __m128i T00, T01, T02, T03; > + __m128i T10, T11, T12, T13; > + __m128i T20, T21; > > - Vec4i sum(0); > - Vec8us sad(0); > - int max_iterators = (ly >> 4) << 4; > - int row; > + for (int i = 0; i < ly; i += 4) > + { > + T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 0) * fencstride)); > + T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 1) * fencstride)); > + T01 = _mm_unpacklo_epi64(T00, T01); > + T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 2) * fencstride)); > + T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 3) * fencstride)); > + T03 = _mm_unpacklo_epi64(T02, T03); > > - for (row = 0; row < max_iterators; row += 16) > - { > - for (int i = 0; i < 16; i++) > - { > - m1.load_a(fenc); > - n1.load(fref); > - sad += abs(m1 - n1); > + T10 = _mm_loadl_epi64((__m128i*)(fref + (i + 0) * frefstride)); > + T11 = _mm_loadl_epi64((__m128i*)(fref + (i + 1) * frefstride)); > + T11 = _mm_unpacklo_epi64(T10, T11); > + T12 = _mm_loadl_epi64((__m128i*)(fref + (i + 2) * frefstride)); > + T13 = _mm_loadl_epi64((__m128i*)(fref + (i + 3) * frefstride)); > + T13 = _mm_unpacklo_epi64(T12, T13); > + T20 = _mm_sub_epi16(T01, T11); > + T20 = _mm_abs_epi16(T20); > + T21 = _mm_sub_epi16(T03, T13); > + T21 = _mm_abs_epi16(T21); > + T21 = _mm_add_epi16(T20, T21); > + sum1 = _mm_add_epi16(sum1, T21); > + } > + sum1 = _mm_hadd_epi16(sum1, sum1); > + sum1 = _mm_unpacklo_epi16(sum1, _mm_setzero_si128()); > + sum1 = _mm_hadd_epi32(_mm_hadd_epi32(sum1, sum1), sum1); > > - fenc += fencstride; > - fref += frefstride; > - } > - > - sum += extend_low(sad); > - sad = 0; > - } > - > - while (row++ < ly) > - { > - m1.load_a(fenc); > - n1.load(fref); > - sad += abs(m1 - n1); > - > - fenc += fencstride; > - fref += frefstride; > - } > - > - sum += extend_low(sad); > - > - return horizontal_add(sum); > + return _mm_cvtsi128_si32(sum1); > } > > template<int ly> > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
