vlc | branch: master | Lyndon Brown <jnq...@gmail.com> | Fri Jan 25 06:05:54 2019 +0000| [c29fcde3887e4fbccc6af4a860a59febfe837f4d] | committer: Jean-Baptiste Kempf
packetizer/startcode_helper: enhance with AVX2 Signed-off-by: Jean-Baptiste Kempf <j...@videolan.org> > http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=c29fcde3887e4fbccc6af4a860a59febfe837f4d --- modules/packetizer/startcode_helper.h | 94 +++++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 5 deletions(-) diff --git a/modules/packetizer/startcode_helper.h b/modules/packetizer/startcode_helper.h index 2b61e5cf98..3391c68db3 100644 --- a/modules/packetizer/startcode_helper.h +++ b/modules/packetizer/startcode_helper.h @@ -1,7 +1,7 @@ /***************************************************************************** * startcode_helper.h: Startcodes helpers ***************************************************************************** - * Copyright (C) 2016 VideoLAN Authors + * Copyright (C) 2016, 2019 VideoLAN Authors * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by @@ -22,6 +22,9 @@ #include <vlc_cpu.h> +#if !defined(CAN_COMPILE_AVX2) && defined(HAVE_AVX2_INTRINSICS) + #include <immintrin.h> +#endif #if !defined(CAN_COMPILE_SSE2) && defined(HAVE_SSE2_INTRINSICS) #include <emmintrin.h> #endif @@ -44,15 +47,91 @@ }\ } +#if defined(CAN_COMPILE_AVX2) || defined(HAVE_AVX2_INTRINSICS) + +__attribute__ ((__target__ ("avx2"))) +static inline const uint8_t * startcode_FindAnnexB_AVX2( const uint8_t *p, const uint8_t *end ) +{ + end -= 3; + + /* First align to 32 */ + const uint8_t *alignedend = p + 32 - ((intptr_t)p & 31); + for (; p < alignedend && p <= end; p++) { + if (p[0] == 0 && p[1] == 0 && p[2] == 1) + return p; + } + + if( p == end ) + return NULL; + + alignedend = end - ((intptr_t) end & 31); + if( alignedend > p ) + { +#ifdef CAN_COMPILE_AVX2 + asm volatile( + "vpxor %%ymm1, %%ymm1\n" + ::: "ymm1" + ); +#else + __m256i zeros = _mm256_set1_epi8( 0x00 ); +#endif + for( ; p < alignedend; p += 32) + { + uint32_t match; +#ifdef CAN_COMPILE_AVX2 + asm volatile( + "vmovdqa 0(%[v]), %%ymm0\n" + "vpcmpeqb %%ymm1, %%ymm0\n" + "vpmovmskb %%ymm0, %[match]\n" + : [match]"=r"(match) + : [v]"r"(p) + : "ymm0" + ); +#else + __m256i v = _mm256_load_si256((__m256i*)p); + __m256i res = _mm256_cmpeq_epi8( zeros, v ); + match = _mm256_movemask_epi8( res ); /* mask will be in reversed match order */ +#endif + if( match & 0x0000000F ) + TRY_MATCH(p, 0); + if( match & 0x000000F0 ) + TRY_MATCH(p, 4); + if( match & 0x00000F00 ) + TRY_MATCH(p, 8); + if( match & 0x0000F000 ) + TRY_MATCH(p, 12); + if( match & 0x000F0000 ) + TRY_MATCH(p, 16); + if( match & 0x00F00000 ) + TRY_MATCH(p, 20); + if( match & 0x0F000000 ) + TRY_MATCH(p, 24); + if( match & 0xF0000000 ) + TRY_MATCH(p, 28); + } + } + + for (; p <= end; p++) { + if (p[0] == 0 && p[1] == 0 && p[2] == 1) + return p; + } + + return NULL; +} + +#endif + #if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS) __attribute__ ((__target__ ("sse2"))) static inline const uint8_t * startcode_FindAnnexB_SSE2( const uint8_t *p, const uint8_t *end ) { + end -= 3; + /* First align to 16 */ /* Skipping this step and doing unaligned loads isn't faster */ const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15); - for (end -= 3; p < alignedend && p <= end; p++) { + for (; p < alignedend && p <= end; p++) { if (p[0] == 0 && p[1] == 0 && p[2] == 1) return p; } @@ -140,13 +219,18 @@ static inline const uint8_t * startcode_FindAnnexB_Bits( const uint8_t *p, const } #undef TRY_MATCH -#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS) +#if defined(CAN_COMPILE_AVX2) || defined(HAVE_AVX2_INTRINSICS) || defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS) static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint8_t *end ) { +#if defined(CAN_COMPILE_AVX2) || defined(HAVE_AVX2_INTRINSICS) + if (vlc_CPU_AVX2()) + return startcode_FindAnnexB_AVX2(p, end); +#endif +#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS) if (vlc_CPU_SSE2()) return startcode_FindAnnexB_SSE2(p, end); - else - return startcode_FindAnnexB_Bits(p, end); +#endif + return startcode_FindAnnexB_Bits(p, end); } #else #define startcode_FindAnnexB startcode_FindAnnexB_Bits _______________________________________________ vlc-commits mailing list vlc-commits@videolan.org https://mailman.videolan.org/listinfo/vlc-commits