From: Maxime Taisant <maximetais...@hotmail.fr> This code aim to improve the performances of the mct using SSE instructions. It was submitted by Nicolas Bertrand a while ago and was rejected. I would like to have some informations on what needs to be modified or improved. Thank you. --- libavcodec/jpeg2000dsp.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-)
diff --git a/libavcodec/jpeg2000dsp.c b/libavcodec/jpeg2000dsp.c index 6e04c3a..a546b7d 100644 --- a/libavcodec/jpeg2000dsp.c +++ b/libavcodec/jpeg2000dsp.c @@ -24,6 +24,10 @@ #include "libavutil/attributes.h" #include "jpeg2000dsp.h" +#ifdef __SSE__ +#include <xmmintrin.h> +#endif + /* Inverse ICT parameters in float and integer. * int value = (float value) * (1<<16) */ static const float f_ict_params[4] = { @@ -40,9 +44,56 @@ static const int i_ict_params[4] = { 116130 }; +static void mct_decode_sse( + float* restrict c0, + float* restrict c1, + float* restrict c2, + int n) +{ + int i; + __m128 vrv, vgu, vgv, vbu; + vrv = _mm_set1_ps(1.402f); + vgu = _mm_set1_ps(0.34413f); + vgv = _mm_set1_ps(0.71414f); + vbu = _mm_set1_ps(1.772f); + for (i = 0; i < (n >> 3); ++i) { + __m128 vy, vu, vv; + __m128 vr, vg, vb; + + vy = _mm_load_ps(c0); + vu = _mm_load_ps(c1); + vv = _mm_load_ps(c2); + vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); + vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); + vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); + _mm_store_ps(c0, vr); + _mm_store_ps(c1, vg); + _mm_store_ps(c2, vb); + c0 += 4; + c1 += 4; + c2 += 4; + vy = _mm_load_ps(c0); + vu = _mm_load_ps(c1); + vv = _mm_load_ps(c2); + vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); + vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); + vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); + _mm_store_ps(c0, vr); + _mm_store_ps(c1, vg); + _mm_store_ps(c2, vb); + c0 += 4; + c1 += 4; + c2 += 4; + } + n &= 7; +} + static void ict_float(void *_src0, void *_src1, void *_src2, int csize) { - float *src0 = _src0, *src1 = _src1, *src2 = _src2; + float *src0 = _src0, *src1 = _src1, *src2 = _src2; +#ifdef __SSE__ + mct_decode_sse(src0, src1, src2, csize); +#else float i0f, i1f, i2f; int i; @@ -55,6 +106,7 @@ static void ict_float(void *_src0, void *_src1, void *_src2, int csize) *src1++ = i1f; *src2++ = i2f; } +#endif } static void ict_int(void *_src0, void *_src1, void *_src2, int csize) -- 2.7.4 _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel