Re: [libav-devel] [PATCH 13/15] lavr: x86: optimized 6-channel flt to s16p conversion
Hi, On Sun, Aug 5, 2012 at 9:52 PM, Justin Ruggles wrote: > --- > libavresample/x86/audio_convert.asm| 74 > > libavresample/x86/audio_convert_init.c | 13 ++ > 2 files changed, 87 insertions(+), 0 deletions(-) LGTM. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 13/15] lavr: x86: optimized 6-channel flt to s16p conversion
--- libavresample/x86/audio_convert.asm| 74 libavresample/x86/audio_convert_init.c | 13 ++ 2 files changed, 87 insertions(+), 0 deletions(-) diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 2908cbf..c666da0 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -1091,3 +1091,77 @@ CONV_FLT_TO_S16P_2CH INIT_XMM avx CONV_FLT_TO_S16P_2CH %endif + +;-- +; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len, +; int channels); +;-- + +%macro CONV_FLT_TO_S16P_6CH 0 +%if ARCH_X86_64 +cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 +%else +cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 +%define lend dword r2m +%endif +mov dst1q, [dstq+ gprsize] +mov dst2q, [dstq+2*gprsize] +mov dst3q, [dstq+3*gprsize] +mov dst4q, [dstq+4*gprsize] +mov dst5q, [dstq+5*gprsize] +mov dstq, [dstq ] +sub dst1q, dstq +sub dst2q, dstq +sub dst3q, dstq +sub dst4q, dstq +sub dst5q, dstq +mova m6, [pf_s16_scale] +.loop: +mulps m0, m6, [srcq+0*mmsize] +mulps m3, m6, [srcq+1*mmsize] +mulps m1, m6, [srcq+2*mmsize] +mulps m4, m6, [srcq+3*mmsize] +mulps m2, m6, [srcq+4*mmsize] +mulps m5, m6, [srcq+5*mmsize] +cvtps2dq m0, m0 +cvtps2dq m1, m1 +cvtps2dq m2, m2 +cvtps2dq m3, m3 +cvtps2dq m4, m4 +cvtps2dq m5, m5 +packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 +packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 +packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 +PALIGNRm3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x +shufps m1, m2, q1032; m1 = 12, 13, 14, 15, 16, 17, 18, 19 +psrldq m2, 4; m2 = 18, 19, 20, 21, 22, 23, x, x +SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 +; m3 = 4, 10, 5, 11, x, x, x, x +SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21 +; m2 = 16, 22, 17, 23, x, x, x, x +SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 +; m1 = 2, 8, 14, 20, 3, 9, 15, 21 +punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23 +movq[dstq ], m0 +movhps [dstq+dst1q], m0 +movq[dstq+dst2q], m1 +movhps [dstq+dst3q], m1 +movq[dstq+dst4q], m3 +movhps [dstq+dst5q], m3 +add srcq, mmsize*6 +add dstq, mmsize/2 +sub lend, mmsize/4 +jg .loop +REP_RET +%endmacro + +%define PALIGNR PALIGNR_MMX +INIT_XMM sse2 +CONV_FLT_TO_S16P_6CH +%define PALIGNR PALIGNR_SSSE3 +INIT_XMM ssse3 +CONV_FLT_TO_S16P_6CH +%if HAVE_AVX +INIT_XMM avx +CONV_FLT_TO_S16P_6CH +%endif diff --git a/libavresample/x86/audio_convert_init.c b/libavresample/x86/audio_convert_init.c index d623543..944f1cd 100644 --- a/libavresample/x86/audio_convert_init.c +++ b/libavresample/x86/audio_convert_init.c @@ -127,6 +127,13 @@ extern void ff_conv_flt_to_s16p_2ch_sse2(int16_t *const *dst, float *src, extern void ff_conv_flt_to_s16p_2ch_avx (int16_t *const *dst, float *src, int len, int channels); +extern void ff_conv_flt_to_s16p_6ch_sse2 (int16_t *const *dst, float *src, + int len, int channels); +extern void ff_conv_flt_to_s16p_6ch_ssse3(int16_t *const *dst, float *src, + int len, int channels); +extern void ff_conv_flt_to_s16p_6ch_avx (int16_t *const *dst, float *src, + int len, int channels); + av_cold void ff_audio_convert_init_x86(AudioConvert *ac) { #if HAVE_YASM @@ -184,6 +191,8 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) 6, 16, 4, "SSE2", ff_conv_s16_to_fltp_6ch_sse2); ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLT, 2, 16, 8, "SSE2", ff_conv_flt_to_s16p_2ch_sse2); +ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLT, + 6, 16, 4, "SSE2", ff_conv_flt_to_s16p_6ch_sse2); } if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSE) { ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16P, @@ -196,6 +205,8 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) 6, 16, 4, "SSSE3", ff_conv_s16_to_s16p_6ch_
Re: [libav-devel] [PATCH 13/15] lavr: x86: optimized 6-channel flt to s16p conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles wrote: > +movhlpsm3, m1 > +movlhpsm3, m2 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 > +movlhpsm1, m1 > +movhlpsm1, m0 ; m1 = 4, 5, 6, 7, 8, 9, 10, 11 > +psrldq m1, 4; m1 = 6, 7, 8, 9, 10, 11, x, x > +psrldq m2, 4; m2 = 18, 19, 20, 21, 22, 23, x, x palignrx2+psrldqx1, saves 3 instructions. > +add srcq, mmsize*6 > +add dstq, mmsize/2 > +sub lend, mmsize/4 Pointer munging to remove one add/sub. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 13/15] lavr: x86: optimized 6-channel flt to s16p conversion
--- libavresample/x86/audio_convert.asm| 74 libavresample/x86/audio_convert_init.c |9 2 files changed, 83 insertions(+), 0 deletions(-) diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index c371aa3..4ce490c 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -1007,3 +1007,77 @@ CONV_FLT_TO_S16P_2CH INIT_XMM avx CONV_FLT_TO_S16P_2CH %endif + +;-- +; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len, +; int channels); +;-- + +%macro CONV_FLT_TO_S16P_6CH 0 +cglobal conv_flt_to_s16p_6ch, 2,8,7, dst, src, dst1, dst2, dst3, dst4, dst5, len +%if ARCH_X86_64 +mov lend, r2d +%else +%define lend dword r2m +%endif +mov dst1q, [dstq+ gprsize] +mov dst2q, [dstq+2*gprsize] +mov dst3q, [dstq+3*gprsize] +mov dst4q, [dstq+4*gprsize] +mov dst5q, [dstq+5*gprsize] +mov dstq, [dstq ] +sub dst1q, dstq +sub dst2q, dstq +sub dst3q, dstq +sub dst4q, dstq +sub dst5q, dstq +mova m6, [pf_s16_scale] +ALIGN 16 +.loop: +mulps m0, m6, [srcq ] +mulps m3, m6, [srcq+ mmsize] +mulps m1, m6, [srcq+2*mmsize] +mulps m4, m6, [srcq+3*mmsize] +mulps m2, m6, [srcq+4*mmsize] +mulps m5, m6, [srcq+5*mmsize] +cvtps2dq m0, m0 +cvtps2dq m1, m1 +cvtps2dq m2, m2 +cvtps2dq m3, m3 +cvtps2dq m4, m4 +cvtps2dq m5, m5 +packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 +packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 +packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 +movhlpsm3, m1 +movlhpsm3, m2 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 +movlhpsm1, m1 +movhlpsm1, m0 ; m1 = 4, 5, 6, 7, 8, 9, 10, 11 +psrldq m1, 4; m1 = 6, 7, 8, 9, 10, 11, x, x +psrldq m2, 4; m2 = 18, 19, 20, 21, 22, 23, x, x +punpcklwd m4, m0, m1 ; m4 = 0, 6, 1, 7, 2, 8, 3, 9 +punpckhwd m0, m1 ; m0 = 4, 10, 5, 11, x, x, x, x +punpcklwd m1, m3, m2 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21 +punpckhwd m3, m2 ; m3 = 16, 22, 17, 23, x, x, x, x +punpckldq m2, m4, m1 ; m2 = 0, 6, 12, 18, 1, 7, 13, 19 +punpckhdq m4, m1 ; m4 = 2, 8, 14, 20, 3, 9, 15, 21 +punpckldq m0, m3 ; m0 = 4, 10, 16, 22, 5, 11, 17, 23 +movq[dstq ], m2 +movhps [dstq+dst1q], m2 +movq[dstq+dst2q], m4 +movhps [dstq+dst3q], m4 +movq[dstq+dst4q], m0 +movhps [dstq+dst5q], m0 +add srcq, mmsize*6 +add dstq, mmsize/2 +sub lend, mmsize/4 +jg .loop +REP_RET +%endmacro + +INIT_XMM sse2 +CONV_FLT_TO_S16P_6CH +%if HAVE_AVX +INIT_XMM avx +CONV_FLT_TO_S16P_6CH +%endif diff --git a/libavresample/x86/audio_convert_init.c b/libavresample/x86/audio_convert_init.c index d2732cd..a985d7d 100644 --- a/libavresample/x86/audio_convert_init.c +++ b/libavresample/x86/audio_convert_init.c @@ -117,6 +117,11 @@ extern void ff_conv_flt_to_s16p_2ch_sse2(int16_t *const *dst, float *src, extern void ff_conv_flt_to_s16p_2ch_avx (int16_t *const *dst, float *src, int len, int channels); +extern void ff_conv_flt_to_s16p_6ch_sse2(int16_t *const *dst, float *src, + int len, int channels); +extern void ff_conv_flt_to_s16p_6ch_avx (int16_t *const *dst, float *src, + int len, int channels); + av_cold void ff_audio_convert_init_x86(AudioConvert *ac) { #if HAVE_YASM @@ -171,6 +176,8 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) 6, 16, 4, "SSE2", ff_conv_s16_to_fltp_6ch_sse2); ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLT, 2, 16, 8, "SSE2", ff_conv_flt_to_s16p_2ch_sse2); +ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLT, + 6, 16, 4, "SSE2", ff_conv_flt_to_s16p_6ch_sse2); } if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16, @@ -213,6 +220,8 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) 6, 16, 4, "AVX", ff_conv_s16_to_fltp_6ch_avx); ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLT, 2, 16, 8, "AVX", ff_conv_flt_to_s16p_2ch_avx)