Re: [libav-devel] [GASPP PATCH] Comment out "it" instructions for armasm
On 2019-10-02 11:53:28 +0300, Martin Storsjö wrote: > Armasm implicitly adds it instructions as needed. In VS 2019 16.3, > there's a bug [1] in armasm making it fail to parse these it instructions > (but it can still add them implicitly just fine). > > I'm not sure if it really is worth working around this issue, or just > wait for it to hopefully be fixed by the next release again. > > [1] > https://developercommunity.visualstudio.com/content/problem/757709/armasm-fails-to-handle-it-instructions.html > --- > gas-preprocessor.pl | 4 > 1 file changed, 4 insertions(+) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index b6c2786..9d8fb5d 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -1168,6 +1168,10 @@ sub handle_serialized_line { > $line =~ s/fmxr/vmsr/; > $line =~ s/fmrx/vmrs/; > $line =~ s/fadds/vadd.f32/; > +# Armasm in VS 2019 16.3 errors out on "it" instructions. But > +# armasm implicitly adds the necessary it instructions anyway, so we > +# can just filter it out. > +$line =~ s/^\s*it[te]*\s+/$comm$&/; > } > if ($as_type eq "armasm" and $arch eq "aarch64") { > # Convert "b.eq" into "beq" I guess ok-ish since armasm can handle implicit it instructions. Do you have expectation when a fixed version might be released? If it's more than a couple of weeks I'd say the workaround is worth it. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH] Filter out the armasm argument "-oldit" from the preprocessor
On 2019-10-02 10:58:46 +0300, Martin Storsjö wrote: > --- > gas-preprocessor.pl | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 6da37c1..b6c2786 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -162,6 +162,7 @@ if ($as_type ne "armasm") { > @preprocess_c_cmd = grep ! /^-fp/, @preprocess_c_cmd; > @preprocess_c_cmd = grep ! /^-EHsc$/, @preprocess_c_cmd; > @preprocess_c_cmd = grep ! /^-O/, @preprocess_c_cmd; > +@preprocess_c_cmd = grep ! /^-oldit/, @preprocess_c_cmd; > > @gcc_cmd = grep ! /^-G/, @gcc_cmd; > @gcc_cmd = grep ! /^-W/, @gcc_cmd; ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] aarch64: Add assembly support for -fsanitize=hwaddress tagged globals.
On 2019-08-21 22:40:13 +0300, Martin Storsjö wrote: > From: Peter Collingbourne > > As of LLVM r368102, Clang will set a pointer tag in bits 56-63 of the > address of a global when compiling with -fsanitize=hwaddress. This requires > an adjustment to assembly code that takes the address of such globals: the > code cannot use the regular R_AARCH64_ADR_PREL_PG_HI21 relocation to refer > to the global, since the tag would take the address out of range. Instead, > the code must use the non-checking (_NC) variant of the relocation (the > link-time check is substituted by a runtime check). > > This change makes the necessary adjustment in the movrel macro, where it is > needed when compiling with -fsanitize=hwaddress. > > Signed-off-by: Peter Collingbourne > Signed-off-by: Martin Storsjö > --- > libavutil/aarch64/asm.S | 8 > 1 file changed, 8 insertions(+) > > diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S > index bf5c1b7ee1..81d723b9b3 100644 > --- a/libavutil/aarch64/asm.S > +++ b/libavutil/aarch64/asm.S > @@ -32,6 +32,10 @@ > # define FUNC # > #endif > > +#ifndef __has_feature > +# define __has_feature(x) 0 > +#endif > + > .macro function name, export=0, align=2 > .macro endfunc > ELF .size \name, . - \name > @@ -94,7 +98,11 @@ ELF .size \name, . - \name > add \rd, \rd, :lo12:\val+(\offset) > .endif > #elif CONFIG_PIC > +# if __has_feature(hwaddress_sanitizer) > +adrp\rd, :pg_hi21_nc:\val+(\offset) > +# else > adrp\rd, \val+(\offset) > +# endif > add \rd, \rd, :lo12:\val+(\offset) > #else > ldr \rd, =\val+\offset ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/2] h264/arm64: implement missing 4:2:2 chroma loop filter neon functions
--- libavcodec/aarch64/h264dsp_init_aarch64.c | 18 ++-- libavcodec/aarch64/h264dsp_neon.S | 36 +++ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c index 07bda2ff07..85fea8e040 100644 --- a/libavcodec/aarch64/h264dsp_init_aarch64.c +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -37,10 +37,14 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma422_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, int stride, int alpha, int beta); void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, int stride, int alpha, int beta); +void ff_h264_h_loop_filter_chroma422_intra_neon(uint8_t *pix, int stride, +int alpha, int beta); void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, int stride, int alpha, int beta); @@ -91,10 +95,18 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; -c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; -c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon; -c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon; + +if (chroma_format_idc <= 1) { +c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; +c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon; +c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon; +} else { +c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon; +c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon; +c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon; +c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon; +} c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index 448e575b8c..bcce7e7da5 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -28,9 +28,9 @@ ldr w6, [x4] ccmpw3, #0, #0, ne mov v24.S[0], w6 -and w6, w6, w6, lsl #16 +and w8, w6, w6, lsl #16 b.eq1f -andsw6, w6, w6, lsl #8 +andsw8, w8, w8, lsl #8 b.ge2f 1: ret @@ -394,10 +394,10 @@ endfunc usubw v4.8H, v4.8H, v16.8B and v26.8B, v26.8B, v30.8B shl v4.8H, v4.8H, #2 -mov x2, v26.d[0] +mov x8, v26.d[0] sli v24.8H, v24.8H, #8 uaddw v4.8H, v4.8H, v18.8B -cbz x2, 9f +cbz x8, 9f usubw v4.8H, v4.8H, v2.8B rshrn v4.8B, v4.8H, #3 sminv4.8B, v4.8B, v24.8B @@ -436,6 +436,7 @@ function ff_h264_h_loop_filter_chroma_neon, export=1 sxtwx1, w1 sub x0, x0, #2 +h_loop_filter_chroma420: ld1 {v18.S}[0], [x0], x1 ld1 {v16.S}[0], [x0], x1 ld1 {v0.S}[0], [x0], x1 @@ -464,6 +465,19 @@ function ff_h264_h_loop_filter_chroma_neon, export=1 ret endfunc +function ff_h264_h_loop_filter_chroma422_neon, export=1 +sxtwx1, w1 +h264_loop_filter_start +add x5, x0, x1 +sub x0, x0, #2 +add x1, x1, x1 +mov x7, x30 +bl h_loop_filter_chroma420 +mov x30, x7 +sub x0, x5, #2 +mov v24.s[0], w6 +b h_loop_filter_chroma420 +endfunc .macro h264_loop_filter_chroma_intra uabd
[libav-devel] [PATCH 2/2] checkasm/h264: test 4:2:2 chroma loop filter functions
--- tests/checkasm/h264dsp.c | 44 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/tests/checkasm/h264dsp.c b/tests/checkasm/h264dsp.c index 706fc79397..ee07121ab4 100644 --- a/tests/checkasm/h264dsp.c +++ b/tests/checkasm/h264dsp.c @@ -341,9 +341,9 @@ static void check_loop_filter(void) c = c*9/10; } -#define CHECK_LOOP_FILTER(name, align, ...) \ +#define CHECK_LOOP_FILTER(name, align, idc) \ do {\ -if (check_func(h.name, #name "_%dbpp", bit_depth)) {\ +if (check_func(h.name, #name #idc "_%dbpp", bit_depth)) { \ for (j = 0; j < 36; j++) { \ intptr_t off = 8 * 32 + (j & 15) * 4 * !align; \ for (i = 0; i < 1024; i+=4) { \ @@ -355,7 +355,7 @@ static void check_loop_filter(void) call_ref(dst0 + off, 32, alphas[j], betas[j], tc0[j]); \ call_new(dst1 + off, 32, alphas[j], betas[j], tc0[j]); \ if (memcmp(dst0, dst1, 32 * 16 * SIZEOF_PIXEL)) { \ -fprintf(stderr, #name ": j:%d, alpha:%d beta:%d " \ +fprintf(stderr, #name #idc ": j:%d, alpha:%d beta:%d " \ "tc0:{%d,%d,%d,%d}\n", j, alphas[j], betas[j], \ tc0[j][0], tc0[j][1], tc0[j][2], tc0[j][3]); \ fail(); \ @@ -365,12 +365,16 @@ static void check_loop_filter(void) } \ } while (0) -CHECK_LOOP_FILTER(h264_v_loop_filter_luma, 1); -CHECK_LOOP_FILTER(h264_h_loop_filter_luma, 0); -CHECK_LOOP_FILTER(h264_h_loop_filter_luma_mbaff, 0); -CHECK_LOOP_FILTER(h264_v_loop_filter_chroma, 1); -CHECK_LOOP_FILTER(h264_h_loop_filter_chroma, 0); -CHECK_LOOP_FILTER(h264_h_loop_filter_chroma_mbaff, 0); +CHECK_LOOP_FILTER(h264_v_loop_filter_luma, 1,); +CHECK_LOOP_FILTER(h264_h_loop_filter_luma, 0,); +CHECK_LOOP_FILTER(h264_h_loop_filter_luma_mbaff, 0,); +CHECK_LOOP_FILTER(h264_v_loop_filter_chroma, 1,); +CHECK_LOOP_FILTER(h264_h_loop_filter_chroma, 0,); +CHECK_LOOP_FILTER(h264_h_loop_filter_chroma_mbaff, 0,); + +ff_h264dsp_init(&h, bit_depth, 2); +CHECK_LOOP_FILTER(h264_h_loop_filter_chroma, 0, 422); +CHECK_LOOP_FILTER(h264_h_loop_filter_chroma_mbaff, 0, 422); #undef CHECK_LOOP_FILTER } } @@ -397,9 +401,9 @@ static void check_loop_filter_intra(void) a = a*9/10; } -#define CHECK_LOOP_FILTER(name, align) \ +#define CHECK_LOOP_FILTER(name, align, idc) \ do {\ -if (check_func(h.name, #name "_%dbpp", bit_depth)) {\ +if (check_func(h.name, #name #idc "_%dbpp", bit_depth)) { \ for (j = 0; j < 36; j++) { \ intptr_t off = 8 * 32 + (j & 15) * 4 * !align; \ for (i = 0; i < 1024; i+=4) { \ @@ -411,7 +415,7 @@ static void check_loop_filter_intra(void) call_ref(dst0 + off, 32, alphas[j], betas[j]); \ call_new(dst1 + off, 32, alphas[j], betas[j]); \ if (memcmp(dst0, dst1, 32 * 16 * SIZEOF_PIXEL)) { \ -fprintf(stderr, #name ": j:%d, alpha:%d beta:%d\n", \ +fprintf(stderr, #name #idc ": j:%d, alpha:%d beta:%d\n", \ j, alphas[j], betas[j]);\ fail(); \ } \ @@ -420,12 +424,16 @@ static void check_loop_filter_intra(void) } \ } while (0) -CHECK_LOOP_FILTER(h264_v_loop_filter_luma_intra, 1); -CHECK_LOOP_FILTER(h264_h_loop_filter_luma_intra, 0); -CHECK_LOOP_FILTER(h264_h_loop_filter_luma_mbaff_intra, 0); -CHECK_LOOP_FILTER(h264_v_loop_filter_chroma_intra, 1); -CHECK_LOOP_FILTER(h264_h_loop_filter_chroma_intra, 0); -CHECK_LOOP_FILTER(h264_h_loop_filter_chroma_mbaff_intra, 0); +CHECK_LOOP_FILTER(h264_v_loop_filter_luma_intra, 1,); +CHECK_LOOP_FILTER(h264_h_loop_filter_luma_intra, 0,); +CHECK_LOOP_FILTER(h264_h_loop_filter_luma_mbaff_intra, 0,); +CHECK_LOOP_FILTER(h264_v_loop_filter_chroma_intra, 1,); +CHECK_LOOP_FILTER(h264_h_loop_filter_chroma_intr
Re: [libav-devel] [PATCH 1/1] h264/x86: sign extend int stride in deblock functions
On 2019-01-27 11:39:13 +0100, Diego Biurrun wrote: > On Sun, Jan 27, 2019 at 11:18:41AM +0100, Janne Grunau wrote: > > Fixes checkasm errors after adding the h264 deblock tests. > > --- > > libavcodec/x86/h264_deblock.asm | 8 > > libavcodec/x86/h264_deblock_10bit.asm | 9 + > > 2 files changed, 17 insertions(+) > > Shouldn't some int types be converted to ptrdiff_t instead? that would be another possible solution but h264 seems to use int for stride consistently. So changing stride to ptrdiff_t is much more effort and riskier since it touches more functions and architectures. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/1] h264/x86: sign extend int stride in deblock functions
Fixes checkasm errors after adding the h264 deblock tests. --- libavcodec/x86/h264_deblock.asm | 8 libavcodec/x86/h264_deblock_10bit.asm | 9 + 2 files changed, 17 insertions(+) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 33fd5a9dd7..4b9cf85d16 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -288,6 +288,7 @@ cextern pb_3 ;- %macro DEBLOCK_LUMA 0 cglobal deblock_v_luma_8, 5,5,10 +movsxdifnidn r1, r1d movdm8, [r4] ; tc0 lea r4, [r1*3] dec r2d; alpha-1 @@ -335,6 +336,7 @@ cglobal deblock_v_luma_8, 5,5,10 INIT_MMX cpuname cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 movsxd r7, r1d +movsxdifnidn r1, r1d lear8, [r7+r7*2] lear6, [r0-4] lear5, [r0-4+r8] @@ -395,6 +397,7 @@ DEBLOCK_LUMA ; int8_t *tc0) ;- cglobal deblock_%1_luma_8, 5,5,8,2*%2 +movsxdifnidn r1, r1d lea r4, [r1*3] dec r2 ; alpha-1 neg r4 @@ -445,6 +448,7 @@ cglobal deblock_%1_luma_8, 5,5,8,2*%2 ;- INIT_MMX cpuname cglobal deblock_h_luma_8, 0,5,8,0x60+12 +movsxdifnidn r1, r1d movr0, r0mp movr3, r1m lear4, [r3*3] @@ -646,6 +650,7 @@ cglobal deblock_%1_luma_intra_8, 4,6,16,0x10 %else cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50 %endif +movsxdifnidn r1, r1d lea r4, [r1*4] lea r5, [r1*3] ; 3*stride dec r2d; alpha-1 @@ -703,6 +708,7 @@ INIT_MMX cpuname ;- cglobal deblock_h_luma_intra_8, 4,9,0,0x80 movsxd r7, r1d +movsxdifnidn r1, r1d lear8, [r7*3] lear6, [r0-4] lear5, [r0-4+r8] @@ -782,6 +788,7 @@ DEBLOCK_LUMA_INTRA v8 INIT_MMX mmxext %macro CHROMA_V_START 0 +movsxdifnidn r1, r1d decr2d ; alpha-1 decr3d ; beta-1 movt5, r0 @@ -790,6 +797,7 @@ INIT_MMX mmxext %endmacro %macro CHROMA_H_START 0 +movsxdifnidn r1, r1d decr2d decr3d subr0, 2 diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index d049c62bf2..1a424b7f43 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -162,6 +162,7 @@ cglobal deblock_v_luma_10, 5,5,8*(mmsize/16) %define ms2 [rsp+mmsize*2] %define am [rsp+mmsize*3] %define bm [rsp+mmsize*4] +movsxdifnidn r1, r1d SUBrsp, pad shlr2d, 2 shlr3d, 2 @@ -219,6 +220,7 @@ cglobal deblock_h_luma_10, 5,6,8*(mmsize/16) %define p2m [rsp+mmsize*4] %define am [rsp+mmsize*5] %define bm [rsp+mmsize*6] +movsxdifnidn r1, r1d SUBrsp, pad shlr2d, 2 shlr3d, 2 @@ -349,6 +351,7 @@ cglobal deblock_v_luma_10, 5,5,15 %define mask0 m7 %define mask1 m10 %define mask2 m11 +movsxdifnidn r1, r1d shlr2d, 2 shlr3d, 2 LOAD_ABm12, m13, r2d, r3d @@ -377,6 +380,7 @@ cglobal deblock_v_luma_10, 5,5,15 REP_RET cglobal deblock_h_luma_10, 5,7,15 +movsxdifnidn r1, r1d shlr2d, 2 shlr3d, 2 LOAD_ABm12, m13, r2d, r3d @@ -492,6 +496,7 @@ DEBLOCK_LUMA_64 CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] %assign i i+1 %endrep +movsxdifnidn r1, r1d SUBrsp, pad %endmacro @@ -615,6 +620,7 @@ cglobal deblock_v_luma_intra_10, 4,7,16 %define q2 m13 %define aa m5 %define bb m14 +movsxdifnidn r1, r1d lea r4, [r1*4] lea r5, [r1*3] ; 3*stride neg r4 @@ -668,6 +674,7 @@ cglobal deblock_h_luma_intra_10, 4,7,16 %define p3 m4 %define spill [rsp] %assign pad 24-(stack_offset&15) +movsxdifnidn r1, r1d SUB rsp, pad lea r4, [r1*4] lea r5, [r1*3] ; 3*stride @@ -852,6 +859,7 @@ DEBLOCK_LUMA_INTRA ; int8_t *tc0) ;- cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) +movsxdifnidn r1, r1d mov r5, r0 sub r0, r1 sub r0, r1 @@ -887,6 +895,7 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) ; int beta) ;- cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) +movsxdifnidn r1, r1d mov r4, r0 sub r0, r1 sub r0, r1 -- 2.20.1 ___ libav-devel mailing list libav-devel@libav
Re: [libav-devel] [PATCH 2/4] checkasm/h264: add loop filter tests
On 2019-01-26 23:22:42 +0200, Martin Storsjö wrote: > On Tue, 1 Jan 2019, Janne Grunau wrote: > > > --- > > tests/checkasm/h264dsp.c | 124 +++ > > 1 file changed, 124 insertions(+) > > This newly added test seems to fail on macOS. I haven't debugged through it > properly yet, but disabling the use of checkasm_checked_call seems to make > it pass. stride for the deblock functions is still int nad the x86 asm doesn't sign extend it. Exactly what checkasm_checked_call is supposed to catch. I'll fix it. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] libopenh264dec: Use a newer decoding entry point function
On 2019-01-25 10:39:13 +0200, Martin Storsjö wrote: > The "new" entry point actually has existed since OpenH264 1.4 in > 2015, but with B-frames, this entry point is essential for actually > getting the right frames returned and reordered. > > The name of this function, DecodeFrameNoDelay, is rather backwards > considering that it doesn't return the latest decoded frame immediately, > but actually does proper delaying and reordering of frames, but > it's the recommended decoding entry point. The commit message is hard to parse. Something along below is imho easier to understand: | The "new" entry point actually has existed since OpenH264 1.4 in | 2015 and is the the recommended decoding entry point. | | The name of this function, DecodeFrameNoDelay, is rather backwards | considering that it doesn't return the latest decoded frame immediately, | but actually does proper delaying and reordering of frames. path itsel ok, Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH] Name read-only data sections .rdata, convert both .rdata and .rodata in the same way
On 2019-01-11 15:24:56 +0200, Martin Storsjö wrote: > --- > gas-preprocessor.pl | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 4131c46..0137718 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -1198,7 +1198,7 @@ sub handle_serialized_line { > $line =~ s/\.arm/ARM/x; > # The alignment in AREA is the power of two, just as .align in gas > $line =~ s/\.text/AREA |.text|, CODE, READONLY, ALIGN=4, CODEALIGN/; > -$line =~ s/(\s*)(.*)\.rodata/$1AREA |.rodata|, DATA, READONLY, > ALIGN=5/; > +$line =~ s/(\s*)(.*)\.ro?data/$1AREA |.rdata|, DATA, READONLY, > ALIGN=5/; > $line =~ s/\.data/AREA |.data|, DATA, ALIGN=5/; > } > if ($as_type eq "armasm" and $arch eq "arm") { ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/4] h264/aarch64: sign extend int stride in loop filter asm
--- libavcodec/aarch64/h264dsp_neon.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index 9b4610a4d4..60ffa24500 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -130,6 +130,7 @@ endfunc function ff_h264_h_loop_filter_luma_neon, export=1 h264_loop_filter_start +sxtwx1, w1 sub x0, x0, #4 ld1 {v6.8B}, [x0], x1 @@ -210,6 +211,7 @@ endfunc function ff_h264_v_loop_filter_chroma_neon, export=1 h264_loop_filter_start +sxtwx1, w1 sub x0, x0, x1, lsl #1 ld1 {v18.8B}, [x0], x1 @@ -228,6 +230,7 @@ endfunc function ff_h264_h_loop_filter_chroma_neon, export=1 h264_loop_filter_start +sxtwx1, w1 sub x0, x0, #2 ld1 {v18.S}[0], [x0], x1 -- 2.20.1 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 2/4] checkasm/h264: add loop filter tests
--- tests/checkasm/h264dsp.c | 124 +++ 1 file changed, 124 insertions(+) diff --git a/tests/checkasm/h264dsp.c b/tests/checkasm/h264dsp.c index f355a72a74..706fc79397 100644 --- a/tests/checkasm/h264dsp.c +++ b/tests/checkasm/h264dsp.c @@ -28,6 +28,7 @@ #include "libavutil/intreadwrite.h" static const uint32_t pixel_mask[3] = { 0x, 0x01ff01ff, 0x03ff03ff }; +static const uint32_t pixel_mask_lf[3] = { 0xff0fff0f, 0x01ff000f, 0x03ff000f }; #define SIZEOF_PIXEL ((bit_depth + 7) / 8) #define SIZEOF_COEF (2 * ((bit_depth + 7) / 8)) @@ -312,9 +313,132 @@ static void check_idct_multiple(void) } } + +static void check_loop_filter(void) +{ +LOCAL_ALIGNED_16(uint8_t, dst, [32 * 16 * 2]); +LOCAL_ALIGNED_16(uint8_t, dst0, [32 * 16 * 2]); +LOCAL_ALIGNED_16(uint8_t, dst1, [32 * 16 * 2]); +H264DSPContext h; +int bit_depth; +int alphas[36], betas[36]; +int8_t tc0[36][4]; + +declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, int stride, + int alpha, int beta, int8_t *tc0); + +for (bit_depth = 8; bit_depth <= 10; bit_depth++) { +int i, j, a, c; +uint32_t mask = pixel_mask_lf[bit_depth - 8]; +ff_h264dsp_init(&h, bit_depth, 1); +for (i = 35, a = 255, c = 250; i >= 0; i--) { +alphas[i] = a << (bit_depth - 8); +betas[i] = (i + 1) / 2 << (bit_depth - 8); +tc0[i][0] = tc0[i][3] = (c + 6) / 10; +tc0[i][1] = (c + 7) / 15; +tc0[i][2] = (c + 9) / 20; +a = a*9/10; +c = c*9/10; +} + +#define CHECK_LOOP_FILTER(name, align, ...) \ +do {\ +if (check_func(h.name, #name "_%dbpp", bit_depth)) {\ +for (j = 0; j < 36; j++) { \ +intptr_t off = 8 * 32 + (j & 15) * 4 * !align; \ +for (i = 0; i < 1024; i+=4) { \ +AV_WN32A(dst + i, rnd() & mask);\ +} \ +memcpy(dst0, dst, 32 * 16 * 2); \ +memcpy(dst1, dst, 32 * 16 * 2); \ +\ +call_ref(dst0 + off, 32, alphas[j], betas[j], tc0[j]); \ +call_new(dst1 + off, 32, alphas[j], betas[j], tc0[j]); \ +if (memcmp(dst0, dst1, 32 * 16 * SIZEOF_PIXEL)) { \ +fprintf(stderr, #name ": j:%d, alpha:%d beta:%d " \ +"tc0:{%d,%d,%d,%d}\n", j, alphas[j], betas[j], \ +tc0[j][0], tc0[j][1], tc0[j][2], tc0[j][3]); \ +fail(); \ +} \ +bench_new(dst1, 32, alphas[j], betas[j], tc0[j]); \ +} \ +} \ +} while (0) + +CHECK_LOOP_FILTER(h264_v_loop_filter_luma, 1); +CHECK_LOOP_FILTER(h264_h_loop_filter_luma, 0); +CHECK_LOOP_FILTER(h264_h_loop_filter_luma_mbaff, 0); +CHECK_LOOP_FILTER(h264_v_loop_filter_chroma, 1); +CHECK_LOOP_FILTER(h264_h_loop_filter_chroma, 0); +CHECK_LOOP_FILTER(h264_h_loop_filter_chroma_mbaff, 0); +#undef CHECK_LOOP_FILTER +} +} + +static void check_loop_filter_intra(void) +{ +LOCAL_ALIGNED_16(uint8_t, dst, [32 * 16 * 2]); +LOCAL_ALIGNED_16(uint8_t, dst0, [32 * 16 * 2]); +LOCAL_ALIGNED_16(uint8_t, dst1, [32 * 16 * 2]); +H264DSPContext h; +int bit_depth; +int alphas[36], betas[36]; + +declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, int stride, + int alpha, int beta); + +for (bit_depth = 8; bit_depth <= 10; bit_depth++) { +int i, j, a; +uint32_t mask = pixel_mask_lf[bit_depth - 8]; +ff_h264dsp_init(&h, bit_depth, 1); +for (i = 35, a = 255; i >= 0; i--) { +alphas[i] = a << (bit_depth - 8); +betas[i] = (i + 1) / 2 << (bit_depth - 8); +a = a*9/10; +} + +#define CHECK_LOOP_FILTER(name, align) \ +do {\ +if (check_func(h.name, #name "_%dbpp", bit_depth)) {\ +for (j = 0; j < 36; j++) { \ +intptr_t off = 8 * 32 + (j & 15) * 4 * !align; \ +for (i = 0; i < 1024; i+=4) { \ +AV_WN32A(dst + i, rnd() & mask);
[libav-devel] [PATCH 4/4] h264/aarch64: add intra loop filter neon asm
Add my neon asm from x264 relicensed under the LGPL 2.1 or later. Ported (x264 uses nv12 chroma) and optimized. Cycle count for checkasm --bench on a Snapdragon 820e: h264_h_loop_filter_luma_intra_8bpp_c: 60.0 h264_h_loop_filter_luma_intra_8bpp_neon: 54.2 h264_v_loop_filter_luma_intra_8bpp_c: 148.3 h264_v_loop_filter_luma_intra_8bpp_neon: 73.8 h264_h_loop_filter_chroma_intra_8bpp_c: 27.8 h264_h_loop_filter_chroma_intra_8bpp_neon: 21.4 h264_h_loop_filter_chroma_mbaff_intra_8bpp_c: 15.8 h264_h_loop_filter_chroma_mbaff_intra_8bpp_neon: 15.7 h264_v_loop_filter_chroma_intra_8bpp_c: 45.8 h264_v_loop_filter_chroma_intra_8bpp_neon: 17.3 --- libavcodec/aarch64/h264dsp_init_aarch64.c | 16 ++ libavcodec/aarch64/h264dsp_neon.S | 297 ++ 2 files changed, 313 insertions(+) diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c index b106f11134..07bda2ff07 100644 --- a/libavcodec/aarch64/h264dsp_init_aarch64.c +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -29,10 +29,20 @@ void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); +void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha, + int beta); +void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha, + int beta); void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); +void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, int stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, int stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, int stride, + int alpha, int beta); void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height, int log2_den, int weight, int offset); @@ -77,8 +87,14 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, if (have_neon(cpu_flags) && bit_depth == 8) { c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; +c->h264_v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon; +c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; +c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; +c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon; +c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon; c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index b649f1d018..448e575b8c 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -1,6 +1,7 @@ /* * Copyright (c) 2008 Mans Rullgard * Copyright (c) 2013 Janne Grunau + * Copyright (c) 2014 Janne Grunau * * This file is part of Libav. * @@ -181,6 +182,203 @@ function ff_h264_h_loop_filter_luma_neon, export=1 ret endfunc + +.macro h264_loop_filter_start_intra +orr w4, w2, w3 +cbnzw4, 1f +ret +1: +sxtwx1, w1 +dup v30.16b, w2// alpha +dup v31.16b, w3// beta +.endm + +.macro h264_loop_filter_luma_intra +uabdv16.16b, v7.16b, v0.16b// abs(p0 - q0) +uabdv17.16b, v6.16b, v7.16b// abs(p1 - p0) +uabdv18.16b, v1.16b, v0.16b// abs(q1 - q0) +cmhiv19.16b, v30.16b, v16.16b // < alpha +cmhiv17.16b, v31.16b, v17.16b // < beta +cmhiv18.16b, v31.16b, v18.16b // < beta + +moviv29.16b, #2 +ushrv30.16b, v30.16b, #2// alpha >> 2 +add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 +cmhiv16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 + +
[libav-devel] [PATCH 3/4] h264/aarch64: optimize neon loop filter
Exit as soon as possible if no filtering will be done. Improves the checkasm --bench cycle count on a Snapdragon 820e: h264_h_loop_filter_luma_8bpp_c: 72.4 -> 72.5 h264_h_loop_filter_luma_8bpp_neon: 97.1 -> 56.3 h264_v_loop_filter_luma_8bpp_c: 174.0 -> 173.5 h264_v_loop_filter_luma_8bpp_neon: 62.9 -> 60.9 h264_h_loop_filter_chroma_8bpp_c:30.2 -> 30.3 h264_h_loop_filter_chroma_8bpp_neon: 51.6 -> 25.7 h264_v_loop_filter_chroma_8bpp_c:57.3 -> 57.3 h264_v_loop_filter_chroma_8bpp_neon: 28.0 -> 24.0 --- libavcodec/aarch64/h264dsp_neon.S | 33 ++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index 60ffa24500..b649f1d018 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -54,9 +54,12 @@ uabdv17.16B, v20.16B, v16.16B // abs(p2 - p0) and v21.16B, v21.16B, v28.16B uabdv19.16B, v4.16B, v0.16B // abs(q2 - q0) +and v21.16B, v21.16B, v30.16B // < beta +shrnv30.8b, v21.8h, #4 +mov x7, v30.d[0] cmhiv17.16B, v22.16B, v17.16B // < beta -and v21.16B, v21.16B, v30.16B cmhiv19.16B, v22.16B, v19.16B // < beta +cbz x7, 9f and v17.16B, v17.16B, v21.16B and v19.16B, v19.16B, v21.16B and v24.16B, v24.16B, v21.16B @@ -124,7 +127,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1 st1 {v16.16B}, [x0], x1 st1 {v0.16B}, [x0], x1 st1 {v19.16B}, [x0] - +9: ret endfunc @@ -174,32 +177,34 @@ function ff_h264_h_loop_filter_luma_neon, export=1 st1 {v16.S}[3], [x0], x1 st1 {v0.S}[3], [x0], x1 st1 {v19.S}[3], [x0], x1 - +9: ret endfunc .macro h264_loop_filter_chroma dup v22.8B, w2 // alpha +dup v23.8B, w3 // beta uxtlv24.8H, v24.8B uabdv26.8B, v16.8B, v0.8B // abs(p0 - q0) -uxtlv4.8H, v0.8B uabdv28.8B, v18.8B, v16.8B // abs(p1 - p0) +uabdv30.8B, v2.8B, v0.8B // abs(q1 - q0) +cmhiv26.8B, v22.8B, v26.8B // < alpha +cmhiv28.8B, v23.8B, v28.8B // < beta +cmhiv30.8B, v23.8B, v30.8B // < beta +uxtlv4.8H, v0.8B +and v26.8B, v26.8B, v28.8B usubw v4.8H, v4.8H, v16.8B -sli v24.8H, v24.8H, #8 +and v26.8B, v26.8B, v30.8B shl v4.8H, v4.8H, #2 -uabdv30.8B, v2.8B, v0.8B // abs(q1 - q0) +mov x2, v26.d[0] +sli v24.8H, v24.8H, #8 uaddw v4.8H, v4.8H, v18.8B -cmhiv26.8B, v22.8B, v26.8B // < alpha +cbz x2, 9f usubw v4.8H, v4.8H, v2.8B -dup v22.8B, w3 // beta rshrn v4.8B, v4.8H, #3 -cmhiv28.8B, v22.8B, v28.8B // < beta -cmhiv30.8B, v22.8B, v30.8B // < beta sminv4.8B, v4.8B, v24.8B neg v25.8B, v24.8B -and v26.8B, v26.8B, v28.8B smaxv4.8B, v4.8B, v25.8B -and v26.8B, v26.8B, v30.8B uxtlv22.8H, v0.8B and v4.8B, v4.8B, v26.8B uxtlv28.8H, v16.8B @@ -224,7 +229,7 @@ function ff_h264_v_loop_filter_chroma_neon, export=1 sub x0, x0, x1, lsl #1 st1 {v16.8B}, [x0], x1 st1 {v0.8B}, [x0], x1 - +9: ret endfunc @@ -257,7 +262,7 @@ function ff_h264_h_loop_filter_chroma_neon, export=1 st1 {v16.S}[1], [x0], x1 st1 {v0.S}[1], [x0], x1 st1 {v2.S}[1], [x0], x1 - +9: ret endfunc -- 2.20.1 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH] Use the correct variable $line instead of the implicit variable
On 2018-10-22 23:24:12 +0300, Martin Storsjö wrote: > This fixes cases if the input parameter is something else than > the currently iterated variable. > --- > gas-preprocessor.pl | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 41d7b69..39ad08d 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -383,12 +383,12 @@ sub parse_line { > return if (parse_if_line($line)); > > if (scalar(@rept_lines) == 0) { > -if (/\.macro/) { > +if ($line =~ /\.macro/) { > $macro_level++; > if ($macro_level > 1 && !$current_macro) { > die "nested macros but we don't have master macro"; > } > -} elsif (/\.endm/) { > +} elsif ($line =~ /\.endm/) { > $macro_level--; > if ($macro_level < 0) { > die "unmatched .endm"; ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH] Add a -verbose option for printing all executed commands
On 2018-10-22 23:23:38 +0300, Martin Storsjö wrote: > --- > gas-preprocessor.pl | 9 + > 1 file changed, 9 insertions(+) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index fd9aac8..41d7b69 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -27,6 +27,7 @@ my $as_type = "apple-gas"; > > my $fix_unreq = $^O eq "darwin"; > my $force_thumb = 0; > +my $verbose = 0; > > my $arm_cond_codes = "eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le|al|hs|lo"; > > @@ -48,6 +49,7 @@ command. Following options are currently supported: > -force-thumb - assemble as thumb regardless of the input source > (note, this is incomplete and only works for sources > it explicitly was tested with) > +-verbose - print executed commands > "; > > sub usage() { > @@ -61,6 +63,8 @@ while (@ARGV) { > $fix_unreq = $1 ne "no-"; > } elsif ($opt eq "-force-thumb") { > $force_thumb = 1; > +} elsif ($opt eq "-verbose") { > +$verbose = 1; > } elsif ($opt eq "-arch") { > $arch = shift; > die "unknown arch: '$arch'\n" if not exists $canonical_arch{$arch}; > @@ -90,6 +94,7 @@ if (grep /\.c$/, @gcc_cmd) { > # pass -v/--version along, used during probing. Matching '-v' might have > # uninteded results but it doesn't matter much if gas-preprocessor or > # the compiler fails. > +print STDERR join(" ", @gcc_cmd)."\n" if $verbose; > exec(@gcc_cmd); > } else { > die "Unrecognized input filetype"; > @@ -115,6 +120,7 @@ if ($as_type eq "armasm") { > $index++; > } > if (grep /^-MM$/, @preprocess_c_cmd) { > +print STDERR join(" ", @preprocess_c_cmd)."\n" if $verbose; > system(@preprocess_c_cmd) == 0 or die "Error running preprocessor"; > exit 0; > } > @@ -206,12 +212,14 @@ $comm = ";" if $as_type =~ /armasm/; > my %ppc_spr = (ctr=> 9, > vrsave => 256); > > +print STDERR join(" ", @preprocess_c_cmd)."\n" if $verbose; > open(INPUT, "-|", @preprocess_c_cmd) || die "Error running preprocessor"; > > if ($ENV{GASPP_DEBUG}) { > open(ASMFILE, ">&STDOUT"); > } else { > if ($as_type ne "armasm") { > +print STDERR join(" ", @gcc_cmd)."\n" if $verbose; > open(ASMFILE, "|-", @gcc_cmd) or die "Error running assembler"; > } else { > open(ASMFILE, ">", $tempfile); > @@ -1192,6 +1200,7 @@ if ($as_type ne "armasm") { > close(INPUT) or exit 1; > close(ASMFILE) or exit 1; > if ($as_type eq "armasm" and ! defined $ENV{GASPP_DEBUG}) { > +print STDERR join(" ", @gcc_cmd)."\n" if $verbose; > system(@gcc_cmd) == 0 or die "Error running assembler"; > } > ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH 2/2] Don't match whitespace as branch condition codes
On 2018-10-20 00:18:27 +0300, Martin Storsjö wrote: > For cases like "b1b", this could previously be matched as > $cond = " ". > > This fixes preprocessing with a preprocessor that preserves multiple > consecutive spaces, like cl.exe does. > --- > Better fix, which also works in a number of cases where the previous > version failed. > --- > gas-preprocessor.pl | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index b22ee8a..c42412f 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -879,7 +879,7 @@ sub handle_serialized_line { > > > # Check branch instructions > -if ($line =~ > /(?:^|\n)\s*(\w+\s*:\s*)?(bl?x?\.?(..)?(\.w)?)\s+(\w+)/) { > +if ($line =~ > /(?:^|\n)\s*(\w+\s*:\s*)?(bl?x?\.?([^\s]{2})?(\.w)?)\s+(\w+)/) { > my $instr = $2; > my $cond = $3; > my $width = $4; both ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH] Extend armasm64 workaround for uxtw/sxtw to uxth/sxth and uxtb/sxtb as well
On 2018-10-22 12:51:47 +0300, Martin Storsjö wrote: > --- > gas-preprocessor.pl | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 7efe3b9..669d435 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -1011,7 +1011,7 @@ sub handle_serialized_line { > > # Convert e.g. "add x0, x0, w0, uxtw" into "add x0, x0, w0, uxtw > #0", > # or "ldr x0, [x0, w0, uxtw]" into "ldr x0, [x0, w0, uxtw #0]". > -$line =~ s/(uxtw|sxtw)(\s*\]?\s*)$/\1 #0\2/i; > +$line =~ s/(uxt[whb]|sxt[whb])(\s*\]?\s*)$/\1 #0\2/i; > > # Convert "mov x0, v0.d[0]" into "umov x0, v0.d[0]" > $line =~ s/\bmov\s+[xw]\d+\s*,\s*v\d+\.[ds]/u$&/i; ok, Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] arm: Emit .thumb_func directives
On 2018-10-12 14:43:56 +0300, Martin Storsjö wrote: > Prior to Xcode 9.3, the clang built-in assembler didn't support > altmacro, and gas-preprocessor was used for assembling for arm/darwin. > > For thumb functions, gas-preprocessor took care of adding the .thumb_func > directives, but when now being able to assemble without gas-preprocessor, > we need to add these directives ourselves. > --- > libavutil/arm/asm.S | 8 > 1 file changed, 8 insertions(+) > > diff --git a/libavutil/arm/asm.S b/libavutil/arm/asm.S > index e7eea0271f..5207a1a2b8 100644 > --- a/libavutil/arm/asm.S > +++ b/libavutil/arm/asm.S > @@ -75,6 +75,12 @@ T .thumb > ELF .eabi_attribute 25, 1 @ Tag_ABI_align_preserved > ELF .section .note.GNU-stack,"",%progbits @ Mark stack as non-executable > > +.macro func_mode name > +#if CONFIG_THUMB && defined(__APPLE__) > +.thumb_func \name > +#endif > +.endm > + > .macro function name, export=0, align=2 > .set.Lpic_idx, 0 > .set.Lpic_gp, 0 > @@ -98,10 +104,12 @@ FUNC.endfunc > .global EXTERN_ASM\name > ELF .type EXTERN_ASM\name, %function > FUNC.func EXTERN_ASM\name > +func_mode EXTERN_ASM\name > EXTERN_ASM\name: > .else > ELF .type \name, %function > FUNC.func \name > +func_mode \name > \name: > .endif > .endm > -- patch ok either in this form or as discussed on irc Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] avcodec: rename the AV1 profiles
On 2018-03-29 13:10:49 -0300, James Almer wrote: > Use the proper names instead of numbers > > Signed-off-by: James Almer > --- > libavcodec/avcodec.h | 6 +++--- > libavcodec/libaomenc.c | 6 +++--- > 2 files changed, 6 insertions(+), 6 deletions(-) > > diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h > index ac0915328..eb234a40d 100644 > --- a/libavcodec/avcodec.h > +++ b/libavcodec/avcodec.h > @@ -2551,9 +2551,9 @@ typedef struct AVCodecContext { > #define FF_PROFILE_HEVC_MAIN_STILL_PICTURE 3 > #define FF_PROFILE_HEVC_REXT4 > > -#define FF_PROFILE_AV1_00 > -#define FF_PROFILE_AV1_11 > -#define FF_PROFILE_AV1_22 > +#define FF_PROFILE_AV1_MAIN 0 > +#define FF_PROFILE_AV1_HIGH 1 > +#define FF_PROFILE_AV1_PROFESSIONAL 2 > > /** > * level > diff --git a/libavcodec/libaomenc.c b/libavcodec/libaomenc.c > index 94b3ddd32..a2a2c3994 100644 > --- a/libavcodec/libaomenc.c > +++ b/libavcodec/libaomenc.c > @@ -302,13 +302,13 @@ static av_cold int aom_init(AVCodecContext *avctx) > if (avctx->profile != FF_PROFILE_UNKNOWN) > enccfg.g_profile = avctx->profile; > else if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) > -avctx->profile = enccfg.g_profile = FF_PROFILE_AV1_0; > +avctx->profile = enccfg.g_profile = FF_PROFILE_AV1_MAIN; > else { > const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt); > if (desc->comp[0].depth < 12) > -avctx->profile = enccfg.g_profile = FF_PROFILE_AV1_1; > +avctx->profile = enccfg.g_profile = FF_PROFILE_AV1_HIGH; > else > -avctx->profile = enccfg.g_profile = FF_PROFILE_AV1_2; > +avctx->profile = enccfg.g_profile = FF_PROFILE_AV1_PROFESSIONAL; > } both patched look good to j-b in irc and I agree Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH 2/2] Convert {v0.8b-v3.8b} into {v0.8b, v1.8b, v2.8b, v3.8b} for armasm64
On 2018-03-08 15:26:14 +0200, Martin Storsjö wrote: > --- > gas-preprocessor.pl | 18 ++ > 1 file changed, 18 insertions(+) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 9a7f6d8..5158cc7 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -1051,6 +1051,24 @@ sub handle_serialized_line { > $line =~ s/#$scale/#$inverted_scale/; > } > } > + > +# Convert "ld1 {v0.4h-v3.4h}" into "ld1 > {v0.4h,v1.4h,v2.4h,v3.4h}" > +if ($line =~ > /(?:ld|st)\d\s+({\s*v(\d+)\.(\d[bhsdBHSD])\s*-\s*v(\d+)\.(\d[bhsdBHSD])\s*})/) > { > +my $regspec = $1; > +my $reg1 = $2; > +my $layout1 = $3; > +my $reg2 = $4; > +my $layout2 = $5; > +if ($layout1 eq $layout2) { > +my $new_regspec = "{"; > +foreach my $i ($reg1 .. $reg2) { > +$new_regspec .= "," if ($i > $reg1); > +$new_regspec .= "v$i.$layout1"; > +} > +$new_regspec .= "}"; > +$line =~ s/$regspec/$new_regspec/; > +} > +} > } > # armasm is unable to parse &0x - add spacing > $line =~ s/&0x/& 0x/g; ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH 1/2] Convert .extern into IMPORT for armasm
On 2018-03-08 15:26:13 +0200, Martin Storsjö wrote: > --- > gas-preprocessor.pl | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index b0c909c..9a7f6d8 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -1119,6 +1119,7 @@ sub handle_serialized_line { > } > if ($as_type eq "armasm") { > $line =~ s/\.global/EXPORT/x; > +$line =~ s/\.extern/IMPORT/x; > $line =~ s/\.int/dcd/x; > $line =~ s/\.long/dcd/x; > $line =~ s/\.float/dcfs/x; ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH 3/3] Don't skip negative offsets for ldr by default for armasm64
On 2018-03-06 10:58:32 +0200, Martin Storsjö wrote: > The version of armasm64 in Visual Studio 2017 15.6 can assemble > these just fine. > --- > gas-preprocessor.pl | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index cb2f912..b0c909c 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -973,8 +973,8 @@ sub handle_serialized_line { > my $reg = $1; > my $sym = $2; > my $offset = eval_expr($3); > -if ($offset < 0) { > -# armasm64 is buggy with ldr x0, =sym+offset where the > +if ($offset < 0 and $ENV{GASPP_ARMASM64_SKIP_NEG_OFFSET}) { > +# armasm64 in VS < 15.6 is buggy with ldr x0, > =sym+offset where the > # offset is a negative value; it does write a negative > # offset into the literal pool as it should, but the > # negative offset only covers the lower 32 bit of the 64 ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH 2/3] Don't skip prfum instructions by default for armasm64
On 2018-03-06 10:58:31 +0200, Martin Storsjö wrote: > The version of armasm64 in Visual Studio 2017 15.5 can assemble > these just fine. > --- > gas-preprocessor.pl | 10 ++ > 1 file changed, 6 insertions(+), 4 deletions(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 9ff47a9..cb2f912 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -1020,10 +1020,12 @@ sub handle_serialized_line { > # Convert "cset w0, lo" into "csetlo w0" > $line =~ s/(cset)\s+([xw]\w+)\s*,\s*($arm_cond_codes)/\1\3 \2/; > > -# Strip out prfum; armasm64 fails to assemble any > -# variant/combination of prfum tested so far, but it can be > -# left out without any > -$line =~ s/prfum.*\]//; > +if ($ENV{GASPP_ARMASM64_SKIP_PRFUM}) { > +# Strip out prfum; armasm64 (VS < 15.5) fails to assemble any > +# variant/combination of prfum tested so far, but it can be > +# left out without any Maybe replace the last incomplete sentence after the comma with "Since it is a prefetch instruction it can be skipped without changing results." > +$line =~ s/prfum.*\]//; > +} > > # Convert "ldrb w0, [x0, #-1]" into "ldurb w0, [x0, #-1]". > # Don't do this for forms with writeback though. otherwise ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH 1/3] Document what versions were buggy and required GASPP_ARMASM64_INVERT_SCALE
On 2018-03-06 10:58:30 +0200, Martin Storsjö wrote: > --- > gas-preprocessor.pl | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 3787756..9ff47a9 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -1041,7 +1041,7 @@ sub handle_serialized_line { > if ($ENV{GASPP_ARMASM64_INVERT_SCALE}) { > # Instructions like fcvtzs and scvtf store the scale value > # inverted in the opcode (stored as 64 - scale), but armasm64 > -# in early versions stores it as-is. Thus convert from > +# in VS < 15.5 stores it as-is. Thus convert from > # "fcvtzs w0, s0, #8" into "fcvtzs w0, s0, #56". > if ($line =~ > /(?:fcvtzs|scvtf)\s+(\w+)\s*,\s*(\w+)\s*,\s*#(\d+)/) { > my $scale = $3; ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 6/8] Define _WIN32 while preprocessing for armasm
On 2017-10-18 09:43:11 +0300, Martin Storsjö wrote: > On Wed, 18 Oct 2017, Janne Grunau wrote: > > >On 2017-10-14 23:35:20 +0300, Martin Storsjö wrote: > >>--- > >> gas-preprocessor.pl | 1 + > >> 1 file changed, 1 insertion(+) > >> > >>diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > >>index 456ee24..63b0ab3 100755 > >>--- a/gas-preprocessor.pl > >>+++ b/gas-preprocessor.pl > >>@@ -98,6 +98,7 @@ if ($as_type eq "armasm") { > >> > >> $preprocess_c_cmd[0] = "cpp"; > >> push(@preprocess_c_cmd, "-undef"); > >>+push(@preprocess_c_cmd, "-D_WIN32"); > >> > >> @preprocess_c_cmd = grep ! /^-nologo$/, @preprocess_c_cmd; > >> # Remove -ignore XX parameter pairs from preprocess_c_cmd > > > >this looks a little suspicious. Some code expect _WIN32 to be defined but > >msvc apparently doesn't define it. > > MSVC does define it normally, but we're using the plain "cpp" binary here, > which in cross building setups is the local compiler's preprocessor - hence > the -undef above to get rid of whatever other definitions that preprocessor > sets. Please add this explanation either to the commit message or as comment. Patch ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH 6/6] Work around an armasm64 bug in the scale operand to fcvtzs/scvtf
On 2017-10-16 22:38:19 +0300, Martin Storsjö wrote: > The operand shouldn't be stored as is, but stored as 64-scale, in > the opcode, but armasm64 misses to do this. > > This might be a big enough bug to report and try to get fixed, but > that requires removing this workaround at that point. Please report this as bug. I'd propose a environment variable or version check for this fixup. > --- > gas-preprocessor.pl | 10 ++ > 1 file changed, 10 insertions(+) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index d9eaf1d..182b684 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -1032,6 +1032,16 @@ sub handle_serialized_line { > $line =~ s/$instr$suffix/${instr}u$suffix/; >} > } > + > +# Instructions like fcvtzs and scvtf store the scale value > +# inverted in the opcode (stored as 64 - scale), but armasm64 > +# stores it as-is. Thus convert from "fcvtzs w0, s0, #8" > +# into "fcvtzs w0, s0, #56". > +if ($line =~ > /(?:fcvtzs|scvtf)\s+(\w+)\s*,\s*(\w+)\s*,\s*#(\d+)/) { > +my $scale = $3; > +my $inverted_scale = 64 - $3; > +$line =~ s/#$scale/#$inverted_scale/; > +} > } > # armasm is unable to parse &0x - add spacing > $line =~ s/&0x/& 0x/g; The fixup itself is ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH 5/6] Convert local labels in tbz instructions for armasm
On 2017-10-16 22:38:18 +0300, Martin Storsjö wrote: > Also convert the register from wX into xX, since armasm fails to > assemble it when referring to the register as wX. > --- > gas-preprocessor.pl | 11 +-- > 1 file changed, 9 insertions(+), 2 deletions(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index b650c39..d9eaf1d 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -891,16 +891,23 @@ sub handle_serialized_line { > ($arch eq "aarch64" and !is_aarch64_register($target))) > { > $call_targets{$target}++; > } > -} elsif ($line =~ > /(?:^|\n)\s*(\w+\s*:\s*)?(cbn?z|adr)\s+(\w+)\s*,\s*(\w+)/) { > +} elsif ($line =~ > /(?:^|\n)\s*(\w+\s*:\s*)?(cbn?z|adr|tbz)\s+(\w+)\s*,(\s*#\d+\s*,)?\s*(\w+)/) { > my $instr = $2; > my $reg = $3; > -my $target = $4; > +my $bit = $4; > +my $target = $5; > if ($target =~ /^(\d+)([bf])$/) { > # The target is a local label > $line = handle_local_label($line, $1, $2); > } else { > $call_targets{$target}++; > } > +# Convert tbz with a wX register into an xX register. > +if ($instr eq "tbz" and $reg =~ /w\d+/) { > +my $xreg = $reg; > +$xreg =~ s/w/x/; > +$line =~ s/\b$reg\b/$xreg/; > +} > } elsif ($line =~ /^\s*.h?word.*\b\d+[bf]\b/) { > while ($line =~ /\b(\d+)([bf])\b/g) { > $line = handle_local_label($line, $1, $2); please mention that this is an armasm bug, the fixup is ok though Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH 4/6] Convert ldr/str/ldrb/strb etc into ldurb, when the offset is negative
On 2017-10-16 22:38:17 +0300, Martin Storsjö wrote: > --- > gas-preprocessor.pl | 13 + > 1 file changed, 13 insertions(+) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 552ed0c..b650c39 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -1012,6 +1012,19 @@ sub handle_serialized_line { > # variant/combination of prfum tested so far, but it can be > # left out without any > $line =~ s/prfum.*\]//; > + > +# Convert "ldrb w0, [x0, #-1]" into "ldurb w0, [x0, #-1]". > +# Don't do this for forms with writeback though. > +if ($line =~ > /(ld|st)(r[bh]?)\s+(\w+)\s*,\s*\[\s*(\w+)\s*,\s*#([^\]]+)\s*\][^!]/) { > + my $instr = $1; > + my $suffix = $2; > + my $target = $3; > + my $base = $4; > + my $offset = eval_expr($5); > + if ($offset < 0) { > +$line =~ s/$instr$suffix/${instr}u$suffix/; > + } > +} > } > # armasm is unable to parse &0x - add spacing > $line =~ s/&0x/& 0x/g; patch ok-ish but we should fix the offending code too. ldr? without writeback supports only unsigned offsets and gas seems to fix it for us. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH 3/6] Handle cinc just like ccmp/csel
On 2017-10-16 22:38:16 +0300, Martin Storsjö wrote: > This can be squashed into "Add support for MS armasm64"; this > was found while trying to build x264. > --- > gas-preprocessor.pl | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 2add3dd..552ed0c 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -1002,6 +1002,9 @@ sub handle_serialized_line { > # and "csel w0, w0, w0, ne" into "cselne w0, w0, w0". > $line =~ > s/(ccmp|csel)\s+([xw]\w+)\s*,\s*([xw#]\w+)\s*,\s*([xw#]\w+)\s*,\s*($arm_cond_codes)/\1\5 > \2, \3, \4/; > > +# Convert "cinc w0, w0, ne" into "cincne w0, w0". > +$line =~ > s/(cinc)\s+([xw]\w+)\s*,\s*([xw]\w+)\s*,\s*($arm_cond_codes)/\1\4 \2, \3/; > + > # Convert "cset w0, lo" into "csetlo w0" > $line =~ s/(cset)\s+([xw]\w+)\s*,\s*($arm_cond_codes)/\1\3 \2/; > ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH 2/6] Allow register names such as xzr instead of the pattern [xw]\d+ in ccmp/csel
On 2017-10-16 22:38:15 +0300, Martin Storsjö wrote: > Also update the csel pattern similarly. > > This is required for building x264. > --- > gas-preprocessor.pl | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 4c91ee0..2add3dd 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -1000,10 +1000,10 @@ sub handle_serialized_line { > > # Convert "ccmp w0, #0, #0, ne" into "ccmpne w0, #0, #0", > # and "csel w0, w0, w0, ne" into "cselne w0, w0, w0". > -$line =~ > s/(ccmp|csel)\s+([xw]\d+)\s*,\s*([xw#]\d+)\s*,\s*([xw#]\d+)\s*,\s*($arm_cond_codes)/\1\5 > \2, \3, \4/; > +$line =~ > s/(ccmp|csel)\s+([xw]\w+)\s*,\s*([xw#]\w+)\s*,\s*([xw#]\w+)\s*,\s*($arm_cond_codes)/\1\5 > \2, \3, \4/; > > # Convert "cset w0, lo" into "csetlo w0" > -$line =~ s/(cset)\s+([xw]\d+)\s*,\s*($arm_cond_codes)/\1\3 \2/; > +$line =~ s/(cset)\s+([xw]\w+)\s*,\s*($arm_cond_codes)/\1\3 \2/; > > # Strip out prfum; armasm64 fails to assemble any > # variant/combination of prfum tested so far, but it can be ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [GASPP PATCH] Support converting uxtl into ushll on a line that starts with a local label
On 2017-10-16 12:36:13 +0300, Martin Storsjö wrote: > Also make a note that this conversion is necessary for armasm64. > > For consistency, allow local labels in all similar full-line > conversions as well. > --- > gas-preprocessor.pl | 19 ++- > 1 file changed, 10 insertions(+), 9 deletions(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index a8cf1e0..6353a07 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -793,24 +793,25 @@ sub handle_serialized_line { > if ($arch eq "aarch64") { > # fix missing aarch64 instructions in Xcode 5.1 (beta3) > # mov with vector arguments is not supported, use alias orr instead > -if ($line =~ > /^\s*mov\s+(v\d[\.{}\[\]\w]+),\s*(v\d[\.{}\[\]\w]+)\b\s*$/) { > -$line = "orr $1, $2, $2\n"; > +if ($line =~ > /^(\d+:)?\s*mov\s+(v\d[\.{}\[\]\w]+),\s*(v\d[\.{}\[\]\w]+)\b\s*$/) { > +$line = "$1orr $2, $3, $3\n"; > } > # movi 16, 32 bit shifted variant, shift is optional > -if ($line =~ > /^\s*movi\s+(v[0-3]?\d\.(?:2|4|8)[hsHS])\s*,\s*(#\w+)\b\s*$/) { > -$line = "movi $1, $2, lsl #0\n"; > +if ($line =~ > /^(\d+:)?\s*movi\s+(v[0-3]?\d\.(?:2|4|8)[hsHS])\s*,\s*(#\w+)\b\s*$/) { > +$line = "$1movi $2, $3, lsl #0\n"; > } > # Xcode 5 misses the alias uxtl. Replace it with the more general > ushll. > # Clang 3.4 misses the alias sxtl too. Replace it with the more > general sshll. > -if ($line =~ > /^\s*(s|u)xtl(2)?\s+(v[0-3]?\d\.[248][hsdHSD])\s*,\s*(v[0-3]?\d\.(?:2|4|8|16)[bhsBHS])\b\s*$/) > { > -$line = "$1shll$2 $3, $4, #0\n"; > +# armasm64 also misses these instructions. > +if ($line =~ > /^(\d+:)?\s*(s|u)xtl(2)?\s+(v[0-3]?\d\.[248][hsdHSD])\s*,\s*(v[0-3]?\d\.(?:2|4|8|16)[bhsBHS])\b\s*$/) > { > +$line = "$1$2shll$3 $4, $5, #0\n"; > } > # clang 3.4 and armasm64 do not automatically use shifted immediates > in add/sub > if (($as_type eq "clang" or $as_type eq "armasm") and > -$line =~ /^(\s*(?:add|sub)s?) ([^#l]+)#([\d\+\-\*\/ <>]+)\s*$/) { > -my $imm = eval $3; > +$line =~ /^(\d+:)?(\s*(?:add|sub)s?) ([^#l]+)#([\d\+\-\*\/ > <>]+)\s*$/) { > +my $imm = eval $4; > if ($imm > 4095 and not ($imm & 4095)) { > -$line = "$1 $2#" . ($imm >> 12) . ", lsl #12\n"; > +$line = "$1 $2 $3#" . ($imm >> 12) . ", lsl #12\n"; > } > } > if ($ENV{GASPP_FIX_XCODE5}) { ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/3] aarch64: Remove a dot from a label
On 2017-10-14 23:35:32 +0300, Martin Storsjö wrote: > This fixes building with armasm64 (when run through gas-preprocessor). > --- > libavcodec/aarch64/mpegaudiodsp_neon.S | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/libavcodec/aarch64/mpegaudiodsp_neon.S > b/libavcodec/aarch64/mpegaudiodsp_neon.S > index 34181d9..6dbf142 100644 > --- a/libavcodec/aarch64/mpegaudiodsp_neon.S > +++ b/libavcodec/aarch64/mpegaudiodsp_neon.S > @@ -24,7 +24,7 @@ > #define WFRAC_BITS 16 // fractional bits for window > #define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15) > > -const tbl_rev128.s, align=4 > +const tbl_rev128, align=4 > .byte 12, 13, 14, 15 > .byte8, 9, 10, 11 > .byte4, 5, 6, 7 > @@ -39,7 +39,7 @@ function ff_mpadsp_apply_window_\type\()_neon, export=1 > ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x7], #64 > st1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x8], #64 > st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x8], #64 > -movrel x15, tbl_rev128.s > +movrel x15, tbl_rev128 > ld1 {v27.4s}, [x15] > .ifc \type, fixed > lsl x4, x4, #1 please keep the 's' or replace it with another indicator for the element size. tbl_rev128_s or tbl_rev128_32 would be ok for me. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 8/8] Add support for MS armasm64
On 2017-10-14 23:35:22 +0300, Martin Storsjö wrote: > --- > I haven't been able to assemble prfum instructions with armasm64 yet; > dumpbin -disasm does disassemble the instruction correctly (e.g. from > an object file assembled with llvm), but armasm64 doesn't support > assembling it, either in that form or with a few variations I've tried. > > In the simplest form, it fails like this: > > test.asm(8) : error A2502: operand 1: Expected constant > prfum pldl1keep, [x0, #0] > > The prfm instruction seems to be handled correctly though. Since > this prefetch instruction isn't essential, it can be worked around > by simply skipping the instruction. > --- > gas-preprocessor.pl | 125 > +++- > 1 file changed, 104 insertions(+), 21 deletions(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 9bcdbac..a8cf1e0 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl ... > @@ -1013,11 +1091,16 @@ sub handle_serialized_line { > $line =~ s/\.text/AREA |.text|, CODE, READONLY, ALIGN=4, CODEALIGN/; > $line =~ s/(\s*)(.*)\.rodata/$1AREA |.rodata|, DATA, READONLY, > ALIGN=5/; > $line =~ s/\.data/AREA |.data|, DATA, ALIGN=5/; > - > +} > +if ($as_type eq "armasm" and $arch eq "arm") { > $line =~ s/fmxr/vmsr/; > $line =~ s/fmrx/vmrs/; > $line =~ s/fadds/vadd.f32/; > } > +if ($as_type eq "armasm" and $arch eq "aarch64") { > +# Convert "b.eq" into "beq" > +$line =~ s/\bb\.($arm_cond_codes)\b/b\1/; > +} wtf! > > # catch unknown section names that aren't mach-o style (with a comma) > if ($as_type =~ /apple-/ and $line =~ /.section ([^,]*)$/) { > @@ -1038,7 +1121,7 @@ if ($as_type ne "armasm") { > grep exists $thumb_labels{$_}, keys %call_targets; > } else { > map print(ASMFILE "\tIMPORT $_\n"), > -grep ! exists $labels_seen{$_}, (keys %call_targets, keys > %mov32_targets); > +grep ! exists $labels_seen{$_}, (keys %call_targets, keys > %import_symbols); > > print ASMFILE "\tEND\n"; > } patch ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 6/8] Define _WIN32 while preprocessing for armasm
On 2017-10-14 23:35:20 +0300, Martin Storsjö wrote: > --- > gas-preprocessor.pl | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 456ee24..63b0ab3 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -98,6 +98,7 @@ if ($as_type eq "armasm") { > > $preprocess_c_cmd[0] = "cpp"; > push(@preprocess_c_cmd, "-undef"); > +push(@preprocess_c_cmd, "-D_WIN32"); > > @preprocess_c_cmd = grep ! /^-nologo$/, @preprocess_c_cmd; > # Remove -ignore XX parameter pairs from preprocess_c_cmd this looks a little suspicious. Some code expect _WIN32 to be defined but msvc apparently doesn't define it. Unless there's a reason to expect this predefined in the preprocessor I'd prefer it fixed in the code instead. I could live with this if it's expected to be required for more than just libav's asm. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 5/8] Operate on the right variable instead of the implicit variable
On 2017-10-14 23:35:19 +0300, Martin Storsjö wrote: > Apparently, this hasn't caused any issues in practice. > --- > gas-preprocessor.pl | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index fe9c746..456ee24 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -943,7 +943,7 @@ sub handle_serialized_line { > # Convert "mov pc, lr" into "bx lr", since the former only works > # for switching from arm to thumb (and only in armv7), but not > # from thumb to arm. > -s/mov\s*pc\s*,\s*lr/bx lr/g; > +$line =~ s/mov\s*pc\s*,\s*lr/bx lr/g; > > # Convert stmdb/ldmia/stmfd/ldmfd/ldm with only one register into a > plain str/ldr with post-increment/decrement. > # Wide thumb2 encoding requires at least two registers in register > list while all other encodings support one register too. ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 4/8] Require boundaries around local labels in handle_local_label
On 2017-10-14 23:35:18 +0300, Martin Storsjö wrote: > Since we're doing a replace of a string that looks like e.g "1b" > over a full line, such a string could concievably be a substring of > another identifier as well. > > This doesn't fix any known issue, but attempts to make this > less fragile. > --- > gas-preprocessor.pl | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 09fcf0e..fe9c746 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -631,12 +631,12 @@ sub handle_local_label { > my $dir = $_[2]; > my $target = "$num$dir"; > if ($dir eq "b") { > -$line =~ s/$target/$last_temp_labels{$num}/g; > +$line =~ s/\b$target\b/$last_temp_labels{$num}/g; > } else { > my $name = "temp_label_$temp_label_next"; > $temp_label_next++; > push(@{$next_temp_labels{$num}}, $name); > -$line =~ s/$target/$name/g; > +$line =~ s/\b$target\b/$name/g; > } > return $line; > } ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 3/8] Correctly check for arm condition codes when trying to filter out 'bic'
On 2017-10-14 23:35:17 +0300, Martin Storsjö wrote: > Since an empty condition code also is valid, this also matched for > any other string, since it matched the empty string. By making sure > the pattern matches the full string, we avoid that issue. > > Thanks to the later is_arm_register check, this slipped through > earlier. > --- > gas-preprocessor.pl | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index 2c9cd07..09fcf0e 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -704,7 +704,7 @@ sub handle_serialized_line { > my $cond = $3; > my $label = $4; > # Don't interpret e.g. bic as b with ic as conditional code > -if ($cond =~ /|$arm_cond_codes/) { > +if ($cond =~ /^(|$arm_cond_codes)$/) { > if (exists $thumb_labels{$label}) { > print ASMFILE ".thumb_func $label\n"; > } else { > @@ -871,7 +871,7 @@ sub handle_serialized_line { > my $width = $4; > my $target = $5; > # Don't interpret e.g. bic as b with ic as conditional code > -if ($cond !~ /|$arm_cond_codes/) { > +if ($cond !~ /^(|$arm_cond_codes)$/) { > # Not actually a branch > } elsif ($target =~ /^(\d+)([bf])$/) { > # The target is a local label ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/8] Pass -undef to cpp instead of undefining __ELF__ and __MACH__
On 2017-10-14 23:35:15 +0300, Martin Storsjö wrote: > --- > gas-preprocessor.pl | 3 +-- > 1 file changed, 1 insertion(+), 2 deletions(-) > > diff --git a/gas-preprocessor.pl b/gas-preprocessor.pl > index afdfc9e..6aae65d 100755 > --- a/gas-preprocessor.pl > +++ b/gas-preprocessor.pl > @@ -97,8 +97,7 @@ if (grep /\.c$/, @gcc_cmd) { > if ($as_type eq "armasm") { > > $preprocess_c_cmd[0] = "cpp"; > -push(@preprocess_c_cmd, "-U__ELF__"); > -push(@preprocess_c_cmd, "-U__MACH__"); > +push(@preprocess_c_cmd, "-undef"); > > @preprocess_c_cmd = grep ! /^-nologo$/, @preprocess_c_cmd; > # Remove -ignore XX parameter pairs from preprocess_c_cmd ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] arm: Fix SIGBUS on ARM when compiled with binutils 2.29
On 2017-08-31 12:10:56 +0300, Martin Storsjö wrote: > In binutils 2.29, the behavior of the ADR instruction changed so that 1 is > added to the address of a Thumb function (previously nothing was added). This > allows the loaded address to be passed to a BLX instruction and the correct > mode change will occur. > > See: https://sourceware.org/bugzilla/show_bug.cgi?id=21458 > > By using adr with a label that isn't annotated as a thumb function, > we avoid the new behaviour in binutils 2.29 and get the same behaviour > as in prior releases, and as in other assemblers (ms armasm.exe, > clang's built in assembler). > --- > libavcodec/arm/h264idct_neon.S | 20 > 1 file changed, 12 insertions(+), 8 deletions(-) > > diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S > index f588f3e..b078cf2 100644 > --- a/libavcodec/arm/h264idct_neon.S > +++ b/libavcodec/arm/h264idct_neon.S > @@ -21,6 +21,7 @@ > #include "libavutil/arm/asm.S" > > function ff_h264_idct_add_neon, export=1 > +h264_idct_add_neon_nothumb: I thought of adding the symbol to the function macro but given that it's just two functions in single file adding the macros manually might be better. I don't expect it to be needed in many more places if any. > vld1.64 {d0-d3}, [r1,:128] > vmov.i16q15, #0 > > @@ -73,6 +74,7 @@ function ff_h264_idct_add_neon, export=1 > endfunc > > function ff_h264_idct_dc_add_neon, export=1 > +h264_idct_dc_add_neon_nothumb: > mov r3, #0 > vld1.16 {d2[],d3[]}, [r1,:16] > strhr3, [r1] > @@ -113,8 +115,8 @@ function ff_h264_idct_add16_neon, export=1 > movne lr, #0 > cmp lr, #0 > ite ne > -adrne lr, X(ff_h264_idct_dc_add_neon) + CONFIG_THUMB > -adreq lr, X(ff_h264_idct_add_neon)+ CONFIG_THUMB > +adrne lr, h264_idct_dc_add_neon_nothumb + CONFIG_THUMB > +adreq lr, h264_idct_add_neon_nothumb+ CONFIG_THUMB > blx lr > 2: subsip, ip, #1 > add r1, r1, #32 > @@ -138,8 +140,8 @@ function ff_h264_idct_add16intra_neon, export=1 > cmp r8, #0 > ldrsh r8, [r1] > iteet ne > -adrne lr, X(ff_h264_idct_add_neon)+ CONFIG_THUMB > -adreq lr, X(ff_h264_idct_dc_add_neon) + CONFIG_THUMB > +adrne lr, h264_idct_add_neon_nothumb+ CONFIG_THUMB > +adreq lr, h264_idct_dc_add_neon_nothumb + CONFIG_THUMB > cmpeq r8, #0 > blxne lr > subsip, ip, #1 > @@ -166,8 +168,8 @@ function ff_h264_idct_add8_neon, export=1 > cmp r8, #0 > ldrsh r8, [r1] > iteet ne > -adrne lr, X(ff_h264_idct_add_neon)+ CONFIG_THUMB > -adreq lr, X(ff_h264_idct_dc_add_neon) + CONFIG_THUMB > +adrne lr, h264_idct_add_neon_nothumb+ CONFIG_THUMB > +adreq lr, h264_idct_dc_add_neon_nothumb + CONFIG_THUMB > cmpeq r8, #0 > blxne lr > add r12, r12, #1 > @@ -267,6 +269,7 @@ endfunc > .endm > > function ff_h264_idct8_add_neon, export=1 > +h264_idct8_add_neon_nothumb: > vmov.i16q3, #0 > vld1.16 {q8-q9}, [r1,:128] > vst1.16 {q3}, [r1,:128]! > @@ -328,6 +331,7 @@ function ff_h264_idct8_add_neon, export=1 > endfunc > > function ff_h264_idct8_dc_add_neon, export=1 > +h264_idct8_dc_add_neon_nothumb: > mov r3, #0 > vld1.16 {d30[],d31[]},[r1,:16] > strhr3, [r1] > @@ -388,8 +392,8 @@ function ff_h264_idct8_add4_neon, export=1 > movne lr, #0 > cmp lr, #0 > ite ne > -adrne lr, X(ff_h264_idct8_dc_add_neon) + CONFIG_THUMB > -adreq lr, X(ff_h264_idct8_add_neon)+ CONFIG_THUMB > +adrne lr, h264_idct8_dc_add_neon_nothumb + CONFIG_THUMB > +adreq lr, h264_idct8_add_neon_nothumb+ CONFIG_THUMB > blx lr > 2: subsr12, r12, #4 > add r1, r1, #128 patch ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/3] configure: Simplify dlopen check
On 2017-02-21 18:26:25 +0100, Diego Biurrun wrote: > --- > > This was previously approved. > > configure | 26 +- > 1 file changed, 9 insertions(+), 17 deletions(-) > > diff --git a/configure b/configure > index 6f1be32..ef6a8e0 100755 > --- a/configure > +++ b/configure > @@ -1608,7 +1608,6 @@ SYSTEM_FUNCS=" > CommandLineToArgvW > CoTaskMemFree > CryptGenRandom > -dlopen > fcntl > flt_lim > fork > @@ -2218,10 +2217,8 @@ wmv3_vaapi_hwaccel_select="vc1_vaapi_hwaccel" > wmv3_vdpau_hwaccel_select="vc1_vdpau_hwaccel" > > # hardware-accelerated codecs > -nvenc_deps_any="dlopen LoadLibrary" > -nvenc_extralibs='$ldl' > -omx_deps="dlopen pthreads" > -omx_extralibs='$ldl' > +nvenc_deps_any="libdl LoadLibrary" > +omx_deps="libdl pthreads" > omx_rpi_select="omx" > qsvdec_select="qsv" > qsvenc_select="qsv" > @@ -2280,7 +2277,7 @@ mjpeg2jpeg_bsf_select="jpegtables" > > # external libraries > avisynth_deps="LoadLibrary" > -avxsynth_deps="dlopen" > +avxsynth_deps="libdl" > avisynth_demuxer_deps_any="avisynth avxsynth" > avisynth_demuxer_select="riffdec" > libdcadec_decoder_deps="libdcadec" > @@ -2472,10 +2469,8 @@ deinterlace_qsv_filter_deps="libmfx" > deinterlace_vaapi_filter_deps="vaapi" > delogo_filter_deps="gpl" > drawtext_filter_deps="libfreetype" > -frei0r_filter_deps="frei0r dlopen" > -frei0r_filter_extralibs='$ldl' > -frei0r_src_filter_deps="frei0r dlopen" > -frei0r_src_filter_extralibs='$ldl' > +frei0r_filter_deps="frei0r libdl" > +frei0r_src_filter_deps="frei0r libdl" > hdcd_filter_deps="libhdcd" > hqdn3d_filter_deps="gpl" > interlace_filter_deps="gpl" > @@ -4461,12 +4456,6 @@ check_code cc arm_neon.h "int16x8_t test = > vdupq_n_s16(0)" && enable intrinsics_ > > check_ldflags -Wl,--as-needed > > -if check_func dlopen; then > -ldl= > -elif check_func dlopen -ldl; then > -ldl=-ldl > -fi > - > if ! disabled network; then > check_func getaddrinfo $network_extralibs > check_func inet_aton $network_extralibs > @@ -4638,6 +4627,9 @@ enabled pthreads && > disabled zlib || check_lib zlib zlib.h zlibVersion -lz > disabled bzlib || check_lib bzlib bzlib.h BZ2_bzlibVersion -lbz2 > > +# On some systems dynamic loading requires no extra linker flags > +check_lib libdl dlfcn.h dlopen || check_lib libdl dlfcn.h dlopen -ldl > + > check_lib libm math.h sin -lm && LIBM="-lm" > > atan2f_args=2 > @@ -4650,7 +4642,7 @@ done > > # these are off by default, so fail if requested and not available > enabled avisynth && require_header avisynth/avisynth_c.h > -enabled avxsynth && require avxsynth "avxsynth/avxsynth_c.h > dlfcn.h" dlopen -ldl > +enabled avxsynth && require_header avxsynth/avxsynth_c.h > enabled cuda && require cuda cuda.h cuInit -lcuda > enabled frei0r&& require_header frei0r.h > enabled gnutls&& require_pkg_config gnutls gnutls > gnutls/gnutls.h gnutls_global_init ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/3] Revert "configure: Add proper weak dependency of drawtext filter on libfontconfig"
On 2017-02-21 18:26:24 +0100, Diego Biurrun wrote: > External dependencies cannot be handled as weak dependencies since they need > to be explicitly enabled. If a weak dependency is set, the variable > corresponding > to the weak dependency can be enabled without the rest of the build system > settings, resulting in a failing build. > > This reverts commit 66988320794a107f2a460eaa71dbd9fab8056842. > --- > configure | 1 - > 1 file changed, 1 deletion(-) > > diff --git a/configure b/configure > index 24e9fc3..6f1be32 100755 > --- a/configure > +++ b/configure > @@ -2472,7 +2472,6 @@ deinterlace_qsv_filter_deps="libmfx" > deinterlace_vaapi_filter_deps="vaapi" > delogo_filter_deps="gpl" > drawtext_filter_deps="libfreetype" > -drawtext_filter_suggest="libfontconfig" > frei0r_filter_deps="frei0r dlopen" > frei0r_filter_extralibs='$ldl' > frei0r_src_filter_deps="frei0r dlopen" ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 3/4] arm: vp9itxfm: Reorder iadst16 coeffs
On 2017-02-09 14:33:55 +0200, Martin Storsjö wrote: > This matches the order they are in the 16 bpp version. > > There they are in this order, to make sure we access them in the > same order they are declared, easing loading only half of the > coefficients at a time. > > This makes the 8 bpp version match the 16 bpp version better. > --- > libavcodec/arm/vp9itxfm_neon.S | 12 ++-- > 1 file changed, 6 insertions(+), 6 deletions(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index f74d542..c8eeb76 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -37,8 +37,8 @@ idct_coeffs: > endconst > > const iadst16_coeffs, align=4 > -.short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760 > -.short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207 > +.short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 > +.short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 > endconst > > @ Do four 4x4 transposes, using q registers for the subtransposes that don't > @@ -672,19 +672,19 @@ function iadst16 > vld1.16 {q0-q1}, [r12,:128] > > mbutterfly_lq3, q2, d31, d16, d0[1], d0[0] @ q3 = t1, q2 = > t0 > -mbutterfly_lq5, q4, d23, d24, d2[1], d2[0] @ q5 = t9, q4 = > t8 > +mbutterfly_lq5, q4, d23, d24, d1[1], d1[0] @ q5 = t9, q4 = > t8 > butterfly_n d31, d24, q3, q5, q6, q5 @ d31 = t1a, d24 = > t9a > mbutterfly_lq7, q6, d29, d18, d0[3], d0[2] @ q7 = t3, q6 = > t2 > butterfly_n d16, d23, q2, q4, q3, q4 @ d16 = t0a, d23 = > t8a > > -mbutterfly_lq3, q2, d21, d26, d2[3], d2[2] @ q3 = t11, q2 = > t10 > +mbutterfly_lq3, q2, d21, d26, d1[3], d1[2] @ q3 = t11, q2 = > t10 > butterfly_n d29, d26, q7, q3, q4, q3 @ d29 = t3a, d26 = > t11a > -mbutterfly_lq5, q4, d27, d20, d1[1], d1[0] @ q5 = t5, q4 = > t4 > +mbutterfly_lq5, q4, d27, d20, d2[1], d2[0] @ q5 = t5, q4 = > t4 > butterfly_n d18, d21, q6, q2, q3, q2 @ d18 = t2a, d21 = > t10a > > mbutterfly_lq7, q6, d19, d28, d3[1], d3[0] @ q7 = t13, q6 = > t12 > butterfly_n d20, d28, q5, q7, q2, q7 @ d20 = t5a, d28 = > t13a > -mbutterfly_lq3, q2, d25, d22, d1[3], d1[2] @ q3 = t7, q2 = > t6 > +mbutterfly_lq3, q2, d25, d22, d2[3], d2[2] @ q3 = t7, q2 = > t6 > butterfly_n d27, d19, q4, q6, q5, q6 @ d27 = t4a, d19 = > t12a > > mbutterfly_lq5, q4, d17, d30, d3[3], d3[2] @ q5 = t15, q4 = > t14 ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 4/4] aarch64: vp9itxfm: Reorder iadst16 coeffs
On 2017-02-09 14:33:56 +0200, Martin Storsjö wrote: > This matches the order they are in the 16 bpp version. > > There they are in this order, to make sure we access them in the > same order they are declared, easing loading only half of the > coefficients at a time. > > This makes the 8 bpp version match the 16 bpp version better. > --- > libavcodec/aarch64/vp9itxfm_neon.S | 12 ++-- > 1 file changed, 6 insertions(+), 6 deletions(-) > > diff --git a/libavcodec/aarch64/vp9itxfm_neon.S > b/libavcodec/aarch64/vp9itxfm_neon.S > index f87f6bd..7b7dbd4 100644 > --- a/libavcodec/aarch64/vp9itxfm_neon.S > +++ b/libavcodec/aarch64/vp9itxfm_neon.S > @@ -37,8 +37,8 @@ idct_coeffs: > endconst > > const iadst16_coeffs, align=4 > -.short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760 > -.short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207 > +.short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 > +.short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 > endconst > > // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 > @@ -622,19 +622,19 @@ function iadst16 > ld1 {v0.8h,v1.8h}, [x11] > > dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // > v6,v7 = t1, v4,v5 = t0 > -dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.h[1], v1.h[0] // > v10,v11 = t9, v8,v9 = t8 > +dmbutterfly_l v10, v11, v8, v9, v23, v24, v0.h[5], v0.h[4] // > v10,v11 = t9, v8,v9 = t8 > dbutterfly_nv31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // > v31 = t1a, v24 = t9a > dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // > v14,v15 = t3, v12,v13 = t2 > dbutterfly_nv16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // > v16 = t0a, v23 = t8a > > -dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.h[3], v1.h[2] // > v6,v7 = t11, v4,v5 = t10 > +dmbutterfly_l v6, v7, v4, v5, v21, v26, v0.h[7], v0.h[6] // > v6,v7 = t11, v4,v5 = t10 > dbutterfly_nv29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // > v29 = t3a, v26 = t11a > -dmbutterfly_l v10, v11, v8, v9, v27, v20, v0.h[5], v0.h[4] // > v10,v11 = t5, v8,v9 = t4 > +dmbutterfly_l v10, v11, v8, v9, v27, v20, v1.h[1], v1.h[0] // > v10,v11 = t5, v8,v9 = t4 > dbutterfly_nv18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // > v18 = t2a, v21 = t10a > > dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // > v14,v15 = t13, v12,v13 = t12 > dbutterfly_nv20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // > v20 = t5a, v28 = t13a > -dmbutterfly_l v6, v7, v4, v5, v25, v22, v0.h[7], v0.h[6] // > v6,v7 = t7, v4,v5 = t6 > +dmbutterfly_l v6, v7, v4, v5, v25, v22, v1.h[3], v1.h[2] // > v6,v7 = t7, v4,v5 = t6 > dbutterfly_nv27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // > v27 = t4a, v19 = t12a > > dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // > v10,v11 = t15, v8,v9 = t14 ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/4] aarch64: vp9itxfm: Reorder the idct coefficients for better pairing
On 2017-02-09 14:33:54 +0200, Martin Storsjö wrote: > All elements are used pairwise, except for the first one. > Previously, the 16th element was unused. Move the unused element > to the second slot, to make the later element pairs not split > across registers. > > This simplifies loading only parts of the coefficients, > reducing the difference to the 16 bpp version. > --- > libavcodec/aarch64/vp9itxfm_neon.S | 124 > ++--- > 1 file changed, 62 insertions(+), 62 deletions(-) ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/4] arm: vp9itxfm: Reorder the idct coefficients for better pairing
On 2017-02-09 14:33:53 +0200, Martin Storsjö wrote: > All elements are used pairwise, except for the first one. > Previously, the 16th element was unused. Move the unused element > to the second slot, to make the later element pairs not split > across registers. > > This simplifies loading only parts of the coefficients, > reducing the difference to the 16 bpp version. > --- > The 16 bpp version is only in ffmpeg for now, since libav's vp9 > decoder doesn't support the high bitdepth profiles. This change > in itself still makes sense to do though. > --- > libavcodec/arm/vp9itxfm_neon.S | 124 > - > 1 file changed, 62 insertions(+), 62 deletions(-) ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] arm: vp9itxfm: Avoid reloading the idct32 coefficients
On 2017-02-09 13:39:55 +0200, Martin Storsjö wrote: > The idct32x32 function actually backed up and restored q4-q7 even > though it didn't clobber them; there are plenty of registers that > can be used to allow keeping all the idct coefficients in registers > without having to reload different subsets of them at different > stages in the transform. > > Since the idct16 core transform avoids clobbering q4-q7 (but clobbers > q2-q3 instead, to avoid needing to back up and restore q4-q7 at all > in the idct16 function), and the lanewise vmul needs a register in > the q0-q3 range, we move the stored coefficients from q2-q3 into q4-q5 > while doing idct16. > > While keeping these coefficients in registers, we still can skip backing > up and restoring q7. > > Before: Cortex A7 A8 A9 A53 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18553.8 17182.7 14303.3 12089.7 > After: > vp9_inv_dct_dct_32x32_sub32_add_neon: 18470.3 16717.7 14173.6 11860.8 > --- > libavcodec/arm/vp9itxfm_neon.S | 246 > - > 1 file changed, 120 insertions(+), 126 deletions(-) ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 6/6] arm: vp9lpf: Implement the mix2_44 function with one single filter pass
On 2017-02-11 23:42:05 +0200, Martin Storsjö wrote: > On Sat, 11 Feb 2017, Martin Storsjö wrote: > > >On Fri, 10 Feb 2017, Janne Grunau wrote: > > > >>On 2017-01-15 22:55:52 +0200, Martin Storsjö wrote: > >>>For this case, with 8 inputs but only changing 4 of them, we can fit > >>>all 16 input pixels into a q register, and still have enough temporary > >>>registers for doing the loop filter. > >>> > >>>The wd=8 filters would require too many temporary registers for > >>>processing all 16 pixels at once though. > >>> > >>>Before: Cortex A7 A8 A9 A53 > >>>vp9_loop_filter_mix2_v_44_16_neon: 289.7 256.2 237.5 181.2 > >>>After: > >>>vp9_loop_filter_mix2_v_44_16_neon: 221.2 150.5 177.7 138.0 > >>>--- > >>> libavcodec/arm/vp9dsp_init_arm.c | 7 +- > >>> libavcodec/arm/vp9lpf_neon.S | 191 > >+++ > >>> 2 files changed, 195 insertions(+), 3 deletions(-) > >>> > >>>diff --git a/libavcodec/arm/vp9dsp_init_arm.c > >b/libavcodec/arm/vp9dsp_init_arm.c > >>>index e99d931..1ede170 100644 > >>>--- a/libavcodec/arm/vp9dsp_init_arm.c > >>>+++ b/libavcodec/arm/vp9dsp_init_arm.c > >>>@@ -194,6 +194,8 @@ define_loop_filters(8, 8); > >>> define_loop_filters(16, 8); > >>> define_loop_filters(16, 16); > >>> > >>>+define_loop_filters(44, 16); > >>>+ > >>> #define lf_mix_fn(dir, wd1, wd2, stridea) > >\ > >>> static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, > >\ > >>> ptrdiff_t > >>>stride, > >\ > >>>@@ -207,7 +209,6 @@ static void > >loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, > >>> lf_mix_fn(h, wd1, wd2, stride) \ > >>> lf_mix_fn(v, wd1, wd2, sizeof(uint8_t)) > >>> > >>>-lf_mix_fns(4, 4) > >>> lf_mix_fns(4, 8) > >>> lf_mix_fns(8, 4) > >>> lf_mix_fns(8, 8) > >>>@@ -227,8 +228,8 @@ static av_cold void > >vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp) > >>> dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon; > >>> dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon; > >>> > >>>-dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_neon; > >>>-dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_neon; > >>>+dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon; > >>>+dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon; > >>> dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_neon; > >>> dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_neon; > >>> dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_neon; > >>>diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S > >>>index e31c807..12984a9 100644 > >>>--- a/libavcodec/arm/vp9lpf_neon.S > >>>+++ b/libavcodec/arm/vp9lpf_neon.S > >>>@@ -44,6 +44,109 @@ > >>> vtrn.8 \r2, \r3 > >>> .endm > >>> > >>>+@ The input to and output from this macro is in the registers q8-q15, > >>>+@ and q0-q7 are used as scratch registers. > >>>+@ p3 = q8, p0 = q11, q0 = q12, q3 = q15 > >>>+.macro loop_filter_q > >>>+vdup.u8 d0, r2 @ E > >>>+lsr r2, r2, #8 > >>>+vdup.u8 d2, r3 @ I > >>>+lsr r3, r3, #8 > >>>+vdup.u8 d1, r2 @ E > >>>+vdup.u8 d3, r3 @ I > > > >I tried implementing your suggestion with uzp here, but it ended up being > >slower actually. With the version of the patch I posted here: > > > >vp9_loop_filter_mix2_v_44_16_neon: 221.2 150.5 185.0 139.0 > > > >With this block replaced with this: > > > >vdup.u16q0, r2 @ E > >vdup.u16q1, r3 @ I > >vuzp.u8 d0, d1 @ E > >vuzp.u8 d2, d3 @ I > > > >I get the following: > > > >vp9_loop_filter_mix2_v_44_16_neon: 223.2 150.5 186.1 142.0 > > > >I.e. 1-3 cycles slower on A7, A9 and A53, identical on A8. > > If I move the two vuzp further down, I get the following: > > vp9_loop_filter_mix2_v_44_16_neon: 223.2 148.5 185.1 141.0 > > I.e. +2 on A7, -2 on A8, 0 on A9, +2 on A53. So on average it's still worse, > even though it codewise is neater. leave it as it was then Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/6] arm/aarch64: vp9lpf: Keep the comparison to E within 8 bit
On 2017-02-11 22:19:02 +0200, Martin Storsjö wrote: > On Fri, 10 Feb 2017, Janne Grunau wrote: > > >On 2017-01-15 22:55:48 +0200, Martin Storsjö wrote: > >>The theoretical maximum value of E is 193, so we can just > >>saturate the addition to 255. > >> > >>Before: Cortex A7 A8 A9 A53 A53/AArch64 > >>vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.888.0 87.7 > >>vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0136.7 > >>vp9_loop_filter_v_16_8_neon:497.0 419.5 379.7 293.0275.7 > >>vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0452.0 > >>After: > >>vp9_loop_filter_v_4_8_neon: 136.0 125.7 112.684.0 83.0 > >>vp9_loop_filter_v_8_8_neon: 234.0 195.5 171.5 136.0133.7 > >>vp9_loop_filter_v_16_8_neon:490.0 417.5 377.7 289.0271.0 > >>vp9_loop_filter_v_16_16_neon: 951.2 814.7 732.3 571.0446.7 > >>--- > >> libavcodec/aarch64/vp9lpf_neon.S | 40 > >> +--- > >> libavcodec/arm/vp9lpf_neon.S | 11 +-- > >> 2 files changed, 14 insertions(+), 37 deletions(-) > >> > >>diff --git a/libavcodec/aarch64/vp9lpf_neon.S > >>b/libavcodec/aarch64/vp9lpf_neon.S > >>index 3b8e6eb..4553173 100644 > >>--- a/libavcodec/aarch64/vp9lpf_neon.S > >>+++ b/libavcodec/aarch64/vp9lpf_neon.S > >>@@ -51,13 +51,6 @@ > >> // see the arm version instead. > >> > >> > >>-.macro uabdl_sz dst1, dst2, in1, in2, sz > >>-uabdl \dst1, \in1\().8b, \in2\().8b > >>-.ifc \sz, .16b > >>-uabdl2 \dst2, \in1\().16b, \in2\().16b > >>-.endif > >>-.endm > >>- > >> .macro add_sz dst1, dst2, in1, in2, in3, in4, sz > >> add \dst1, \in1, \in3 > >> .ifc \sz, .16b > >>@@ -86,20 +79,6 @@ > >> .endif > >> .endm > >> > >>-.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz > >>-cmhs\dst1, \in1, \in3 > >>-.ifc \sz, .16b > >>-cmhs\dst2, \in2, \in4 > >>-.endif > >>-.endm > >>- > >>-.macro xtn_sz dst, in1, in2, sz > >>-xtn \dst\().8b, \in1 > >>-.ifc \sz, .16b > >>-xtn2\dst\().16b, \in2 > >>-.endif > >>-.endm > >>- > >> .macro usubl_sz dst1, dst2, in1, in2, sz > >> usubl \dst1, \in1\().8b, \in2\().8b > >> .ifc \sz, .16b > >>@@ -179,20 +158,20 @@ > >> // tmpq2 == tmp3 + tmp4, etc. > >> .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, > >> tmp8 > >> .if \mix == 0 > >>-dup v0.8h, w2// E > >>-dup v1.8h, w2// E > >>+dup v0\sz, w2// E > >> dup v2\sz, w3// I > >> dup v3\sz, w4// H > >> .else > >>-dup v0.8h, w2// E > >>+dup v0.8b, w2// E > >> dup v2.8b, w3// I > >> dup v3.8b, w4// H > >>+lsr w5, w2, #8 > >> lsr w6, w3, #8 > >> lsr w7, w4, #8 > >>-ushrv1.8h, v0.8h, #8 // E > >>+dup v1.8b, w5// E > >> dup v4.8b, w6// I > >>-bic v0.8h, #255, lsl 8 // E > >> dup v5.8b, w7// H > >>+trn1v0.2d, v0.2d, v1.2d > > > >isn't this equivalent to > > > >dup v0.8h, w2 > >uzp1 v0.16b, v0.16b, v0.16b > > > >on little endian? > > Nice idea, but it isn't quite as straightforward on aarch64 - on arm it > would have been. gah, yes. > All the even values will be output in the output registers of uzp1, so > you need uzp2 as well. > > So instead of this as we have now: > > dup v0.8b, w2 > lsr w5, w2, #8 > dup v1.8b, w5 > trn1 v0.2d, v0.2d, v1.2d > > We could do: > > dup v0.8h, w2 > uzp2 v1.16b, v0.16b, v0.16b > uzp1 v0.16b, v0.16b, v0.16b > trn1 v0.2d, v0.2d, v1.2d rev16 v1.16b, v0.16b // or ext ..x or any other instruction uzp1 v0.16b, v0.16b, v1.16b is one instruction less but also not straight forward ok as is Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] aarch64: vp9itxfm: Avoid reloading the idct32 coefficients
On 2017-02-09 13:27:04 +0200, Martin Storsjö wrote: > The idct32x32 function actually backed up and restored d8-d15 even ... pushed onto the stack ... is imo clearer even though there are no explicit push/pop instructions > though it didn't clobber them; there are plenty of registers that > can be used to allow keeping all the idct coefficients in registers > without having to reload different subsets of them at different > stages in the transform. > > After this, we still can skip backing up and restoring d12-d15. same > > Before: > vp9_inv_dct_dct_32x32_sub32_add_neon: 8128.3 > After: > vp9_inv_dct_dct_32x32_sub32_add_neon: 8053.3 > --- > libavcodec/aarch64/vp9itxfm_neon.S | 110 > +++-- > 1 file changed, 43 insertions(+), 67 deletions(-) > > diff --git a/libavcodec/aarch64/vp9itxfm_neon.S > b/libavcodec/aarch64/vp9itxfm_neon.S > index c954d1a..64286df 100644 > --- a/libavcodec/aarch64/vp9itxfm_neon.S > +++ b/libavcodec/aarch64/vp9itxfm_neon.S > @@ -1106,18 +1106,14 @@ endfunc > .endm > > function idct32_odd > -ld1 {v0.8h,v1.8h}, [x11] > - > -dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = > t16a, v31 = t31a > -dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = > t17a, v23 = t30a > -dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = > t18a, v27 = t29a > -dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = > t19a, v19 = t28a > -dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = > t20a, v29 = t27a > -dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = > t21a, v21 = t26a > -dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = > t22a, v25 = t25a > -dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = > t23a, v17 = t24a > - > -ld1 {v0.8h}, [x10] > +dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = > t16a, v31 = t31a > +dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = > t17a, v23 = t30a > +dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = > t18a, v27 = t29a > +dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = > t19a, v19 = t28a > +dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = > t20a, v29 = t27a > +dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = > t21a, v21 = t26a > +dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = > t22a, v25 = t25a > +dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = > t23a, v17 = t24a > > butterfly_8hv4, v24, v16, v24 // v4 = t16, v24 = t17 > butterfly_8hv5, v20, v28, v20 // v5 = t19, v20 = t18 > @@ -1136,18 +1132,14 @@ function idct32_odd > endfunc > > function idct32_odd_half > -ld1 {v0.8h,v1.8h}, [x11] > - > -dmbutterfly_h1 v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = > t16a, v31 = t31a > -dmbutterfly_h2 v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = > t17a, v23 = t30a > -dmbutterfly_h1 v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = > t18a, v27 = t29a > -dmbutterfly_h2 v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = > t19a, v19 = t28a > -dmbutterfly_h1 v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = > t20a, v29 = t27a > -dmbutterfly_h2 v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = > t21a, v21 = t26a > -dmbutterfly_h1 v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = > t22a, v25 = t25a > -dmbutterfly_h2 v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = > t23a, v17 = t24a > - > -ld1 {v0.8h}, [x10] > +dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = > t16a, v31 = t31a > +dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = > t17a, v23 = t30a > +dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = > t18a, v27 = t29a > +dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = > t19a, v19 = t28a > +dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = > t20a, v29 = t27a > +dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = > t21a, v21 = t26a > +dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = > t22a, v25 = t25a > +dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = > t23a, v17 = t24a > > butterfly_8hv4, v24, v16, v24 // v4 = t16, v24 = t17 > butterfly_8hv5, v20, v28, v20 // v5 = t19, v20 = t18 > @@ -1166,18 +1158,14 @@ function idct32_odd_half > endfunc > > function idct32_odd_quarter > -ld1 {v0.8h,v1.8h}, [x11] > - > -dsmull_hv4, v5,
Re: [libav-devel] [PATCH 6/6] arm: vp9lpf: Implement the mix2_44 function with one single filter pass
On 2017-01-15 22:55:52 +0200, Martin Storsjö wrote: > For this case, with 8 inputs but only changing 4 of them, we can fit > all 16 input pixels into a q register, and still have enough temporary > registers for doing the loop filter. > > The wd=8 filters would require too many temporary registers for > processing all 16 pixels at once though. > > Before: Cortex A7 A8 A9 A53 > vp9_loop_filter_mix2_v_44_16_neon: 289.7 256.2 237.5 181.2 > After: > vp9_loop_filter_mix2_v_44_16_neon: 221.2 150.5 177.7 138.0 > --- > libavcodec/arm/vp9dsp_init_arm.c | 7 +- > libavcodec/arm/vp9lpf_neon.S | 191 > +++ > 2 files changed, 195 insertions(+), 3 deletions(-) > > diff --git a/libavcodec/arm/vp9dsp_init_arm.c > b/libavcodec/arm/vp9dsp_init_arm.c > index e99d931..1ede170 100644 > --- a/libavcodec/arm/vp9dsp_init_arm.c > +++ b/libavcodec/arm/vp9dsp_init_arm.c > @@ -194,6 +194,8 @@ define_loop_filters(8, 8); > define_loop_filters(16, 8); > define_loop_filters(16, 16); > > +define_loop_filters(44, 16); > + > #define lf_mix_fn(dir, wd1, wd2, stridea) > \ > static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, > \ > ptrdiff_t stride, > \ > @@ -207,7 +209,6 @@ static void > loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, > lf_mix_fn(h, wd1, wd2, stride) \ > lf_mix_fn(v, wd1, wd2, sizeof(uint8_t)) > > -lf_mix_fns(4, 4) > lf_mix_fns(4, 8) > lf_mix_fns(8, 4) > lf_mix_fns(8, 8) > @@ -227,8 +228,8 @@ static av_cold void > vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp) > dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon; > dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon; > > -dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_neon; > -dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_neon; > +dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon; > +dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon; > dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_neon; > dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_neon; > dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_neon; > diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S > index e31c807..12984a9 100644 > --- a/libavcodec/arm/vp9lpf_neon.S > +++ b/libavcodec/arm/vp9lpf_neon.S > @@ -44,6 +44,109 @@ > vtrn.8 \r2, \r3 > .endm > > +@ The input to and output from this macro is in the registers q8-q15, > +@ and q0-q7 are used as scratch registers. > +@ p3 = q8, p0 = q11, q0 = q12, q3 = q15 > +.macro loop_filter_q > +vdup.u8 d0, r2 @ E > +lsr r2, r2, #8 > +vdup.u8 d2, r3 @ I > +lsr r3, r3, #8 > +vdup.u8 d1, r2 @ E > +vdup.u8 d3, r3 @ I > + > +vabd.u8 q2, q8, q9 @ abs(p3 - p2) > +vabd.u8 q3, q9, q10@ abs(p2 - p1) > +vabd.u8 q4, q10, q11@ abs(p1 - p0) > +vabd.u8 q5, q12, q13@ abs(q0 - q1) > +vabd.u8 q6, q13, q14@ abs(q1 - q2) > +vabd.u8 q7, q14, q15@ abs(q2 - q3) > +vmax.u8 q2, q2, q3 > +vmax.u8 q3, q4, q5 > +vmax.u8 q4, q6, q7 > +vabd.u8 q5, q11, q12@ abs(p0 - q0) > +vmax.u8 q2, q2, q3 > +vqadd.u8q5, q5, q5 @ abs(p0 - q0) * 2 > +vabd.u8 q7, q10, q13@ abs(p1 - q1) > +vmax.u8 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - > q3)) > +vshr.u8 q7, q7, #1 > +vcle.u8 q2, q2, q1 @ max(abs()) <= I > +vqadd.u8q5, q5, q7 @ abs(p0 - q0) * 2 + abs(p1 - q1) > >> 1 > +vcle.u8 q5, q5, q0 > +vandq2, q2, q5 @ fm > + > +vshrn.u16 d10, q2, #4 > +vmovr2, r3, d10 > +orrsr2, r2, r3 > +@ If no pixels need filtering, just exit as soon as possible > +beq 9f > + > +@ Calculate the normal inner loop filter for 2 or 4 pixels > +ldr r3, [sp, #64] > +vabd.u8 q3, q10, q11@ abs(p1 - p0) > +vabd.u8 q4, q13, q12@ abs(q1 - q0) > + > +vsubl.u8q5, d20, d26@ p1 - q1 > +vsubl.u8q6, d21, d27@ p1 - q1 > +vmax.u8 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0)) > +vqmovn.s16 d10, q5 @ av_clip_int8p(p1 - q1) > +vqmovn.s16 d11, q6 @ av_clip_int8p(p1 - q1) > +
Re: [libav-devel] [PATCH 5/6] aarch64: vp9lpf: Interleave the start of flat8in into the calculation above
On 2017-01-15 22:55:51 +0200, Martin Storsjö wrote: > --- > libavcodec/aarch64/vp9lpf_neon.S | 16 +--- > 1 file changed, 13 insertions(+), 3 deletions(-) > > diff --git a/libavcodec/aarch64/vp9lpf_neon.S > b/libavcodec/aarch64/vp9lpf_neon.S > index 4553173..3894307 100644 > --- a/libavcodec/aarch64/vp9lpf_neon.S > +++ b/libavcodec/aarch64/vp9lpf_neon.S > @@ -316,20 +316,30 @@ > > uxtl_sz v0.8h, v1.8h, v22, \sz// p1 > uxtl_sz v2.8h, v3.8h, v25, \sz// q1 > +.if \wd >= 8 > +mov x5, v6.d[0] > +.endif > saddw_szv0.8h, v1.8h, v0.8h, v1.8h, \tmp3, \sz // p1 + f > ssubw_szv2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q1 - f > +.if \wd >= 8 > +.ifc \sz, .16b > +mov x6, v6.d[1] > +.endif > +.endif is it helpful have this mov here? It would look a little less ugly if you merged this .if with the one above > sqxtun_sz v0, v0.8h, v1.8h, \sz // out p1 > sqxtun_sz v2, v2.8h, v3.8h, \sz // out q1 > +.if \wd >= 8 > +.ifc \sz, .16b > +addsx5, x5, x6 > +.endif > +.endif > bit v22\sz, v0\sz, v5\sz // if (!hev && fm && > !flat8in) > bit v25\sz, v2\sz, v5\sz > > // If no pixels need flat8in, jump to flat8out > // (or to a writeout of the inner 4 pixels, for wd=8) > .if \wd >= 8 > -mov x5, v6.d[0] > .ifc \sz, .16b > -mov x6, v6.d[1] > -addsx5, x5, x6 > b.eq6f > .else > cbz x5, 6f otherwise ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 4/6] arm: vp9lpf: Interleave the start of flat8in into the calculation above
On 2017-01-15 22:55:50 +0200, Martin Storsjö wrote: > This adds lots of extra .ifs, but speeds it up by a couple cycles, > by avoiding stalls. > --- > libavcodec/arm/vp9lpf_neon.S | 8 ++-- > 1 file changed, 6 insertions(+), 2 deletions(-) > > diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S > index 9be4cef..e31c807 100644 > --- a/libavcodec/arm/vp9lpf_neon.S > +++ b/libavcodec/arm/vp9lpf_neon.S > @@ -181,16 +181,20 @@ > > vmovl.u8q0, d22@ p1 > vmovl.u8q1, d25@ q1 > +.if \wd >= 8 > +vmovr2, r3, d6 > +.endif > vaddw.s8q0, q0, \tmp3 @ p1 + f > vsubw.s8q1, q1, \tmp3 @ q1 - f > +.if \wd >= 8 > +orrsr2, r2, r3 > +.endif > vqmovun.s16 d0, q0 @ out p1 > vqmovun.s16 d2, q1 @ out q1 > vbitd22, d0, d5@ if (!hev && fm && !flat8in) > vbitd25, d2, d5 > > .if \wd >= 8 > -vmovr2, r3, d6 > -orrsr2, r2, r3 > @ If no pixels need flat8in, jump to flat8out > @ (or to a writeout of the inner 4 pixels, for wd=8) > beq 6f ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 3/6] arm: vp9lpf: Use orrs instead of orr+cmp
On 2017-01-15 22:55:49 +0200, Martin Storsjö wrote: > --- > libavcodec/arm/vp9lpf_neon.S | 12 > 1 file changed, 4 insertions(+), 8 deletions(-) > > diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S > index 5e154f6..9be4cef 100644 > --- a/libavcodec/arm/vp9lpf_neon.S > +++ b/libavcodec/arm/vp9lpf_neon.S > @@ -77,8 +77,7 @@ > > vdup.u8 d3, r3 @ H > vmovr2, r3, d4 > -orr r2, r2, r3 > -cmp r2, #0 > +orrsr2, r2, r3 > @ If no pixels need filtering, just exit as soon as possible > beq 9f > > @@ -191,8 +190,7 @@ > > .if \wd >= 8 > vmovr2, r3, d6 > -orr r2, r2, r3 > -cmp r2, #0 > +orrsr2, r2, r3 > @ If no pixels need flat8in, jump to flat8out > @ (or to a writeout of the inner 4 pixels, for wd=8) > beq 6f > @@ -247,14 +245,12 @@ > 6: > vorrd2, d6, d7 > vmovr2, r3, d2 > -orr r2, r2, r3 > -cmp r2, #0 > +orrsr2, r2, r3 > @ If no pixels needed flat8in nor flat8out, jump to a > @ writeout of the inner 4 pixels > beq 7f > vmovr2, r3, d7 > -orr r2, r2, r3 > -cmp r2, #0 > +orrsr2, r2, r3 > @ If no pixels need flat8out, jump to a writeout of the inner 6 > pixels > beq 8f ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/6] arm/aarch64: vp9lpf: Keep the comparison to E within 8 bit
On 2017-01-15 22:55:48 +0200, Martin Storsjö wrote: > The theoretical maximum value of E is 193, so we can just > saturate the addition to 255. > > Before: Cortex A7 A8 A9 A53 A53/AArch64 > vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.888.0 87.7 > vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0136.7 > vp9_loop_filter_v_16_8_neon:497.0 419.5 379.7 293.0275.7 > vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0452.0 > After: > vp9_loop_filter_v_4_8_neon: 136.0 125.7 112.684.0 83.0 > vp9_loop_filter_v_8_8_neon: 234.0 195.5 171.5 136.0133.7 > vp9_loop_filter_v_16_8_neon:490.0 417.5 377.7 289.0271.0 > vp9_loop_filter_v_16_16_neon: 951.2 814.7 732.3 571.0446.7 > --- > libavcodec/aarch64/vp9lpf_neon.S | 40 > +--- > libavcodec/arm/vp9lpf_neon.S | 11 +-- > 2 files changed, 14 insertions(+), 37 deletions(-) > > diff --git a/libavcodec/aarch64/vp9lpf_neon.S > b/libavcodec/aarch64/vp9lpf_neon.S > index 3b8e6eb..4553173 100644 > --- a/libavcodec/aarch64/vp9lpf_neon.S > +++ b/libavcodec/aarch64/vp9lpf_neon.S > @@ -51,13 +51,6 @@ > // see the arm version instead. > > > -.macro uabdl_sz dst1, dst2, in1, in2, sz > -uabdl \dst1, \in1\().8b, \in2\().8b > -.ifc \sz, .16b > -uabdl2 \dst2, \in1\().16b, \in2\().16b > -.endif > -.endm > - > .macro add_sz dst1, dst2, in1, in2, in3, in4, sz > add \dst1, \in1, \in3 > .ifc \sz, .16b > @@ -86,20 +79,6 @@ > .endif > .endm > > -.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz > -cmhs\dst1, \in1, \in3 > -.ifc \sz, .16b > -cmhs\dst2, \in2, \in4 > -.endif > -.endm > - > -.macro xtn_sz dst, in1, in2, sz > -xtn \dst\().8b, \in1 > -.ifc \sz, .16b > -xtn2\dst\().16b, \in2 > -.endif > -.endm > - > .macro usubl_sz dst1, dst2, in1, in2, sz > usubl \dst1, \in1\().8b, \in2\().8b > .ifc \sz, .16b > @@ -179,20 +158,20 @@ > // tmpq2 == tmp3 + tmp4, etc. > .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, > tmp8 > .if \mix == 0 > -dup v0.8h, w2// E > -dup v1.8h, w2// E > +dup v0\sz, w2// E > dup v2\sz, w3// I > dup v3\sz, w4// H > .else > -dup v0.8h, w2// E > +dup v0.8b, w2// E > dup v2.8b, w3// I > dup v3.8b, w4// H > +lsr w5, w2, #8 > lsr w6, w3, #8 > lsr w7, w4, #8 > -ushrv1.8h, v0.8h, #8 // E > +dup v1.8b, w5// E > dup v4.8b, w6// I > -bic v0.8h, #255, lsl 8 // E > dup v5.8b, w7// H > +trn1v0.2d, v0.2d, v1.2d isn't this equivalent to dup v0.8h, w2 uzp1 v0.16b, v0.16b, v0.16b on little endian? > trn1v2.2d, v2.2d, v4.2d > trn1v3.2d, v3.2d, v5.2d > .endif > @@ -206,16 +185,15 @@ > umaxv4\sz, v4\sz, v5\sz > umaxv5\sz, v6\sz, v7\sz > umax\tmp1\sz, \tmp1\sz, \tmp2\sz > -uabdl_szv6.8h, v7.8h, v23, v24, \sz // abs(p0 - q0) > +uabdv6\sz, v23\sz, v24\sz// abs(p0 - q0) > umaxv4\sz, v4\sz, v5\sz > -add_sz v6.8h, v7.8h, v6.8h, v7.8h, v6.8h, v7.8h, \sz > // abs(p0 - q0) * 2 > +uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2 > uabdv5\sz, v22\sz, v25\sz// abs(p1 - q1) > umaxv4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), > ..., abs(q2 - q3)) > ushrv5\sz, v5\sz, #1 > cmhsv4\sz, v2\sz, v4\sz // max(abs()) <= I > -uaddw_szv6.8h, v7.8h, v6.8h, v7.8h, v5, \sz // abs(p0 - > q0) * 2 + abs(p1 - q1) >> 1 > -cmhs_sz v6.8h, v7.8h, v0.8h, v1.8h, v6.8h, v7.8h, \sz > -xtn_sz v5, v6.8h, v7.8h, \sz > +uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + > abs(p1 - q1) >> 1 > +cmhsv5\sz, v0\sz, v6\sz > and v4\sz, v4\sz, v5\sz // fm > > // If no pixels need filtering, just exit as soon as possible > diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S > index c57c0e9..5e154f6 100644 > --- a/libavcodec/arm/vp9lpf_neon.S > +++ b/libavcodec/arm/vp9lpf_neon.S > @@ -51,7 +51,7 @@ > @ and d28-d31 as
Re: [libav-devel] [PATCH 1/6] arm/aarch64: vp9lpf: Calculate !hev directly
On 2017-01-15 22:55:47 +0200, Martin Storsjö wrote: > Previously we first calculated hev, and then negated it. > > Since we were able to schedule the negation in the middle > of another calculation, we don't see any gain in all cases. > > Before: Cortex A7 A8 A9 A53 A53/AArch64 > vp9_loop_filter_v_4_8_neon: 147.0 129.0 115.889.0 88.7 > vp9_loop_filter_v_8_8_neon: 242.0 198.5 174.7 140.0136.7 > vp9_loop_filter_v_16_8_neon:500.0 419.5 382.7 293.0275.7 > vp9_loop_filter_v_16_16_neon: 971.2 825.5 731.5 579.0453.0 > After: > vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.888.0 87.7 > vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0136.7 > vp9_loop_filter_v_16_8_neon:497.0 419.5 379.7 293.0275.7 > vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0452.0 > --- > libavcodec/aarch64/vp9lpf_neon.S | 5 ++--- > libavcodec/arm/vp9lpf_neon.S | 5 ++--- > 2 files changed, 4 insertions(+), 6 deletions(-) > > diff --git a/libavcodec/aarch64/vp9lpf_neon.S > b/libavcodec/aarch64/vp9lpf_neon.S > index e9c7d9e..3b8e6eb 100644 > --- a/libavcodec/aarch64/vp9lpf_neon.S > +++ b/libavcodec/aarch64/vp9lpf_neon.S > @@ -292,7 +292,7 @@ > .if \mix != 0 > sxtlv1.8h, v1.8b > .endif > -cmhiv5\sz, v5\sz, v3\sz // hev > +cmhsv5\sz, v3\sz, v5\sz // !hev > .if \wd == 8 > // If a 4/8 or 8/4 mix is used, clear the relevant half of v6 > .if \mix != 0 > @@ -306,11 +306,10 @@ > .elseif \wd == 8 > bic v4\sz, v4\sz, v6\sz // fm && !flat8in > .endif > -mvn v5\sz, v5\sz // !hev > +and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in > .if \wd == 16 > and v7\sz, v7\sz, v6\sz // flat8out && flat8in && fm > .endif > -and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in > > mul_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, > \tmp4\().8h, \tmp5\().8h, \tmp5\().8h, \sz // 3 * (q0 - p0) > bic \tmp1\sz, \tmp1\sz, v5\sz// if (!hev) > av_clip_int8 = 0 > diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S > index fbf2901..c57c0e9 100644 > --- a/libavcodec/arm/vp9lpf_neon.S > +++ b/libavcodec/arm/vp9lpf_neon.S > @@ -141,7 +141,7 @@ > .if \wd == 8 > vcle.u8 d6, d6, d0@ flat8in > .endif > -vcgt.u8 d5, d5, d3@ hev > +vcle.u8 d5, d5, d3@ !hev > .if \wd == 8 > vandd6, d6, d4@ flat8in && fm > .endif > @@ -151,11 +151,10 @@ > .elseif \wd == 8 > vbicd4, d4, d6@ fm && !flat8in > .endif > -vmvnd5, d5 @ !hev > +vandd5, d5, d4@ !hev && fm && !flat8in > .if \wd == 16 > vandd7, d7, d6@ flat8out && flat8in && fm > .endif > -vandd5, d5, d4@ !hev && fm && !flat8in > > vmul.s16\tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0) > vbic\tmp1, \tmp1, d5@ if (!hev) av_clip_int8 = 0 ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] aarch64: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling
On 2017-01-05 09:35:37 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > Before: Cortex A53 > vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 > vp9_inv_dct_dct_32x32_sub1_add_neon: 555.1 > After: > vp9_inv_dct_dct_16x16_sub1_add_neon: 180.2 > vp9_inv_dct_dct_32x32_sub1_add_neon: 475.3 > --- > libavcodec/aarch64/vp9itxfm_neon.S | 54 > +- > 1 file changed, 36 insertions(+), 18 deletions(-) > > diff --git a/libavcodec/aarch64/vp9itxfm_neon.S > b/libavcodec/aarch64/vp9itxfm_neon.S > index 7ce6df0..a4284e3 100644 > --- a/libavcodec/aarch64/vp9itxfm_neon.S > +++ b/libavcodec/aarch64/vp9itxfm_neon.S > @@ -448,16 +448,23 @@ function idct16x16_dc_add_neon > > srshr v2.8h, v2.8h, #6 > > +mov x3, x0 > mov x4, #16 > 1: > // Loop to add the constant from v2 into all 16x16 outputs > -ld1 {v3.16b}, [x0] > -uaddw v4.8h, v2.8h, v3.8b > -uaddw2 v5.8h, v2.8h, v3.16b > -sqxtun v4.8b, v4.8h > -sqxtun2 v4.16b, v5.8h > -st1 {v4.16b}, [x0], x1 > -subsx4, x4, #1 > +subsx4, x4, #2 > +ld1 {v3.16b}, [x0], x1 > +ld1 {v4.16b}, [x0], x1 > +uaddw v16.8h, v2.8h, v3.8b > +uaddw2 v17.8h, v2.8h, v3.16b > +uaddw v18.8h, v2.8h, v4.8b > +uaddw2 v19.8h, v2.8h, v4.16b > +sqxtun v3.8b, v16.8h > +sqxtun2 v3.16b, v17.8h > +sqxtun v4.8b, v18.8h > +sqxtun2 v4.16b, v19.8h > +st1 {v3.16b}, [x3], x1 > +st1 {v4.16b}, [x3], x1 > b.ne1b > > ret > @@ -824,20 +831,31 @@ function idct32x32_dc_add_neon > > srshr v0.8h, v2.8h, #6 > > +mov x3, x0 > mov x4, #32 > 1: > // Loop to add the constant v0 into all 32x32 outputs > -ld1 {v1.16b,v2.16b}, [x0] > -uaddw v3.8h, v0.8h, v1.8b > -uaddw2 v4.8h, v0.8h, v1.16b > -uaddw v5.8h, v0.8h, v2.8b > -uaddw2 v6.8h, v0.8h, v2.16b > -sqxtun v3.8b, v3.8h > -sqxtun2 v3.16b, v4.8h > -sqxtun v4.8b, v5.8h > -sqxtun2 v4.16b, v6.8h > -st1 {v3.16b,v4.16b}, [x0], x1 > -subsx4, x4, #1 > +subsx4, x4, #2 > +ld1 {v1.16b,v2.16b}, [x0], x1 > +uaddw v16.8h, v0.8h, v1.8b > +uaddw2 v17.8h, v0.8h, v1.16b > +ld1 {v3.16b,v4.16b}, [x0], x1 > +uaddw v18.8h, v0.8h, v2.8b > +uaddw2 v19.8h, v0.8h, v2.16b > +uaddw v20.8h, v0.8h, v3.8b > +uaddw2 v21.8h, v0.8h, v3.16b > +uaddw v22.8h, v0.8h, v4.8b > +uaddw2 v23.8h, v0.8h, v4.16b > +sqxtun v1.8b, v16.8h > +sqxtun2 v1.16b, v17.8h > +sqxtun v2.8b, v18.8h > +sqxtun2 v2.16b, v19.8h > +sqxtun v3.8b, v20.8h > +sqxtun2 v3.16b, v21.8h > +st1 {v1.16b,v2.16b}, [x3], x1 > +sqxtun v4.8b, v22.8h > +sqxtun2 v4.16b, v23.8h > +st1 {v3.16b,v4.16b}, [x3], x1 > b.ne1b > > ret ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] arm: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling
On 2017-01-05 09:35:36 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > Before:Cortex A7 A8 A9 A53 > vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.5 211.7 235.8 > vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0 459.2 862.2 553.9 > After: > vp9_inv_dct_dct_16x16_sub1_add_neon: 226.5 145.0 225.1 171.8 > vp9_inv_dct_dct_32x32_sub1_add_neon: 721.2 415.7 727.6 475.0 > --- > libavcodec/arm/vp9itxfm_neon.S | 54 > -- > 1 file changed, 36 insertions(+), 18 deletions(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index 5abe435..a81240b 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -518,16 +518,23 @@ function idct16x16_dc_add_neon > > vrshr.s16 q8, q8, #6 > > +mov r3, r0 > mov r12, #16 > 1: > @ Loop to add the constant from q8 into all 16x16 outputs > -vld1.8 {q3}, [r0,:128] > -vaddw.u8q10, q8, d6 > -vaddw.u8q11, q8, d7 > -vqmovun.s16 d6, q10 > -vqmovun.s16 d7, q11 > -vst1.8 {q3}, [r0,:128], r1 > -subsr12, r12, #1 > +subsr12, r12, #2 > +vld1.8 {q2}, [r0,:128], r1 > +vaddw.u8q10, q8, d4 > +vld1.8 {q3}, [r0,:128], r1 > +vaddw.u8q11, q8, d5 > +vaddw.u8q12, q8, d6 > +vaddw.u8q13, q8, d7 > +vqmovun.s16 d4, q10 > +vqmovun.s16 d5, q11 > +vqmovun.s16 d6, q12 > +vst1.8 {q2}, [r3,:128], r1 > +vqmovun.s16 d7, q13 > +vst1.8 {q3}, [r3,:128], r1 > bne 1b > > bx lr > @@ -889,20 +896,31 @@ function idct32x32_dc_add_neon > > vrshr.s16 q8, q8, #6 > > +mov r3, r0 > mov r12, #32 > 1: > @ Loop to add the constant from q8 into all 32x32 outputs > -vld1.8 {q2-q3}, [r0,:128] > -vaddw.u8q10, q8, d4 > -vaddw.u8q11, q8, d5 > -vaddw.u8q12, q8, d6 > -vaddw.u8q13, q8, d7 > -vqmovun.s16 d4, q10 > -vqmovun.s16 d5, q11 > -vqmovun.s16 d6, q12 > -vqmovun.s16 d7, q13 > -vst1.8 {q2-q3}, [r0,:128], r1 > -subsr12, r12, #1 > +subsr12, r12, #2 > +vld1.8 {q0-q1}, [r0,:128], r1 > +vaddw.u8q9, q8, d0 > +vaddw.u8q10, q8, d1 > +vld1.8 {q2-q3}, [r0,:128], r1 > +vaddw.u8q11, q8, d2 > +vaddw.u8q12, q8, d3 > +vaddw.u8q13, q8, d4 > +vaddw.u8q14, q8, d5 > +vaddw.u8q15, q8, d6 > +vqmovun.s16 d0, q9 > +vaddw.u8q9, q8, d7 > +vqmovun.s16 d1, q10 > +vqmovun.s16 d2, q11 > +vqmovun.s16 d3, q12 > +vqmovun.s16 d4, q13 > +vqmovun.s16 d5, q14 > +vst1.8 {q0-q1}, [r3,:128], r1 > +vqmovun.s16 d6, q15 > +vqmovun.s16 d7, q9 > +vst1.8 {q2-q3}, [r3,:128], r1 > bne 1b > > bx lr ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 6/6] aarch64: vp9mc: Calculate less unused data in the 4 pixel wide horizontal filter
On 2017-01-02 14:17:56 +0200, Martin Storsjö wrote: > No measured speedup on an Cortex A53, but other cores might benefit. A little surprised that it didn't made a difference on the cortex-a53 since certain sites reported the NEON unit isn't fully 128-bit wide, So unlikely that it makes a difference on other cores. > --- > libavcodec/aarch64/vp9mc_neon.S | 15 +-- > 1 file changed, 13 insertions(+), 2 deletions(-) > > diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S > index 99f1809..95ed26c 100644 > --- a/libavcodec/aarch64/vp9mc_neon.S > +++ b/libavcodec/aarch64/vp9mc_neon.S > @@ -202,9 +202,12 @@ endfunc > ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) > mla \dst2\().8h, v21.8h, v0.h[\offset] > mla \dst4\().8h, v23.8h, v0.h[\offset] > -.else > +.elseif \size == 8 > mla \dst1\().8h, v20.8h, v0.h[\offset] > mla \dst3\().8h, v22.8h, v0.h[\offset] > +.else > +mla \dst1\().4h, v20.4h, v0.h[\offset] > +mla \dst3\().4h, v22.4h, v0.h[\offset] > .endif > .endm > // The same as above, but don't accumulate straight into the > @@ -219,16 +222,24 @@ endfunc > ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) > mul v21.8h, v21.8h, v0.h[\offset] > mul v23.8h, v23.8h, v0.h[\offset] > -.else > +.elseif \size == 8 > mul v20.8h, v20.8h, v0.h[\offset] > mul v22.8h, v22.8h, v0.h[\offset] > +.else > +mul v20.4h, v20.4h, v0.h[\offset] > +mul v22.4h, v22.4h, v0.h[\offset] > .endif > +.if \size == 4 > +sqadd \dst1\().4h, \dst1\().4h, v20.4h > +sqadd \dst3\().4h, \dst3\().4h, v22.4h > +.else > sqadd \dst1\().8h, \dst1\().8h, v20.8h > sqadd \dst3\().8h, \dst3\().8h, v22.8h > .if \size >= 16 > sqadd \dst2\().8h, \dst2\().8h, v21.8h > sqadd \dst4\().8h, \dst4\().8h, v23.8h > .endif > +.endif > .endm patch ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 4/6] aarch64: vp9mc: Simplify the extmla macro parameters
On 2017-01-02 14:17:54 +0200, Martin Storsjö wrote: > Fold the field lengths into the macro. > > This makes the macro invocations much more readable, when the > lines are shorter. > > This also makes it easier to use only half the registers within > the macro. > --- > libavcodec/aarch64/vp9mc_neon.S | 50 > - > 1 file changed, 25 insertions(+), 25 deletions(-) > > diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S > index c1f1876..99f1809 100644 > --- a/libavcodec/aarch64/vp9mc_neon.S > +++ b/libavcodec/aarch64/vp9mc_neon.S > @@ -193,41 +193,41 @@ endfunc > // for size >= 16), and multiply-accumulate into dst1 and dst3 (or > // dst1-dst2 and dst3-dst4 for size >= 16) > .macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, > offset, size > -ext v20.16b, \src1, \src2, #(2*\offset) > -ext v22.16b, \src4, \src5, #(2*\offset) > +ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) > +ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) > .if \size >= 16 > -mla \dst1, v20.8h, v0.h[\offset] > -ext v21.16b, \src2, \src3, #(2*\offset) > -mla \dst3, v22.8h, v0.h[\offset] > -ext v23.16b, \src5, \src6, #(2*\offset) > -mla \dst2, v21.8h, v0.h[\offset] > -mla \dst4, v23.8h, v0.h[\offset] > +mla \dst1\().8h, v20.8h, v0.h[\offset] > +ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) > +mla \dst3\().8h, v22.8h, v0.h[\offset] > +ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) > +mla \dst2\().8h, v21.8h, v0.h[\offset] > +mla \dst4\().8h, v23.8h, v0.h[\offset] > .else > -mla \dst1, v20.8h, v0.h[\offset] > -mla \dst3, v22.8h, v0.h[\offset] > +mla \dst1\().8h, v20.8h, v0.h[\offset] > +mla \dst3\().8h, v22.8h, v0.h[\offset] > .endif > .endm > // The same as above, but don't accumulate straight into the > // destination, but use a temp register and accumulate with saturation. > .macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, > src6, offset, size > -ext v20.16b, \src1, \src2, #(2*\offset) > -ext v22.16b, \src4, \src5, #(2*\offset) > +ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) > +ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) > .if \size >= 16 > mul v20.8h, v20.8h, v0.h[\offset] > -ext v21.16b, \src2, \src3, #(2*\offset) > +ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) > mul v22.8h, v22.8h, v0.h[\offset] > -ext v23.16b, \src5, \src6, #(2*\offset) > +ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) > mul v21.8h, v21.8h, v0.h[\offset] > mul v23.8h, v23.8h, v0.h[\offset] > .else > mul v20.8h, v20.8h, v0.h[\offset] > mul v22.8h, v22.8h, v0.h[\offset] > .endif > -sqadd \dst1, \dst1, v20.8h > -sqadd \dst3, \dst3, v22.8h > +sqadd \dst1\().8h, \dst1\().8h, v20.8h > +sqadd \dst3\().8h, \dst3\().8h, v22.8h > .if \size >= 16 > -sqadd \dst2, \dst2, v21.8h > -sqadd \dst4, \dst4, v23.8h > +sqadd \dst2\().8h, \dst2\().8h, v21.8h > +sqadd \dst4\().8h, \dst4\().8h, v23.8h > .endif > .endm > > @@ -292,13 +292,13 @@ function \type\()_8tap_\size\()h_\idx1\idx2 > mul v2.8h, v5.8h, v0.h[0] > mul v25.8h, v17.8h, v0.h[0] > .endif > -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, > v6.16b, v16.16b, v17.16b, v18.16b, 1, \size > -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, > v6.16b, v16.16b, v17.16b, v18.16b, 2, \size > -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, > v6.16b, v16.16b, v17.16b, v18.16b, \idx1, \size > -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, > v6.16b, v16.16b, v17.16b, v18.16b, 5, \size > -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, > v6.16b, v16.16b, v17.16b, v18.16b, 6, \size > -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, > v6.16b, v16.16b, v17.16b, v18.16b, 7, \size > -extmulqadd v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, > v6.16b, v16.16b, v17.16b, v18.16b, \idx2, \size > +extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, > \size > +ext
Re: [libav-devel] [PATCH 5/6] arm: vp9mc: Calculate less unused data in the 4 pixel wide horizontal filter
On 2017-01-02 14:17:55 +0200, Martin Storsjö wrote: > Before:Cortex A7 A8 A9 A53 > vp9_put_8tap_smooth_4h_neon: 378.1 273.2 340.7 229.5 > After: > vp9_put_8tap_smooth_4h_neon: 352.1 222.2 290.5 229.5 > --- > libavcodec/arm/vp9mc_neon.S | 33 ++--- > 1 file changed, 22 insertions(+), 11 deletions(-) > > diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S > index a5413a3..8d43ff1 100644 > --- a/libavcodec/arm/vp9mc_neon.S > +++ b/libavcodec/arm/vp9mc_neon.S > @@ -209,7 +209,7 @@ endfunc > @ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 > @ for size >= 16), and multiply-accumulate into dst1 and dst3 (or > @ dst1-dst2 and dst3-dst4 for size >= 16) > -.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, > offset, size > +.macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, > src5, src6, offset, size > vext.8 q14, \src1, \src2, #(2*\offset) > vext.8 q15, \src4, \src5, #(2*\offset) > .if \size >= 16 > @@ -219,14 +219,17 @@ endfunc > vext.8 q6, \src5, \src6, #(2*\offset) > vmla_lane \dst2, q5, \offset > vmla_lane \dst4, q6, \offset > -.else > +.elseif \size == 8 > vmla_lane \dst1, q14, \offset > vmla_lane \dst3, q15, \offset > +.else > +vmla_lane \dst1d, d28, \offset > +vmla_lane \dst3d, d30, \offset > .endif > .endm > @ The same as above, but don't accumulate straight into the > @ destination, but use a temp register and accumulate with saturation. > -.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, > src6, offset, size > +.macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, > src4, src5, src6, offset, size > vext.8 q14, \src1, \src2, #(2*\offset) > vext.8 q15, \src4, \src5, #(2*\offset) > .if \size >= 16 > @@ -236,16 +239,24 @@ endfunc > vext.8 q6, \src5, \src6, #(2*\offset) > vmul_lane q5, q5, \offset > vmul_lane q6, q6, \offset > -.else > +.elseif \size == 8 > vmul_lane q14, q14, \offset > vmul_lane q15, q15, \offset > +.else > +vmul_lane d28, d28, \offset > +vmul_lane d30, d30, \offset > .endif > +.if \size == 4 > +vqadd.s16 \dst1d, \dst1d, d28 > +vqadd.s16 \dst3d, \dst3d, d30 > +.else > vqadd.s16 \dst1, \dst1, q14 > vqadd.s16 \dst3, \dst3, q15 > .if \size >= 16 > vqadd.s16 \dst2, \dst2, q5 > vqadd.s16 \dst4, \dst4, q6 > .endif > +.endif > .endm > > > @@ -309,13 +320,13 @@ function \type\()_8tap_\size\()h_\idx1\idx2 > vmul.s16q2, q9, d0[0] > vmul.s16q4, q12, d0[0] > .endif > -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, > 1, \size > -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, > 2, \size > -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, > \idx1, \size > -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, > 5, \size > -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, > 6, \size > -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, > 7, \size > -extmulqadd q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, > \idx2, \size > +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, > q12, q13, 1, \size > +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, > q12, q13, 2, \size > +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, > q12, q13, \idx1, \size > +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, > q12, q13, 5, \size > +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, > q12, q13, 6, \size > +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, > q12, q13, 7, \size > +extmulqadd q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, > q12, q13, \idx2, \size > > @ Round, shift and saturate > vqrshrun.s16d2, q1, #7 ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/6] arm: vp9itxfm: Share instructions for loading idct coeffs in the 8x8 function
On 2017-02-09 14:29:56 +0200, Martin Storsjö wrote: > --- > libavcodec/arm/vp9itxfm_neon.S | 3 +-- > 1 file changed, 1 insertion(+), 2 deletions(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index 167d517..3d0b0fa 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -412,13 +412,12 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, > export=1 > .ifc \txfm1\()_\txfm2,idct_idct > movrel r12, idct_coeffs > vpush {q4-q5} > -vld1.16 {q0}, [r12,:128] > .else > movrel r12, iadst8_coeffs > vld1.16 {q1}, [r12,:128]! > vpush {q4-q7} > -vld1.16 {q0}, [r12,:128] > .endif > +vld1.16 {q0}, [r12,:128] > > vmov.i16q2, #0 > vmov.i16q3, #0 the whole set is ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 5/5] aarch64: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)
On 2017-02-09 09:50:48 +0200, Martin Storsjö wrote: > On Thu, 9 Feb 2017, Janne Grunau wrote: > > >On 2017-02-05 14:05:49 +0200, Martin Storsjö wrote: > >>On Sun, 5 Feb 2017, Janne Grunau wrote: > >> > >>>> // out1 = in1 + in2 > >>>> // out2 = in1 - in2 > >>>> .macro butterfly_8h out1, out2, in1, in2 > >>>>@@ -463,7 +510,7 @@ function idct16x16_dc_add_neon > >>>> ret > >>>> endfunc > >>>> > >>>>-function idct16 > >>>>+.macro idct16_full > >>>> dmbutterfly0v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // > >>>> v16 = t0a, v24 = t1a > >>>> dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // > >>>> v20 = t2a, v28 = t3a > >>>> dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // > >>>> v18 = t4a, v30 = t7a > >>>>@@ -485,7 +532,10 @@ function idct16 > >>>> dmbutterfly0v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 > >>>> // v22 = t6a, v26 = t5a > >>>> dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 > >>>> // v23 = t9a, v25 = t14a > >>>> dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, > >>>> neg=1 // v27 = t13a, v21 = t10a > >>>>+idct16_end > >>> > >>>I think it would be clearer if idct16_end is used directly from the macro. > >>>it would probably also make sense to move idct16_end and avoid the > >>>idct16_full macro. The patch might be smaller and it is immediately > >>>obvious that there is no code change but the resulting code is more > >>>comlicated than it needs to be. same applies to arm if we go with > >>>alternative 1. > >> > >>Ok, so you mean like this? > >> > >>function idct16 > >>dmbutterfly... > >> > >>idct16_end > >>endfunc > > > >that would be one option, the other would be to move the idct_end > >instructions as a macro out of the the existing idct16 function and use it > >as macro. That would make the full idct structural identical to the half > >and quarter version and avoid a macro only used once. > > I'm not really following what you're suggesting here - can you outline it > with a code sample like mine above? sorry, it seems I wasn't fully awake. I misread your code snipped. To avoid any confusing here is what I ment outlined as pseudo patch: @@ +.macro idct16_end +[code from the existing idct16 function] +.endm + function idct16 @@ ... +idct16_end -[code moved to the idct16_end macro] endfunc Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 5/5] aarch64: vp9itxfm: Do separate functions for half/quarter idct16 and idct32 (alternative 2)
On 2017-02-06 00:16:41 +0200, Martin Storsjö wrote: > > Ok, so after running a slightly shorter clip (which seems to have about as > large percentage of runtime doing IDCT as the previous one) with a bit more > iterations, I've got the following results (the 'user' part from 'time > avconv -threads 1 -i foo -f null -'): > > 32 orig 32 alt1 32 alt2 64 orig 64 alt1 64 alt2 > 40.436s 40.148s 40.008s 37.428s 37.356s 37.192s > 40.596s 40.140s 40.216s 37.572s 37.524s 37.384s > 40.512s 40.228s 40.188s 37.740s 37.588s 37.368s > 40.584s 40.136s 40.216s 37.880s 37.492s 37.348s > 40.572s 40.292s 40.232s 37.756s 37.556s 37.676s > 40.764s 40.312s 40.232s 37.876s 37.640s 37.468s > 40.688s 40.284s 40.368s 37.972s 37.608s 37.460s > > So while alt2 is faster in most runs, the margin is not quite as big as in > the previous benchmark. (The benchmarks were done on a practically unloaded > system so it shouldn't vary too much from run to run, but in practice, the > first few runs seem to be slightly faster than the later ones.) > > I.e. around 400 ms gain out of 40 s for alt1, and then another -50 - +150 ms > speedup on top of that for alt2. > > What do you think? At least it looks like the difference between alt1 and alt2 are quite similar on 32- and 64-bit. So we should use the same variant on both archs. I favor alternate 2. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 5/5] aarch64: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)
On 2017-02-05 14:05:49 +0200, Martin Storsjö wrote: > On Sun, 5 Feb 2017, Janne Grunau wrote: > > >> // out1 = in1 + in2 > >> // out2 = in1 - in2 > >> .macro butterfly_8h out1, out2, in1, in2 > >>@@ -463,7 +510,7 @@ function idct16x16_dc_add_neon > >> ret > >> endfunc > >> > >>-function idct16 > >>+.macro idct16_full > >> dmbutterfly0v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 > >> = t0a, v24 = t1a > >> dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 > >> = t2a, v28 = t3a > >> dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 > >> = t4a, v30 = t7a > >>@@ -485,7 +532,10 @@ function idct16 > >> dmbutterfly0v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 > >>// v22 = t6a, v26 = t5a > >> dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 > >>// v23 = t9a, v25 = t14a > >> dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, > >> neg=1 // v27 = t13a, v21 = t10a > >>+idct16_end > > > >I think it would be clearer if idct16_end is used directly from the macro. > >it would probably also make sense to move idct16_end and avoid the > >idct16_full macro. The patch might be smaller and it is immediately > >obvious that there is no code change but the resulting code is more > >comlicated than it needs to be. same applies to arm if we go with > >alternative 1. > > Ok, so you mean like this? > > function idct16 > dmbutterfly... > > idct16_end > endfunc that would be one option, the other would be to move the idct_end instructions as a macro out of the the existing idct16 function and use it as macro. That would make the full idct structural identical to the half and quarter version and avoid a macro only used once. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] configure: Rework dependency handling for conflicting components
On 2017-02-06 17:22:06 +0100, Diego Biurrun wrote: > This makes the feature more visible and obvious. > --- > > Changed to use _conflict instead of _not as Janne suggested. > > configure | 22 +- > 1 file changed, 13 insertions(+), 9 deletions(-) ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] configure: Add name parameter to require_pkg_config() helper function
On 2017-02-06 18:08:00 +0100, Diego Biurrun wrote: > This allows distinguishing between the internal variable name for > external libraries and the pkg-config package name. Having both > names available avoids special-casing outside the helper function > when the two identifiers do not match. > --- > > Moved the shift according to Janne's suggestion. > > configure | 55 +-- > 1 file changed, 29 insertions(+), 26 deletions(-) ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 07/12] configure: Add name parameter to require_pkg_config() helper function
On 2017-01-24 18:12:47 +0100, Diego Biurrun wrote: > This allows distinguishing between the internal variable name for > external libraries and the pkg-config package name. Having both > names available avoids special-casing outside the helper function > when the two identifiers do not match. > --- > configure | 59 +++ > 1 file changed, 31 insertions(+), 28 deletions(-) > > diff --git a/configure b/configure > index 747ce17..d3fd489 100755 > --- a/configure > +++ b/configure > @@ -1134,11 +1134,14 @@ require_cpp_condition(){ > > require_pkg_config(){ > log require_pkg_config "$@" > -pkg_version="$1" > -pkg="${1%% *}" > +name="$1" shift here and pkg and pkg_version do not to be modified > +pkg_version="$2" > +pkg="${2%% *}" > +shift > +test "$name" = "" && name=$pkg > check_pkg_config "$@" || die "ERROR: $pkg_version not found" > add_cflags$(get_safe "${pkg}_cflags") > -add_extralibs $(get_safe "${pkg}_extralibs") > +add_extralibs $(get_safe "${name}_extralibs") > } > > hostcc_e(){ > @@ -4501,7 +4504,7 @@ case "$custom_allocator" in > require libjemalloc jemalloc/jemalloc.h malloc -ljemalloc > ;; > tcmalloc) > -require_pkg_config libtcmalloc gperftools/tcmalloc.h tc_malloc > +require_pkg_config "" libtcmalloc gperftools/tcmalloc.h tc_malloc > malloc_prefix=tc_ > ;; > esac > @@ -4635,41 +4638,41 @@ enabled avisynth && require_header > avisynth/avisynth_c.h > enabled avxsynth && require avxsynth "avxsynth/avxsynth_c.h > dlfcn.h" dlopen -ldl > enabled cuda && require cuda cuda.h cuInit -lcuda > enabled frei0r&& require_header frei0r.h > -enabled gnutls&& require_pkg_config gnutls gnutls/gnutls.h > gnutls_global_init > -enabled libbs2b && require_pkg_config libbs2b bs2b.h bs2b_open > -enabled libdc1394 && require_pkg_config libdc1394-2 dc1394/dc1394.h > dc1394_new > +enabled gnutls&& require_pkg_config "" gnutls gnutls/gnutls.h > gnutls_global_init > +enabled libbs2b && require_pkg_config "" libbs2b bs2b.h bs2b_open > +enabled libdc1394 && require_pkg_config libdc1394 libdc1394-2 > dc1394/dc1394.h dc1394_new > enabled libdcadec && require libdcadec libdcadec/dca_context.h > dcadec_context_create -ldcadec > enabled libfaac && require libfaac "stdint.h faac.h" > faacEncGetVersion -lfaac > -enabled libfdk_aac&& require_pkg_config fdk-aac > "fdk-aac/aacenc_lib.h" aacEncOpen > -enabled libfontconfig && require_pkg_config fontconfig > "fontconfig/fontconfig.h" FcInit > -enabled libfreetype && require_pkg_config freetype2 "ft2build.h > FT_FREETYPE_H" FT_Init_FreeType > +enabled libfdk_aac&& require_pkg_config libfdk_aac fdk-aac > "fdk-aac/aacenc_lib.h" aacEncOpen > +enabled libfontconfig && require_pkg_config libfontconfig fontconfig > "fontconfig/fontconfig.h" FcInit > +enabled libfreetype && require_pkg_config libfreetype freetype2 > "ft2build.h FT_FREETYPE_H" FT_Init_FreeType > enabled libgsm&& { for gsm_hdr in "gsm.h" "gsm/gsm.h"; do > check_lib libgsm "${gsm_hdr}" gsm_create > -lgsm && break; > done || die "ERROR: libgsm not found"; } > -enabled libhdcd && require_pkg_config libhdcd "hdcd/hdcd_simple.h" > hdcd_new > +enabled libhdcd && require_pkg_config "" libhdcd > "hdcd/hdcd_simple.h" hdcd_new > enabled libilbc && require libilbc ilbc.h WebRtcIlbcfix_InitDecode > -lilbc > -enabled libkvazaar&& require_pkg_config "kvazaar >= 0.8.1" kvazaar.h > kvz_api_get > -enabled libmfx&& require_pkg_config libmfx "mfx/mfxvideo.h" > MFXInit > +enabled libkvazaar&& require_pkg_config libkvazaar "kvazaar >= > 0.8.1" kvazaar.h kvz_api_get > +enabled libmfx&& require_pkg_config "" libmfx "mfx/mfxvideo.h" > MFXInit > enabled libmp3lame&& require "libmp3lame >= 3.98.3" lame/lame.h > lame_set_VBR_quality -lmp3lame > enabled libnpp&& require libnpp npp.h nppGetLibVersion -lnppi > -lnppc > enabled libopencore_amrnb && require libopencore_amrnb > opencore-amrnb/interf_dec.h Decoder_Interface_init -lopencore-amrnb > enabled libopencore_amrwb && require libopencore_amrwb > opencore-amrwb/dec_if.h D_IF_init -lopencore-amrwb > -enabled libopencv && require_pkg_config opencv opencv/cv.h > cvCreateImageHeader > -enabled libopenh264 && require_pkg_config openh264 wels/codec_api.h > WelsGetCodecVersion > +enabled libopencv && require_pkg_config libopencv opencv opencv/cv.h > cvCreateImageHeader > +enabled libopenh264 && require_pkg_config libopenh264 openh264 > wels/codec_api.h WelsGetCodecVersion > enabled libopenjpeg && { check_lib libopenjpeg openjpeg.h opj_version > -lope
Re: [libav-devel] [PATCH 5/5] aarch64: vp9itxfm: Do separate functions for half/quarter idct16 and idct32 (alternative 2)
On 2016-12-01 11:27:02 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > This makes it easier to avoid filling the temp buffer with zeros for the > skipped slices, and leads to slightly more straightforward code for these > cases (for the 16x16 case, where the special case pass functions are > written out instead of templated from the same macro), instead of riddling > the common code with special case branches or macro .ifs. > > The code size increases from 14740 bytes to 24472 bytes. > > Before: > vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 > vp9_inv_dct_dct_16x16_sub2_add_neon:1051.0 > vp9_inv_dct_dct_16x16_sub4_add_neon:1051.0 > vp9_inv_dct_dct_16x16_sub8_add_neon:1051.0 > vp9_inv_dct_dct_16x16_sub12_add_neon: 1390.3 > vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.1 > vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5 > vp9_inv_dct_dct_32x32_sub2_add_neon:5199.1 > vp9_inv_dct_dct_32x32_sub4_add_neon:5199.9 > vp9_inv_dct_dct_32x32_sub8_add_neon:5196.9 > vp9_inv_dct_dct_32x32_sub12_add_neon: 6171.6 > vp9_inv_dct_dct_32x32_sub16_add_neon: 6170.9 > vp9_inv_dct_dct_32x32_sub20_add_neon: 7147.1 > vp9_inv_dct_dct_32x32_sub24_add_neon: 7147.0 > vp9_inv_dct_dct_32x32_sub28_add_neon: 8118.8 > vp9_inv_dct_dct_32x32_sub32_add_neon: 8125.8 > > After: > vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 > vp9_inv_dct_dct_16x16_sub2_add_neon: 639.0 > vp9_inv_dct_dct_16x16_sub4_add_neon: 639.0 > vp9_inv_dct_dct_16x16_sub8_add_neon: 845.0 > vp9_inv_dct_dct_16x16_sub12_add_neon: 1389.4 > vp9_inv_dct_dct_16x16_sub16_add_neon: 1389.3 > vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5 > vp9_inv_dct_dct_32x32_sub2_add_neon:3684.1 > vp9_inv_dct_dct_32x32_sub4_add_neon:3682.6 > vp9_inv_dct_dct_32x32_sub8_add_neon:3684.1 > vp9_inv_dct_dct_32x32_sub12_add_neon: 5319.0 > vp9_inv_dct_dct_32x32_sub16_add_neon: 5323.5 > vp9_inv_dct_dct_32x32_sub20_add_neon: 7149.8 > vp9_inv_dct_dct_32x32_sub24_add_neon: 7148.2 > vp9_inv_dct_dct_32x32_sub28_add_neon: 8124.5 > vp9_inv_dct_dct_32x32_sub32_add_neon: 8122.1 > > --- > If we wouldn't have made the core transforms standalone functions, > the code size would end up at around 34 KB. > > The binary output is 6 KB larger than in the other alternative, > but is more straightforward and gives better opportunities to > special case them further. > > In the arm version, there was a significant speedup compared to the > other alternative (having cmps within the functions), skipping > zeroing of the temp buffer. Here there's much less difference. And the relative binary size difference is even larger. It would a little strange to choose different alternatives for 32- and 64-bit but it sounds like alternative 1 might be better for arm64. Please run a full decoding benchmark for arm64 too. > --- > libavcodec/aarch64/vp9itxfm_neon.S | 628 > + > 1 file changed, 566 insertions(+), 62 deletions(-) > > diff --git a/libavcodec/aarch64/vp9itxfm_neon.S > b/libavcodec/aarch64/vp9itxfm_neon.S > index be9643e..9910170 100644 > --- a/libavcodec/aarch64/vp9itxfm_neon.S > +++ b/libavcodec/aarch64/vp9itxfm_neon.S > @@ -75,6 +75,16 @@ endconst > .endif > .endm > > +// Same as dmbutterfly0 above, but treating the input in in2 as zero, > +// writing the same output into both out1 and out2. > +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, > tmp6 > +smull \tmp1\().4s, \in1\().4h, v0.h[0] > +smull2 \tmp2\().4s, \in1\().8h, v0.h[0] > +rshrn \out1\().4h, \tmp1\().4s, #14 > +rshrn2 \out1\().8h, \tmp2\().4s, #14 > +mov \out2\().16b, \out1\().16b > +.endm > + > // out1,out2 = in1 * coef1 - in2 * coef2 > // out3,out4 = in1 * coef2 + in2 * coef1 > // out are 4 x .4s registers, in are 2 x .8h registers > @@ -104,6 +114,43 @@ endconst > rshrn2 \inout2\().8h, \tmp4\().4s, #14 > .endm > > +// Same as dmbutterfly above, but treating the input in inout2 as zero > +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 > +smull \tmp1\().4s, \inout1\().4h, \coef1 > +smull2 \tmp2\().4s, \inout1\().8h, \coef1 > +smull \tmp3\().4s, \inout1\().4h, \coef2 > +smull2 \tmp4\().4s, \inout1\().8h, \coef2 > +rshrn \inout1\().4h, \tmp1\().4s, #14 > +rshrn2 \inout1\().8h, \tmp2\().4s, #14 > +rshrn \inout2\().4h, \tmp3\().4s, #14 > +rshrn2 \inout2\().8h, \tmp4\().4s, #14 > +.endm > + > +// Same as dmbutterfly above, but treating the input in inout1 as zero > +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 > +smull \tmp1\().4s, \inout2\().4h, \coef2 > +smull2 \tmp2\().4s, \inout2\().8h, \coef2 > +smull \tmp3\
Re: [libav-devel] [PATCH] arm: vp9itxfm: Avoid .irp when it doesn't save any lines
On 2017-02-04 23:37:37 +0200, Martin Storsjö wrote: > This makes it more readable. > --- > This was suggested by Janne in a review of a patch that added a > modified copy of this function; similar code already exists as well. > --- > libavcodec/arm/vp9itxfm_neon.S | 24 > 1 file changed, 12 insertions(+), 12 deletions(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index 5abe435..49b993f 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -690,21 +690,21 @@ function \txfm\()16_1d_4x16_pass1_neon > @ for the first slice of the second pass (where it is the > @ last 4x4 block). > add r0, r0, #8 > -.irp i, 20, 24, 28 > -vst1.16 {d\i}, [r0,:64]! > -.endr > +vst1.16 {d20}, [r0,:64]! > +vst1.16 {d24}, [r0,:64]! > +vst1.16 {d28}, [r0,:64]! > add r0, r0, #8 > -.irp i, 21, 25, 29 > -vst1.16 {d\i}, [r0,:64]! > -.endr > +vst1.16 {d21}, [r0,:64]! > +vst1.16 {d25}, [r0,:64]! > +vst1.16 {d29}, [r0,:64]! > add r0, r0, #8 > -.irp i, 22, 26, 30 > -vst1.16 {d\i}, [r0,:64]! > -.endr > +vst1.16 {d22}, [r0,:64]! > +vst1.16 {d26}, [r0,:64]! > +vst1.16 {d30}, [r0,:64]! > add r0, r0, #8 > -.irp i, 23, 27, 31 > -vst1.16 {d\i}, [r0,:64]! > -.endr > +vst1.16 {d23}, [r0,:64]! > +vst1.16 {d27}, [r0,:64]! > +vst1.16 {d31}, [r0,:64]! > vmovd28, d16 > vmovd29, d17 > vmovd30, d18 ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 5/5] aarch64: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)
On 2016-12-01 11:27:01 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > This increases the code size of libavcodec/aarch64/vp9itxfm_neon.o > from 14740 to 18504 bytes. > > Before: > vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 > vp9_inv_dct_dct_16x16_sub2_add_neon:1051.0 > vp9_inv_dct_dct_16x16_sub4_add_neon:1051.0 > vp9_inv_dct_dct_16x16_sub8_add_neon:1051.0 > vp9_inv_dct_dct_16x16_sub12_add_neon: 1390.3 > vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.1 > vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5 > vp9_inv_dct_dct_32x32_sub2_add_neon:5199.1 > vp9_inv_dct_dct_32x32_sub4_add_neon:5199.9 > vp9_inv_dct_dct_32x32_sub8_add_neon:5196.9 > vp9_inv_dct_dct_32x32_sub12_add_neon: 6171.6 > vp9_inv_dct_dct_32x32_sub16_add_neon: 6170.9 > vp9_inv_dct_dct_32x32_sub20_add_neon: 7147.1 > vp9_inv_dct_dct_32x32_sub24_add_neon: 7147.0 > vp9_inv_dct_dct_32x32_sub28_add_neon: 8118.8 > vp9_inv_dct_dct_32x32_sub32_add_neon: 8125.8 > > After: > vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 > vp9_inv_dct_dct_16x16_sub2_add_neon: 697.0 > vp9_inv_dct_dct_16x16_sub4_add_neon: 697.0 > vp9_inv_dct_dct_16x16_sub8_add_neon: 908.0 > vp9_inv_dct_dct_16x16_sub12_add_neon: 1399.6 > vp9_inv_dct_dct_16x16_sub16_add_neon: 1403.3 > vp9_inv_dct_dct_32x32_sub1_add_neon: 554.1 > vp9_inv_dct_dct_32x32_sub2_add_neon:3879.7 > vp9_inv_dct_dct_32x32_sub4_add_neon:3952.2 > vp9_inv_dct_dct_32x32_sub8_add_neon:3948.4 > vp9_inv_dct_dct_32x32_sub12_add_neon: 5462.1 > vp9_inv_dct_dct_32x32_sub16_add_neon: 5461.7 > vp9_inv_dct_dct_32x32_sub20_add_neon: 7169.2 > vp9_inv_dct_dct_32x32_sub24_add_neon: 7162.4 > vp9_inv_dct_dct_32x32_sub28_add_neon: 8137.4 > vp9_inv_dct_dct_32x32_sub32_add_neon: 8136.7 > > I.e. in general a very minor overhead for the full subpartition case due > to the additional cmps, but a significant speedup for the cases when we > only need to process a small part of the actual input data. > --- > If we wouldn't have made the core transforms standalone functions, > the code size would end up at around 28 KB. > --- > libavcodec/aarch64/vp9itxfm_neon.S | 367 > +++-- > 1 file changed, 347 insertions(+), 20 deletions(-) > > diff --git a/libavcodec/aarch64/vp9itxfm_neon.S > b/libavcodec/aarch64/vp9itxfm_neon.S > index be9643e..bb79348 100644 > --- a/libavcodec/aarch64/vp9itxfm_neon.S > +++ b/libavcodec/aarch64/vp9itxfm_neon.S > @@ -75,6 +75,16 @@ endconst > .endif > .endm > > +// Same as dmbutterfly0 above, but treating the input in in2 as zero, > +// writing the same output into both out1 and out2. > +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, > tmp6 > +smull \tmp1\().4s, \in1\().4h, v0.h[0] > +smull2 \tmp2\().4s, \in1\().8h, v0.h[0] > +rshrn \out1\().4h, \tmp1\().4s, #14 > +rshrn2 \out1\().8h, \tmp2\().4s, #14 > +mov \out2\().16b, \out1\().16b > +.endm > + > // out1,out2 = in1 * coef1 - in2 * coef2 > // out3,out4 = in1 * coef2 + in2 * coef1 > // out are 4 x .4s registers, in are 2 x .8h registers > @@ -104,6 +114,43 @@ endconst > rshrn2 \inout2\().8h, \tmp4\().4s, #14 > .endm > > +// Same as dmbutterfly above, but treating the input in inout2 as zero > +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 > +smull \tmp1\().4s, \inout1\().4h, \coef1 > +smull2 \tmp2\().4s, \inout1\().8h, \coef1 > +smull \tmp3\().4s, \inout1\().4h, \coef2 > +smull2 \tmp4\().4s, \inout1\().8h, \coef2 > +rshrn \inout1\().4h, \tmp1\().4s, #14 > +rshrn2 \inout1\().8h, \tmp2\().4s, #14 > +rshrn \inout2\().4h, \tmp3\().4s, #14 > +rshrn2 \inout2\().8h, \tmp4\().4s, #14 > +.endm > + > +// Same as dmbutterfly above, but treating the input in inout1 as zero > +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 > +smull \tmp1\().4s, \inout2\().4h, \coef2 > +smull2 \tmp2\().4s, \inout2\().8h, \coef2 > +smull \tmp3\().4s, \inout2\().4h, \coef1 > +smull2 \tmp4\().4s, \inout2\().8h, \coef1 > +neg \tmp1\().4s, \tmp1\().4s > +neg \tmp2\().4s, \tmp2\().4s > +rshrn \inout2\().4h, \tmp3\().4s, #14 > +rshrn2 \inout2\().8h, \tmp4\().4s, #14 > +rshrn \inout1\().4h, \tmp1\().4s, #14 > +rshrn2 \inout1\().8h, \tmp2\().4s, #14 > +.endm > + > +.macro dsmull_h out1, out2, in, coef > +smull \out1\().4s, \in\().4h, \coef > +smull2 \out2\().4s, \in\().8h, \coef > +.endm > + > +.macro drshrn_h out, in1, in2, shift > +rshrn \out\().4h, \in1\().4s, \shift > +rshrn2
Re: [libav-devel] [PATCH 2/5] arm: vp9itxfm: Do separate functions for half/quarter idct16 and idct32 (alternative 2)
On 2017-02-05 00:34:16 +0200, Martin Storsjö wrote: > On Sat, 4 Feb 2017, Janne Grunau wrote: > > >I'm not really sure which variant I prefer. Is the speed difference > >mesuable for idct heavy real world samples? If you have preference for one > >or the other variant I trust your judgement. > > It's measurable, but it's not much. For one sample, I originally got a full > decode time like this (fastest time out of 2 runs) with the current master: > user2m53.980s > Alternative 1: > user2m53.448s > Alternative 2: > user2m52.952s What's is the approximate share of the idct on the whole decoding time? > So alternative 2 is better, but produces a couple KB bigger binaries, and > more duplicated code. (OTOH also allowing more exact special casing of minor > details.) > > I originally clearly preferred alt 2, but with your suggestions for alt 1, > the diff for that one ends up very small and neat. I think the numbers look pretty compelling for alternative 2. 1s vs. 0.5s overall decoding speedup. The difference is larger than I expected and imo justifies the code duplication and increased binary size. While the patch for alternative 1 looks small and nice that's not really an argument. the patch for alternative 2 would also look nicer if you did the macro move in a separate patch. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 4/5] aarch64: vp9itxfm: Restructure the idct32 store macros
On 2016-12-01 11:27:00 +0200, Martin Storsjö wrote: > This avoids concatenation, which can't be used if the whole macro > is wrapped within another macro. > --- > libavcodec/aarch64/vp9itxfm_neon.S | 80 > +++--- > 1 file changed, 40 insertions(+), 40 deletions(-) ok, I think it's also more readable. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 3/5] aarch64: vp9itxfm: Make the larger core transforms standalone functions
On 2016-12-01 11:26:59 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > This reduces the code size of libavcodec/aarch64/vp9itxfm_neon.o from > 19496 to 14740 bytes. > > This gives a small slowdown of a couple of tens of cycles, but makes > it more feasible to add more optimized versions of these transforms. > > Before: > vp9_inv_dct_dct_16x16_sub4_add_neon:1036.7 > vp9_inv_dct_dct_16x16_sub16_add_neon: 1372.2 > vp9_inv_dct_dct_32x32_sub4_add_neon:5180.0 > vp9_inv_dct_dct_32x32_sub32_add_neon: 8095.7 > > After: > vp9_inv_dct_dct_16x16_sub4_add_neon:1051.0 > vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.1 > vp9_inv_dct_dct_32x32_sub4_add_neon:5199.9 > vp9_inv_dct_dct_32x32_sub32_add_neon: 8125.8 > --- > libavcodec/aarch64/vp9itxfm_neon.S | 42 > +++--- > 1 file changed, 25 insertions(+), 17 deletions(-) ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/5] arm: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)
On 2017-02-03 23:44:51 +0200, Martin Storsjö wrote: > On Fri, 3 Feb 2017, Janne Grunau wrote: > > >On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote: > >>This work is sponsored by, and copyright, Google. > >> > > >>@@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon > >> > >> mov r12, #32 > >> vmov.s16q2, #0 > >>+ > >>+.ifc \txfm,idct > >>+cmp r3, #10 > >>+ble 3f > >>+cmp r3, #38 > >>+ble 4f > >>+.endif > > > >I'd test only for less or equal 38 here > > > >>+ > >> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > >> vld1.16 {d\i}, [r2,:64] > >> vst1.16 {d4}, [r2,:64], r12 > >> .endr > >> > >> bl \txfm\()16 > >>+.ifc \txfm,idct > >>+b 5f > > > >cmp r3, #10 > > > >>+ > >>+3: > >>+.irp i, 16, 17, 18, 19 > >>+vld1.16 {d\i}, [r2,:64] > >>+vst1.16 {d4}, [r2,:64], r12 > >>+.endr > >>+bl idct16_quarter > >>+b 5f > > > >remove this > > > >>+ > >>+4: > >>+.irp i, 16, 17, 18, 19, 20, 21, 22, 23 > >>+vld1.16 {d\i}, [r2,:64] > >>+vst1.16 {d4}, [r2,:64], r12 > > > >.if \i == 19 > >blle idct16_half > >ble 5f > >.endif > > > >saves a little binary space not sure if it's worth it. > > Hmm, that looks pretty neat. > > I folded in this change into the aarch64 version (and the rshrn instead of > mov) as well, using a b.gt instead of conditional bl, like this: > > .if \i == 19 > b.gt4f > bl idct16_quarter > b 5f > 4: > .endif > > In principle I guess one could interleave the same in the full loop as well, > having only one loop, with special case checks for i == 19 and i == 23. Then > we'd end up with two comparisons instead of one when doing the full case - > not sure if it's preferrable or not. I doubt the comparisons are noticeable. so folding it into the main loop should be fine. > The main question though is whether you prefer this or alternative 2. see my other mail. I have no strong opinion. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/5] arm: vp9itxfm: Do separate functions for half/quarter idct16 and idct32 (alternative 2)
On 2016-12-01 11:26:58 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > This makes it easier to avoid filling the temp buffer with zeros for the > skipped slices, and leads to slightly more straightforward code for these > cases (for the 16x16 case, where the special case pass functions are written > out instead of templated from the same macro), instead of riddling the common > code with special case branches or macro .ifs. > > The code size increases from 12388 bytes to 19932 bytes. > > Before: Cortex A7 A8 A9 A53 > vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0189.7211.9235.8 > vp9_inv_dct_dct_16x16_sub2_add_neon:2056.7 1521.2 1734.8 1262.0 > vp9_inv_dct_dct_16x16_sub4_add_neon:2060.8 1608.5 1735.7 1262.0 > vp9_inv_dct_dct_16x16_sub8_add_neon:2444.9 1801.6 2007.8 1508.5 > vp9_inv_dct_dct_16x16_sub12_add_neon: 2902.1 2116.7 2285.1 1751.7 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5 > vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0456.7866.0553.9 > vp9_inv_dct_dct_32x32_sub2_add_neon: 11042.7 8127.5 8582.7 6822.8 > vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1 > vp9_inv_dct_dct_32x32_sub8_add_neon: 11908.0 9281.8 9381.9 7562.4 > vp9_inv_dct_dct_32x32_sub12_add_neon: 13015.2 10791.1 10220.3 8318.9 > vp9_inv_dct_dct_32x32_sub16_add_neon: 14150.3 11886.2 11032.6 9064.8 > vp9_inv_dct_dct_32x32_sub20_add_neon: 15165.7 12993.8 11847.0 9816.7 > vp9_inv_dct_dct_32x32_sub24_add_neon: 16280.8 15111.2 12658.6 10576.8 > vp9_inv_dct_dct_32x32_sub28_add_neon: 17412.6 15549.4 13462.7 11325.6 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9 > > After: > vp9_inv_dct_dct_16x16_sub1_add_neon: 274.4189.5211.7235.8 > vp9_inv_dct_dct_16x16_sub2_add_neon:1214.2962.0 1034.4764.0 > vp9_inv_dct_dct_16x16_sub4_add_neon:1214.5911.0 1034.7763.9 > vp9_inv_dct_dct_16x16_sub8_add_neon:2000.6 1601.9 1729.0 1286.4 > vp9_inv_dct_dct_16x16_sub12_add_neon: 2854.3 2122.2 2292.9 1757.6 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3231.1 2477.9 2544.6 2005.7 > vp9_inv_dct_dct_32x32_sub1_add_neon: 756.1460.3865.3553.9 > vp9_inv_dct_dct_32x32_sub2_add_neon:7603.7 5469.8 6046.2 4922.6 > vp9_inv_dct_dct_32x32_sub4_add_neon:7586.9 5740.2 6061.5 4921.5 > vp9_inv_dct_dct_32x32_sub8_add_neon:8380.7 6554.4 6600.4 5476.3 > vp9_inv_dct_dct_32x32_sub12_add_neon: 11005.8 9856.2 9242.4 7462.3 > vp9_inv_dct_dct_32x32_sub16_add_neon: 11959.7 10698.5 9998.0 8134.5 > vp9_inv_dct_dct_32x32_sub20_add_neon: 15250.8 13175.6 11854.4 9825.7 > vp9_inv_dct_dct_32x32_sub24_add_neon: 16382.3 14501.4 12671.7 10579.5 > vp9_inv_dct_dct_32x32_sub28_add_neon: 17521.2 16403.8 13486.3 11331.2 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18630.8 17398.7 14383.2 12089.4 > > --- > If we wouldn't have made the core transforms standalone functions, > the code size would end up at around 26 KB. > > The binary output is 4 KB larger than in the other alternative, > but is more straightforward and gives better opportunities to > special case them, and is a couple hundred cycles faster for the > small subpartitions. > --- > libavcodec/arm/vp9itxfm_neon.S | 645 > ++--- > 1 file changed, 601 insertions(+), 44 deletions(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index 22e63e5..4bba4b9 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -74,6 +74,14 @@ endconst > vrshrn.s32 \out2, \tmpq4, #14 > .endm > > +@ Same as mbutterfly0 above, but treating the input in in2 as zero, > +@ writing the same output into both out1 and out2. > +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4 > +vmull.s16 \tmpq3, \in1, d0[0] > +vrshrn.s32 \out1, \tmpq3, #14 > +vmov\out2, \out1 > +.endm Same as the other variant > + > @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 > @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 > @ Same as mbutterfly0, but with input being 2 q registers, output > @@ -137,6 +145,23 @@ endconst > vrshrn.s32 \inout2, \tmp2, #14 > .endm > > +@ Same as mbutterfly above, but treating the input in inout2 as zero > +.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2 > +vmull.s16 \tmp1, \inout1, \coef1 > +vmull.s16 \tmp2, \inout1, \coef2 > +vrshrn.s32 \inout1, \tmp1, #14 > +vrshrn.s32 \inout2, \tmp2, #14 > +.endm > + > +@ Same as mbutterfly above, but treating the input in inout1 as zero > +.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2 > +vmull.s16 \tmp1,
Re: [libav-devel] [PATCH 2/5] arm: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)
On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > This increases the code size of libavcodec/arm/vp9itxfm_neon.o > from 12388 to 15064 bytes. > > Before: Cortex A7 A8 A9 A53 > vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0189.7211.9235.8 > vp9_inv_dct_dct_16x16_sub2_add_neon:2056.7 1521.2 1734.8 1262.0 > vp9_inv_dct_dct_16x16_sub4_add_neon:2060.8 1608.5 1735.7 1262.0 > vp9_inv_dct_dct_16x16_sub8_add_neon:2444.9 1801.6 2007.8 1508.5 > vp9_inv_dct_dct_16x16_sub12_add_neon: 2902.1 2116.7 2285.1 1751.7 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5 > vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0456.7866.0553.9 > vp9_inv_dct_dct_32x32_sub2_add_neon: 11042.7 8127.5 8582.7 6822.8 > vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1 > vp9_inv_dct_dct_32x32_sub8_add_neon: 11908.0 9281.8 9381.9 7562.4 > vp9_inv_dct_dct_32x32_sub12_add_neon: 13015.2 10791.1 10220.3 8318.9 > vp9_inv_dct_dct_32x32_sub16_add_neon: 14150.3 11886.2 11032.6 9064.8 > vp9_inv_dct_dct_32x32_sub20_add_neon: 15165.7 12993.8 11847.0 9816.7 > vp9_inv_dct_dct_32x32_sub24_add_neon: 16280.8 15111.2 12658.6 10576.8 > vp9_inv_dct_dct_32x32_sub28_add_neon: 17412.6 15549.4 13462.7 11325.6 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9 > > After: > vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0189.5211.5236.1 > vp9_inv_dct_dct_16x16_sub2_add_neon:1448.2994.0 1191.3836.0 > vp9_inv_dct_dct_16x16_sub4_add_neon:1437.0991.0 1191.6836.0 > vp9_inv_dct_dct_16x16_sub8_add_neon:2114.5 1757.9 1855.3 1335.3 > vp9_inv_dct_dct_16x16_sub12_add_neon: 2862.7 2141.5 2293.3 1772.7 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3299.6 2419.1 2552.7 2033.0 > vp9_inv_dct_dct_32x32_sub1_add_neon: 753.0457.5864.3554.8 > vp9_inv_dct_dct_32x32_sub2_add_neon:7867.8 5978.6 6594.6 5109.9 > vp9_inv_dct_dct_32x32_sub4_add_neon:7871.0 5772.5 6582.2 5108.5 > vp9_inv_dct_dct_32x32_sub8_add_neon:8694.8 6925.7 7125.7 5671.4 > vp9_inv_dct_dct_32x32_sub12_add_neon: 11250.3 9654.7 9557.6 7540.5 > vp9_inv_dct_dct_32x32_sub16_add_neon: 12129.5 11061.1 10295.0 8220.7 > vp9_inv_dct_dct_32x32_sub20_add_neon: 15218.4 13580.8 11841.3 9739.9 > vp9_inv_dct_dct_32x32_sub24_add_neon: 16343.5 15097.0 12629.2 10496.6 > vp9_inv_dct_dct_32x32_sub28_add_neon: 17482.2 15516.4 13476.0 11261.0 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18586.7 16817.5 14289.3 12019.0 > > --- > If we wouldn't have made the core transforms standalone functions > in the previous patch, the code size would increase to around 21 KB (which > isn't too bad), but the idct32 pass1/2 functions would bloat up so much > that they would require literal pools within the functions themselves. > --- > libavcodec/arm/vp9itxfm_neon.S | 351 > ++--- > 1 file changed, 331 insertions(+), 20 deletions(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index 22e63e5..bd3f678 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -74,6 +74,14 @@ endconst > vrshrn.s32 \out2, \tmpq4, #14 > .endm > > +@ Same as mbutterfly0 above, but treating the input in in2 as zero, > +@ writing the same output into both out1 and out2. > +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4 > +vmull.s16 \tmpq3, \in1, d0[0] > +vrshrn.s32 \out1, \tmpq3, #14 > +vmov\out2, \out1 if you haven't already tried doing the vrshrn twice could be faster since it has less dependencies > +.endm > + > @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 > @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 > @ Same as mbutterfly0, but with input being 2 q registers, output > @@ -137,6 +145,23 @@ endconst > vrshrn.s32 \inout2, \tmp2, #14 > .endm > > +@ Same as mbutterfly above, but treating the input in inout2 as zero > +.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2 > +vmull.s16 \tmp1, \inout1, \coef1 > +vmull.s16 \tmp2, \inout1, \coef2 > +vrshrn.s32 \inout1, \tmp1, #14 > +vrshrn.s32 \inout2, \tmp2, #14 > +.endm > + > +@ Same as mbutterfly above, but treating the input in inout1 as zero > +.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2 > +vmull.s16 \tmp1, \inout2, \coef2 > +vmull.s16 \tmp2, \inout2, \coef1 > +vneg.s32\tmp1, \tmp1 > +vrshrn.s32 \inout2, \tmp2, #14 > +vrshrn.s32 \inout1, \tmp1, #14 > +.endm > + > @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << >
Re: [libav-devel] [PATCH 1/5] arm: vp9itxfm: Make the larger core transforms standalone functions
On 2016-12-01 11:26:56 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from > 15324 to 12388 bytes. > > This gives a small slowdown of a couple tens of cycles, up to around > 150 cycles for the full case of the largest transform, but makes > it more feasible to add more optimized versions of these transforms. > > Before: Cortex A7 A8 A9 A53 > vp9_inv_dct_dct_16x16_sub4_add_neon:2063.4 1516.0 1719.5 1245.1 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3279.3 2454.5 2525.2 1982.3 > vp9_inv_dct_dct_32x32_sub4_add_neon: 10750.0 7955.4 8525.6 6754.2 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18574.0 17108.4 14216.7 12010.2 > > After: > vp9_inv_dct_dct_16x16_sub4_add_neon:2060.8 1608.5 1735.7 1262.0 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5 > vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9 > --- > libavcodec/arm/vp9itxfm_neon.S | 43 > +- > 1 file changed, 26 insertions(+), 17 deletions(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index 5abe435..22e63e5 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -534,7 +534,7 @@ function idct16x16_dc_add_neon > endfunc > .ltorg > > -.macro idct16 > +function idct16 > mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, > d24 = t1a > mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = > t3a > mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = > t7a > @@ -580,9 +580,10 @@ endfunc > vmovd4, d21 @ d4 = t10a > butterfly d20, d27, d6, d27 @ d20 = out[4], d27 > = out[11] > butterfly d21, d26, d26, d4@ d21 = out[5], d26 > = out[10] > -.endm > +bx lr > +endfunc > > -.macro iadst16 > +function iadst16 > movrel r12, iadst16_coeffs > vld1.16 {q0-q1}, [r12,:128] > > @@ -653,7 +654,8 @@ endfunc > > vmovd16, d2 > vmovd30, d4 > -.endm > +bx lr > +endfunc > > .macro itxfm16_1d_funcs txfm > @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, > @@ -662,6 +664,8 @@ endfunc > @ r1 = slice offset > @ r2 = src > function \txfm\()16_1d_4x16_pass1_neon > +push{lr} > + > mov r12, #32 > vmov.s16q2, #0 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > @@ -669,7 +673,7 @@ function \txfm\()16_1d_4x16_pass1_neon > vst1.16 {d4}, [r2,:64], r12 > .endr > > -\txfm\()16 > +bl \txfm\()16 > > @ Do four 4x4 transposes. Originally, d16-d31 contain the > @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 > @@ -682,7 +686,7 @@ function \txfm\()16_1d_4x16_pass1_neon > .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 > vst1.16 {d\i}, [r0,:64]! > .endr > -bx lr > +pop {pc} > 1: > @ Special case: For the last input column (r1 == 12), > @ which would be stored as the last row in the temp buffer, > @@ -709,7 +713,7 @@ function \txfm\()16_1d_4x16_pass1_neon > vmovd29, d17 > vmovd30, d18 > vmovd31, d19 > -bx lr > +pop {pc} > endfunc > > @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, > @@ -719,6 +723,7 @@ endfunc > @ r2 = src (temp buffer) > @ r3 = slice offset > function \txfm\()16_1d_4x16_pass2_neon > +push{lr} > mov r12, #32 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 > vld1.16 {d\i}, [r2,:64], r12 > @@ -732,7 +737,7 @@ function \txfm\()16_1d_4x16_pass2_neon > > add r3, r0, r1 > lsl r1, r1, #1 > -\txfm\()16 > +bl \txfm\()16 > > .macro load_add_store coef0, coef1, coef2, coef3 > vrshr.s16 \coef0, \coef0, #6 > @@ -773,7 +778,7 @@ function \txfm\()16_1d_4x16_pass2_neon > load_add_store q12, q13, q14, q15 > .purgem load_add_store > > -bx lr > +pop {pc} > endfunc > .endm > > @@ -908,7 +913,7 @@ function idct32x32_dc_add_neon > bx lr > endfunc > > -.macro idct32_odd > +function idct32_odd > movrel r12, idct_coeffs > add r12, r12, #32 > vld1.16 {q0-q1}, [r12,:128
Re: [libav-devel] [PATCH] build: Move build-system-related helper files to a separate subdirectory
On 2016-12-22 13:07:14 +0100, Diego Biurrun wrote: > This unclutters the top-level directory and groups related files together. > --- > > Now with "avbuild" as directory to store files in instead of "build". > > .gitignore | 3 ++- > Makefile | 19 ++- > avbuild/.gitignore | 4 > arch.mak => avbuild/arch.mak | 0 > common.mak => avbuild/common.mak | 2 +- > library.mak => avbuild/library.mak | 2 +- > version.sh => avbuild/version.sh | 0 > configure | 23 --- > tests/fate.sh | 4 ++-- > 9 files changed, 32 insertions(+), 25 deletions(-) > create mode 100644 avbuild/.gitignore > rename arch.mak => avbuild/arch.mak (100%) > rename common.mak => avbuild/common.mak (98%) > rename library.mak => avbuild/library.mak (98%) > rename version.sh => avbuild/version.sh (100%) ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] h264dec: make sure to only end a field if it has been started
On 2016-12-18 11:36:30 +0100, Anton Khirnov wrote: > Calling ff_h264_field_end() when the per-field state is not properly > initialized leads to all kinds of undefined behaviour. > > CC: libav-sta...@libav.org > Bug-Id: 977 978 992 > --- > libavcodec/h264_picture.c | 1 + > libavcodec/h264_slice.c | 4 ++-- > libavcodec/h264dec.c | 3 ++- > libavcodec/h264dec.h | 5 + > 4 files changed, 10 insertions(+), 3 deletions(-) > > diff --git a/libavcodec/h264_picture.c b/libavcodec/h264_picture.c > index e22852a..24ba79d 100644 > --- a/libavcodec/h264_picture.c > +++ b/libavcodec/h264_picture.c > @@ -194,6 +194,7 @@ int ff_h264_field_end(H264Context *h, H264SliceContext > *sl, int in_setup) > emms_c(); > > h->current_slice = 0; > +h->field_started = 0; > > return err; > } > diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c > index 1b91088..db7628c 100644 > --- a/libavcodec/h264_slice.c > +++ b/libavcodec/h264_slice.c > @@ -1884,9 +1884,8 @@ int ff_h264_queue_decode_slice(H264Context *h, const > H2645NAL *nal) > sl = h->slice_ctx; > } > > -if (h->current_slice && h->cur_pic_ptr && FIELD_PICTURE(h)) { > +if (h->field_started) > ff_h264_field_end(h, sl, 1); > -} > > h->current_slice = 0; > if (!h->first_field) { > @@ -1902,6 +1901,7 @@ int ff_h264_queue_decode_slice(H264Context *h, const > H2645NAL *nal) > ret = h264_field_start(h, sl, nal); > if (ret < 0) > return ret; > +h->field_started = 1; > } > } > > diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c > index 330a74d..62bb036 100644 > --- a/libavcodec/h264dec.c > +++ b/libavcodec/h264dec.c > @@ -757,7 +757,8 @@ out: > > if (!(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS) || > (h->mb_y >= h->mb_height && h->mb_height)) { > -ff_h264_field_end(h, &h->slice_ctx[0], 0); > +if (h->field_started) > +ff_h264_field_end(h, &h->slice_ctx[0], 0); > > *got_frame = 0; > if (h->output_frame->buf[0]) { > diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h > index f934fc4..2ffe4de 100644 > --- a/libavcodec/h264dec.h > +++ b/libavcodec/h264dec.h > @@ -509,6 +509,11 @@ typedef struct H264Context { > * slices) anymore */ > int setup_finished; > > +/* This is set to 1 if h264_field_start() has been called successfully, > + * so all per-field state is properly initialized and we can decode > + * the slice data */ > +int field_started; > + > AVFrame *output_frame; > > int enable_er; looks ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] pthread_frame: do not run hwaccel decoding asynchronously unless it's safe
On 2016-12-14 09:56:40 +0100, Anton Khirnov wrote: > Certain hardware decoding APIs are not guaranteed to be thread-safe, so > having the user access decoded hardware surfaces while the decoder is > running in another thread can cause failures (this is mainly known to > happen with DXVA2). > > For such hwaccels, only allow the decoding thread to run while the user > is inside a lavc decode call (avcodec_send_packet/receive_frame). > --- > libavcodec/avcodec.h | 5 + > libavcodec/hwaccel.h | 24 + > libavcodec/pthread_frame.c | 52 > -- > libavcodec/vaapi_h264.c| 2 ++ > libavcodec/vaapi_mpeg2.c | 2 ++ > libavcodec/vaapi_mpeg4.c | 3 +++ > libavcodec/vaapi_vc1.c | 3 +++ > libavcodec/vaapi_vp8.c | 2 ++ > libavcodec/vdpau_h264.c| 2 ++ > libavcodec/vdpau_hevc.c| 2 ++ > libavcodec/vdpau_mpeg12.c | 3 +++ > libavcodec/vdpau_mpeg4.c | 2 ++ > libavcodec/vdpau_vc1.c | 3 +++ > 13 files changed, 99 insertions(+), 6 deletions(-) > create mode 100644 libavcodec/hwaccel.h ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] pthread_frame: ensure the threads don't run simultaneously with hwaccel
On 2016-12-14 09:56:23 +0100, Anton Khirnov wrote: > --- > libavcodec/h263dec.c | 2 +- > libavcodec/h264dec.c | 2 +- > libavcodec/pthread_frame.c | 35 +++ > 3 files changed, 37 insertions(+), 2 deletions(-) > > diff --git a/libavcodec/h263dec.c b/libavcodec/h263dec.c > index e4a7227..921ff5f 100644 > --- a/libavcodec/h263dec.c > +++ b/libavcodec/h263dec.c > @@ -558,7 +558,7 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void > *data, int *got_frame, > if ((ret = ff_mpv_frame_start(s, avctx)) < 0) > return ret; > > -if (!s->divx_packed && !avctx->hwaccel) > +if (!s->divx_packed) > ff_thread_finish_setup(avctx); > > if (avctx->hwaccel) { > diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c > index 330a74d..83b3ab3 100644 > --- a/libavcodec/h264dec.c > +++ b/libavcodec/h264dec.c > @@ -573,7 +573,7 @@ static int decode_nal_units(H264Context *h, const uint8_t > *buf, int buf_size) > if ((err = ff_h264_queue_decode_slice(h, nal))) > break; > > -if (avctx->active_thread_type & FF_THREAD_FRAME && > !h->avctx->hwaccel && > +if (avctx->active_thread_type & FF_THREAD_FRAME && > i >= nals_needed && !h->setup_finished && h->cur_pic_ptr) { > ff_thread_finish_setup(avctx); > h->setup_finished = 1; > diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c > index 75f4ff4..364a998 100644 > --- a/libavcodec/pthread_frame.c > +++ b/libavcodec/pthread_frame.c > @@ -99,6 +99,8 @@ typedef struct PerThreadContext { > int requested_flags; ///< flags passed to get_buffer() for > requested_frame > > int die; ///< Set when the thread should exit. > + > +int hwaccel_serializing; > } PerThreadContext; > > /** > @@ -109,6 +111,11 @@ typedef struct FrameThreadContext { > PerThreadContext *prev_thread; ///< The last thread submit_packet() was > called on. > > pthread_mutex_t buffer_mutex; ///< Mutex used to protect > get/release_buffer(). > +/** > + * This lock is used for ensuring threads run in serial when hwaccel > + * is used. > + */ > +pthread_mutex_t hwaccel_mutex; > > int next_decoding; ///< The next context to submit a packet > to. > int next_finished; ///< The next context to return output > from. > @@ -149,6 +156,22 @@ static attribute_align_arg void > *frame_worker_thread(void *arg) > ff_thread_finish_setup(avctx); > > pthread_mutex_lock(&p->mutex); > + > +/* If a decoder supports hwaccel, then it must call ff_get_format(). > + * Since that call must happen before ff_thread_finish_setup(), the > + * decoder is required to implement update_thread_context() and call > + * ff_thread_finish_setup() manually. Therefore the above > + * ff_thread_finish_setup() call did not happen and > hwaccel_serializing > + * cannot be true here. */ > +av_assert0(!p->hwaccel_serializing); > + > +/* if the previous thread uses hwaccel then we take the lock to > ensure > + * the threads don't run concurrently */ > +if (avctx->hwaccel) { > +pthread_mutex_lock(&p->parent->hwaccel_mutex); > +p->hwaccel_serializing = 1; > +} > + > av_frame_unref(p->frame); > p->got_frame = 0; > p->result = codec->decode(avctx, p->frame, &p->got_frame, &p->avpkt); > @@ -163,6 +186,11 @@ static attribute_align_arg void > *frame_worker_thread(void *arg) > if (atomic_load(&p->state) == STATE_SETTING_UP) > ff_thread_finish_setup(avctx); > > +if (p->hwaccel_serializing) { > +p->hwaccel_serializing = 0; > +pthread_mutex_unlock(&p->parent->hwaccel_mutex); > +} > + > atomic_store(&p->state, STATE_INPUT_READY); > > pthread_mutex_lock(&p->progress_mutex); > @@ -499,6 +527,11 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { > > if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return; > > +if (avctx->hwaccel && !p->hwaccel_serializing) { > +pthread_mutex_lock(&p->parent->hwaccel_mutex); > +p->hwaccel_serializing = 1; > +} > + > pthread_mutex_lock(&p->progress_mutex); > > atomic_store(&p->state, STATE_SETUP_FINISHED); > @@ -579,6 +612,7 @@ void ff_frame_thread_free(AVCodecContext *avctx, int > thread_count) > > av_freep(&fctx->threads); > pthread_mutex_destroy(&fctx->buffer_mutex); > +pthread_mutex_destroy(&fctx->hwaccel_mutex); > av_freep(&avctx->internal->thread_ctx); > } > > @@ -620,6 +654,7 @@ int ff_frame_thread_init(AVCodecContext *avctx) > } > > pthread_mutex_init(&fctx->buffer_mutex, NULL); > +pthread_mutex_init(&fctx->hwaccel_mutex, NULL); > fctx->delaying = 1; > > for (i = 0; i <
[libav-devel] [PATCH 1/1] arm64: replace 'bic' with immediate with 'and' with inverted immediate
The former is not an official pseudo instruction although gas and llvm's internal assembler support it. Fixes a build error with xcode 6.2 reported by Memphiz on github. --- libavcodec/aarch64/synth_filter_neon.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S index 9551bff8e3..b001c737da 100644 --- a/libavcodec/aarch64/synth_filter_neon.S +++ b/libavcodec/aarch64/synth_filter_neon.S @@ -50,7 +50,7 @@ function ff_synth_filter_float_neon, export=1 add x1, x1, x7, lsl #2 // synth_buf sub w8, w7, #32 stp x5, x1, [sp, #16] -bic x7, x7, #63 +and x7, x7, #~63 and w8, w8, #511 stp x7, x30, [sp, #32] str w8, [x2] -- 2.11.0 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 3/3] pthread_frame: do not run hwaccel decoding asynchronously unless it's safe
On 2016-12-03 17:34:34 +0100, Anton Khirnov wrote: > Certain hardware decoding APIs are often not thread-safe, so having the user > access decoded hardware surfaces while the decoder is running in another > thread can cause failures (this is mainly known to happen with DXVA2). > > For such hwaccels, only allow the decoding thread to run while the user > is inside a lavc decode call (avcodec_send_packet/receive_frame). > --- > libavcodec/avcodec.h | 5 + > libavcodec/hwaccel.h | 24 ++ > libavcodec/pthread_frame.c | 51 > -- > libavcodec/vaapi_h264.c| 2 ++ > libavcodec/vaapi_mpeg2.c | 2 ++ > libavcodec/vaapi_mpeg4.c | 3 +++ > libavcodec/vaapi_vc1.c | 3 +++ > libavcodec/vaapi_vp8.c | 2 ++ > 8 files changed, 86 insertions(+), 6 deletions(-) > create mode 100644 libavcodec/hwaccel.h > > diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h > index e75d300..96149ac 100644 > --- a/libavcodec/avcodec.h > +++ b/libavcodec/avcodec.h > @@ -3361,6 +3361,11 @@ typedef struct AVHWAccel { > * AVCodecInternal.hwaccel_priv_data. > */ > int priv_data_size; > + > +/** > + * Internal hwaccel capabilities. > + */ > +int caps_internal; > } AVHWAccel; > > /** > diff --git a/libavcodec/hwaccel.h b/libavcodec/hwaccel.h > new file mode 100644 > index 000..60dbe81 > --- /dev/null > +++ b/libavcodec/hwaccel.h > @@ -0,0 +1,24 @@ > +/* > + * This file is part of Libav. > + * > + * Libav is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * Libav is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with Libav; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#ifndef AVCODEC_HWACCEL_H > +#define AVCODEC_HWACCEL_H > + > +#define HWACCEL_CAP_ASYNC_SAFE (1 << 0) > + > +#endif /* AVCODEC_HWACCEL_H */ > diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c > index 9fdfb93..83f528e 100644 > --- a/libavcodec/pthread_frame.c > +++ b/libavcodec/pthread_frame.c > @@ -34,6 +34,7 @@ > #endif > > #include "avcodec.h" > +#include "hwaccel.h" > #include "internal.h" > #include "pthread_internal.h" > #include "thread.h" > @@ -101,6 +102,7 @@ typedef struct PerThreadContext { > int die; ///< Set when the thread should exit. > > int hwaccel_serializing; > +int async_serializing; > } PerThreadContext; > > /** > @@ -116,6 +118,7 @@ typedef struct FrameThreadContext { > * is used. > */ > pthread_mutex_t hwaccel_mutex; > +pthread_mutex_t async_mutex; > > int next_decoding; ///< The next context to submit a packet > to. > int next_finished; ///< The next context to return output > from. > @@ -178,6 +181,11 @@ static attribute_align_arg void > *frame_worker_thread(void *arg) > if (atomic_load(&p->state) == STATE_SETTING_UP) > ff_thread_finish_setup(avctx); > > +if (p->async_serializing) { > +p->async_serializing = 0; > +pthread_mutex_unlock(&p->parent->async_mutex); > +} > + > if (p->hwaccel_serializing) { > p->hwaccel_serializing = 0; > pthread_mutex_unlock(&p->parent->hwaccel_mutex); is it concise decision to release the async_mutex before the hwaccel mutex? That should it make more likely that only 1 frame per decode call is decoded. If it is a concise and or tested decision it warrants a comment here. > @@ -406,7 +414,11 @@ int ff_thread_decode_frame(AVCodecContext *avctx, > FrameThreadContext *fctx = avctx->internal->thread_ctx; > int finished = fctx->next_finished; > PerThreadContext *p; > -int err; > +int err, ret; > + > +/* release the hwaccel lock, permitting blocked hwaccel threadsto s/hwaccel\( lock\)/async\1/ > + * go forward while we are in this function */ > +pthread_mutex_unlock(&fctx->async_mutex); > > /* > * Submit a packet to the next decoding thread. > @@ -414,9 +426,11 @@ int ff_thread_decode_frame(AVCodecContext *avctx, > > p = &fctx->threads[fctx->next_decoding]; > err = update_context_from_user(p->avctx, avctx); > -if (err) return err; > +if (err) > +goto finish; > err = submit_packet(p, avpkt); > -if (err) return err; > +if (err) > +goto finish; > > /* > * If we're sti
Re: [libav-devel] [PATCH 2/3] pthread_frame: ensure the threads don't run simultaneously with hwaccel
On 2016-12-03 17:34:33 +0100, Anton Khirnov wrote: > --- > libavcodec/h263dec.c | 2 +- > libavcodec/h264dec.c | 2 +- > libavcodec/pthread_frame.c | 27 +++ > 3 files changed, 29 insertions(+), 2 deletions(-) > > diff --git a/libavcodec/h263dec.c b/libavcodec/h263dec.c > index e4a7227..921ff5f 100644 > --- a/libavcodec/h263dec.c > +++ b/libavcodec/h263dec.c > @@ -558,7 +558,7 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void > *data, int *got_frame, > if ((ret = ff_mpv_frame_start(s, avctx)) < 0) > return ret; > > -if (!s->divx_packed && !avctx->hwaccel) > +if (!s->divx_packed) > ff_thread_finish_setup(avctx); > > if (avctx->hwaccel) { > diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c > index 330a74d..83b3ab3 100644 > --- a/libavcodec/h264dec.c > +++ b/libavcodec/h264dec.c > @@ -573,7 +573,7 @@ static int decode_nal_units(H264Context *h, const uint8_t > *buf, int buf_size) > if ((err = ff_h264_queue_decode_slice(h, nal))) > break; > > -if (avctx->active_thread_type & FF_THREAD_FRAME && > !h->avctx->hwaccel && > +if (avctx->active_thread_type & FF_THREAD_FRAME && > i >= nals_needed && !h->setup_finished && h->cur_pic_ptr) { > ff_thread_finish_setup(avctx); > h->setup_finished = 1; > diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c > index 2736a81..9fdfb93 100644 > --- a/libavcodec/pthread_frame.c > +++ b/libavcodec/pthread_frame.c > @@ -99,6 +99,8 @@ typedef struct PerThreadContext { > int requested_flags; ///< flags passed to get_buffer() for > requested_frame > > int die; ///< Set when the thread should exit. > + > +int hwaccel_serializing; > } PerThreadContext; > > /** > @@ -109,6 +111,11 @@ typedef struct FrameThreadContext { > PerThreadContext *prev_thread; ///< The last thread submit_packet() was > called on. > > pthread_mutex_t buffer_mutex; ///< Mutex used to protect > get/release_buffer(). > +/** > + * This lock is used for ensuring threads run in serial when hwaccel > + * is used. > + */ > +pthread_mutex_t hwaccel_mutex; > > int next_decoding; ///< The next context to submit a packet > to. > int next_finished; ///< The next context to return output > from. > @@ -149,6 +156,14 @@ static attribute_align_arg void > *frame_worker_thread(void *arg) > ff_thread_finish_setup(avctx); > > pthread_mutex_lock(&p->mutex); > + > +/* if the previous thread uses hwaccel then we take the lock to > ensure > + * the threads don't run concurrently */ > +if (avctx->hwaccel) { please either add a comment why hwaccel need a update_thread_context function pointer or add '!p->hwaccel_serializing' to the if. see the ff_thread_finish_setup() call at the beginning oh the hunk > +pthread_mutex_lock(&p->parent->hwaccel_mutex); > +p->hwaccel_serializing = 1; > +} > + > av_frame_unref(p->frame); > p->got_frame = 0; > p->result = codec->decode(avctx, p->frame, &p->got_frame, &p->avpkt); > @@ -163,6 +178,11 @@ static attribute_align_arg void > *frame_worker_thread(void *arg) > if (atomic_load(&p->state) == STATE_SETTING_UP) > ff_thread_finish_setup(avctx); > > +if (p->hwaccel_serializing) { > +p->hwaccel_serializing = 0; > +pthread_mutex_unlock(&p->parent->hwaccel_mutex); > +} > + > atomic_store(&p->state, STATE_INPUT_READY); > > pthread_mutex_lock(&p->progress_mutex); > @@ -499,6 +519,11 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { > > if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return; > > +if (avctx->hwaccel && !p->hwaccel_serializing) { > +pthread_mutex_lock(&p->parent->hwaccel_mutex); > +p->hwaccel_serializing = 1; > +} > + > pthread_mutex_lock(&p->progress_mutex); > > atomic_store(&p->state, STATE_SETUP_FINISHED); > @@ -579,6 +604,7 @@ void ff_frame_thread_free(AVCodecContext *avctx, int > thread_count) > > av_freep(&fctx->threads); > pthread_mutex_destroy(&fctx->buffer_mutex); > +pthread_mutex_destroy(&fctx->hwaccel_mutex); > av_freep(&avctx->internal->thread_ctx); > } > > @@ -620,6 +646,7 @@ int ff_frame_thread_init(AVCodecContext *avctx) > } > > pthread_mutex_init(&fctx->buffer_mutex, NULL); > +pthread_mutex_init(&fctx->hwaccel_mutex, NULL); > fctx->delaying = 1; > > for (i = 0; i < thread_count; i++) { otherwise ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/3] hevc: decouple calling get_format() from exporting the SPS parameters
On 2016-12-03 17:34:32 +0100, Anton Khirnov wrote: > This makes sure ff_get_format() does not get called unnecessarily from > update_thread_context(). > --- > libavcodec/hevcdec.c | 49 ++--- > 1 file changed, 30 insertions(+), 19 deletions(-) > > diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c > index 9dd86c2..27fd683 100644 > --- a/libavcodec/hevcdec.c > +++ b/libavcodec/hevcdec.c > @@ -380,24 +380,10 @@ static void export_stream_params(AVCodecContext *avctx, > const HEVCParamSets *ps, >num, den, 1 << 30); > } > > -static int set_sps(HEVCContext *s, const HEVCSPS *sps) > +static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) > { > #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + > CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL) > enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; > -int ret; > - > -pic_arrays_free(s); > -s->ps.sps = NULL; > -s->ps.vps = NULL; > - > -if (!sps) > -return 0; > - > -ret = pic_arrays_init(s, sps); > -if (ret < 0) > -goto fail; > - > -export_stream_params(s->avctx, &s->ps, sps); > > if (sps->pix_fmt == AV_PIX_FMT_YUV420P || sps->pix_fmt == > AV_PIX_FMT_YUVJ420P || > sps->pix_fmt == AV_PIX_FMT_YUV420P10) { > @@ -417,10 +403,28 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps) > *fmt++ = sps->pix_fmt; > *fmt = AV_PIX_FMT_NONE; > > -ret = ff_get_format(s->avctx, pix_fmts); > +return ff_get_format(s->avctx, pix_fmts); > +} > + > +static int set_sps(HEVCContext *s, const HEVCSPS *sps, > + enum AVPixelFormat pix_fmt) > +{ > +int ret; > + > +pic_arrays_free(s); > +s->ps.sps = NULL; > +s->ps.vps = NULL; > + > +if (!sps) > +return 0; > + > +ret = pic_arrays_init(s, sps); > if (ret < 0) > goto fail; > -s->avctx->pix_fmt = ret; > + > +export_stream_params(s->avctx, &s->ps, sps); > + > +s->avctx->pix_fmt = pix_fmt; > > ff_hevc_pred_init(&s->hpc, sps->bit_depth); > ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth); > @@ -475,10 +479,17 @@ static int hls_slice_header(HEVCContext *s) > s->ps.pps = (HEVCPPS*)s->ps.pps_list[sh->pps_id]->data; > > if (s->ps.sps != (HEVCSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) { > +enum AVPixelFormat pix_fmt; > + > s->ps.sps = (HEVCSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data; > > ff_hevc_clear_refs(s); > -ret = set_sps(s, s->ps.sps); > + > +pix_fmt = get_format(s, s->ps.sps); > +if (pix_fmt < 0) > +return pix_fmt; > + > +ret = set_sps(s, s->ps.sps, pix_fmt); > if (ret < 0) > return ret; > > @@ -2985,7 +2996,7 @@ static int hevc_update_thread_context(AVCodecContext > *dst, > } > > if (s->ps.sps != s0->ps.sps) > -ret = set_sps(s, s0->ps.sps); > +ret = set_sps(s, s0->ps.sps, src->pix_fmt); > > s->seq_decode = s0->seq_decode; > s->seq_output = s0->seq_output; ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 3/3] aarch64: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
On 2016-11-28 11:26:02 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > Previously all subpartitions except the eob=1 (DC) case ran with > the same runtime: > > vp9_inv_dct_dct_16x16_sub16_add_neon: 1373.2 > vp9_inv_dct_dct_32x32_sub32_add_neon: 8089.0 > > By skipping individual 8x16 or 8x32 pixel slices in the first pass, > we reduce the runtime of these functions like this: > > vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 > vp9_inv_dct_dct_16x16_sub2_add_neon:1043.7 > vp9_inv_dct_dct_16x16_sub4_add_neon:1045.3 > vp9_inv_dct_dct_16x16_sub8_add_neon:1043.7 > vp9_inv_dct_dct_16x16_sub12_add_neon: 1374.0 > vp9_inv_dct_dct_16x16_sub16_add_neon: 1368.7 > vp9_inv_dct_dct_32x32_sub1_add_neon: 555.6 > vp9_inv_dct_dct_32x32_sub2_add_neon:5180.0 > vp9_inv_dct_dct_32x32_sub4_add_neon:5175.1 > vp9_inv_dct_dct_32x32_sub8_add_neon:5186.6 > vp9_inv_dct_dct_32x32_sub12_add_neon: 6159.5 > vp9_inv_dct_dct_32x32_sub16_add_neon: 6162.7 > vp9_inv_dct_dct_32x32_sub20_add_neon: 7129.0 > vp9_inv_dct_dct_32x32_sub24_add_neon: 7133.1 > vp9_inv_dct_dct_32x32_sub28_add_neon: 8107.1 > vp9_inv_dct_dct_32x32_sub32_add_neon: 8105.6 > > I.e. in general a very minor overhead for the full subpartition case due > to the additional cmps, but a significant speedup for the cases when we > only need to process a small part of the actual input data. > --- > Updated based on Janne's review of the arm version. > --- > libavcodec/aarch64/vp9itxfm_neon.S | 60 > ++ > 1 file changed, 55 insertions(+), 5 deletions(-) > > diff --git a/libavcodec/aarch64/vp9itxfm_neon.S > b/libavcodec/aarch64/vp9itxfm_neon.S > index f4194a6..9d2ba11 100644 > --- a/libavcodec/aarch64/vp9itxfm_neon.S > +++ b/libavcodec/aarch64/vp9itxfm_neon.S > @@ -588,6 +588,9 @@ endfunc > .macro store i, dst, inc > st1 {v\i\().8h}, [\dst], \inc > .endm > +.macro movi_v i, size, imm > +moviv\i\()\size, \imm > +.endm > .macro load_clear i, src, inc > ld1 {v\i\().8h}, [\src] > st1 {v2.8h}, [\src], \inc > @@ -596,9 +599,8 @@ endfunc > // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, > // transpose into a horizontal 16x8 slice and store. > // x0 = dst (temp buffer) > -// x1 = unused > +// x1 = slice offset > // x2 = src > -// x3 = slice offset > // x9 = input stride > .macro itxfm16_1d_funcs txfm > function \txfm\()16_1d_8x16_pass1_neon > @@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon > transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 > > // Store the transposed 8x8 blocks horizontally. > -cmp x3, #8 > +cmp x1, #8 > b.eq1f > .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 > store \i, x0, #16 > .endr > ret > 1: > -// Special case: For the last input column (x3 == 8), > +// Special case: For the last input column (x1 == 8), > // which would be stored as the last row in the temp buffer, > // don't store the first 8x8 block, but keep it in registers > // for the first slice of the second pass (where it is the > @@ -751,13 +753,35 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, > export=1 > > .irp i, 0, 8 > add x0, sp, #(\i*32) > +.ifc \txfm1\()_\txfm2,idct_idct > +.if \i == 8 > +cmp w3, #38 > +b.le1f > +.endif > +.endif > +mov x1, #\i > add x2, x6, #(\i*2) > -mov x3, #\i > bl \txfm1\()16_1d_8x16_pass1_neon > .endr > .ifc \txfm1\()_\txfm2,iadst_idct > ld1 {v0.8h,v1.8h}, [x10] > .endif > + > +.ifc \txfm1\()_\txfm2,idct_idct > +b 3f > +1: > +// Set v24-v31 to zero, for the in-register passthrough of > +// coefficients to pass 2. Since we only do two slices, this can > +// only ever happen for the second slice. So we only need to store > +// zeros to the temp buffer for the second half of the buffer. > +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 > +add x0, x0, #16 > +movi_v \i, .16b, #0 > +store 24, x0, #16 > +.endr not really pretty, unfortunately I don't see much room for improvement. iirc we should have a gpr which holds #32. move the add out of the .irp and use w\that register as writeback > +3: > +.endif > + > .irp i, 0, 8 > add x0, x4, #(\i) > mov x1, x5 > @@ -1073,12 +1097,17 @@ function idct32_1d_8x32_pass2_neon > ret > endfunc > > +const min_eob_idct_idct_32, align=4 > +.short 0, 34, 135, 336 > +endconst > + > function ff_vp9_idct_idct_32x32_add_neon, export=1 > cmp
Re: [libav-devel] [PATCH 2/3] arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
On 2016-11-28 11:26:01 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > Previously all subpartitions except the eob=1 (DC) case ran with > the same runtime: > > vp9_inv_dct_dct_16x16_sub16_add_neon: 3188.1 2435.4 2499.0 1969.0 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18531.7 16582.3 14207.6 12000.3 > > By skipping individual 4x16 or 4x32 pixel slices in the first pass, > we reduce the runtime of these functions like this: > > vp9_inv_dct_dct_16x16_sub1_add_neon: 274.6189.5211.7235.8 > vp9_inv_dct_dct_16x16_sub2_add_neon:2064.0 1534.8 1719.4 1248.7 > vp9_inv_dct_dct_16x16_sub4_add_neon:2135.0 1477.2 1736.3 1249.5 > vp9_inv_dct_dct_16x16_sub8_add_neon:2446.7 1828.7 1993.6 1494.7 > vp9_inv_dct_dct_16x16_sub12_add_neon: 2832.4 2118.3 2266.5 1735.1 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.7 2475.3 2523.5 1983.1 > vp9_inv_dct_dct_32x32_sub1_add_neon: 756.2456.7862.0553.9 > vp9_inv_dct_dct_32x32_sub2_add_neon: 10682.2 8190.4 8539.2 6762.5 > vp9_inv_dct_dct_32x32_sub4_add_neon: 10813.5 8014.9 8518.3 6762.8 > vp9_inv_dct_dct_32x32_sub8_add_neon: 11859.6 9313.0 9347.4 7514.5 > vp9_inv_dct_dct_32x32_sub12_add_neon: 12946.6 10752.4 10192.2 8280.2 > vp9_inv_dct_dct_32x32_sub16_add_neon: 14074.6 11946.5 11001.4 9008.6 > vp9_inv_dct_dct_32x32_sub20_add_neon: 15269.9 13662.7 11816.1 9762.6 > vp9_inv_dct_dct_32x32_sub24_add_neon: 16327.9 14940.1 12626.7 10516.0 > vp9_inv_dct_dct_32x32_sub28_add_neon: 17462.7 15776.1 13446.2 11264.7 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18575.5 17157.0 14249.3 12015.1 > > I.e. in general a very minor overhead for the full subpartition case due > to the additional loads and cmps, but a significant speedup for the cases > when we only need to process a small part of the actual input data. > > In common VP9 content in a few inspected clips, 70-90% of the non-dc-only > 16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left > 8x8 or 16x16 subpartitions respectively. > --- > Updated with Janne's suggestions. The weird speedup for > vp9_inv_dct_dct_16x16_sub16_add_neon on the Cortex A8 in the previous > iteration of the patch seems to be mostly within noise for that test; it > does still appear occasionally when testing. > --- > libavcodec/arm/vp9itxfm_neon.S | 75 > +- > tests/checkasm/vp9dsp.c| 6 ++-- > 2 files changed, 70 insertions(+), 11 deletions(-) ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/3] arm: vp9itxfm: Only reload the idct coeffs for the iadst_idct combination
On 2016-11-28 11:26:00 +0200, Martin Storsjö wrote: > This avoids reloading them if they haven't been clobbered, if the > first pass also was idct. > > This is similar to what was done in the aarch64 version. > --- > libavcodec/arm/vp9itxfm_neon.S | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index 01944bd..2049241 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -814,7 +814,7 @@ A and r7, sp, #15 > mov r3, #\i > bl \txfm1\()16_1d_4x16_pass1_neon > .endr > -.ifc \txfm2,idct > +.ifc \txfm1\()_\txfm2,iadst_idct > movrel r12, idct_coeffs > vld1.16 {q0-q1}, [r12,:128] > .endif ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] vp9dsp: add DC only versions for idct/idct.
On 2016-11-29 14:55:41 +0200, Martin Storsjö wrote: > From: Clément Bœsch > > before: > > time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null - > real0m11.125s > user0m11.059s > sys 0m0.050s > > time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null - > real0m10.944s > user0m10.819s > sys 0m0.064s > > after: > > time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null - > real0m8.153s > user0m8.034s > sys 0m0.050s > > time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null - > real0m8.038s > user0m7.980s > sys 0m0.039s > --- > libavcodec/vp9dsp.c | 32 > 1 file changed, 24 insertions(+), 8 deletions(-) > > diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c > index 73006fa..ead2f88 100644 > --- a/libavcodec/vp9dsp.c > +++ b/libavcodec/vp9dsp.c > @@ -944,7 +944,7 @@ static av_cold void vp9dsp_intrapred_init(VP9DSPContext > *dsp) > #undef init_intra_pred > } > > -#define itxfm_wrapper(type_a, type_b, sz, bits) \ > +#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \ > static void \ > type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst, \ >ptrdiff_t stride, \ > @@ -953,6 +953,22 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## > _add_c(uint8_t *dst, \ > { \ > int i, j; \ > int16_t tmp[sz * sz], out[sz]; \ > +\ > +if (has_dconly && eob == 1) { \ > +const int t = (((block[0] * 11585 + (1 << 13)) >> 14) \ > + * 11585 + (1 << 13)) >> 14; \ > +block[0] = 0; \ > +for (i = 0; i < sz; i++) { \ > +for (j = 0; j < sz; j++)\ > +dst[j * stride] = av_clip_uint8(dst[j * stride] + \ > +(bits ? \ > + (t + (1 << (bits - 1))) >> > bits : \ > + t)); \ > +dst++; \ > +} \ > +return; \ > +} \ > +\ > for (i = 0; i < sz; i++)\ > type_a ## sz ## _1d(tmp + i * sz, block + i, sz, 0);\ > memset(block, 0, sz * sz * sizeof(*block)); \ > @@ -967,11 +983,11 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## > _add_c(uint8_t *dst, \ > } \ > } > > -#define itxfm_wrap(sz, bits) \ > -itxfm_wrapper(idct, idct, sz, bits) \ > -itxfm_wrapper(iadst, idct, sz, bits) \ > -itxfm_wrapper(idct, iadst, sz, bits) \ > -itxfm_wrapper(iadst, iadst, sz, bits) > +#define itxfm_wrap(sz, bits) \ > +itxfm_wrapper(idct, idct, sz, bits, 1) \ > +itxfm_wrapper(iadst, idct, sz, bits, 0) \ > +itxfm_wrapper(idct, iadst, sz, bits, 0) \ > +itxfm_wrapper(iadst, iadst, sz, bits, 0) > > #define IN(x) in[x * stride] > > @@ -1490,7 +1506,7 @@ static av_always_inline void idct32_1d(int16_t *out, > const int16_t *in, > out[31] = t0 - t31; > } > > -itxfm_wrapper(idct, idct, 32, 6) > +itxfm_wrapper(idct, idct, 32, 6, 1) > > static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in, >ptrdiff_t stride, int pass) > @@ -1523,7 +1539,7 @@ static av_always_inline void iwht4_1d(int16_t *out, > const int16_t *in, > out[3] = t3; > } > > -itxfm_wrapper(iwht, iwht, 4, 0) > +itxfm_wrapper(iwht, iwht, 4, 0, 0) > > #undef IN > #undef itxfm_wrapper ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] pthread_frame: do not run hwaccel decoding asynchronously
On 2016-11-24 19:19:59 +0100, Anton Khirnov wrote: > Only allow the decoding thread to run while the user is inside a lavc > decode call (avcodec_send_packet/receive_frame). > Hardware decoding APIs are often not thread-safe, so having the user > access decoded hardware surfaces while the decoder is running in another > thread can cause failures (this is mainly known to happen with DXVA2). This looks a little extreme. How painful would it be to provide an option to share the mutex with the calling process? > --- > libavcodec/pthread_frame.c | 58 > +- > 1 file changed, 52 insertions(+), 6 deletions(-) > > diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c > index 2736a81..430854f 100644 > --- a/libavcodec/pthread_frame.c > +++ b/libavcodec/pthread_frame.c > @@ -99,6 +99,8 @@ typedef struct PerThreadContext { > int requested_flags; ///< flags passed to get_buffer() for > requested_frame > > int die; ///< Set when the thread should exit. > + > +int hwaccel_serializing; > } PerThreadContext; > > /** > @@ -110,6 +112,14 @@ typedef struct FrameThreadContext { > > pthread_mutex_t buffer_mutex; ///< Mutex used to protect > get/release_buffer(). > > +/** > + * This lock is used for making sure hwaccel decoding does not run > + * concurrently with the calling thread. It is held by the calling thread > + * most of the time and released only while inside > ff_thread_decode_frame(), > + * at which time the worker threads are allowed to progress past > finish_setup(). > + */ > +pthread_mutex_t hwaccel_mutex; > + > int next_decoding; ///< The next context to submit a packet > to. > int next_finished; ///< The next context to return output > from. > > @@ -163,6 +173,11 @@ static attribute_align_arg void > *frame_worker_thread(void *arg) > if (atomic_load(&p->state) == STATE_SETTING_UP) > ff_thread_finish_setup(avctx); > > +if (p->hwaccel_serializing) { > +pthread_mutex_unlock(&p->parent->hwaccel_mutex); > +p->hwaccel_serializing = 0; although protected by other mutexes it would make sense to protect hwaccel_serializing explicitly by hwaccel_mutex > +} > + > atomic_store(&p->state, STATE_INPUT_READY); > > pthread_mutex_lock(&p->progress_mutex); > @@ -386,7 +401,11 @@ int ff_thread_decode_frame(AVCodecContext *avctx, > FrameThreadContext *fctx = avctx->internal->thread_ctx; > int finished = fctx->next_finished; > PerThreadContext *p; > -int err; > +int err, ret; > + > +/* release the hwaccel lock, permitting blocked hwaccel threads to go > + * forward while we are inside this function */ > +pthread_mutex_unlock(&fctx->hwaccel_mutex); > > /* > * Submit a packet to the next decoding thread. > @@ -394,9 +413,11 @@ int ff_thread_decode_frame(AVCodecContext *avctx, > > p = &fctx->threads[fctx->next_decoding]; > err = update_context_from_user(p->avctx, avctx); > -if (err) return err; > +if (err) > +goto finish; > err = submit_packet(p, avpkt); > -if (err) return err; > +if (err) > +goto finish; > > /* > * If we're still receiving the initial packets, don't return a frame. > @@ -406,8 +427,10 @@ int ff_thread_decode_frame(AVCodecContext *avctx, > if (fctx->next_decoding >= (avctx->thread_count-1)) fctx->delaying = > 0; > > *got_picture_ptr=0; > -if (avpkt->size) > -return avpkt->size; > +if (avpkt->size) { > +ret = avpkt->size; > +goto finish; > +} > } > > /* > @@ -448,8 +471,14 @@ int ff_thread_decode_frame(AVCodecContext *avctx, > > fctx->next_finished = finished; > > +ret = (p->result >= 0) ? avpkt->size : p->result; > +finish: > +pthread_mutex_lock(&fctx->hwaccel_mutex); > +if (err < 0) > +return err; > + > /* return the size of the consumed packet if no error occurred */ > -return (p->result >= 0) ? avpkt->size : p->result; > +return ret; > } > > void ff_thread_report_progress(ThreadFrame *f, int n, int field) > @@ -505,6 +534,11 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { > > pthread_cond_broadcast(&p->progress_cond); > pthread_mutex_unlock(&p->progress_mutex); > + > +if (avctx->hwaccel) { > +pthread_mutex_lock(&p->parent->hwaccel_mutex); > +p->hwaccel_serializing = 1; > +} I would have expected the lock before the codecs decode call. Since we don't call finish_setup until after the frame is decode this can't guarantee serialized access to the hw decoding library. Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 14/15] configure: Do not add newlines in filter()/filter_out() functions
On 2016-11-24 17:24:00 +0100, Diego Biurrun wrote: > --- > configure | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/configure b/configure > index a5295bf..27fb6ea 100755 > --- a/configure > +++ b/configure > @@ -430,7 +430,7 @@ filter(){ > pat=$1 > shift > for v; do > -eval "case $v in $pat) echo $v ;; esac" > +eval "case $v in $pat) printf '%s ' $v ;; esac" > done > } > > @@ -438,7 +438,7 @@ filter_out(){ > pat=$1 > shift > for v; do > -eval "case $v in $pat) ;; *) echo $v ;; esac" > +eval "case $v in $pat) ;; *) printf '%s ' $v ;; esac" > done > } ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 03/15] configure: Add missing asyncts filter, movie filter, and output example deps
On 2016-11-24 19:32:54 +0100, Diego Biurrun wrote: > On Thu, Nov 24, 2016 at 06:44:35PM +0100, Janne Grunau wrote: > > On 2016-11-24 17:23:49 +0100, Diego Biurrun wrote: > > > --- a/configure > > > +++ b/configure > > > @@ -2440,6 +2441,7 @@ frei0r_src_filter_extralibs='$ldl' > > > hqdn3d_filter_deps="gpl" > > > interlace_filter_deps="gpl" > > > +movie_filter_deps="avcodec avformat" > > > ocv_filter_deps="libopencv" > > > resample_filter_deps="avresample" > > > --- a/libavfilter/vsrc_movie.c > > > +++ b/libavfilter/vsrc_movie.c > > > @@ -35,7 +35,11 @@ > > > #include "libavutil/opt.h" > > > #include "libavutil/imgutils.h" > > > + > > > +#include "libavcodec/avcodec.h" > > > + > > > #include "libavformat/avformat.h" > > > + > > > #include "avfilter.h" > > > #include "formats.h" > > > > unrelated? At least a little surprising given the commit message > > Sort of related. I looked into the file and noticed that it was missing > the avcodec.h #include. Moving the addition of that #include to a separate > patch felt a bit silly. I can drop this hunk if you prefer... just mention it in the commit message. patch ok with that Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 15/15] [RFC] configure: more fine-grained link-time dependency settings
On 2016-11-24 17:24:01 +0100, Diego Biurrun wrote: > --- > > This works as advertised. > > Issues: > > - Maybe keeping _extralibs as suffix is better than _lbs, dunno. > - Possibly I should investigate Janne's idea of using the function > name as variable name instead of adding a library name parameter > to things like check_lib(). > - The case statement in check_deps that adds the flags to the right > component is ugly. I don't have a better idea yet, but I was also > busy getting the rest of the patch to work at all. > Janne? > > configure | 305 > -- > 1 file changed, 237 insertions(+), 68 deletions(-) > > diff --git a/configure b/configure > index 27fb6ea..233bad4 100755 > --- a/configure > +++ b/configure > @@ -650,8 +650,31 @@ check_deps(){ > > for cfg in $allopts; do > enabled $cfg || continue > -eval dep_extralibs="\$${cfg}_extralibs" > -test -n "$dep_extralibs" && add_extralibs $dep_extralibs > +eval dep_lbs="\$${cfg}_lbs" > +for lib in $dep_lbs; do > +eval append dep_libs "\$${lib}" > +done > +if test -n "$dep_libs"; then > +case $cfg in > +*coder|*parser|*bsf|*hwaccel) > +add_extralibs_lib avcodec $dep_libs ;; > +*muxer|*protocol) > +add_extralibs_lib avformat $dep_libs ;; > +*filter) > +add_extralibs_lib avfilter $dep_libs ;; > +*indev|*outdev) > +add_extralibs_lib avdevice $dep_libs ;; > +avutil) > +add_extralibs_lib avutil $dep_libs ;; > +avconv) > +add_extralibs_lib avconv $dep_libs ;; > +avplay) > +add_extralibs_lib avplay $dep_libs ;; > +avprobe) > +add_extralibs_lib avprobe $dep_libs ;; > +esac > +unset dep_libs > +fi Since we already have nice separated lists for all this it's probably nicer to add the extralibs separately after check_deps() set_component_extralibs(){ linkunit=$1 shift 1 for cfg in $@; do enabled $cfg || continue eval dep_lbs="\$${cfg}_lbs" for lib in $dep_lbs; do eval append dep_libs "\$${lib}" done if test -n "$dep_libs"; then add_extralibs_lib $linkunit $dep_libs fi done } set_component_extralibs avcodec $BSF_LIST $DECODER_LIST ... proof of concept, see disable_components() how to get all component lists for a given library and integrate it into the loop below for linkunit in $LIBRARY_LIST $PROGRAM_LIST; do set_component_extralibs $linkunit $linkunit done Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 04/15] configure: Use correct libm linker flag during math function checks
On 2016-11-24 17:23:50 +0100, Diego Biurrun wrote: > --- > > I suspect very many missing math functions were actually spurious test > failures related to this ... > > configure | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/configure b/configure > index 8fa2f46..39882e7 100755 > --- a/configure > +++ b/configure > @@ -4621,7 +4621,7 @@ ldexpf_args=2 > powf_args=2 > > for func in $MATH_FUNCS; do > -eval check_mathfunc $func \${${func}_args:-1} > +eval check_mathfunc $func \${${func}_args:-1} $LIBM > done > > # these are off by default, so fail if requested and not available ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 03/15] configure: Add missing asyncts filter, movie filter, and output example deps
On 2016-11-24 17:23:49 +0100, Diego Biurrun wrote: > --- > configure| 8 ++-- > libavfilter/vsrc_movie.c | 4 > 2 files changed, 10 insertions(+), 2 deletions(-) > > diff --git a/configure b/configure > index f204dc2..8fa2f46 100755 > --- a/configure > +++ b/configure > @@ -2426,6 +2426,7 @@ unix_protocol_deps="sys_un_h" > unix_protocol_select="network" > > # filters > +asyncts_filter_deps="avresample" > blackframe_filter_deps="gpl" > boxblur_filter_deps="gpl" > bs2b_filter_deps="libbs2b" > @@ -2440,6 +2441,7 @@ frei0r_src_filter_extralibs='$ldl' > hdcd_filter_deps="libhdcd" > hqdn3d_filter_deps="gpl" > interlace_filter_deps="gpl" > +movie_filter_deps="avcodec avformat" > ocv_filter_deps="libopencv" > resample_filter_deps="avresample" > scale_filter_deps="swscale" > @@ -2453,8 +2455,9 @@ encode_audio_example_deps="avcodec avutil" > encode_video_example_deps="avcodec avutil" > filter_audio_example_deps="avfilter avutil" > metadata_example_deps="avformat avutil" > -output_example_deps="avcodec avformat avutil swscale" > +output_example_deps="avcodec avformat avresample avutil swscale" > qsvdec_example_deps="avcodec avutil libmfx h264_qsv_decoder" > +qsvdec_example_deps="avcodec avutil libmfx h264_qsv_decoder vaapi_x11" > transcode_aac_example_deps="avcodec avformat avresample" > > # libraries, in linking order > @@ -5109,7 +5112,8 @@ enabled zlib && add_cppflags -DZLIB_CONST > > # conditional library dependencies, in linking order > enabled movie_filter&& prepend avfilter_deps "avformat avcodec" > -enabled resample_filter && prepend avfilter_deps "avresample" > +enabled_any asyncts_filter resample_filter && > + prepend avfilter_deps "avresample" > enabled scale_filter&& prepend avfilter_deps "swscale" > > enabled opus_decoder&& prepend avcodec_deps "avresample" ok > diff --git a/libavfilter/vsrc_movie.c b/libavfilter/vsrc_movie.c > index 5989a59..7fc9925 100644 > --- a/libavfilter/vsrc_movie.c > +++ b/libavfilter/vsrc_movie.c > @@ -35,7 +35,11 @@ > #include "libavutil/avstring.h" > #include "libavutil/opt.h" > #include "libavutil/imgutils.h" > + > +#include "libavcodec/avcodec.h" > + > #include "libavformat/avformat.h" > + > #include "avfilter.h" > #include "formats.h" > #include "internal.h" unrelated? At least a little surprising given the commit message Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 01/15] configure: Remove old avisynth support leftover
On 2016-11-24 17:23:47 +0100, Diego Biurrun wrote: > --- > configure | 1 - > 1 file changed, 1 deletion(-) > > diff --git a/configure b/configure > index 42c1848..78f1cac 100755 > --- a/configure > +++ b/configure > @@ -3039,7 +3039,6 @@ msvc_common_flags(){ > -mthumb) ;; > -march=*) ;; > -lz) echo zlib.lib ;; > --lavifil32) echo vfw32.lib ;; > -lavicap32) echo vfw32.lib user32.lib ;; > -lx264) echo libx264.lib ;; > -l*) echo ${flag#-l}.lib ;; ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCHv2] aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it
On 2016-11-24 00:09:35 +0200, Martin Storsjö wrote: > --- > libavcodec/aarch64/vp9itxfm_neon.S | 26 +++--- > 1 file changed, 15 insertions(+), 11 deletions(-) > > diff --git a/libavcodec/aarch64/vp9itxfm_neon.S > b/libavcodec/aarch64/vp9itxfm_neon.S > index 2dc6b75..f4194a6 100644 > --- a/libavcodec/aarch64/vp9itxfm_neon.S > +++ b/libavcodec/aarch64/vp9itxfm_neon.S > @@ -599,9 +599,9 @@ endfunc > // x1 = unused > // x2 = src > // x3 = slice offset > +// x9 = input stride > .macro itxfm16_1d_funcs txfm > function \txfm\()16_1d_8x16_pass1_neon > -mov x9, #32 > moviv2.8h, #0 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > load_clear \i, x2, x9 > @@ -649,8 +649,8 @@ endfunc > // x1 = dst stride > // x2 = src (temp buffer) > // x3 = slice offset > +// x9 = temp buffer stride > function \txfm\()16_1d_8x16_pass2_neon > -mov x9, #32 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23 > load\i, x2, x9 > .endr > @@ -747,6 +747,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, > export=1 > .ifc \txfm1,idct > ld1 {v0.8h,v1.8h}, [x10] > .endif > +mov x9, #32 > > .irp i, 0, 8 > add x0, sp, #(\i*32) > @@ -882,13 +883,12 @@ endfunc > // x0 = dst (temp buffer) > // x1 = unused > // x2 = src > +// x9 = double input stride > // x10 = idct_coeffs > // x11 = idct_coeffs + 32 > function idct32_1d_8x32_pass1_neon > ld1 {v0.8h,v1.8h}, [x10] > > -// Double stride of the input, since we only read every other line > -mov x9, #128 > moviv4.8h, #0 > > // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) > @@ -987,12 +987,13 @@ endfunc > // x0 = dst > // x1 = dst stride > // x2 = src (temp buffer) > +// x7 = negative double temp buffer stride > +// x9 = double temp buffer stride > // x10 = idct_coeffs > // x11 = idct_coeffs + 32 > function idct32_1d_8x32_pass2_neon > ld1 {v0.8h,v1.8h}, [x10] > > -mov x9, #128 > // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > ld1 {v\i\().8h}, [x2], x9 > @@ -1001,7 +1002,6 @@ function idct32_1d_8x32_pass2_neon > > idct16 > > -mov x9, #128 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > st1 {v\i\().8h}, [x2], x9 > .endr > @@ -1018,11 +1018,10 @@ function idct32_1d_8x32_pass2_neon > > idct32_odd > > -mov x9, #128 > .macro load_acc_store a, b, c, d, neg=0 > +.if \neg == 0 > ld1 {v4.8h}, [x2], x9 > ld1 {v5.8h}, [x2], x9 > -.if \neg == 0 > add v4.8h, v4.8h, v\a\().8h > ld1 {v6.8h}, [x2], x9 > add v5.8h, v5.8h, v\b\().8h > @@ -1030,10 +1029,12 @@ function idct32_1d_8x32_pass2_neon > add v6.8h, v6.8h, v\c\().8h > add v7.8h, v7.8h, v\d\().8h > .else > +ld1 {v4.8h}, [x2], x7 > +ld1 {v5.8h}, [x2], x7 > sub v4.8h, v4.8h, v\a\().8h > -ld1 {v6.8h}, [x2], x9 > +ld1 {v6.8h}, [x2], x7 > sub v5.8h, v5.8h, v\b\().8h > -ld1 {v7.8h}, [x2], x9 > +ld1 {v7.8h}, [x2], x7 > sub v6.8h, v6.8h, v\c\().8h > sub v7.8h, v7.8h, v\d\().8h > .endif > @@ -1064,7 +1065,6 @@ function idct32_1d_8x32_pass2_neon > load_acc_store 23, 22, 21, 20 > load_acc_store 19, 18, 17, 16 > sub x2, x2, x9 > -neg x9, x9 > load_acc_store 16, 17, 18, 19, 1 > load_acc_store 20, 21, 22, 23, 1 > load_acc_store 24, 25, 26, 27, 1 > @@ -1093,6 +1093,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 > mov x5, x1 > mov x6, x2 > > +// Double stride of the input, since we only read every other line > +mov x9, #128 > +neg x7, x9 > + > .irp i, 0, 8, 16, 24 > add x0, sp, #(\i*64) > add x2, x6, #(\i*2) ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] checkasm: vp9dsp: benchmark all sub-IDCTs (but not WHT or ADST).
On 2016-11-18 13:57:05 +0200, Martin Storsjö wrote: > From: "Ronald S. Bultje" > > --- > tests/checkasm/vp9dsp.c | 21 ++--- > 1 file changed, 14 insertions(+), 7 deletions(-) > > diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c > index 690e0cf..25f9dd1 100644 > --- a/tests/checkasm/vp9dsp.c > +++ b/tests/checkasm/vp9dsp.c > @@ -269,14 +269,20 @@ static void check_itxfm(void) > int n_txtps = tx < TX_32X32 ? N_TXFM_TYPES : 1; > > for (txtp = 0; txtp < n_txtps; txtp++) { > -if (check_func(dsp.itxfm_add[tx][txtp], "vp9_inv_%s_%dx%d_add", > - tx == 4 ? "wht_wht" : txtp_types[txtp], sz, sz)) { > -randomize_buffers(); > -ftx(coef, tx, txtp, sz, BIT_DEPTH); > - > -for (sub = (txtp == 0) ? 1 : 2; sub <= sz; sub <<= 1) { > +// skip testing sub-IDCTs for WHT or ADST since they don't > +// implement it in any of the SIMD functions. If they do, > +// consider changing this to ensure we have complete test > +// coverage > +for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz; sub <<= 1) > { > +if (check_func(dsp.itxfm_add[tx][txtp], > + "vp9_inv_%s_%dx%d_sub%d_add", > + tx == 4 ? "wht_wht" : txtp_types[txtp], > + sz, sz, sub)) { > int eob; > > +randomize_buffers(); > +ftx(coef, tx, txtp, sz, BIT_DEPTH); > + > if (sub < sz) { > eob = copy_subcoefs(subcoef0, coef, tx, txtp, > sz, sub, BIT_DEPTH); > @@ -294,8 +300,9 @@ static void check_itxfm(void) > !iszero(subcoef0, sz * sz * SIZEOF_COEF) || > !iszero(subcoef1, sz * sz * SIZEOF_COEF)) > fail(); > + > +bench_new(dst, sz * SIZEOF_PIXEL, coef, eob); > } > -bench_new(dst, sz * SIZEOF_PIXEL, coef, sz * sz); > } > } > } ok Janne ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 04/11] arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
On 2016-11-23 15:00:51 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > Previously all subpartitions except the eob=1 (DC) case ran with > the same runtime: > > vp9_inv_dct_dct_16x16_sub16_add_neon: 3189.0 2486.8 2509.9 1964.1 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18448.1 16682.0 14235.4 11993.4 > > By skipping individual 4x16 or 4x32 pixel slices in the first pass, > we reduce the runtime of these functions like this: > > vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5188.7211.6235.1 > vp9_inv_dct_dct_16x16_sub4_add_neon:2079.7 1606.3 1772.1 1264.8 > vp9_inv_dct_dct_16x16_sub8_add_neon:2449.2 1834.3 2046.5 1499.7 > vp9_inv_dct_dct_16x16_sub12_add_neon: 2826.2 2109.2 2295.9 1758.2 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3224.1 2476.5 2533.1 1985.7 > vp9_inv_dct_dct_32x32_sub1_add_neon: 752.5457.5863.7554.7 > vp9_inv_dct_dct_32x32_sub4_add_neon: 10689.2 8013.4 8592.9 6785.9 > vp9_inv_dct_dct_32x32_sub8_add_neon: 12217.8 9068.1 9420.4 7518.3 > vp9_inv_dct_dct_32x32_sub12_add_neon: 12967.3 10455.5 10223.9 8275.7 > vp9_inv_dct_dct_32x32_sub16_add_neon: 14084.1 11933.7 10998.9 9012.5 > vp9_inv_dct_dct_32x32_sub20_add_neon: 15171.4 13335.0 11820.6 9757.2 > vp9_inv_dct_dct_32x32_sub24_add_neon: 16229.6 15185.7 12614.4 10504.9 > vp9_inv_dct_dct_32x32_sub28_add_neon: 17338.1 15955.3 13445.0 11248.4 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18465.7 16974.6 14239.2 11999.1 > > I.e. in general a very minor overhead for the full subpartition case due > to the additional cmps, but a significant speedup for the cases when we > only need to process a small part of the actual input data. > > In common VP9 content in a few inspected clips, 70-90% of the non-dc-only > 16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left > 8x8 or 16x16 subpartitions respectively. > --- > This goes on top of the checkasm vp9dsp patch that adds benchmarking > of generic subpartitions in the itxfm. > --- > libavcodec/arm/vp9itxfm_neon.S | 70 > -- > tests/checkasm/vp9dsp.c| 6 ++-- > 2 files changed, 64 insertions(+), 12 deletions(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index 01944bd..769579a 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -659,10 +659,17 @@ endfunc > @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, > @ transpose into a horizontal 16x4 slice and store. > @ r0 = dst (temp buffer) > -@ r1 = unused > +@ r1 = slice offset > @ r2 = src > -@ r3 = slice offset > +@ r3 = eob > +@ r9 = min eob > function \txfm\()16_1d_4x16_pass1_neon > +.ifc \txfm,idct > +@ Check if this whole input slice is zero > +cmp r3, r9 > +ble 2f once this check is true it is true for all remaining slices so we should move it out to the main function. > +.endif > + > mov r12, #32 > vmov.s16q2, #0 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > @@ -678,14 +685,14 @@ function \txfm\()16_1d_4x16_pass1_neon > transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, > d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 > > @ Store the transposed 4x4 blocks horizontally. > -cmp r3, #12 > +cmp r1, #12 > beq 1f > .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 > vst1.16 {d\i}, [r0,:64]! > .endr > bx lr > 1: > -@ Special case: For the last input column (r3 == 12), > +@ Special case: For the last input column (r1 == 12), > @ which would be stored as the last row in the temp buffer, > @ don't store the first 4x4 block, but keep it in registers > @ for the first slice of the second pass (where it is the > @@ -711,6 +718,18 @@ function \txfm\()16_1d_4x16_pass1_neon > vmovd30, d18 > vmovd31, d19 > bx lr > + > +.ifc \txfm,idct > +2: > +@ Set d28-d31 to zero, for the in-register passthrough of > coefficients to pass 2 > +vmov.i16q14, #0 > +vmov.i16q15, #0 > +@ Write zeros to the temp buffer for pass 2 > +.rept 4 > +vst1.16 {q14-q15}, [r0,:128]! > +.endr > +bx lr > +.endif > endfunc > > @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, > @@ -781,15 +800,23 @@ endfunc > itxfm16_1d_funcs idct > itxfm16_1d_funcs iadst > > +@ This is the minimum eob value for each subpartition, in increments of 4 > +const min_eob_idct_idct_16, align=4 > +.short 0, 10, 38, 89 > +endconst > + > .macro itxfm_func16x16 txfm1, txfm2 > functi