Re: [libav-devel] [PATCH] avformat: Free the internal codec context at the end

2017-04-12 Thread Sean McGovern
On Apr 12, 2017 8:00 AM, "Vittorio Giovara" 
wrote:

On Tue, Apr 11, 2017 at 7:54 PM, Sean McGovern  wrote:
> On Apr 11, 2017 7:48 PM, "Luca Barbato"  wrote:
>
> Avoid a use after free in avformat_find_stream_info.
>
> CC: libav-sta...@libav.org
> ---
>  libavformat/utils.c | 6 +-
>  1 file changed, 1 insertion(+), 5 deletions(-)
>
> diff --git a/libavformat/utils.c b/libavformat/utils.c
> index 8fa89eb..eaba473 100644
> --- a/libavformat/utils.c
> +++ b/libavformat/utils.c
> @@ -2466,11 +2466,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
>  count++;
>  }
>
> -// close codecs which were opened in try_decode_frame()
> -for (i = 0; i < ic->nb_streams; i++) {
> -st = ic->streams[i];
> -avcodec_close(st->internal->avctx);
> -}
>  for (i = 0; i < ic->nb_streams; i++) {
>  st = ic->streams[i];
>  avctx = st->internal->avctx;
> @@ -2570,6 +2565,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
>
>  find_stream_info_err:
>  for (i = 0; i < ic->nb_streams; i++) {
> +avcodec_close(ic->streams[i]->internal->avctx);
>  av_freep(>streams[i]->info);
>  av_bsf_free(>streams[i]->internal->extract_extradata.bsf);
>  av_packet_free(>streams[i]->internal->extract_extradata.pkt);
> --
>
>
>
> Does this by any chance close any Bugzilla tickets?

I'm afraid not, this is related to a pesky issue with the new channel
layout api.

patch ok with me.
--


Same.

Was just making sure.

-- Sean McG.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] hevc: Add NEON 16x16 IDCT

2017-04-12 Thread Martin Storsjö

On Fri, 7 Apr 2017, Martin Storsjö wrote:

These 8 macro invocations adds 8 copies of the almost identical same code 
though - the size of this object file is inflated a great deal due to 
this.


Before this, the code size ends up at 2280 bytes, after this patch it ends 
up at 15976 bytes. This also makes the builds fail on e.g. raspberry pi, 
due to "invalid literal constant: pool needs to be closer". In other 
words, the function is too large.


The pushed version ends up with a code size of 5104 bytes, which is more 
reasonable.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] hevc: Add NEON 16x16 IDCT

2017-04-12 Thread Martin Storsjö

On Wed, 12 Apr 2017, Alexandra Hájková wrote:


The speedup vs C code is around 6-13x.
---

Use irp to avoid the repetition.

libavcodec/arm/hevc_idct.S| 196 ++
libavcodec/arm/hevcdsp_init_arm.c |   4 +
2 files changed, 200 insertions(+)

diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 4124fc8..3608f3a 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -222,7 +222,203 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
endfunc
.endm

+.macro butterfly e, o, tmp_p, tmp_m
+vadd.s32\tmp_p, \e, \o
+vsub.s32\tmp_m, \e, \o
+.endm
+
+.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7
+tr_4x4_8\in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, 
q14, q15
+
+vmull.s16   q12, \in1, \in0[0]
+vmull.s16   q13, \in1, \in0[1]
+vmull.s16   q14, \in1, \in0[2]
+vmull.s16   q15, \in1, \in0[3]
+sum_sub q12, \in3, \in0[1], +
+sum_sub q13, \in3, \in0[3], -
+sum_sub q14, \in3, \in0[0], -
+sum_sub q15, \in3, \in0[2], -
+
+sum_sub q12, \in5, \in0[2], +
+sum_sub q13, \in5, \in0[0], -
+sum_sub q14, \in5, \in0[3], +
+sum_sub q15, \in5, \in0[1], +
+
+sum_sub q12, \in7, \in0[3], +
+sum_sub q13, \in7, \in0[2], -
+sum_sub q14, \in7, \in0[1], +
+sum_sub q15, \in7, \in0[0], -
+
+butterfly   q8,  q12, q0, q7
+butterfly   q9,  q13, q1, q6
+butterfly   q10, q14, q2, q5
+butterfly   q11, q15, q3, q4
+add r4,  sp,  #512
+vst1.s16{q0-q1}, [r4, :128]!
+vst1.s16{q2-q3}, [r4, :128]!
+vst1.s16{q4-q5}, [r4, :128]!
+vst1.s16{q6-q7}, [r4, :128]
+.endm
+
+.macro load16 in0, in1, in2, in3, in4, in5, in6, in7
+vld1.s16{\in0}, [r1, :64], r2
+vld1.s16{\in1}, [r3, :64], r2
+vld1.s16{\in2}, [r1, :64], r2
+vld1.s16{\in3}, [r3, :64], r2
+vld1.s16{\in4}, [r1, :64], r2
+vld1.s16{\in5}, [r3, :64], r2
+vld1.s16{\in6}, [r1, :64], r2
+vld1.s16{\in7}, [r3, :64], r2
+.endm
+
+.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, 
op5, op6, op7
+sum_sub q5, \in, \t0, \op0
+sum_sub q6, \in, \t1, \op1
+sum_sub q7, \in, \t2, \op2
+sum_sub q8, \in, \t3, \op3
+sum_sub q9, \in, \t4, \op4
+sum_sub q10,\in, \t5, \op5
+sum_sub q11,\in, \t6, \op6
+sum_sub q12,\in, \t7, \op7
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+vadd.s32q4, \in0, \in1
+vsub.s32\in0, \in0, \in1
+vadd.s32\in1, \in2, \in3
+vsub.s32\in2, \in2, \in3
+vadd.s32\in3, \in4, \in5
+vsub.s32\in4, \in4, \in5
+vadd.s32\in5, \in6, \in7
+vsub.s32\in6, \in6, \in7
+.endm
+
+.macro store16 in0, in1, in2, in3, in4, in5, in6, in7
+vst1.s16\in0, [r1, :64], r2
+vst1.s16\in1, [r3, :64], r4
+vst1.s16\in2, [r1, :64], r2
+vst1.s16\in3, [r3, :64], r4
+vst1.s16\in4, [r1, :64], r2
+vst1.s16\in5, [r3, :64], r4
+vst1.s16\in6, [r1, :64], r2
+vst1.s16\in7, [r3, :64], r4
+.endm
+
+.macro scale out0, out1, out2, out3, out4, out5, out6, out7, in0, in1, in2, 
in3, in4, in5, in6, in7, shift
+vqrshrn.s32 \out0, \in0, \shift
+vqrshrn.s32 \out1, \in1, \shift
+vqrshrn.s32 \out2, \in2, \shift
+vqrshrn.s32 \out3, \in3, \shift
+vqrshrn.s32 \out4, \in4, \shift
+vqrshrn.s32 \out5, \in5, \shift
+vqrshrn.s32 \out6, \in6, \shift
+vqrshrn.s32 \out7, \in7, \shift
+.endm
+
+.macro tr_16x4 name, shift
+function func_tr_16x4_\name
+mov r1,  r5
+add r3,  r5, #64
+mov r2,  #128
+load16  d0, d1, d2, d3, d4, d5, d6, d7
+movrel  r1, trans
+
+tr16_8x4d0, d1, d2, d3, d4, d5, d6, d7
+
+add r1,  r5, #32
+add r3,  r5, #(64 + 32)
+mov r2,  #128
+load16  d8, d9, d2, d3, d4, d5, d6, d7
+movrel  r1, trans + 16
+vld1.s16{q0}, [r1, :128]
+vmull.s16   q5, d8, d0[0]
+vmull.s16   q6, d8, d0[1]
+vmull.s16   q7, d8, d0[2]
+vmull.s16   q8, d8, d0[3]
+vmull.s16   q9, d8, d1[0]
+vmull.s16   q10, d8, d1[1]
+vmull.s16   q11, d8, d1[2]
+vmull.s16   q12, 

Re: [libav-devel] [PATCH] hevc: Optimize NEON 8x8 IDCT using col_limit

2017-04-12 Thread Martin Storsjö

On Wed, 12 Apr 2017, Alexandra Hájková wrote:


---
libavcodec/arm/hevc_idct.S | 26 +++---
1 file changed, 15 insertions(+), 11 deletions(-)


Given that the whole col_limit parameter seems to not be clearly defined, 
I'd skip this patch for now until it has been clarified. (Sorry, I didn't 
know it was that messy when I told you to try to do it.)


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] checkasm: Test sub-idcts for hevc

2017-04-12 Thread Martin Storsjö

On Wed, 12 Apr 2017, Martin Storsjö wrote:


This allows testing and benchmarking use of the col_limit parameter
in the idct implementations.
---
I can't say I really understand the logic for setting the col_limit
parameter based on the range on where non-zero coefficients are set,
but this set up at least works with the reference C version of the
idct. (I.e., it gives the same result both if passing the real
col_limit value or block_size, i.e. it doesn't miss anything.)
---
tests/checkasm/hevc_idct.c | 46 +++---
1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/tests/checkasm/hevc_idct.c b/tests/checkasm/hevc_idct.c
index dd4dc0d064..f8cf4de9f1 100644
--- a/tests/checkasm/hevc_idct.c
+++ b/tests/checkasm/hevc_idct.c
@@ -26,35 +26,51 @@

#include "checkasm.h"

-#define randomize_buffers(buf, size)\
+#define randomize_buffers(buf, size, sub)   \
do {\
-int j;  \
+int j, k;   \
for (j = 0; j < size; j++) {\
-int16_t r = rnd();  \
-AV_WN16A(buf + j, r);   \
+for (k = 0; k < size; k++) {\
+int16_t r = 0;  \
+if (j < sub && k < sub) \
+r = rnd();  \
+AV_WN16A(buf + j * size + k, r);\
+}   \
}   \
} while (0)

static void check_idct(HEVCDSPContext h, int bit_depth)
{
-int i;
+int i, sub;
LOCAL_ALIGNED(32, int16_t, coeffs0, [32 * 32]);
LOCAL_ALIGNED(32, int16_t, coeffs1, [32 * 32]);

for (i = 2; i <= 5; i++) {
int block_size = 1 << i;
int size = block_size * block_size;
-int col_limit = block_size;
+int col_limit;
declare_func(void, int16_t *coeffs, int col_limit);

-randomize_buffers(coeffs0, size);
-memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
-if (check_func(h.idct[i - 2], "hevc_idct_%dx%d_%d", block_size, 
block_size, bit_depth)) {
-call_ref(coeffs0, col_limit);
-call_new(coeffs1, col_limit);
-if (memcmp(coeffs0, coeffs1, sizeof(*coeffs0) * size))
-fail();
-bench_new(coeffs1, col_limit);
+for (sub = 1; sub <= block_size; sub < 4 ? (sub <<= 1) : (sub += 4)) {
+int max_xy = sub - 1;
+randomize_buffers(coeffs0, block_size, sub);
+memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
+
+col_limit = max_xy + max_xy + 4; // last_significant_coeff_x + 
last_significant_coeff_y + 4


This logic doesn't really match what the real decoder does, so I'd rather 
postpone and skip this patch for now, until someone can actually clarify 
the exact definition of what the col_limit parameter.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] mem: uninline av_malloc(z)_array()

2017-04-12 Thread Luca Barbato
On 12/04/2017 15:27, Anton Khirnov wrote:
> Inlining public functions hardcodes their implementation into the ABI,
> so it should be avoided unless there is a very good reason for it. No
> such reason exists in this case.
> ---
>  libavutil/mem.c | 14 ++
>  libavutil/mem.h | 14 ++
>  2 files changed, 16 insertions(+), 12 deletions(-)
> 

Ok.

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] golomb: Simplify get_ur_golomb_jpegls

2017-04-12 Thread Luca Barbato
On 11/04/2017 17:41, Diego Biurrun wrote:
> Not necessary IMO.

the line is extra long so with just the ; you get something quite weird
looking at 80 cols.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH] mem: uninline av_malloc(z)_array()

2017-04-12 Thread Anton Khirnov
Inlining public functions hardcodes their implementation into the ABI,
so it should be avoided unless there is a very good reason for it. No
such reason exists in this case.
---
 libavutil/mem.c | 14 ++
 libavutil/mem.h | 14 ++
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/libavutil/mem.c b/libavutil/mem.c
index 0f506d3..fd0ffd9 100644
--- a/libavutil/mem.c
+++ b/libavutil/mem.c
@@ -138,6 +138,20 @@ int av_reallocp(void *ptr, size_t size)
 return 0;
 }
 
+void *av_malloc_array(size_t nmemb, size_t size)
+{
+if (!size || nmemb >= INT_MAX / size)
+return NULL;
+return av_malloc(nmemb * size);
+}
+
+void *av_mallocz_array(size_t nmemb, size_t size)
+{
+if (!size || nmemb >= INT_MAX / size)
+return NULL;
+return av_mallocz(nmemb * size);
+}
+
 void *av_realloc_array(void *ptr, size_t nmemb, size_t size)
 {
 if (!size || nmemb >= INT_MAX / size)
diff --git a/libavutil/mem.h b/libavutil/mem.h
index f3cf56c..a03ba2f 100644
--- a/libavutil/mem.h
+++ b/libavutil/mem.h
@@ -89,12 +89,7 @@ void *av_malloc(size_t size) av_malloc_attrib 
av_alloc_size(1);
  * be allocated.
  * @see av_malloc()
  */
-av_alloc_size(1, 2) static inline void *av_malloc_array(size_t nmemb, size_t 
size)
-{
-if (!size || nmemb >= INT_MAX / size)
-return NULL;
-return av_malloc(nmemb * size);
-}
+av_alloc_size(1, 2) void *av_malloc_array(size_t nmemb, size_t size);
 
 /**
  * Allocate or reallocate a block of memory.
@@ -202,12 +197,7 @@ void *av_mallocz(size_t size) av_malloc_attrib 
av_alloc_size(1);
  * @see av_mallocz()
  * @see av_malloc_array()
  */
-av_alloc_size(1, 2) static inline void *av_mallocz_array(size_t nmemb, size_t 
size)
-{
-if (!size || nmemb >= INT_MAX / size)
-return NULL;
-return av_mallocz(nmemb * size);
-}
+av_alloc_size(1, 2) void *av_mallocz_array(size_t nmemb, size_t size);
 
 /**
  * Duplicate the string s.
-- 
2.0.0

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] avutil: av_frame_ref: copy source frame flags

2017-04-12 Thread Francois Cartegnie
Le 12/04/2017 à 12:16, Hendrik Leppkes a écrit :
>>
>>  dst->format = src->format;
>> +dst->flags  = src->flags;
>>  dst->width  = src->width;
>>  dst->height = src->height;
>>  dst->channel_layout = src->channel_layout;
> 
> You must be encountering something else.
> av_frame_copy_props already copies the flags field, and its invoked by
> av_frame_ref just below the block you added it to.

Seems so.
Probably because I didn't test using the same starting point (see below).

I've also noticed the corrupted flag isn't set at all if first decoded
frame is an non-IDR I-Frame, because it is derived from the recovery
state which is set when on I-Frame case having no long or short term
reference pictures stored.

Francois
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 2/7] utvideodec: Support UQY2

2017-04-12 Thread Luca Barbato
On 11/04/2017 10:11, Diego Biurrun wrote:
> extradata is uint8_t*, so this should be PRIu8. Maybe it could be done
> in a separate patch though.

It should since just above there is the same thing.

I'd push this one now.

lu
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] checkasm: Test sub-idcts for hevc

2017-04-12 Thread Luca Barbato
On 12/04/2017 13:53, Martin Storsjö wrote:
> This allows testing and benchmarking use of the col_limit parameter
> in the idct implementations.
> ---
> I can't say I really understand the logic for setting the col_limit
> parameter based on the range on where non-zero coefficients are set,
> but this set up at least works with the reference C version of the
> idct. (I.e., it gives the same result both if passing the real
> col_limit value or block_size, i.e. it doesn't miss anything.)
> ---
>  tests/checkasm/hevc_idct.c | 46 
> +++---
>  1 file changed, 31 insertions(+), 15 deletions(-)
> 

Probably fine.

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] hevc: Optimize NEON 8x8 IDCT using col_limit

2017-04-12 Thread Martin Storsjö

On Wed, 12 Apr 2017, Alexandra Hájková wrote:


---
libavcodec/arm/hevc_idct.S | 26 +++---
1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 4124fc8..29135ad 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -58,7 +58,7 @@ endconst

.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, 
tmp3
 vshll.s16  \tmp0, \in0, #6
- vld1.s16   {\in0}, [r1, :64]!
+ vld1.s16   {\in0}, [r4, :64]!
 vmov   \tmp1, \tmp0
 vmull.s16  \tmp2, \in1, \in0[1]
 vmull.s16  \tmp3, \in1, \in0[3]
@@ -67,14 +67,14 @@ endconst
 vmlal.s16  \tmp2, \in3, \in0[3] @o0
 vmlsl.s16  \tmp3, \in3, \in0[1] @o1

- vld1.s16   {\in0}, [r1, :64]
+ vld1.s16   {\in0}, [r4, :64]

 vadd.s32   \out0, \tmp0, \tmp2
 vadd.s32   \out1, \tmp1, \tmp3
 vsub.s32   \out2, \tmp1, \tmp3
 vsub.s32   \out3, \tmp0, \tmp2

- subr1,  r1,  #8
+ subr4,  r4,  #8
.endm

@ Do a 4x4 transpose, using q registers for the subtransposes that don't
@@ -166,21 +166,25 @@ endfunc
.macro idct_8x8 bitdepth
function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
@r0 - coeffs
+push{r4, lr}


If you actually only need one spare register, you don't need to push both 
r4 and lr - it would be enough to just push lr and use that instead of r4 
below. (Since you don't do any subroutine calls in this function, it's ok 
to use lr as scratch register.)



vpush   {q4-q7}

-mov r1,  r0
+mov r4,  r0
mov r2,  #64
add r3,  r0,  #32
-vld1.s16{q0-q1}, [r1,:128], r2
+vld1.s16{q0-q1}, [r4,:128], r2
vld1.s16{q2-q3}, [r3,:128], r2
-vld1.s16{q4-q5}, [r1,:128], r2
+vld1.s16{q4-q5}, [r4,:128], r2
vld1.s16{q6-q7}, [r3,:128], r2

-movrel  r1, trans
+movrel  r4, trans

tr_8x4  7, d0, d2, d4, d6, d8, d10, d12, d14
+cmp r1, #4
+blt 1f
tr_8x4  7, d1, d3, d5, d7, d9, d11, d13, d15



As far as I can see the code in libavcodec/hevcdec.c right now, it's 
impossible to get col_limit < 4. So this won't actually ever have any 
effect, it seems.


I'm not sure what the logic of the current col_limit is, if it should be 
changed so that it behaves like this (which would make sense).


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] avformat: Free the internal codec context at the end

2017-04-12 Thread Vittorio Giovara
On Tue, Apr 11, 2017 at 7:54 PM, Sean McGovern  wrote:
> On Apr 11, 2017 7:48 PM, "Luca Barbato"  wrote:
>
> Avoid a use after free in avformat_find_stream_info.
>
> CC: libav-sta...@libav.org
> ---
>  libavformat/utils.c | 6 +-
>  1 file changed, 1 insertion(+), 5 deletions(-)
>
> diff --git a/libavformat/utils.c b/libavformat/utils.c
> index 8fa89eb..eaba473 100644
> --- a/libavformat/utils.c
> +++ b/libavformat/utils.c
> @@ -2466,11 +2466,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
>  count++;
>  }
>
> -// close codecs which were opened in try_decode_frame()
> -for (i = 0; i < ic->nb_streams; i++) {
> -st = ic->streams[i];
> -avcodec_close(st->internal->avctx);
> -}
>  for (i = 0; i < ic->nb_streams; i++) {
>  st = ic->streams[i];
>  avctx = st->internal->avctx;
> @@ -2570,6 +2565,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
>
>  find_stream_info_err:
>  for (i = 0; i < ic->nb_streams; i++) {
> +avcodec_close(ic->streams[i]->internal->avctx);
>  av_freep(>streams[i]->info);
>  av_bsf_free(>streams[i]->internal->extract_extradata.bsf);
>  av_packet_free(>streams[i]->internal->extract_extradata.pkt);
> --
>
>
>
> Does this by any chance close any Bugzilla tickets?

I'm afraid not, this is related to a pesky issue with the new channel
layout api.

patch ok with me.
-- 
Vittorio
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH] hevc: Optimize NEON 8x8 IDCT using col_limit

2017-04-12 Thread Alexandra Hájková
---
 libavcodec/arm/hevc_idct.S | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 4124fc8..29135ad 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -58,7 +58,7 @@ endconst
 
 .macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, 
tmp3
  vshll.s16  \tmp0, \in0, #6
- vld1.s16   {\in0}, [r1, :64]!
+ vld1.s16   {\in0}, [r4, :64]!
  vmov   \tmp1, \tmp0
  vmull.s16  \tmp2, \in1, \in0[1]
  vmull.s16  \tmp3, \in1, \in0[3]
@@ -67,14 +67,14 @@ endconst
  vmlal.s16  \tmp2, \in3, \in0[3] @o0
  vmlsl.s16  \tmp3, \in3, \in0[1] @o1
 
- vld1.s16   {\in0}, [r1, :64]
+ vld1.s16   {\in0}, [r4, :64]
 
  vadd.s32   \out0, \tmp0, \tmp2
  vadd.s32   \out1, \tmp1, \tmp3
  vsub.s32   \out2, \tmp1, \tmp3
  vsub.s32   \out3, \tmp0, \tmp2
 
- subr1,  r1,  #8
+ subr4,  r4,  #8
 .endm
 
 @ Do a 4x4 transpose, using q registers for the subtransposes that don't
@@ -166,21 +166,25 @@ endfunc
 .macro idct_8x8 bitdepth
 function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
 @r0 - coeffs
+push{r4, lr}
 vpush   {q4-q7}
 
-mov r1,  r0
+mov r4,  r0
 mov r2,  #64
 add r3,  r0,  #32
-vld1.s16{q0-q1}, [r1,:128], r2
+vld1.s16{q0-q1}, [r4,:128], r2
 vld1.s16{q2-q3}, [r3,:128], r2
-vld1.s16{q4-q5}, [r1,:128], r2
+vld1.s16{q4-q5}, [r4,:128], r2
 vld1.s16{q6-q7}, [r3,:128], r2
 
-movrel  r1, trans
+movrel  r4, trans
 
 tr_8x4  7, d0, d2, d4, d6, d8, d10, d12, d14
+cmp r1, #4
+blt 1f
 tr_8x4  7, d1, d3, d5, d7, d9, d11, d13, d15
 
+1:
 @ Transpose each 4x4 block, and swap how d4-d7 and d8-d11 are used.
 @ Layout before:
 @ d0  d1
@@ -209,16 +213,16 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
 
 transpose_8x8   d0, d2, d4, d6, d8, d10, d12, d14, d1, d3, d5, d7, d9, 
d11, d13, d15
 
-mov r1,  r0
+mov r4,  r0
 mov r2,  #64
 add r3,  r0,  #32
-vst1.s16{q0-q1}, [r1,:128], r2
+vst1.s16{q0-q1}, [r4,:128], r2
 vst1.s16{q2-q3}, [r3,:128], r2
-vst1.s16{q4-q5}, [r1,:128], r2
+vst1.s16{q4-q5}, [r4,:128], r2
 vst1.s16{q6-q7}, [r3,:128], r2
 
 vpop{q4-q7}
-bx  lr
+pop {r4, pc}
 endfunc
 .endm
 
-- 
2.10.2

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH] checkasm: Test sub-idcts for hevc

2017-04-12 Thread Martin Storsjö
This allows testing and benchmarking use of the col_limit parameter
in the idct implementations.
---
I can't say I really understand the logic for setting the col_limit
parameter based on the range on where non-zero coefficients are set,
but this set up at least works with the reference C version of the
idct. (I.e., it gives the same result both if passing the real
col_limit value or block_size, i.e. it doesn't miss anything.)
---
 tests/checkasm/hevc_idct.c | 46 +++---
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/tests/checkasm/hevc_idct.c b/tests/checkasm/hevc_idct.c
index dd4dc0d064..f8cf4de9f1 100644
--- a/tests/checkasm/hevc_idct.c
+++ b/tests/checkasm/hevc_idct.c
@@ -26,35 +26,51 @@
 
 #include "checkasm.h"
 
-#define randomize_buffers(buf, size)\
+#define randomize_buffers(buf, size, sub)   \
 do {\
-int j;  \
+int j, k;   \
 for (j = 0; j < size; j++) {\
-int16_t r = rnd();  \
-AV_WN16A(buf + j, r);   \
+for (k = 0; k < size; k++) {\
+int16_t r = 0;  \
+if (j < sub && k < sub) \
+r = rnd();  \
+AV_WN16A(buf + j * size + k, r);\
+}   \
 }   \
 } while (0)
 
 static void check_idct(HEVCDSPContext h, int bit_depth)
 {
-int i;
+int i, sub;
 LOCAL_ALIGNED(32, int16_t, coeffs0, [32 * 32]);
 LOCAL_ALIGNED(32, int16_t, coeffs1, [32 * 32]);
 
 for (i = 2; i <= 5; i++) {
 int block_size = 1 << i;
 int size = block_size * block_size;
-int col_limit = block_size;
+int col_limit;
 declare_func(void, int16_t *coeffs, int col_limit);
 
-randomize_buffers(coeffs0, size);
-memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
-if (check_func(h.idct[i - 2], "hevc_idct_%dx%d_%d", block_size, 
block_size, bit_depth)) {
-call_ref(coeffs0, col_limit);
-call_new(coeffs1, col_limit);
-if (memcmp(coeffs0, coeffs1, sizeof(*coeffs0) * size))
-fail();
-bench_new(coeffs1, col_limit);
+for (sub = 1; sub <= block_size; sub < 4 ? (sub <<= 1) : (sub += 4)) {
+int max_xy = sub - 1;
+randomize_buffers(coeffs0, block_size, sub);
+memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
+
+col_limit = max_xy + max_xy + 4; // last_significant_coeff_x + 
last_significant_coeff_y + 4
+if (max_xy < 4)
+col_limit = FFMIN(4, col_limit);
+else if (max_xy < 8)
+col_limit = FFMIN(8, col_limit);
+else if (max_xy < 12)
+col_limit = FFMIN(24, col_limit);
+
+if (check_func(h.idct[i - 2], "hevc_idct_%dx%d_sub%d_%d", 
block_size, block_size, sub, bit_depth)) {
+call_ref(coeffs0, col_limit);
+call_new(coeffs1, col_limit);
+if (memcmp(coeffs0, coeffs1, sizeof(*coeffs0) * size))
+fail();
+bench_new(coeffs1, col_limit);
+}
 }
 }
 }
@@ -70,7 +86,7 @@ static void check_idct_dc(HEVCDSPContext h, int bit_depth)
 int size = block_size * block_size;
 declare_func_emms(AV_CPU_FLAG_MMXEXT, void, int16_t *coeffs);
 
-randomize_buffers(coeffs0, size);
+randomize_buffers(coeffs0, block_size, 1);
 memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
 
 if (check_func(h.idct_dc[i - 2], "hevc_idct_%dx%d_dc_%d", block_size, 
block_size, bit_depth)) {
-- 
2.11.0 (Apple Git-81)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] avutil: av_frame_ref: copy source frame flags

2017-04-12 Thread Hendrik Leppkes
On Wed, Apr 12, 2017 at 12:06 PM, Francois Cartegnie  wrote:
> This happens at least with h264 (I have not tested other codecs).
>
> When corrupted frames output is set, any corrupted
> frame is flagged but that information is lost when
> av_frame_ref is used on output.
>
> The other local only h264 fix would be to copy flags
> in h264dec output_frame().
>
> Francois
> ---
>  libavutil/frame.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/libavutil/frame.c b/libavutil/frame.c
> index 9cd5f9a..e75236c 100644
> --- a/libavutil/frame.c
> +++ b/libavutil/frame.c
> @@ -205,6 +205,7 @@ int av_frame_ref(AVFrame *dst, const AVFrame *src)
>  int i, ret = 0;
>
>  dst->format = src->format;
> +dst->flags  = src->flags;
>  dst->width  = src->width;
>  dst->height = src->height;
>  dst->channel_layout = src->channel_layout;

You must be encountering something else.
av_frame_copy_props already copies the flags field, and its invoked by
av_frame_ref just below the block you added it to.

- Hendrik
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH] avutil: av_frame_ref: copy source frame flags

2017-04-12 Thread Francois Cartegnie
This happens at least with h264 (I have not tested other codecs).

When corrupted frames output is set, any corrupted
frame is flagged but that information is lost when
av_frame_ref is used on output.

The other local only h264 fix would be to copy flags
in h264dec output_frame().

Francois
---
 libavutil/frame.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavutil/frame.c b/libavutil/frame.c
index 9cd5f9a..e75236c 100644
--- a/libavutil/frame.c
+++ b/libavutil/frame.c
@@ -205,6 +205,7 @@ int av_frame_ref(AVFrame *dst, const AVFrame *src)
 int i, ret = 0;
 
 dst->format = src->format;
+dst->flags  = src->flags;
 dst->width  = src->width;
 dst->height = src->height;
 dst->channel_layout = src->channel_layout;
-- 
2.9.3

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] thread: Define ff_mutex_* macros as stub functions when threads are disabled

2017-04-12 Thread Luca Barbato
On 31/03/2017 19:35, Diego Biurrun wrote:
> Silences a bunch of "statement with no effect" warnings with threads disabled.
> ---
> 
> Now with stub functions, should be more correct.
> 
>  libavutil/thread.h | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/libavutil/thread.h b/libavutil/thread.h
> index cf0fbdd..ac76bb8 100644
> --- a/libavutil/thread.h
> +++ b/libavutil/thread.h
> @@ -48,10 +48,10 @@
>  
>  #define AVMutex char
>  
> -#define ff_mutex_init(mutex, attr) (0)
> -#define ff_mutex_lock(mutex) (0)
> -#define ff_mutex_unlock(mutex) (0)
> -#define ff_mutex_destroy(mutex) (0)
> +static inline int ff_mutex_init(AVMutex *mutex, const void *attr){ return 0; 
> }
> +static inline int ff_mutex_lock(AVMutex *mutex){ return 0; }
> +static inline int ff_mutex_unlock(AVMutex *mutex){ return 0; }
> +static inline int ff_mutex_destroy(AVMutex *mutex){ return 0; }
>  
>  #define AVOnce char
>  #define AV_ONCE_INIT 0
> 

Ok.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] thread: Define ff_mutex_* macros as stub functions when threads are disabled

2017-04-12 Thread Diego Biurrun
On Fri, Mar 31, 2017 at 07:35:35PM +0200, Diego Biurrun wrote:
> Silences a bunch of "statement with no effect" warnings with threads disabled.
> ---
> 
> Now with stub functions, should be more correct.
> 
>  libavutil/thread.h | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)

ping

Diego
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH] hevc: Add NEON 16x16 IDCT

2017-04-12 Thread Alexandra Hájková
The speedup vs C code is around 6-13x.
---

Use irp to avoid the repetition.

 libavcodec/arm/hevc_idct.S| 196 ++
 libavcodec/arm/hevcdsp_init_arm.c |   4 +
 2 files changed, 200 insertions(+)

diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 4124fc8..3608f3a 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -222,7 +222,203 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
 endfunc
 .endm
 
+.macro butterfly e, o, tmp_p, tmp_m
+vadd.s32\tmp_p, \e, \o
+vsub.s32\tmp_m, \e, \o
+.endm
+
+.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7
+tr_4x4_8\in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, 
q14, q15
+
+vmull.s16   q12, \in1, \in0[0]
+vmull.s16   q13, \in1, \in0[1]
+vmull.s16   q14, \in1, \in0[2]
+vmull.s16   q15, \in1, \in0[3]
+sum_sub q12, \in3, \in0[1], +
+sum_sub q13, \in3, \in0[3], -
+sum_sub q14, \in3, \in0[0], -
+sum_sub q15, \in3, \in0[2], -
+
+sum_sub q12, \in5, \in0[2], +
+sum_sub q13, \in5, \in0[0], -
+sum_sub q14, \in5, \in0[3], +
+sum_sub q15, \in5, \in0[1], +
+
+sum_sub q12, \in7, \in0[3], +
+sum_sub q13, \in7, \in0[2], -
+sum_sub q14, \in7, \in0[1], +
+sum_sub q15, \in7, \in0[0], -
+
+butterfly   q8,  q12, q0, q7
+butterfly   q9,  q13, q1, q6
+butterfly   q10, q14, q2, q5
+butterfly   q11, q15, q3, q4
+add r4,  sp,  #512
+vst1.s16{q0-q1}, [r4, :128]!
+vst1.s16{q2-q3}, [r4, :128]!
+vst1.s16{q4-q5}, [r4, :128]!
+vst1.s16{q6-q7}, [r4, :128]
+.endm
+
+.macro load16 in0, in1, in2, in3, in4, in5, in6, in7
+vld1.s16{\in0}, [r1, :64], r2
+vld1.s16{\in1}, [r3, :64], r2
+vld1.s16{\in2}, [r1, :64], r2
+vld1.s16{\in3}, [r3, :64], r2
+vld1.s16{\in4}, [r1, :64], r2
+vld1.s16{\in5}, [r3, :64], r2
+vld1.s16{\in6}, [r1, :64], r2
+vld1.s16{\in7}, [r3, :64], r2
+.endm
+
+.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, 
op5, op6, op7
+sum_sub q5, \in, \t0, \op0
+sum_sub q6, \in, \t1, \op1
+sum_sub q7, \in, \t2, \op2
+sum_sub q8, \in, \t3, \op3
+sum_sub q9, \in, \t4, \op4
+sum_sub q10,\in, \t5, \op5
+sum_sub q11,\in, \t6, \op6
+sum_sub q12,\in, \t7, \op7
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+vadd.s32q4, \in0, \in1
+vsub.s32\in0, \in0, \in1
+vadd.s32\in1, \in2, \in3
+vsub.s32\in2, \in2, \in3
+vadd.s32\in3, \in4, \in5
+vsub.s32\in4, \in4, \in5
+vadd.s32\in5, \in6, \in7
+vsub.s32\in6, \in6, \in7
+.endm
+
+.macro store16 in0, in1, in2, in3, in4, in5, in6, in7
+vst1.s16\in0, [r1, :64], r2
+vst1.s16\in1, [r3, :64], r4
+vst1.s16\in2, [r1, :64], r2
+vst1.s16\in3, [r3, :64], r4
+vst1.s16\in4, [r1, :64], r2
+vst1.s16\in5, [r3, :64], r4
+vst1.s16\in6, [r1, :64], r2
+vst1.s16\in7, [r3, :64], r4
+.endm
+
+.macro scale out0, out1, out2, out3, out4, out5, out6, out7, in0, in1, in2, 
in3, in4, in5, in6, in7, shift
+vqrshrn.s32 \out0, \in0, \shift
+vqrshrn.s32 \out1, \in1, \shift
+vqrshrn.s32 \out2, \in2, \shift
+vqrshrn.s32 \out3, \in3, \shift
+vqrshrn.s32 \out4, \in4, \shift
+vqrshrn.s32 \out5, \in5, \shift
+vqrshrn.s32 \out6, \in6, \shift
+vqrshrn.s32 \out7, \in7, \shift
+.endm
+
+.macro tr_16x4 name, shift
+function func_tr_16x4_\name
+mov r1,  r5
+add r3,  r5, #64
+mov r2,  #128
+load16  d0, d1, d2, d3, d4, d5, d6, d7
+movrel  r1, trans
+
+tr16_8x4d0, d1, d2, d3, d4, d5, d6, d7
+
+add r1,  r5, #32
+add r3,  r5, #(64 + 32)
+mov r2,  #128
+load16  d8, d9, d2, d3, d4, d5, d6, d7
+movrel  r1, trans + 16
+vld1.s16{q0}, [r1, :128]
+vmull.s16   q5, d8, d0[0]
+vmull.s16   q6, d8, d0[1]
+vmull.s16   q7, d8, d0[2]
+vmull.s16   q8, d8, d0[3]
+vmull.s16   q9, d8, d1[0]
+vmull.s16   q10, d8, d1[1]
+vmull.s16   q11, d8, d1[2]
+vmull.s16   q12, d8, d1[3]
+
+add_member  d9, 

Re: [libav-devel] [PATCH 1/8] hevc: properly handle no_rasl_output_flag when removing pictures from the DPB

2017-04-12 Thread Steve Lhomme
Yes, I probably messed up. Merging code from very different 2 year old
code is not exactly easy. I don't know how you people do this...

On Tue, Apr 11, 2017 at 3:33 PM, Hendrik Leppkes  wrote:
> On Tue, Apr 11, 2017 at 3:29 PM, Anton Khirnov  wrote:
>> Quoting Steve Lhomme (2017-04-07 14:27:39)
>>> From: Hendrik Leppkes 
>>>
>>> Fixes ticket #4185.
>>>
>>> Reviewed-By: Mickael Raulet 
>>> Signed-off-by: Hendrik Leppkes 
>>> ---
>>>  libavcodec/hevcdec.c | 3 +++
>>>  libavcodec/hevcdec.h | 1 +
>>>  2 files changed, 4 insertions(+)
>>>
>>> diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
>>> index 6a04858587..6fe05d1b01 100644
>>> --- a/libavcodec/hevcdec.c
>>> +++ b/libavcodec/hevcdec.c
>>> @@ -2421,6 +2421,8 @@ static int hevc_frame_start(HEVCContext *s)
>>>  s->is_decoded= 0;
>>>  s->first_nal_type= s->nal_unit_type;
>>>
>>> +s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s);
>>> +
>>>  if (s->ps.pps->tiles_enabled_flag)
>>>  lc->end_of_tiles_x = s->ps.pps->column_width[0] << 
>>> s->ps.sps->log2_ctb_size;
>>>
>>> @@ -3007,6 +3009,7 @@ static int hevc_update_thread_context(AVCodecContext 
>>> *dst,
>>>  s->seq_output = s0->seq_output;
>>>  s->pocTid0= s0->pocTid0;
>>>  s->max_ra = s0->max_ra;
>>> +s->no_rasl_output_flag = s0->no_rasl_output_flag;
>>>
>>>  s->is_nalff= s0->is_nalff;
>>>  s->nal_length_size = s0->nal_length_size;
>>> diff --git a/libavcodec/hevcdec.h b/libavcodec/hevcdec.h
>>> index ff192f67ae..64089bde75 100644
>>> --- a/libavcodec/hevcdec.h
>>> +++ b/libavcodec/hevcdec.h
>>> @@ -484,6 +484,7 @@ typedef struct HEVCContext {
>>>  int bs_height;
>>>
>>>  int is_decoded;
>>> +int no_rasl_output_flag;
>>>
>>>  HEVCPredContext hpc;
>>>  HEVCDSPContext hevcdsp;
>>> --
>>> 2.11.1
>>
>> Eh? This seems to be write-only.
>>
>
> For some reason he skipped one of the hunks (and also modified the
> actual setting of the flag?)
> http://git.videolan.org/?p=ffmpeg.git;a=commitdiff;h=0118158efa8e45761f9f65a3bb74f33907bd2aec
>
> Also, a follow up for tsan happyness:
> http://git.videolan.org/?p=ffmpeg.git;a=commitdiff;h=bddabfaab65808e40605181d579ffcd85bfe4c26
>
> - Hendrik
> ___
> libav-devel mailing list
> libav-devel@libav.org
> https://lists.libav.org/mailman/listinfo/libav-devel
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel