[FFmpeg-devel] [PATCH 3/3] libavutil/log.c: only include valgrind header when used.

2023-10-29 Thread Reimar . Doeffinger
From: Reimar Döffinger 

This is cleaner, but it is also a workaround for when
the header exists, but cannot be compiled.
This will happen when the compiler has no inline asm
support.
Possibly the configure check should be improved as well.
---
 libavutil/log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/log.c b/libavutil/log.c
index 5948e50467..2d358b7ab9 100644
--- a/libavutil/log.c
+++ b/libavutil/log.c
@@ -47,7 +47,7 @@ static AVMutex mutex = AV_MUTEX_INITIALIZER;
 
 #define LINE_SZ 1024
 
-#if HAVE_VALGRIND_VALGRIND_H
+#if HAVE_VALGRIND_VALGRIND_H && CONFIG_VALGRIND_BACKTRACE
 #include 
 /* this is the log level at which valgrind will output a full backtrace */
 #define BACKTRACE_LOGLEVEL AV_LOG_ERROR
-- 
2.39.3 (Apple Git-145)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 2/3] libavutil/aarch64/cpu.c: HWCAPS requires inline asm support.

2023-10-29 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Fixes compilation with tcc, which does not have aarch64
inline asm support.
---
 libavutil/aarch64/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/aarch64/cpu.c b/libavutil/aarch64/cpu.c
index bd780e8591..0d7c1e268d 100644
--- a/libavutil/aarch64/cpu.c
+++ b/libavutil/aarch64/cpu.c
@@ -34,7 +34,7 @@ static int detect_flags(void)
 
 hwcap = getauxval(AT_HWCAP);
 
-#if defined(HWCAP_CPUID)
+#if defined(HWCAP_CPUID) && HAVE_INLINE_ASM
 // We can check for DOTPROD and I8MM using HWCAP_ASIMDDP and
 // HWCAP2_I8MM too, avoiding to read the CPUID registers (which triggers
 // a trap, handled by the kernel). However the HWCAP_* defines for these
-- 
2.39.3 (Apple Git-145)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 1/3] configure: fix _Pragma check.

2023-10-29 Thread Reimar . Doeffinger
From: Reimar Döffinger 

The test can current pass when _Pragma is not supported, since
_Pragma might be treated as a implicitly declared function.
This happens e.g. with tinycc.
Extending the check to 2 pragmas both matches the actual use
better and avoids this misdetection.
---
 configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index ff3ac9f4de..e2bcf4e1c2 100755
--- a/configure
+++ b/configure
@@ -5987,7 +5987,7 @@ for restrict_keyword in restrict __restrict__ __restrict 
""; do
 test_code cc "" "char * $restrict_keyword p" && break
 done
 
-check_cc pragma_deprecated "" '_Pragma("GCC diagnostic ignored 
\"-Wdeprecated-declarations\"")'
+check_cc pragma_deprecated "" '_Pragma("GCC diagnostic push") _Pragma("GCC 
diagnostic ignored \"-Wdeprecated-declarations\"")'
 
 # The global variable ensures the bits appear unchanged in the object file.
 test_cc 

[FFmpeg-devel] [PATCH] [RFC] tools/patcheck: portability fixes.

2023-07-27 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Enough to make it run on macOS.
In particular:
- fix "empty subexpression" errors caused by constructs like (smth|),
  use ? instead to make them optional
- no -d option for xargs, use the more standard -0 and use tr to
  replace newlines with 0.

Not sure if these cause issues somewhere else, not even completely
sure they all work, but quick testing suggests they work.
On the other hand I remember issues with '?' where I resorted to {0,1}
instead, but I do not remember details.
Ignore if fixing these seems not worth the risk.
---
 tools/patcheck | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/patcheck b/tools/patcheck
index fe52938f29..ee993c60fc 100755
--- a/tools/patcheck
+++ b/tools/patcheck
@@ -21,7 +21,7 @@ echo may or may not be bad. When you use it and it misses 
something or detects
 echo something wrong, fix it and send a patch to the ffmpeg-devel mailing list.
 echo License: GPL, Author: Michael Niedermayer
 
-ERE_PRITYP='(unsigned *|)(char|short|long|int|long *int|short 
*int|void|float|double|(u|)int(8|16|32|64)_t)'
+ERE_PRITYP='(unsigned *)?(char|short|long|int|long *int|short 
*int|void|float|double|u?int(8|16|32|64)_t)'
 ERE_TYPES='(const|static|av_cold|inline| 
*)*('$ERE_PRITYP'|[a-zA-Z][a-zA-Z0-9_]*)[* ]{1,}[a-zA-Z][a-zA-Z0-9_]*'
 ERE_FUNCS="$ERE_TYPES"' *\('
 
@@ -63,7 +63,7 @@ hiegrep '\+= *1 *;' 'can be simplified to ++' $*
 hiegrep '-= *1 *;'  'can be simplified to --' $*
 hiegrep '((!|=)= *(0|NULL)[^0-9a-z]|[^0-9a-z](0|NULL) *(!|=)=)' 'x==0 / x!=0 
can be simplified to !x / x' $*
 
-$EGREP $OPT '^\+ *(const *|)static' $*| $EGREP --color=always '[^=]= 
*(0|NULL)[^0-9a-zA-Z]'> $TMP && printf '\nuseless 0 init\n'
+$EGREP $OPT '^\+ *(const *)?static' $*| $EGREP --color=always '[^=]= 
*(0|NULL)[^0-9a-zA-Z]'> $TMP && printf '\nuseless 0 init\n'
 cat $TMP
 hiegrep '# *ifdef * (HAVE|CONFIG)_' 'ifdefs that should be #if' $*
 
@@ -77,7 +77,7 @@ hiegrep ':\+ *'"$ERE_PRITYP"' *inline' 'non static inline or 
strangely ordered i
 hiegrep "$ERE_FUNCS"' *\)' 'missing void' $*
 hiegrep '(sprintf|strcat|strcpy)' 'Possible security issue, make sure this is 
safe or use snprintf/av_strl*' $*
 hiegrep '/ 
*(2|4|8|16|32|64|128|256|512|1024|2048|4096|8192|16384|32768|65536)[^0-9]' 
'divide by 2^x could use >> maybe' $*
-hiegrep '#(el|)if *(0|1)' 'useless #if' $*
+hiegrep '#(el)?if *(0|1)' 'useless #if' $*
 hiegrep 'if *\( *(0|1) *\)' 'useless if()' $*
 hiegrep '& *[a-zA-Z0-9_]* *\[ *0 *\]' 'useless & [0]' $*
 hiegrep '(\( *[0-9] *(&&|\|\|)|(&&|\|\|) *[0-9] *\))' 'overriding condition' $*
@@ -118,22 +118,22 @@ if test -e $TMP ; then
 cat $TMP
 fi
 
-$EGREP -B2 $OPT '^(\+|) *('"$ERE_TYPES"'|# *define)' $* | $EGREP -A2 
--color=always '(:|-)\+[^/]*/(\*([^*]|$)|/([^/]|$))' > $TMP && printf "\n Non 
doxy comments\n"
+$EGREP -B2 $OPT '^\+? *('"$ERE_TYPES"'|# *define)' $* | $EGREP -A2 
--color=always '(:|-)\+[^/]*/(\*([^*]|$)|/([^/]|$))' > $TMP && printf "\n Non 
doxy comments\n"
 cat $TMP
 
 rm $TMP
 for i in \
 $($EGREP -H '^\+ *'"$ERE_TYPES" $*  |\
 $GREP -v '(' | $EGREP -v '\Wgoto\W' |\
-xargs -d '\n' -n 1 |\
+tr '\n' '\0' | xargs -0 -n 1 |\
 $GREP -o '[* ][* ]*[a-zA-Z][0-9a-zA-Z_]* *[,;=]' |\
 sed 's/.[* ]*\([a-zA-Z][0-9a-zA-Z_]*\) *[,;=]/\1/') \
 ; do
 echo $i | $GREP '^NULL$' && continue
-$EGREP $i' *(\+|-|\*|/|\||&|%|)=[^=]' $* >/dev/null || echo "possibly 
never written:"$i >> $TMP
+$EGREP $i' *(\+|-|\*|/|\||&|%)?=[^=]' $* >/dev/null || echo "possibly 
never written:"$i >> $TMP
 $EGREP '(=|\(|return).*'$i'(==|[^=])*$'$* >/dev/null || echo "possibly 
never read   :"$i >> $TMP
-$EGREP -o $i' *((\+|-|\*|/|\||&|%|)=[^=]|\+\+|--) *(0x|)[0-9]*(;|)'   $* |\
-   $EGREP -v $i' *= *(0x|)[0-9]{1,};'>/dev/null || echo "possibly 
constant :"$i >> $TMP
+$EGREP -o $i' *((\+|-|\*|/|\||&|%)?=[^=]|\+\+|--) *(0x)?[0-9]*;?'   $* |\
+   $EGREP -v $i' *= *(0x)?[0-9]{1,};'>/dev/null || echo "possibly 
constant :"$i >> $TMP
 done
 if test -e $TMP ; then
 printf '\npossibly unused variables\n'
@@ -151,7 +151,7 @@ cat $TMP | tr '@' '\n'
 cat $* | tr '\n' '@' | $EGREP --color=always -o '\+ *if *\( *([A-Za-z0-9_]*) 
*[<>]=? *([A-Za-z0-9_]*) *\)[ @\\+]*(\1|\2) *= *(\1|\2) *;'  >$TMP && printf 
"\nFFMIN/FFMAX\n"
 cat $TMP | tr '@' '\n'
 
-cat $* | tr '\n' '@' | $EGREP --color=always -o '\+ *if *\( *([A-Za-z0-9_]*) 
*\)[ @\\+]*av_free(p|) *\( *(&|) *\1[^-.]'  >$TMP && printf "\nav_free(NULL) is 
safe\n"
+cat $* | tr '\n' '@' | $EGREP --color=always -o '\+ *if *\( *([A-Za-z0-9_]*) 
*\)[ @\\+]*av_freep? *\( *&? *\1[^-.]'  >$TMP && printf "\nav_free(NULL) is 
safe\n"
 cat $TMP | tr '@' '\n'
 
 cat $* | tr '\n' '@' | $EGREP --color=always -o '[^a-zA-Z0-9_]([a-zA-Z0-9_]*) 
*= *av_malloc *\([^)]*\)[ @;\\+]*memset *\( *\1'  >$TMP && printf 
"\nav_mallocz()\n"
-- 
2.37.1 (Apple Git-137.1)

___
ffmpeg-devel mailing list

[FFmpeg-devel] [PATCH] libavformat: fix incorrect handling of incomplete AVBPrint.

2023-07-27 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Change some internal APIs a bit to make it harder to make
such mistakes.
In particular, have the read chunk functions return an error
when the result is incomplete.
This might be less flexible, but since there has been no
use-case for that so far, avoiding coding mistakes seems better.
Add a function to queue a AVBPrint directly (ff_subtitles_queue_insert_bprint).
Also fixes a leak in lrcdec when ff_subtitles_queue_insert fails.

Signed-off-by: Reimar Döffinger 
---
 libavformat/assdec.c |  4 +++-
 libavformat/lrcdec.c |  7 ++-
 libavformat/mpsubdec.c   |  5 +++--
 libavformat/realtextdec.c|  6 +-
 libavformat/samidec.c|  6 +-
 libavformat/srtdec.c |  4 +++-
 libavformat/subtitles.c  | 19 +++
 libavformat/subtitles.h  | 14 --
 libavformat/tedcaptionsdec.c |  2 +-
 libavformat/webvttdec.c  |  4 +++-
 10 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/libavformat/assdec.c b/libavformat/assdec.c
index 0915f6fafd..bf7b8a73a2 100644
--- a/libavformat/assdec.c
+++ b/libavformat/assdec.c
@@ -73,6 +73,8 @@ static int read_dialogue(ASSContext *ass, AVBPrint *dst, 
const uint8_t *p,
 
 av_bprint_clear(dst);
 av_bprintf(dst, "%u,%d,%s", ass->readorder++, layer, p + pos);
+if (!av_bprint_is_complete(dst))
+return AVERROR(ENOMEM);
 
 /* right strip the buffer */
 while (dst->len > 0 &&
@@ -135,7 +137,7 @@ static int ass_read_header(AVFormatContext *s)
 av_bprintf(, "%s", line.str);
 continue;
 }
-sub = ff_subtitles_queue_insert(>q, rline.str, rline.len, 0);
+sub = ff_subtitles_queue_insert_bprint(>q, , 0);
 if (!sub) {
 res = AVERROR(ENOMEM);
 goto end;
diff --git a/libavformat/lrcdec.c b/libavformat/lrcdec.c
index fff39495f8..83bb4a4b75 100644
--- a/libavformat/lrcdec.c
+++ b/libavformat/lrcdec.c
@@ -171,6 +171,8 @@ static int lrc_read_header(AVFormatContext *s)
 
 while(!avio_feof(s->pb)) {
 int64_t pos = read_line(, s->pb);
+if (!av_bprint_is_complete())
+goto err_nomem_out;
 int64_t header_offset = find_header(line.str);
 if(header_offset >= 0) {
 char *comma_offset = strchr(line.str, ':');
@@ -205,7 +207,7 @@ static int lrc_read_header(AVFormatContext *s)
 sub = ff_subtitles_queue_insert(>q, line.str + 
ts_strlength,
 line.len - ts_strlength, 0);
 if (!sub)
-return AVERROR(ENOMEM);
+goto err_nomem_out;
 sub->pos = pos;
 sub->pts = ts_start - lrc->ts_offset;
 sub->duration = -1;
@@ -216,6 +218,9 @@ static int lrc_read_header(AVFormatContext *s)
 ff_metadata_conv_ctx(s, NULL, ff_lrc_metadata_conv);
 av_bprint_finalize(, NULL);
 return 0;
+err_nomem_out:
+av_bprint_finalize(, NULL);
+return AVERROR(ENOMEM);
 }
 
 const AVInputFormat ff_lrc_demuxer = {
diff --git a/libavformat/mpsubdec.c b/libavformat/mpsubdec.c
index d290a41fb9..0374563575 100644
--- a/libavformat/mpsubdec.c
+++ b/libavformat/mpsubdec.c
@@ -116,9 +116,10 @@ static int mpsub_read_header(AVFormatContext *s)
 AVPacket *sub;
 const int64_t pos = avio_tell(s->pb);
 
-ff_subtitles_read_chunk(s->pb, );
+res = ff_subtitles_read_chunk(s->pb, );
+if (res < 0) goto end;
 if (buf.len) {
-sub = ff_subtitles_queue_insert(>q, buf.str, buf.len, 
0);
+sub = ff_subtitles_queue_insert_bprint(>q, , 0);
 if (!sub) {
 res = AVERROR(ENOMEM);
 goto end;
diff --git a/libavformat/realtextdec.c b/libavformat/realtextdec.c
index c281dec346..7992a5b7fc 100644
--- a/libavformat/realtextdec.c
+++ b/libavformat/realtextdec.c
@@ -80,6 +80,10 @@ static int realtext_read_header(AVFormatContext *s)
 const int64_t pos = ff_text_pos() - (c != 0);
 int n = ff_smil_extract_next_text_chunk(, , );
 
+if (n < 0) {
+res = n;
+goto end;
+}
 if (n == 0)
 break;
 
@@ -103,7 +107,7 @@ static int realtext_read_header(AVFormatContext *s)
 /* if we just read a  tag, introduce a new event, otherwise 
merge
  * with the previous one */
 int merge = !av_strncasecmp(buf.str, "q, buf.str, buf.len, merge);
+sub = ff_subtitles_queue_insert_bprint(>q, , merge);
 if (!sub) {
 res = AVERROR(ENOMEM);
 goto end;
diff --git a/libavformat/samidec.c b/libavformat/samidec.c
index 0da299343d..070b623ebf 100644
--- a/libavformat/samidec.c
+++ b/libavformat/samidec.c
@@ -68,6 +68,10 @@ static int sami_read_header(AVFormatContext *s)
 const int64_t pos = ff_text_pos() - (c != 0);

[FFmpeg-devel] [PATCH] hevcdsp_idct_neon.S: Avoid unnecessary mov.

2023-07-26 Thread Reimar . Doeffinger
From: Reimar Döffinger 

ret can be given an argument instead.
This is also consistent with how other assembler code
in FFmpeg does it.
---
 libavcodec/aarch64/hevcdsp_idct_neon.S | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index b7f23386a4..f7142c939c 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -617,8 +617,7 @@ function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
 
 add  sp,  sp,  #640
 
-mov x30, x15
-ret
+ret x15
 endfunc
 .endm
 
@@ -814,8 +813,7 @@ function ff_hevc_idct_32x32_\bitdepth\()_neon, export=1
 .endr
 
 add sp,  sp,  #2432
-mov x30, x15
-ret
+ret x15
 endfunc
 .endm
 
-- 
2.37.1 (Apple Git-137.1)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] libaformat: fix incorrect handling of incomplete AVBPrint.

2023-07-23 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Change some internal APIs a bit to make it harder to make
such mistakes.
In particular, have the read chunk functions return an error
when the result is incomplete.
This might be less flexible, but since there has been no
use-case for that so far, avoiding coding mistakes seems better.
Add a function to queue a AVBPrint directly (ff_subtitles_queue_insert_bprint).
Also fixes a leak in lrcdec when ff_subtitles_queue_insert fails.
---
 libavformat/assdec.c |  4 +++-
 libavformat/lrcdec.c |  7 ++-
 libavformat/mpsubdec.c   |  5 +++--
 libavformat/realtextdec.c|  6 +-
 libavformat/samidec.c|  6 +-
 libavformat/srtdec.c |  4 +++-
 libavformat/subtitles.c  | 17 +
 libavformat/subtitles.h  | 14 --
 libavformat/tedcaptionsdec.c |  2 +-
 libavformat/webvttdec.c  |  4 +++-
 10 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/libavformat/assdec.c b/libavformat/assdec.c
index 0915f6fafd..bf7b8a73a2 100644
--- a/libavformat/assdec.c
+++ b/libavformat/assdec.c
@@ -73,6 +73,8 @@ static int read_dialogue(ASSContext *ass, AVBPrint *dst, 
const uint8_t *p,
 
 av_bprint_clear(dst);
 av_bprintf(dst, "%u,%d,%s", ass->readorder++, layer, p + pos);
+if (!av_bprint_is_complete(dst))
+return AVERROR(ENOMEM);
 
 /* right strip the buffer */
 while (dst->len > 0 &&
@@ -135,7 +137,7 @@ static int ass_read_header(AVFormatContext *s)
 av_bprintf(, "%s", line.str);
 continue;
 }
-sub = ff_subtitles_queue_insert(>q, rline.str, rline.len, 0);
+sub = ff_subtitles_queue_insert_bprint(>q, , 0);
 if (!sub) {
 res = AVERROR(ENOMEM);
 goto end;
diff --git a/libavformat/lrcdec.c b/libavformat/lrcdec.c
index fff39495f8..83bb4a4b75 100644
--- a/libavformat/lrcdec.c
+++ b/libavformat/lrcdec.c
@@ -171,6 +171,8 @@ static int lrc_read_header(AVFormatContext *s)
 
 while(!avio_feof(s->pb)) {
 int64_t pos = read_line(, s->pb);
+if (!av_bprint_is_complete())
+goto err_nomem_out;
 int64_t header_offset = find_header(line.str);
 if(header_offset >= 0) {
 char *comma_offset = strchr(line.str, ':');
@@ -205,7 +207,7 @@ static int lrc_read_header(AVFormatContext *s)
 sub = ff_subtitles_queue_insert(>q, line.str + 
ts_strlength,
 line.len - ts_strlength, 0);
 if (!sub)
-return AVERROR(ENOMEM);
+goto err_nomem_out;
 sub->pos = pos;
 sub->pts = ts_start - lrc->ts_offset;
 sub->duration = -1;
@@ -216,6 +218,9 @@ static int lrc_read_header(AVFormatContext *s)
 ff_metadata_conv_ctx(s, NULL, ff_lrc_metadata_conv);
 av_bprint_finalize(, NULL);
 return 0;
+err_nomem_out:
+av_bprint_finalize(, NULL);
+return AVERROR(ENOMEM);
 }
 
 const AVInputFormat ff_lrc_demuxer = {
diff --git a/libavformat/mpsubdec.c b/libavformat/mpsubdec.c
index d290a41fb9..0374563575 100644
--- a/libavformat/mpsubdec.c
+++ b/libavformat/mpsubdec.c
@@ -116,9 +116,10 @@ static int mpsub_read_header(AVFormatContext *s)
 AVPacket *sub;
 const int64_t pos = avio_tell(s->pb);
 
-ff_subtitles_read_chunk(s->pb, );
+res = ff_subtitles_read_chunk(s->pb, );
+if (res < 0) goto end;
 if (buf.len) {
-sub = ff_subtitles_queue_insert(>q, buf.str, buf.len, 
0);
+sub = ff_subtitles_queue_insert_bprint(>q, , 0);
 if (!sub) {
 res = AVERROR(ENOMEM);
 goto end;
diff --git a/libavformat/realtextdec.c b/libavformat/realtextdec.c
index c281dec346..7992a5b7fc 100644
--- a/libavformat/realtextdec.c
+++ b/libavformat/realtextdec.c
@@ -80,6 +80,10 @@ static int realtext_read_header(AVFormatContext *s)
 const int64_t pos = ff_text_pos() - (c != 0);
 int n = ff_smil_extract_next_text_chunk(, , );
 
+if (n < 0) {
+res = n;
+goto end;
+}
 if (n == 0)
 break;
 
@@ -103,7 +107,7 @@ static int realtext_read_header(AVFormatContext *s)
 /* if we just read a  tag, introduce a new event, otherwise 
merge
  * with the previous one */
 int merge = !av_strncasecmp(buf.str, "q, buf.str, buf.len, merge);
+sub = ff_subtitles_queue_insert_bprint(>q, , merge);
 if (!sub) {
 res = AVERROR(ENOMEM);
 goto end;
diff --git a/libavformat/samidec.c b/libavformat/samidec.c
index 0da299343d..070b623ebf 100644
--- a/libavformat/samidec.c
+++ b/libavformat/samidec.c
@@ -68,6 +68,10 @@ static int sami_read_header(AVFormatContext *s)
 const int64_t pos = ff_text_pos() - (c != 0);
 int is_sync, is_body, n = 

[FFmpeg-devel] [PATCH] libaformat: fix incorrect handling of incomplete AVBPrint.

2023-06-22 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Change some internal APIs a bit to make it harder to make
such mistakes.
In particular, have the read chunk functions return an error
when the result is incomplete.
This might be less flexible, but since there has been no
use-case for that so far, avoiding coding mistakes seems better.
Add a function to queue a AVBPrint directly (ff_subtitles_queue_insert_bprint).
Also fixes a leak in lrcdec when ff_subtitles_queue_insert fails.
---
Note that this combines a few different things, but they all
are meant to address the same issue.
Happy to split if that's wanted, first priority is getting an
idea if some part of this seems like a bad idea generally.

 libavformat/assdec.c |  4 +++-
 libavformat/lrcdec.c |  7 ++-
 libavformat/mpsubdec.c   |  5 +++--
 libavformat/realtextdec.c|  7 ++-
 libavformat/samidec.c|  7 ++-
 libavformat/srtdec.c |  4 +++-
 libavformat/subtitles.c  | 17 +
 libavformat/subtitles.h  | 14 --
 libavformat/tedcaptionsdec.c |  2 +-
 libavformat/webvttdec.c  |  4 +++-
 10 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/libavformat/assdec.c b/libavformat/assdec.c
index 0915f6fafd..bf7b8a73a2 100644
--- a/libavformat/assdec.c
+++ b/libavformat/assdec.c
@@ -73,6 +73,8 @@ static int read_dialogue(ASSContext *ass, AVBPrint *dst, 
const uint8_t *p,
 
 av_bprint_clear(dst);
 av_bprintf(dst, "%u,%d,%s", ass->readorder++, layer, p + pos);
+if (!av_bprint_is_complete(dst))
+return AVERROR(ENOMEM);
 
 /* right strip the buffer */
 while (dst->len > 0 &&
@@ -135,7 +137,7 @@ static int ass_read_header(AVFormatContext *s)
 av_bprintf(, "%s", line.str);
 continue;
 }
-sub = ff_subtitles_queue_insert(>q, rline.str, rline.len, 0);
+sub = ff_subtitles_queue_insert_bprint(>q, , 0);
 if (!sub) {
 res = AVERROR(ENOMEM);
 goto end;
diff --git a/libavformat/lrcdec.c b/libavformat/lrcdec.c
index fff39495f8..83bb4a4b75 100644
--- a/libavformat/lrcdec.c
+++ b/libavformat/lrcdec.c
@@ -171,6 +171,8 @@ static int lrc_read_header(AVFormatContext *s)
 
 while(!avio_feof(s->pb)) {
 int64_t pos = read_line(, s->pb);
+if (!av_bprint_is_complete())
+goto err_nomem_out;
 int64_t header_offset = find_header(line.str);
 if(header_offset >= 0) {
 char *comma_offset = strchr(line.str, ':');
@@ -205,7 +207,7 @@ static int lrc_read_header(AVFormatContext *s)
 sub = ff_subtitles_queue_insert(>q, line.str + 
ts_strlength,
 line.len - ts_strlength, 0);
 if (!sub)
-return AVERROR(ENOMEM);
+goto err_nomem_out;
 sub->pos = pos;
 sub->pts = ts_start - lrc->ts_offset;
 sub->duration = -1;
@@ -216,6 +218,9 @@ static int lrc_read_header(AVFormatContext *s)
 ff_metadata_conv_ctx(s, NULL, ff_lrc_metadata_conv);
 av_bprint_finalize(, NULL);
 return 0;
+err_nomem_out:
+av_bprint_finalize(, NULL);
+return AVERROR(ENOMEM);
 }
 
 const AVInputFormat ff_lrc_demuxer = {
diff --git a/libavformat/mpsubdec.c b/libavformat/mpsubdec.c
index d290a41fb9..0374563575 100644
--- a/libavformat/mpsubdec.c
+++ b/libavformat/mpsubdec.c
@@ -116,9 +116,10 @@ static int mpsub_read_header(AVFormatContext *s)
 AVPacket *sub;
 const int64_t pos = avio_tell(s->pb);
 
-ff_subtitles_read_chunk(s->pb, );
+res = ff_subtitles_read_chunk(s->pb, );
+if (res < 0) goto end;
 if (buf.len) {
-sub = ff_subtitles_queue_insert(>q, buf.str, buf.len, 
0);
+sub = ff_subtitles_queue_insert_bprint(>q, , 0);
 if (!sub) {
 res = AVERROR(ENOMEM);
 goto end;
diff --git a/libavformat/realtextdec.c b/libavformat/realtextdec.c
index c281dec346..9f6aab789e 100644
--- a/libavformat/realtextdec.c
+++ b/libavformat/realtextdec.c
@@ -80,6 +80,11 @@ static int realtext_read_header(AVFormatContext *s)
 const int64_t pos = ff_text_pos() - (c != 0);
 int n = ff_smil_extract_next_text_chunk(, , );
 
+if (n < 0)
+{
+res = n;
+goto end;
+}
 if (n == 0)
 break;
 
@@ -103,7 +108,7 @@ static int realtext_read_header(AVFormatContext *s)
 /* if we just read a  tag, introduce a new event, otherwise 
merge
  * with the previous one */
 int merge = !av_strncasecmp(buf.str, "q, buf.str, buf.len, merge);
+sub = ff_subtitles_queue_insert_bprint(>q, , merge);
 if (!sub) {
 res = AVERROR(ENOMEM);
 goto end;
diff --git a/libavformat/samidec.c b/libavformat/samidec.c
index 

[FFmpeg-devel] [PATCH] aarch64: Implement stack spilling in a consistent way.

2022-10-09 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Currently it is done in several different ways, which
might cause needless dependencies or in case of
tx_float_neon.S is incorrect.

Signed-off-by: Reimar Döffinger 
---
 libavcodec/aarch64/fft_neon.S  |  3 +-
 libavcodec/aarch64/h264idct_neon.S |  6 +-
 libavcodec/aarch64/hevcdsp_sao_neon.S  |  3 +-
 libavcodec/aarch64/mdct_neon.S | 18 ++
 libavcodec/aarch64/me_cmp_neon.S   |  6 +-
 libavcodec/aarch64/synth_filter_neon.S |  3 +-
 libavcodec/aarch64/vp9itxfm_neon.S | 28 -
 libavcodec/aarch64/vp9lpf_16bpp_neon.S | 32 +--
 libavcodec/aarch64/vp9lpf_neon.S   | 80 +-
 libavutil/aarch64/tx_float_neon.S  | 52 -
 10 files changed, 109 insertions(+), 122 deletions(-)

diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S
index 9ff3f9c526..d7225511dd 100644
--- a/libavcodec/aarch64/fft_neon.S
+++ b/libavcodec/aarch64/fft_neon.S
@@ -342,8 +342,7 @@ endfunc
 function fft\n\()_neon, align=6
 AARCH64_VALID_JUMP_TARGET
 AARCH64_SIGN_LINK_REGISTER
-sub sp,  sp,  #16
-stp x28, x30, [sp]
+stp x28, x30, [sp, #-16]!
 add x28, x0,  #\n4*2*8
 bl  fft\n2\()_neon
 mov x0,  x28
diff --git a/libavcodec/aarch64/h264idct_neon.S 
b/libavcodec/aarch64/h264idct_neon.S
index 7d2879b0ce..375da31d65 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -157,8 +157,7 @@ function ff_h264_idct_add16intra_neon, export=1
 endfunc
 
 function ff_h264_idct_add8_neon, export=1
-sub sp,  sp, #0x40
-stp x19, x20, [sp]
+stp x19, x20, [sp, #-0x40]!
 mov x12, x30
 ldp x6,  x15, [x0]  // dest[0], dest[1]
 add x5,  x1,  #16*4 // block_offset
@@ -187,8 +186,7 @@ function ff_h264_idct_add8_neon, export=1
 cselx6,  x15, x6,  eq
 cmp x10, #20
 b.lt1b
-ldp x19, x20, [sp]
-add sp,  sp,  #0x40
+ldp x19, x20, [sp], #0x40
 ret x12
 endfunc
 
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index d4decfde3b..30e83dda5d 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -33,8 +33,7 @@
 //  int16_t *sao_offset_val, int sao_left_class,
 //  int width, int height)
 function ff_hevc_sao_band_filter_8x8_8_neon, export=1
-sub sp,  sp, #64
-stp xzr, xzr, [sp]
+stp xzr, xzr, [sp, #-64]!
 stp xzr, xzr, [sp, #16]
 stp xzr, xzr, [sp, #32]
 stp xzr, xzr, [sp, #48]
diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S
index 6091e72022..98b09bf1ab 100644
--- a/libavcodec/aarch64/mdct_neon.S
+++ b/libavcodec/aarch64/mdct_neon.S
@@ -23,8 +23,7 @@
 #include "libavutil/aarch64/asm.S"
 
 function ff_imdct_half_neon, export=1
-sub sp,  sp,  #32
-stp x19, x20, [sp]
+stp x19, x20, [sp, #-32]!
 AARCH64_SIGN_LINK_REGISTER
 str x30, [sp, #16]
 mov x12, #1
@@ -120,17 +119,15 @@ function ff_imdct_half_neon, export=1
 st2 {v4.2s,v5.2s},  [x0]
 st2 {v6.2s,v7.2s},  [x8]
 
-ldp x19, x20, [sp]
 ldr x30, [sp, #16]
 AARCH64_VALIDATE_LINK_REGISTER
-add sp,  sp,  #32
+ldp x19, x20, [sp], #32
 
 ret
 endfunc
 
 function ff_imdct_calc_neon, export=1
-sub sp,  sp,  #32
-stp x19, x20, [sp]
+stp x19, x20, [sp, #-32]!
 AARCH64_SIGN_LINK_REGISTER
 str x30, [sp, #16]
 ldr w3,  [x0, #28]  // mdct_bits
@@ -163,18 +160,16 @@ function ff_imdct_calc_neon, export=1
 subsx19, x19,  #16
 b.gt1b
 
-ldp x19, x20, [sp]
 ldr x30, [sp, #16]
 AARCH64_VALIDATE_LINK_REGISTER
-add sp,  sp,  #32
+ldp x19, x20, [sp], #32
 
 ret
 endfunc
 
 
 function ff_mdct_calc_neon, export=1
-sub sp,  sp,  #32
-stp x19, x20, [sp]
+stp x19, x20, [sp, #-32]!
 AARCH64_SIGN_LINK_REGISTER
 str x30, [sp, #16]
 
@@ -323,10 +318,9 @@ function ff_mdct_calc_neon, export=1
 st2 {v4.2s,v5.2s},  [x0]
 st2 {v6.2s,v7.2s},  [x8]
 
-ldp x19, x20, [sp]
 ldr 

[FFmpeg-devel] [PATCH] libavcodec/hevcdsp: port SIMD idct functions from 32-bit.

2021-01-15 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Makes SIMD-optimized 8x8 and 16x16 idcts for 8 and 10 bit depth
available on aarch64.
For a UHD HDR (10 bit) sample video these were consuming the most time
and this optimization reduced overall decode time from 19.4s to 16.4s,
approximately 15% speedup.
Test sample was the first 300 frames of "LG 4K HDR Demo - New York.ts",
running on Apple M1.
---
 libavcodec/aarch64/Makefile   |   2 +
 libavcodec/aarch64/hevcdsp_idct_neon.S| 380 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  45 +++
 libavcodec/hevcdsp.c  |   2 +
 libavcodec/hevcdsp.h  |   1 +
 5 files changed, 430 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S
 create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index f6434e40da..2ea1d74a38 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -61,3 +61,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
aarch64/vp9itxfm_16bpp_neon.o   \
aarch64/vp9lpf_neon.o   
\
aarch64/vp9mc_16bpp_neon.o  
\
aarch64/vp9mc_neon.o
+NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o 
\
+   aarch64/hevcdsp_init_aarch64.o
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
new file mode 100644
index 00..4aac205e22
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -0,0 +1,380 @@
+/*
+ * ARM NEON optimised IDCT functions for HEVC decoding
+ * Copyright (c) 2014 Seppo Tomperi 
+ * Copyright (c) 2017 Alexandra Hájková
+ *
+ * Ported from arm/hevcdsp_idct_neon.S by
+ * Copyright (c) 2020 Reimar Döffinger
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+const trans, align=4
+.short 64, 83, 64, 36
+.short 89, 75, 50, 18
+.short 90, 87, 80, 70
+.short 57, 43, 25, 9
+.short 90, 90, 88, 85
+.short 82, 78, 73, 67
+.short 61, 54, 46, 38
+.short 31, 22, 13, 4
+endconst
+
+.macro sum_sub out, in, c, op, p
+  .ifc \op, +
+smlal\p \out, \in, \c
+  .else
+smlsl\p \out, \in, \c
+  .endif
+.endm
+
+.macro fixsqrshrn d, dt, n, m
+  .ifc \dt, .8H
+sqrshrn2\d\dt, \n\().4S, \m
+  .else
+sqrshrn \n\().4H, \n\().4S, \m
+mov \d\().D[0], \n\().D[0]
+  .endif
+.endm
+
+// uses and clobbers v28-v31 as temp registers
+.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
+ sshll\p1   v28.4S, \in0, #6
+ movv29.16B, v28.16B
+ smull\p1   v30.4S, \in1, v0.H[1]
+ smull\p1   v31.4S, \in1, v0.H[3]
+ smlal\p2   v28.4S, \in2, v0.H[0] //e0
+ smlsl\p2   v29.4S, \in2, v0.H[0] //e1
+ smlal\p2   v30.4S, \in3, v0.H[3] //o0
+ smlsl\p2   v31.4S, \in3, v0.H[1] //o1
+
+ add\out0, v28.4S, v30.4S
+ add\out1, v29.4S, v31.4S
+ sub\out2, v29.4S, v31.4S
+ sub\out3, v28.4S, v30.4S
+.endm
+
+.macro transpose8_4x4 r0, r1, r2, r3
+trn1v2.8H, \r0\().8H, \r1\().8H
+trn2v3.8H, \r0\().8H, \r1\().8H
+trn1v4.8H, \r2\().8H, \r3\().8H
+trn2v5.8H, \r2\().8H, \r3\().8H
+trn1\r0\().4S, v2.4S, v4.4S
+trn2\r2\().4S, v2.4S, v4.4S
+trn1\r1\().4S, v3.4S, v5.4S
+trn2\r3\().4S, v3.4S, v5.4S
+.endm
+
+.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
+transpose8_4x4  \r0, \r1, \r2, \r3
+transpose8_4x4  \r4, \r5, \r6, \r7
+.endm
+
+.macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, 
in5,in5t, in6,in6t, in7,in7t, p1, p2
+tr_4x4_8\in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4S, 
v25.4S, v26.4S, v27.4S, \p1, \p2
+
+smull\p1v30.4S, \in1\in1t, v0.H[6]
+smull\p1 

[FFmpeg-devel] [PATCH] configure: Set MSVC as_default later.

2021-01-15 Thread Reimar . Doeffinger
From: Reimar Döffinger 

It would get immediately overridden to $cc, which in case
of gas-preprocessor missing would result in it trying
to use cl.exe for asm files instead of erroring out.
This is because cl.exe does not fail but just print a warning
when it is given a file it does not know what to do with it...
---
 configure | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/configure b/configure
index 12b41cde1c..d3b665f6f9 100755
--- a/configure
+++ b/configure
@@ -4271,14 +4271,6 @@ case "$toolchain" in
 ld_default="$source_path/compat/windows/mslink"
 nm_default="dumpbin.exe -symbols"
 ar_default="lib.exe"
-case "$arch" in
-aarch64|arm64)
-as_default="armasm64.exe"
-;;
-arm*)
-as_default="armasm.exe"
-;;
-esac
 target_os_default="win32"
 # Use a relative path for TMPDIR. This makes sure all the
 # ffconf temp files are written with a relative path, avoiding
@@ -4720,6 +4712,14 @@ probe_cc(){
 _ld_path='-libpath:'
 elif $_cc -nologo- 2>&1 | grep -q Microsoft || { $_cc -v 2>&1 | grep -q 
clang && $_cc -? > /dev/null 2>&1; }; then
 _type=msvc
+case "$arch" in
+aarch64|arm64)
+as_default="armasm64.exe"
+;;
+arm*)
+as_default="armasm.exe"
+;;
+esac
 if $_cc -nologo- 2>&1 | grep -q Microsoft; then
 _ident=$($_cc 2>&1 | head -n1 | tr -d '\r')
 else
-- 
2.24.3 (Apple Git-128)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] libavcodec/hevcdsp: port SIMD idct functions from 32-bit.

2021-01-12 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Makes SIMD-optimized 8x8 and 16x16 idcts for 8 and 10 bit depth
available on aarch64.
For a UHD HDR (10 bit) sample video these were consuming the most time
and this optimization reduced overall decode time from 19.4s to 16.4s,
approximately 15% speedup.
Test sample was the first 300 frames of "LG 4K HDR Demo - New York.ts",
running on Apple M1.
---
 libavcodec/aarch64/Makefile   |   2 +
 libavcodec/aarch64/hevcdsp_idct_neon.S| 423 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  45 +++
 libavcodec/hevcdsp.c  |   2 +
 libavcodec/hevcdsp.h  |   1 +
 5 files changed, 473 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S
 create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index f6434e40da..2ea1d74a38 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -61,3 +61,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
aarch64/vp9itxfm_16bpp_neon.o   \
aarch64/vp9lpf_neon.o   
\
aarch64/vp9mc_16bpp_neon.o  
\
aarch64/vp9mc_neon.o
+NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o 
\
+   aarch64/hevcdsp_init_aarch64.o
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
new file mode 100644
index 00..6b42f6ca3a
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -0,0 +1,423 @@
+/*
+ * ARM NEON optimised IDCT functions for HEVC decoding
+ * Copyright (c) 2014 Seppo Tomperi 
+ * Copyright (c) 2017 Alexandra Hájková
+ *
+ * Ported from arm/hevcdsp_idct_neon.S by
+ * Copyright (c) 2020 Reimar Döffinger
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+const trans, align=4
+.short 64, 83, 64, 36
+.short 89, 75, 50, 18
+.short 90, 87, 80, 70
+.short 57, 43, 25, 9
+.short 90, 90, 88, 85
+.short 82, 78, 73, 67
+.short 61, 54, 46, 38
+.short 31, 22, 13, 4
+endconst
+
+.macro sum_sub out, in, c, op, p
+  .ifc \op, +
+smlal\p \out, \in, \c
+  .else
+smlsl\p \out, \in, \c
+  .endif
+.endm
+
+.macro fixsqrshrn d, dt, n, m
+  .ifc \dt, .8H
+sqrshrn2\d\dt, \n\().4S, \m
+  .else
+sqrshrn \n\().4H, \n\().4S, \m
+mov \d\().D[0], \n\().D[0]
+  .endif
+.endm
+
+// uses and clobbers v28-v31 as temp registers
+.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
+ sshll\p1   v28.4S, \in0, #6
+ movv29.16B, v28.16B
+ smull\p1   v30.4S, \in1, v0.H[1]
+ smull\p1   v31.4S, \in1, v0.H[3]
+ smlal\p2   v28.4S, \in2, v0.H[0] //e0
+ smlsl\p2   v29.4S, \in2, v0.H[0] //e1
+ smlal\p2   v30.4S, \in3, v0.H[3] //o0
+ smlsl\p2   v31.4S, \in3, v0.H[1] //o1
+
+ add\out0, v28.4S, v30.4S
+ add\out1, v29.4S, v31.4S
+ sub\out2, v29.4S, v31.4S
+ sub\out3, v28.4S, v30.4S
+.endm
+
+.macro transpose8_4x4 r0, r1, r2, r3
+trn1v2.8H, \r0\().8H, \r1\().8H
+trn2v3.8H, \r0\().8H, \r1\().8H
+trn1v4.8H, \r2\().8H, \r3\().8H
+trn2v5.8H, \r2\().8H, \r3\().8H
+trn1\r0\().4S, v2.4S, v4.4S
+trn2\r2\().4S, v2.4S, v4.4S
+trn1\r1\().4S, v3.4S, v5.4S
+trn2\r3\().4S, v3.4S, v5.4S
+.endm
+
+.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
+transpose8_4x4  \r0, \r1, \r2, \r3
+transpose8_4x4  \r4, \r5, \r6, \r7
+.endm
+
+.macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, 
in5,in5t, in6,in6t, in7,in7t, p1, p2
+tr_4x4_8\in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4S, 
v25.4S, v26.4S, v27.4S, \p1, \p2
+
+smull\p1v30.4S, \in1\in1t, v0.H[6]
+smull\p1 

[FFmpeg-devel] [PATCH] libswscale/aarch64/hscale.S: Support more bit-depth variants.

2021-01-10 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Trivially expand hscale assembler to support > 8 bit formats
both for input and output.
16-bit input is not supported as I am not certain how to
get sufficient test coverage.
---
 libswscale/aarch64/hscale.S  | 53 ++--
 libswscale/aarch64/swscale.c | 49 +++--
 2 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S
index af55ffe2b7..3b42d39dac 100644
--- a/libswscale/aarch64/hscale.S
+++ b/libswscale/aarch64/hscale.S
@@ -20,7 +20,11 @@
 
 #include "libavutil/aarch64/asm.S"
 
-function ff_hscale_8_to_15_neon, export=1
+.macro hscale srcbits, dstbits, ldt, lds, c
+function ff_hscale_\srcbits\()_to_\dstbits\()_neon, export=1
+.if \dstbits >= 16
+moviv20.4S, #(0x1 << (\dstbits - 16)), msl #16
+.endif
 sbfiz   x7, x6, #1, #32 // filterSize*2 (*2 
because int16)
 1:  ldr w8, [x5], #4// filterPos[idx]
 ldr w0, [x5], #4// filterPos[idx + 1]
@@ -34,30 +38,30 @@ function ff_hscale_8_to_15_neon, export=1
 moviv1.2D, #0   // val sum part 2 (for 
dst[1])
 moviv2.2D, #0   // val sum part 3 (for 
dst[2])
 moviv3.2D, #0   // val sum part 4 (for 
dst[3])
-add x17, x3, w8, UXTW   // srcp + filterPos[0]
-add x8,  x3, w0, UXTW   // srcp + filterPos[1]
-add x0, x3, w11, UXTW   // srcp + filterPos[2]
-add x11, x3, w9, UXTW   // srcp + filterPos[3]
+add x17, x3, w8, UXTW #!!(\srcbits > 8) // srcp + 
filterPos[0]
+add x8,  x3, w0, UXTW #!!(\srcbits > 8) // srcp + 
filterPos[1]
+add x0, x3, w11, UXTW #!!(\srcbits > 8) // srcp + 
filterPos[2]
+add x11, x3, w9, UXTW #!!(\srcbits > 8) // srcp + 
filterPos[3]
 mov w15, w6 // filterSize counter
-2:  ld1 {v4.8B}, [x17], #8  // srcp[filterPos[0] + 
{0..7}]
+2:  ld1 {v4.\ldt}, [x17], \lds  // srcp[filterPos[0] + 
{0..7}]
 ld1 {v5.8H}, [x16], #16 // load 8x16-bit 
filter values, part 1
-ld1 {v6.8B}, [x8], #8   // srcp[filterPos[1] + 
{0..7}]
+ld1 {v6.\ldt}, [x8], \lds   // srcp[filterPos[1] + 
{0..7}]
 ld1 {v7.8H}, [x12], #16 // load 8x16-bit at 
filter+filterSize
-uxtlv4.8H, v4.8B// unpack part 1 to 
16-bit
+\c\cuxtlv4.8H, v4.8B// unpack part 1 to 
16-bit
 smlal   v0.4S, v4.4H, v5.4H // v0 accumulates 
srcp[filterPos[0] + {0..3}] * filter[{0..3}]
 smlal2  v0.4S, v4.8H, v5.8H // v0 accumulates 
srcp[filterPos[0] + {4..7}] * filter[{4..7}]
-ld1 {v16.8B}, [x0], #8  // srcp[filterPos[2] + 
{0..7}]
+ld1 {v16.\ldt}, [x0], \lds  // srcp[filterPos[2] + 
{0..7}]
 ld1 {v17.8H}, [x13], #16// load 8x16-bit at 
filter+2*filterSize
-uxtlv6.8H, v6.8B// unpack part 2 to 
16-bit
+\c\cuxtlv6.8H, v6.8B// unpack part 2 to 
16-bit
 smlal   v1.4S, v6.4H, v7.4H // v1 accumulates 
srcp[filterPos[1] + {0..3}] * filter[{0..3}]
-uxtlv16.8H, v16.8B  // unpack part 3 to 
16-bit
+\c\cuxtlv16.8H, v16.8B  // unpack part 3 to 
16-bit
 smlal   v2.4S, v16.4H, v17.4H   // v2 accumulates 
srcp[filterPos[2] + {0..3}] * filter[{0..3}]
 smlal2  v2.4S, v16.8H, v17.8H   // v2 accumulates 
srcp[filterPos[2] + {4..7}] * filter[{4..7}]
-ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + 
{0..7}]
+ld1 {v18.\ldt}, [x11], \lds // srcp[filterPos[3] + 
{0..7}]
 smlal2  v1.4S, v6.8H, v7.8H // v1 accumulates 
srcp[filterPos[1] + {4..7}] * filter[{4..7}]
 ld1 {v19.8H}, [x4], #16 // load 8x16-bit at 
filter+3*filterSize
 subsw15, w15, #8// j -= 8: processed 
8/filterSize
-uxtlv18.8H, v18.8B  // unpack part 4 to 
16-bit
+\c\cuxtlv18.8H, v18.8B  // unpack part 4 to 
16-bit
 smlal   v3.4S, v18.4H, v19.4H   // v3 accumulates 
srcp[filterPos[3] + {0..3}] * filter[{0..3}]
 smlal2  v3.4S, v18.8H, v19.8H   // v3 

[FFmpeg-devel] [PATCH] Add support for "omp simd" pragma.

2021-01-10 Thread Reimar . Doeffinger
From: Reimar Döffinger 

This requests loops to be vectorized using SIMD
instructions.
The performance increase is far from hand-optimized
assembly but still significant over the plain C version.
Typical values are a 2-4x speedup where a hand-written
version would achieve 4x-10x.
So it is far from a replacement, however some architures
will get hand-written assembler quite late or not at all,
and this is a good improvement for a trivial amount of work.
The cause, besides the compiler being a compiler, is
usually that it does not manage to use saturating instructions
and thus has to use 32-bit operations where actually
saturating 16-bit operations would be sufficient.
Other causes are for example the av_clip functions that
are not ideal for vectorization (and even as scalar code
not optimal for any modern CPU that has either CSEL or
MAX/MIN instructions).
And of course this only works for relatively simple
loops, the IDCT functions for example seemed not possible
to optimize that way.
Also note that while clang may accept the code and sometimes
produces warnings, it does not seem to do anything actually
useful at all.
Here are example measurements using gcc 10 under Linux (in a VM unfortunately)
on AArch64 on Apple M1:
Commad:
time ./ffplay_g LG\ 4K\ HDR\ Demo\ -\ New\ York.ts -t 10 -autoexit -threads 1 
-noframedrop

Original code:
real0m19.572s
user0m23.386s
sys 0m0.213s

Changing all put_hevc:
real0m15.648s
user0m19.503s (83.4% of original)
sys 0m0.186s

In addition changing add_residual:
real0m15.424s
user0m19.278s (82.4% of original)
sys 0m0.133s

In addition changing planar copy dither:
real0m15.040s
user0m18.874s (80.7% of original)
sys 0m0.168s

Signed-off-by: Reimar Döffinger 
---
 configure | 23 +
 libavcodec/hevcdsp_template.c | 47 +++
 libavutil/internal.h  |  6 +
 libswscale/swscale_unscaled.c |  3 +++
 4 files changed, 79 insertions(+)

diff --git a/configure b/configure
index 900505756b..73b7c3daeb 100755
--- a/configure
+++ b/configure
@@ -406,6 +406,7 @@ Toolchain options:
   --enable-pic build position-independent code
   --enable-thumb   compile for Thumb instruction set
   --enable-lto use link-time optimization
+  --enable-openmp-simd use the "omp simd" pragma to optimize code
   --env="ENV=override" override the environment variables
 
 Advanced options (experts only):
@@ -2335,6 +2336,7 @@ HAVE_LIST="
 opencl_dxva2
 opencl_vaapi_beignet
 opencl_vaapi_intel_media
+openmp_simd
 perl
 pod2man
 texi2html
@@ -2446,6 +2448,7 @@ CMDLINE_SELECT="
 extra_warnings
 logging
 lto
+openmp_simd
 optimizations
 rpath
 stripping
@@ -6926,6 +6929,26 @@ if enabled lto; then
 disable inline_asm_direct_symbol_refs
 fi
 
+if enabled openmp_simd; then
+ompopt="-fopenmp"
+if ! test_cflags $ompopt ; then
+test_cflags -Xpreprocessor -fopenmp && ompopt="-Xpreprocessor -fopenmp"
+fi
+test_cc $ompopt <> shift);
 src  += srcstride;
@@ -568,6 +573,7 @@ static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, 
ptrdiff_t _dststride,
 
 ox = ox * (1 << (BIT_DEPTH - 8));
 for (y = 0; y < height; y++) {
+FF_OMP_SIMD
 for (x = 0; x < width; x++)
 dst[x] = av_clip_pixelsrc[x] << (14 - BIT_DEPTH)) * wx + 
offset) >> shift) + ox);
 src += srcstride;
@@ -592,6 +598,7 @@ static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, 
ptrdiff_t _dststride,
 ox0 = ox0 * (1 << (BIT_DEPTH - 8));
 ox1 = ox1 * (1 << (BIT_DEPTH - 8));
 for (y = 0; y < height; y++) {
+FF_OMP_SIMD
 for (x = 0; x < width; x++) {
 dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + 
src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
 }
@@ -623,6 +630,7 @@ static void FUNC(put_hevc_qpel_h)(int16_t *dst,
 ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 const int8_t *filter= ff_hevc_qpel_filters[mx - 1];
 for (y = 0; y < height; y++) {
+FF_OMP_SIMD
 for (x = 0; x < width; x++)
 dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 src += srcstride;
@@ -639,6 +647,7 @@ static void FUNC(put_hevc_qpel_v)(int16_t *dst,
 ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 const int8_t *filter= ff_hevc_qpel_filters[my - 1];
 for (y = 0; y < height; y++)  {
+FF_OMP_SIMD
 for (x = 0; x < width; x++)
 dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
 src += srcstride;
@@ -662,6 +671,7 @@ static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
 src   -= QPEL_EXTRA_BEFORE * srcstride;
 filter = ff_hevc_qpel_filters[mx - 1];
 for (y = 0; y < height + QPEL_EXTRA; y++) {
+FF_OMP_SIMD
 for (x = 0; x < width; x++)
 tmp[x] = 

[FFmpeg-devel] [PATCH] libavcodec/aarch64/hevcdsp_idct_neon.S: Also port add_residual functions.

2021-01-10 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Speedup is fairly small, around 1.5%, but these are fairly simple.
---
 libavcodec/aarch64/hevcdsp_idct_neon.S| 190 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  24 +++
 2 files changed, 214 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 9f67e45..edd03a0 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -36,6 +36,196 @@ const trans, align=4
 .short 31, 22, 13, 4
 endconst
 
+.macro clip10 in1, in2, c1, c2
+smax\in1, \in1, \c1
+smax\in2, \in2, \c1
+smin\in1, \in1, \c2
+smin\in2, \in2, \c2
+.endm
+
+function ff_hevc_add_residual_4x4_8_neon, export=1
+ld1 {v0.8H-v1.8H}, [x1]
+ld1 {v2.S}[0], [x0], x2
+ld1 {v2.S}[1], [x0], x2
+ld1 {v2.S}[2], [x0], x2
+ld1 {v2.S}[3], [x0], x2
+sub x0, x0, x2, lsl #2
+uxtlv8.8H, v2.8B
+uxtl2   v9.8H, v2.16B
+sqadd   v0.8H, v0.8H, v8.8H
+sqadd   v1.8H, v1.8H, v9.8H
+sqxtun  v0.8B, v0.8H
+sqxtun2 v0.16B, v1.8H
+st1 {v0.S}[0], [x0], x2
+st1 {v0.S}[1], [x0], x2
+st1 {v0.S}[2], [x0], x2
+st1 {v0.S}[3], [x0], x2
+ret
+endfunc
+
+function ff_hevc_add_residual_4x4_10_neon, export=1
+mov x12, x0
+ld1 {v0.8H-v1.8H}, [x1]
+ld1 {v2.D}[0], [x12], x2
+ld1 {v2.D}[1], [x12], x2
+ld1 {v3.D}[0], [x12], x2
+sqadd   v0.8H, v0.8H, v2.8H
+ld1 {V3.D}[1], [x12], x2
+moviv4.8H, #0
+sqadd   v1.8H, v1.8H, v3.8H
+mvniv5.8H, #0xFC, LSL #8 // movi #0x3FF
+clip10  v0.8H, v1.8H, v4.8H, v5.8H
+st1 {v0.D}[0], [x0], x2
+st1 {v0.D}[1], [x0], x2
+st1 {v1.D}[0], [x0], x2
+st1 {v1.D}[1], [x0], x2
+ret
+endfunc
+
+function ff_hevc_add_residual_8x8_8_neon, export=1
+add x12, x0, x2
+add x2,  x2, x2
+mov x3,   #8
+1:  subsx3,   x3, #2
+ld1 {v2.D}[0],   [x0]
+ld1 {v2.D}[1],   [x12]
+uxtlv3.8H,   v2.8B
+ld1 {v0.8H-v1.8H}, [x1], #32
+uxtl2   v2.8H,   v2.16B
+sqadd   v0.8H,   v0.8H,   v3.8H
+sqadd   v1.8H,   v1.8H,   v2.8H
+sqxtun  v0.8B,   v0.8H
+sqxtun2 v0.16B,  v1.8H
+st1 {v0.D}[0],   [x0], x2
+st1 {v0.D}[1],   [x12], x2
+bne 1b
+ret
+endfunc
+
+function ff_hevc_add_residual_8x8_10_neon, export=1
+add x12, x0, x2
+add x2,  x2, x2
+mov x3,  #8
+moviv4.8H, #0
+mvniv5.8H, #0xFC, LSL #8 // movi #0x3FF
+1:  subsx3,  x3, #2
+ld1 {v0.8H-v1.8H}, [x1], #32
+ld1 {v2.8H},[x0]
+sqadd   v0.8H, v0.8H, v2.8H
+ld1 {v3.8H},[x12]
+sqadd   v1.8H, v1.8H, v3.8H
+clip10  v0.8H, v1.8H, v4.8H, v5.8H
+st1 {v0.8H}, [x0], x2
+st1 {v1.8H}, [x12], x2
+bne 1b
+ret
+endfunc
+
+function ff_hevc_add_residual_16x16_8_neon, export=1
+mov x3,  #16
+add x12, x0, x2
+add x2,  x2, x2
+1:  subsx3,  x3, #2
+ld1 {v16.16B}, [x0]
+ld1 {v0.8H-v3.8H}, [x1], #64
+ld1 {v19.16B},[x12]
+uxtlv17.8H, v16.8B
+uxtl2   v18.8H, v16.16B
+uxtlv20.8H, v19.8B
+uxtl2   v21.8H, v19.16B
+sqadd   v0.8H,  v0.8H, v17.8H
+sqadd   v1.8H,  v1.8H, v18.8H
+sqadd   v2.8H,  v2.8H, v20.8H
+sqadd   v3.8H,  v3.8H, v21.8H
+sqxtun  v0.8B,  v0.8H
+sqxtun2 v0.16B, v1.8H
+sqxtun  v1.8B,  v2.8H
+sqxtun2 v1.16B, v3.8H
+st1 {v0.16B}, [x0], x2
+st1 {v1.16B}, [x12], x2
+bne 1b
+ret
+endfunc
+
+function ff_hevc_add_residual_16x16_10_neon, export=1
+mov x3,  #16
+moviv20.8H, #0
+mvniv21.8H, #0xFC, LSL #8 // movi #0x3FF
+add x12, x0, x2
+add x2,  x2, x2
+1:  subs   

[FFmpeg-devel] [PATCH] libavcodec/aarch64/hevcdsp_idct_neon.S: Also port add_residual functions.

2021-01-10 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Speedup is fairly small, around 1.5%, but these are fairly simple.
---
 libavcodec/aarch64/hevcdsp_idct_neon.S| 190 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  24 +++
 2 files changed, 214 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 9f67e45..edd03a0 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -36,6 +36,196 @@ const trans, align=4
 .short 31, 22, 13, 4
 endconst
 
+.macro clip10 in1, in2, c1, c2
+smax\in1, \in1, \c1
+smax\in2, \in2, \c1
+smin\in1, \in1, \c2
+smin\in2, \in2, \c2
+.endm
+
+function ff_hevc_add_residual_4x4_8_neon, export=1
+ld1 {v0.8H-v1.8H}, [x1]
+ld1 {v2.S}[0], [x0], x2
+ld1 {v2.S}[1], [x0], x2
+ld1 {v2.S}[2], [x0], x2
+ld1 {v2.S}[3], [x0], x2
+sub x0, x0, x2, lsl #2
+uxtlv8.8H, v2.8B
+uxtl2   v9.8H, v2.16B
+sqadd   v0.8H, v0.8H, v8.8H
+sqadd   v1.8H, v1.8H, v9.8H
+sqxtun  v0.8B, v0.8H
+sqxtun2 v0.16B, v1.8H
+st1 {v0.S}[0], [x0], x2
+st1 {v0.S}[1], [x0], x2
+st1 {v0.S}[2], [x0], x2
+st1 {v0.S}[3], [x0], x2
+ret
+endfunc
+
+function ff_hevc_add_residual_4x4_10_neon, export=1
+mov x12, x0
+ld1 {v0.8H-v1.8H}, [x1]
+ld1 {v2.D}[0], [x12], x2
+ld1 {v2.D}[1], [x12], x2
+ld1 {v3.D}[0], [x12], x2
+sqadd   v0.8H, v0.8H, v2.8H
+ld1 {V3.D}[1], [x12], x2
+moviv4.8H, #0
+sqadd   v1.8H, v1.8H, v3.8H
+mvniv5.8H, #0xFC, LSL #8 // movi #0x3FF
+clip10  v0.8H, v1.8H, v4.8H, v5.8H
+st1 {v0.D}[0], [x0], x2
+st1 {v0.D}[1], [x0], x2
+st1 {v1.D}[0], [x0], x2
+st1 {v1.D}[1], [x0], x2
+ret
+endfunc
+
+function ff_hevc_add_residual_8x8_8_neon, export=1
+add x12, x0, x2
+add x2,  x2, x2
+mov x3,   #8
+1:  subsx3,   x3, #2
+ld1 {v2.D}[0],   [x0]
+ld1 {v2.D}[1],   [x12]
+uxtlv3.8H,   v2.8B
+ld1 {v0.8H-v1.8H}, [x1], #32
+uxtl2   v2.8H,   v2.16B
+sqadd   v0.8H,   v0.8H,   v3.8H
+sqadd   v1.8H,   v1.8H,   v2.8H
+sqxtun  v0.8B,   v0.8H
+sqxtun2 v0.16B,  v1.8H
+st1 {v0.D}[0],   [x0], x2
+st1 {v0.D}[1],   [x12], x2
+bne 1b
+ret
+endfunc
+
+function ff_hevc_add_residual_8x8_10_neon, export=1
+add x12, x0, x2
+add x2,  x2, x2
+mov x3,  #8
+moviv4.8H, #0
+mvniv5.8H, #0xFC, LSL #8 // movi #0x3FF
+1:  subsx3,  x3, #2
+ld1 {v0.8H-v1.8H}, [x1], #32
+ld1 {v2.8H},[x0]
+sqadd   v0.8H, v0.8H, v2.8H
+ld1 {v3.8H},[x12]
+sqadd   v1.8H, v1.8H, v3.8H
+clip10  v0.8H, v1.8H, v4.8H, v5.8H
+st1 {v0.8H}, [x0], x2
+st1 {v1.8H}, [x12], x2
+bne 1b
+ret
+endfunc
+
+function ff_hevc_add_residual_16x16_8_neon, export=1
+mov x3,  #16
+add x12, x0, x2
+add x2,  x2, x2
+1:  subsx3,  x3, #2
+ld1 {v16.16B}, [x0]
+ld1 {v0.8H-v3.8H}, [x1], #64
+ld1 {v19.16B},[x12]
+uxtlv17.8H, v16.8B
+uxtl2   v18.8H, v16.16B
+uxtlv20.8H, v19.8B
+uxtl2   v21.8H, v19.16B
+sqadd   v0.8H,  v0.8H, v17.8H
+sqadd   v1.8H,  v1.8H, v18.8H
+sqadd   v2.8H,  v2.8H, v20.8H
+sqadd   v3.8H,  v3.8H, v21.8H
+sqxtun  v0.8B,  v0.8H
+sqxtun2 v0.16B, v1.8H
+sqxtun  v1.8B,  v2.8H
+sqxtun2 v1.16B, v3.8H
+st1 {v0.16B}, [x0], x2
+st1 {v1.16B}, [x12], x2
+bne 1b
+ret
+endfunc
+
+function ff_hevc_add_residual_16x16_10_neon, export=1
+mov x3,  #16
+moviv20.8H, #0
+mvniv21.8H, #0xFC, LSL #8 // movi #0x3FF
+add x12, x0, x2
+add x2,  x2, x2
+1:  subs   

[FFmpeg-devel] [PATCH] libavcodec/hevcdsp: port SIMD idct functions from 32-bit.

2021-01-08 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Makes SIMD-optimized 8x8 and 16x16 idcts for 8 and 10 bit depth
available on aarch64.
For a UHD HDR (10 bit) sample video these were consuming the most time
and this optimization reduced overall decode time from 19.4s to 16.4s,
approximately 15% speedup.
Test sample was the first 300 frames of "LG 4K HDR Demo - New York.ts",
running on Apple M1.
---
 libavcodec/aarch64/Makefile   |   2 +
 libavcodec/aarch64/hevcdsp_idct_neon.S| 426 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  45 +++
 libavcodec/hevcdsp.c  |   2 +
 libavcodec/hevcdsp.h  |   1 +
 5 files changed, 476 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S
 create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index f6434e4..2ea1d74 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -61,3 +61,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
aarch64/vp9itxfm_16bpp_neon.o   \
aarch64/vp9lpf_neon.o   
\
aarch64/vp9mc_16bpp_neon.o  
\
aarch64/vp9mc_neon.o
+NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o 
\
+   aarch64/hevcdsp_init_aarch64.o
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
new file mode 100644
index 000..9f67e45
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -0,0 +1,426 @@
+/*
+ * ARM NEON optimised IDCT functions for HEVC decoding
+ * Copyright (c) 2014 Seppo Tomperi 
+ * Copyright (c) 2017 Alexandra Hájková
+ *
+ * Ported from arm/hevcdsp_idct_neon.S by
+ * Copyright (c) 2020 Reimar Döffinger
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+const trans, align=4
+.short 64, 83, 64, 36
+.short 89, 75, 50, 18
+.short 90, 87, 80, 70
+.short 57, 43, 25, 9
+.short 90, 90, 88, 85
+.short 82, 78, 73, 67
+.short 61, 54, 46, 38
+.short 31, 22, 13, 4
+endconst
+
+.macro sum_sub out, in, c, op, p
+  .ifc \op, +
+smlal\p \out, \in, \c
+  .else
+smlsl\p \out, \in, \c
+  .endif
+.endm
+
+.macro fixsqrshrn d, dt, n, m
+  .ifc \dt, .8H
+sqrshrn2\d\dt, \n\().4S, \m
+  .else
+sqrshrn \n\().4H, \n\().4S, \m
+mov \d\().D[0], \n\().D[0]
+  .endif
+.endm
+
+.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, 
tmp3, p1, p2
+ sshll\p1   \tmp0, \in0, #6
+ mov\tmp1, \tmp0
+ smull\p1   \tmp2, \in1, v0.H[1]
+ smull\p1   \tmp3, \in1, v0.H[3]
+ smlal\p2   \tmp0, \in2, v0.H[0] //e0
+ smlsl\p2   \tmp1, \in2, v0.H[0] //e1
+ smlal\p2   \tmp2, \in3, v0.H[3] //o0
+ smlsl\p2   \tmp3, \in3, v0.H[1] //o1
+
+ add\out0, \tmp0, \tmp2
+ add\out1, \tmp1, \tmp3
+ sub\out2, \tmp1, \tmp3
+ sub\out3, \tmp0, \tmp2
+.endm
+
+.macro transpose8_4x4 r0, r1, r2, r3
+trn1v2.8H, \r0\().8H, \r1\().8H
+trn2v3.8H, \r0\().8H, \r1\().8H
+trn1v4.8H, \r2\().8H, \r3\().8H
+trn2v5.8H, \r2\().8H, \r3\().8H
+trn1\r0\().4S, v2.4S, v4.4S
+trn2\r2\().4S, v2.4S, v4.4S
+trn1\r1\().4S, v3.4S, v5.4S
+trn2\r3\().4S, v3.4S, v5.4S
+.endm
+
+.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
+transpose8_4x4  \r0, \r1, \r2, \r3
+transpose8_4x4  \r4, \r5, \r6, \r7
+.endm
+
+.macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, 
in5,in5t, in6,in6t, in7,in7t, p1, p2
+tr_4x4_8\in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4S, 
v25.4S, v26.4S, v27.4S, v28.4S, v29.4S, v30.4S, v31.4S, \p1, \p2
+
+smull\p1v30.4S, \in1\in1t, v0.H[6]
+smull\p1v28.4S, \in1\in1t,