[FFmpeg-devel] [PATCH 3/3] libavutil/log.c: only include valgrind header when used.
From: Reimar Döffinger This is cleaner, but it is also a workaround for when the header exists, but cannot be compiled. This will happen when the compiler has no inline asm support. Possibly the configure check should be improved as well. --- libavutil/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavutil/log.c b/libavutil/log.c index 5948e50467..2d358b7ab9 100644 --- a/libavutil/log.c +++ b/libavutil/log.c @@ -47,7 +47,7 @@ static AVMutex mutex = AV_MUTEX_INITIALIZER; #define LINE_SZ 1024 -#if HAVE_VALGRIND_VALGRIND_H +#if HAVE_VALGRIND_VALGRIND_H && CONFIG_VALGRIND_BACKTRACE #include /* this is the log level at which valgrind will output a full backtrace */ #define BACKTRACE_LOGLEVEL AV_LOG_ERROR -- 2.39.3 (Apple Git-145) ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/3] libavutil/aarch64/cpu.c: HWCAPS requires inline asm support.
From: Reimar Döffinger Fixes compilation with tcc, which does not have aarch64 inline asm support. --- libavutil/aarch64/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavutil/aarch64/cpu.c b/libavutil/aarch64/cpu.c index bd780e8591..0d7c1e268d 100644 --- a/libavutil/aarch64/cpu.c +++ b/libavutil/aarch64/cpu.c @@ -34,7 +34,7 @@ static int detect_flags(void) hwcap = getauxval(AT_HWCAP); -#if defined(HWCAP_CPUID) +#if defined(HWCAP_CPUID) && HAVE_INLINE_ASM // We can check for DOTPROD and I8MM using HWCAP_ASIMDDP and // HWCAP2_I8MM too, avoiding to read the CPUID registers (which triggers // a trap, handled by the kernel). However the HWCAP_* defines for these -- 2.39.3 (Apple Git-145) ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/3] configure: fix _Pragma check.
From: Reimar Döffinger The test can current pass when _Pragma is not supported, since _Pragma might be treated as a implicitly declared function. This happens e.g. with tinycc. Extending the check to 2 pragmas both matches the actual use better and avoids this misdetection. --- configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure b/configure index ff3ac9f4de..e2bcf4e1c2 100755 --- a/configure +++ b/configure @@ -5987,7 +5987,7 @@ for restrict_keyword in restrict __restrict__ __restrict ""; do test_code cc "" "char * $restrict_keyword p" && break done -check_cc pragma_deprecated "" '_Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")' +check_cc pragma_deprecated "" '_Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")' # The global variable ensures the bits appear unchanged in the object file. test_cc
[FFmpeg-devel] [PATCH] [RFC] tools/patcheck: portability fixes.
From: Reimar Döffinger Enough to make it run on macOS. In particular: - fix "empty subexpression" errors caused by constructs like (smth|), use ? instead to make them optional - no -d option for xargs, use the more standard -0 and use tr to replace newlines with 0. Not sure if these cause issues somewhere else, not even completely sure they all work, but quick testing suggests they work. On the other hand I remember issues with '?' where I resorted to {0,1} instead, but I do not remember details. Ignore if fixing these seems not worth the risk. --- tools/patcheck | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/patcheck b/tools/patcheck index fe52938f29..ee993c60fc 100755 --- a/tools/patcheck +++ b/tools/patcheck @@ -21,7 +21,7 @@ echo may or may not be bad. When you use it and it misses something or detects echo something wrong, fix it and send a patch to the ffmpeg-devel mailing list. echo License: GPL, Author: Michael Niedermayer -ERE_PRITYP='(unsigned *|)(char|short|long|int|long *int|short *int|void|float|double|(u|)int(8|16|32|64)_t)' +ERE_PRITYP='(unsigned *)?(char|short|long|int|long *int|short *int|void|float|double|u?int(8|16|32|64)_t)' ERE_TYPES='(const|static|av_cold|inline| *)*('$ERE_PRITYP'|[a-zA-Z][a-zA-Z0-9_]*)[* ]{1,}[a-zA-Z][a-zA-Z0-9_]*' ERE_FUNCS="$ERE_TYPES"' *\(' @@ -63,7 +63,7 @@ hiegrep '\+= *1 *;' 'can be simplified to ++' $* hiegrep '-= *1 *;' 'can be simplified to --' $* hiegrep '((!|=)= *(0|NULL)[^0-9a-z]|[^0-9a-z](0|NULL) *(!|=)=)' 'x==0 / x!=0 can be simplified to !x / x' $* -$EGREP $OPT '^\+ *(const *|)static' $*| $EGREP --color=always '[^=]= *(0|NULL)[^0-9a-zA-Z]'> $TMP && printf '\nuseless 0 init\n' +$EGREP $OPT '^\+ *(const *)?static' $*| $EGREP --color=always '[^=]= *(0|NULL)[^0-9a-zA-Z]'> $TMP && printf '\nuseless 0 init\n' cat $TMP hiegrep '# *ifdef * (HAVE|CONFIG)_' 'ifdefs that should be #if' $* @@ -77,7 +77,7 @@ hiegrep ':\+ *'"$ERE_PRITYP"' *inline' 'non static inline or strangely ordered i hiegrep "$ERE_FUNCS"' *\)' 'missing void' $* hiegrep '(sprintf|strcat|strcpy)' 'Possible security issue, make sure this is safe or use snprintf/av_strl*' $* hiegrep '/ *(2|4|8|16|32|64|128|256|512|1024|2048|4096|8192|16384|32768|65536)[^0-9]' 'divide by 2^x could use >> maybe' $* -hiegrep '#(el|)if *(0|1)' 'useless #if' $* +hiegrep '#(el)?if *(0|1)' 'useless #if' $* hiegrep 'if *\( *(0|1) *\)' 'useless if()' $* hiegrep '& *[a-zA-Z0-9_]* *\[ *0 *\]' 'useless & [0]' $* hiegrep '(\( *[0-9] *(&&|\|\|)|(&&|\|\|) *[0-9] *\))' 'overriding condition' $* @@ -118,22 +118,22 @@ if test -e $TMP ; then cat $TMP fi -$EGREP -B2 $OPT '^(\+|) *('"$ERE_TYPES"'|# *define)' $* | $EGREP -A2 --color=always '(:|-)\+[^/]*/(\*([^*]|$)|/([^/]|$))' > $TMP && printf "\n Non doxy comments\n" +$EGREP -B2 $OPT '^\+? *('"$ERE_TYPES"'|# *define)' $* | $EGREP -A2 --color=always '(:|-)\+[^/]*/(\*([^*]|$)|/([^/]|$))' > $TMP && printf "\n Non doxy comments\n" cat $TMP rm $TMP for i in \ $($EGREP -H '^\+ *'"$ERE_TYPES" $* |\ $GREP -v '(' | $EGREP -v '\Wgoto\W' |\ -xargs -d '\n' -n 1 |\ +tr '\n' '\0' | xargs -0 -n 1 |\ $GREP -o '[* ][* ]*[a-zA-Z][0-9a-zA-Z_]* *[,;=]' |\ sed 's/.[* ]*\([a-zA-Z][0-9a-zA-Z_]*\) *[,;=]/\1/') \ ; do echo $i | $GREP '^NULL$' && continue -$EGREP $i' *(\+|-|\*|/|\||&|%|)=[^=]' $* >/dev/null || echo "possibly never written:"$i >> $TMP +$EGREP $i' *(\+|-|\*|/|\||&|%)?=[^=]' $* >/dev/null || echo "possibly never written:"$i >> $TMP $EGREP '(=|\(|return).*'$i'(==|[^=])*$'$* >/dev/null || echo "possibly never read :"$i >> $TMP -$EGREP -o $i' *((\+|-|\*|/|\||&|%|)=[^=]|\+\+|--) *(0x|)[0-9]*(;|)' $* |\ - $EGREP -v $i' *= *(0x|)[0-9]{1,};'>/dev/null || echo "possibly constant :"$i >> $TMP +$EGREP -o $i' *((\+|-|\*|/|\||&|%)?=[^=]|\+\+|--) *(0x)?[0-9]*;?' $* |\ + $EGREP -v $i' *= *(0x)?[0-9]{1,};'>/dev/null || echo "possibly constant :"$i >> $TMP done if test -e $TMP ; then printf '\npossibly unused variables\n' @@ -151,7 +151,7 @@ cat $TMP | tr '@' '\n' cat $* | tr '\n' '@' | $EGREP --color=always -o '\+ *if *\( *([A-Za-z0-9_]*) *[<>]=? *([A-Za-z0-9_]*) *\)[ @\\+]*(\1|\2) *= *(\1|\2) *;' >$TMP && printf "\nFFMIN/FFMAX\n" cat $TMP | tr '@' '\n' -cat $* | tr '\n' '@' | $EGREP --color=always -o '\+ *if *\( *([A-Za-z0-9_]*) *\)[ @\\+]*av_free(p|) *\( *(&|) *\1[^-.]' >$TMP && printf "\nav_free(NULL) is safe\n" +cat $* | tr '\n' '@' | $EGREP --color=always -o '\+ *if *\( *([A-Za-z0-9_]*) *\)[ @\\+]*av_freep? *\( *&? *\1[^-.]' >$TMP && printf "\nav_free(NULL) is safe\n" cat $TMP | tr '@' '\n' cat $* | tr '\n' '@' | $EGREP --color=always -o '[^a-zA-Z0-9_]([a-zA-Z0-9_]*) *= *av_malloc *\([^)]*\)[ @;\\+]*memset *\( *\1' >$TMP && printf "\nav_mallocz()\n" -- 2.37.1 (Apple Git-137.1) ___ ffmpeg-devel mailing list
[FFmpeg-devel] [PATCH] libavformat: fix incorrect handling of incomplete AVBPrint.
From: Reimar Döffinger Change some internal APIs a bit to make it harder to make such mistakes. In particular, have the read chunk functions return an error when the result is incomplete. This might be less flexible, but since there has been no use-case for that so far, avoiding coding mistakes seems better. Add a function to queue a AVBPrint directly (ff_subtitles_queue_insert_bprint). Also fixes a leak in lrcdec when ff_subtitles_queue_insert fails. Signed-off-by: Reimar Döffinger --- libavformat/assdec.c | 4 +++- libavformat/lrcdec.c | 7 ++- libavformat/mpsubdec.c | 5 +++-- libavformat/realtextdec.c| 6 +- libavformat/samidec.c| 6 +- libavformat/srtdec.c | 4 +++- libavformat/subtitles.c | 19 +++ libavformat/subtitles.h | 14 -- libavformat/tedcaptionsdec.c | 2 +- libavformat/webvttdec.c | 4 +++- 10 files changed, 56 insertions(+), 15 deletions(-) diff --git a/libavformat/assdec.c b/libavformat/assdec.c index 0915f6fafd..bf7b8a73a2 100644 --- a/libavformat/assdec.c +++ b/libavformat/assdec.c @@ -73,6 +73,8 @@ static int read_dialogue(ASSContext *ass, AVBPrint *dst, const uint8_t *p, av_bprint_clear(dst); av_bprintf(dst, "%u,%d,%s", ass->readorder++, layer, p + pos); +if (!av_bprint_is_complete(dst)) +return AVERROR(ENOMEM); /* right strip the buffer */ while (dst->len > 0 && @@ -135,7 +137,7 @@ static int ass_read_header(AVFormatContext *s) av_bprintf(, "%s", line.str); continue; } -sub = ff_subtitles_queue_insert(>q, rline.str, rline.len, 0); +sub = ff_subtitles_queue_insert_bprint(>q, , 0); if (!sub) { res = AVERROR(ENOMEM); goto end; diff --git a/libavformat/lrcdec.c b/libavformat/lrcdec.c index fff39495f8..83bb4a4b75 100644 --- a/libavformat/lrcdec.c +++ b/libavformat/lrcdec.c @@ -171,6 +171,8 @@ static int lrc_read_header(AVFormatContext *s) while(!avio_feof(s->pb)) { int64_t pos = read_line(, s->pb); +if (!av_bprint_is_complete()) +goto err_nomem_out; int64_t header_offset = find_header(line.str); if(header_offset >= 0) { char *comma_offset = strchr(line.str, ':'); @@ -205,7 +207,7 @@ static int lrc_read_header(AVFormatContext *s) sub = ff_subtitles_queue_insert(>q, line.str + ts_strlength, line.len - ts_strlength, 0); if (!sub) -return AVERROR(ENOMEM); +goto err_nomem_out; sub->pos = pos; sub->pts = ts_start - lrc->ts_offset; sub->duration = -1; @@ -216,6 +218,9 @@ static int lrc_read_header(AVFormatContext *s) ff_metadata_conv_ctx(s, NULL, ff_lrc_metadata_conv); av_bprint_finalize(, NULL); return 0; +err_nomem_out: +av_bprint_finalize(, NULL); +return AVERROR(ENOMEM); } const AVInputFormat ff_lrc_demuxer = { diff --git a/libavformat/mpsubdec.c b/libavformat/mpsubdec.c index d290a41fb9..0374563575 100644 --- a/libavformat/mpsubdec.c +++ b/libavformat/mpsubdec.c @@ -116,9 +116,10 @@ static int mpsub_read_header(AVFormatContext *s) AVPacket *sub; const int64_t pos = avio_tell(s->pb); -ff_subtitles_read_chunk(s->pb, ); +res = ff_subtitles_read_chunk(s->pb, ); +if (res < 0) goto end; if (buf.len) { -sub = ff_subtitles_queue_insert(>q, buf.str, buf.len, 0); +sub = ff_subtitles_queue_insert_bprint(>q, , 0); if (!sub) { res = AVERROR(ENOMEM); goto end; diff --git a/libavformat/realtextdec.c b/libavformat/realtextdec.c index c281dec346..7992a5b7fc 100644 --- a/libavformat/realtextdec.c +++ b/libavformat/realtextdec.c @@ -80,6 +80,10 @@ static int realtext_read_header(AVFormatContext *s) const int64_t pos = ff_text_pos() - (c != 0); int n = ff_smil_extract_next_text_chunk(, , ); +if (n < 0) { +res = n; +goto end; +} if (n == 0) break; @@ -103,7 +107,7 @@ static int realtext_read_header(AVFormatContext *s) /* if we just read a tag, introduce a new event, otherwise merge * with the previous one */ int merge = !av_strncasecmp(buf.str, "q, buf.str, buf.len, merge); +sub = ff_subtitles_queue_insert_bprint(>q, , merge); if (!sub) { res = AVERROR(ENOMEM); goto end; diff --git a/libavformat/samidec.c b/libavformat/samidec.c index 0da299343d..070b623ebf 100644 --- a/libavformat/samidec.c +++ b/libavformat/samidec.c @@ -68,6 +68,10 @@ static int sami_read_header(AVFormatContext *s) const int64_t pos = ff_text_pos() - (c != 0);
[FFmpeg-devel] [PATCH] hevcdsp_idct_neon.S: Avoid unnecessary mov.
From: Reimar Döffinger ret can be given an argument instead. This is also consistent with how other assembler code in FFmpeg does it. --- libavcodec/aarch64/hevcdsp_idct_neon.S | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index b7f23386a4..f7142c939c 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -617,8 +617,7 @@ function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1 add sp, sp, #640 -mov x30, x15 -ret +ret x15 endfunc .endm @@ -814,8 +813,7 @@ function ff_hevc_idct_32x32_\bitdepth\()_neon, export=1 .endr add sp, sp, #2432 -mov x30, x15 -ret +ret x15 endfunc .endm -- 2.37.1 (Apple Git-137.1) ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] libaformat: fix incorrect handling of incomplete AVBPrint.
From: Reimar Döffinger Change some internal APIs a bit to make it harder to make such mistakes. In particular, have the read chunk functions return an error when the result is incomplete. This might be less flexible, but since there has been no use-case for that so far, avoiding coding mistakes seems better. Add a function to queue a AVBPrint directly (ff_subtitles_queue_insert_bprint). Also fixes a leak in lrcdec when ff_subtitles_queue_insert fails. --- libavformat/assdec.c | 4 +++- libavformat/lrcdec.c | 7 ++- libavformat/mpsubdec.c | 5 +++-- libavformat/realtextdec.c| 6 +- libavformat/samidec.c| 6 +- libavformat/srtdec.c | 4 +++- libavformat/subtitles.c | 17 + libavformat/subtitles.h | 14 -- libavformat/tedcaptionsdec.c | 2 +- libavformat/webvttdec.c | 4 +++- 10 files changed, 54 insertions(+), 15 deletions(-) diff --git a/libavformat/assdec.c b/libavformat/assdec.c index 0915f6fafd..bf7b8a73a2 100644 --- a/libavformat/assdec.c +++ b/libavformat/assdec.c @@ -73,6 +73,8 @@ static int read_dialogue(ASSContext *ass, AVBPrint *dst, const uint8_t *p, av_bprint_clear(dst); av_bprintf(dst, "%u,%d,%s", ass->readorder++, layer, p + pos); +if (!av_bprint_is_complete(dst)) +return AVERROR(ENOMEM); /* right strip the buffer */ while (dst->len > 0 && @@ -135,7 +137,7 @@ static int ass_read_header(AVFormatContext *s) av_bprintf(, "%s", line.str); continue; } -sub = ff_subtitles_queue_insert(>q, rline.str, rline.len, 0); +sub = ff_subtitles_queue_insert_bprint(>q, , 0); if (!sub) { res = AVERROR(ENOMEM); goto end; diff --git a/libavformat/lrcdec.c b/libavformat/lrcdec.c index fff39495f8..83bb4a4b75 100644 --- a/libavformat/lrcdec.c +++ b/libavformat/lrcdec.c @@ -171,6 +171,8 @@ static int lrc_read_header(AVFormatContext *s) while(!avio_feof(s->pb)) { int64_t pos = read_line(, s->pb); +if (!av_bprint_is_complete()) +goto err_nomem_out; int64_t header_offset = find_header(line.str); if(header_offset >= 0) { char *comma_offset = strchr(line.str, ':'); @@ -205,7 +207,7 @@ static int lrc_read_header(AVFormatContext *s) sub = ff_subtitles_queue_insert(>q, line.str + ts_strlength, line.len - ts_strlength, 0); if (!sub) -return AVERROR(ENOMEM); +goto err_nomem_out; sub->pos = pos; sub->pts = ts_start - lrc->ts_offset; sub->duration = -1; @@ -216,6 +218,9 @@ static int lrc_read_header(AVFormatContext *s) ff_metadata_conv_ctx(s, NULL, ff_lrc_metadata_conv); av_bprint_finalize(, NULL); return 0; +err_nomem_out: +av_bprint_finalize(, NULL); +return AVERROR(ENOMEM); } const AVInputFormat ff_lrc_demuxer = { diff --git a/libavformat/mpsubdec.c b/libavformat/mpsubdec.c index d290a41fb9..0374563575 100644 --- a/libavformat/mpsubdec.c +++ b/libavformat/mpsubdec.c @@ -116,9 +116,10 @@ static int mpsub_read_header(AVFormatContext *s) AVPacket *sub; const int64_t pos = avio_tell(s->pb); -ff_subtitles_read_chunk(s->pb, ); +res = ff_subtitles_read_chunk(s->pb, ); +if (res < 0) goto end; if (buf.len) { -sub = ff_subtitles_queue_insert(>q, buf.str, buf.len, 0); +sub = ff_subtitles_queue_insert_bprint(>q, , 0); if (!sub) { res = AVERROR(ENOMEM); goto end; diff --git a/libavformat/realtextdec.c b/libavformat/realtextdec.c index c281dec346..7992a5b7fc 100644 --- a/libavformat/realtextdec.c +++ b/libavformat/realtextdec.c @@ -80,6 +80,10 @@ static int realtext_read_header(AVFormatContext *s) const int64_t pos = ff_text_pos() - (c != 0); int n = ff_smil_extract_next_text_chunk(, , ); +if (n < 0) { +res = n; +goto end; +} if (n == 0) break; @@ -103,7 +107,7 @@ static int realtext_read_header(AVFormatContext *s) /* if we just read a tag, introduce a new event, otherwise merge * with the previous one */ int merge = !av_strncasecmp(buf.str, "q, buf.str, buf.len, merge); +sub = ff_subtitles_queue_insert_bprint(>q, , merge); if (!sub) { res = AVERROR(ENOMEM); goto end; diff --git a/libavformat/samidec.c b/libavformat/samidec.c index 0da299343d..070b623ebf 100644 --- a/libavformat/samidec.c +++ b/libavformat/samidec.c @@ -68,6 +68,10 @@ static int sami_read_header(AVFormatContext *s) const int64_t pos = ff_text_pos() - (c != 0); int is_sync, is_body, n =
[FFmpeg-devel] [PATCH] libaformat: fix incorrect handling of incomplete AVBPrint.
From: Reimar Döffinger Change some internal APIs a bit to make it harder to make such mistakes. In particular, have the read chunk functions return an error when the result is incomplete. This might be less flexible, but since there has been no use-case for that so far, avoiding coding mistakes seems better. Add a function to queue a AVBPrint directly (ff_subtitles_queue_insert_bprint). Also fixes a leak in lrcdec when ff_subtitles_queue_insert fails. --- Note that this combines a few different things, but they all are meant to address the same issue. Happy to split if that's wanted, first priority is getting an idea if some part of this seems like a bad idea generally. libavformat/assdec.c | 4 +++- libavformat/lrcdec.c | 7 ++- libavformat/mpsubdec.c | 5 +++-- libavformat/realtextdec.c| 7 ++- libavformat/samidec.c| 7 ++- libavformat/srtdec.c | 4 +++- libavformat/subtitles.c | 17 + libavformat/subtitles.h | 14 -- libavformat/tedcaptionsdec.c | 2 +- libavformat/webvttdec.c | 4 +++- 10 files changed, 56 insertions(+), 15 deletions(-) diff --git a/libavformat/assdec.c b/libavformat/assdec.c index 0915f6fafd..bf7b8a73a2 100644 --- a/libavformat/assdec.c +++ b/libavformat/assdec.c @@ -73,6 +73,8 @@ static int read_dialogue(ASSContext *ass, AVBPrint *dst, const uint8_t *p, av_bprint_clear(dst); av_bprintf(dst, "%u,%d,%s", ass->readorder++, layer, p + pos); +if (!av_bprint_is_complete(dst)) +return AVERROR(ENOMEM); /* right strip the buffer */ while (dst->len > 0 && @@ -135,7 +137,7 @@ static int ass_read_header(AVFormatContext *s) av_bprintf(, "%s", line.str); continue; } -sub = ff_subtitles_queue_insert(>q, rline.str, rline.len, 0); +sub = ff_subtitles_queue_insert_bprint(>q, , 0); if (!sub) { res = AVERROR(ENOMEM); goto end; diff --git a/libavformat/lrcdec.c b/libavformat/lrcdec.c index fff39495f8..83bb4a4b75 100644 --- a/libavformat/lrcdec.c +++ b/libavformat/lrcdec.c @@ -171,6 +171,8 @@ static int lrc_read_header(AVFormatContext *s) while(!avio_feof(s->pb)) { int64_t pos = read_line(, s->pb); +if (!av_bprint_is_complete()) +goto err_nomem_out; int64_t header_offset = find_header(line.str); if(header_offset >= 0) { char *comma_offset = strchr(line.str, ':'); @@ -205,7 +207,7 @@ static int lrc_read_header(AVFormatContext *s) sub = ff_subtitles_queue_insert(>q, line.str + ts_strlength, line.len - ts_strlength, 0); if (!sub) -return AVERROR(ENOMEM); +goto err_nomem_out; sub->pos = pos; sub->pts = ts_start - lrc->ts_offset; sub->duration = -1; @@ -216,6 +218,9 @@ static int lrc_read_header(AVFormatContext *s) ff_metadata_conv_ctx(s, NULL, ff_lrc_metadata_conv); av_bprint_finalize(, NULL); return 0; +err_nomem_out: +av_bprint_finalize(, NULL); +return AVERROR(ENOMEM); } const AVInputFormat ff_lrc_demuxer = { diff --git a/libavformat/mpsubdec.c b/libavformat/mpsubdec.c index d290a41fb9..0374563575 100644 --- a/libavformat/mpsubdec.c +++ b/libavformat/mpsubdec.c @@ -116,9 +116,10 @@ static int mpsub_read_header(AVFormatContext *s) AVPacket *sub; const int64_t pos = avio_tell(s->pb); -ff_subtitles_read_chunk(s->pb, ); +res = ff_subtitles_read_chunk(s->pb, ); +if (res < 0) goto end; if (buf.len) { -sub = ff_subtitles_queue_insert(>q, buf.str, buf.len, 0); +sub = ff_subtitles_queue_insert_bprint(>q, , 0); if (!sub) { res = AVERROR(ENOMEM); goto end; diff --git a/libavformat/realtextdec.c b/libavformat/realtextdec.c index c281dec346..9f6aab789e 100644 --- a/libavformat/realtextdec.c +++ b/libavformat/realtextdec.c @@ -80,6 +80,11 @@ static int realtext_read_header(AVFormatContext *s) const int64_t pos = ff_text_pos() - (c != 0); int n = ff_smil_extract_next_text_chunk(, , ); +if (n < 0) +{ +res = n; +goto end; +} if (n == 0) break; @@ -103,7 +108,7 @@ static int realtext_read_header(AVFormatContext *s) /* if we just read a tag, introduce a new event, otherwise merge * with the previous one */ int merge = !av_strncasecmp(buf.str, "q, buf.str, buf.len, merge); +sub = ff_subtitles_queue_insert_bprint(>q, , merge); if (!sub) { res = AVERROR(ENOMEM); goto end; diff --git a/libavformat/samidec.c b/libavformat/samidec.c index
[FFmpeg-devel] [PATCH] aarch64: Implement stack spilling in a consistent way.
From: Reimar Döffinger Currently it is done in several different ways, which might cause needless dependencies or in case of tx_float_neon.S is incorrect. Signed-off-by: Reimar Döffinger --- libavcodec/aarch64/fft_neon.S | 3 +- libavcodec/aarch64/h264idct_neon.S | 6 +- libavcodec/aarch64/hevcdsp_sao_neon.S | 3 +- libavcodec/aarch64/mdct_neon.S | 18 ++ libavcodec/aarch64/me_cmp_neon.S | 6 +- libavcodec/aarch64/synth_filter_neon.S | 3 +- libavcodec/aarch64/vp9itxfm_neon.S | 28 - libavcodec/aarch64/vp9lpf_16bpp_neon.S | 32 +-- libavcodec/aarch64/vp9lpf_neon.S | 80 +- libavutil/aarch64/tx_float_neon.S | 52 - 10 files changed, 109 insertions(+), 122 deletions(-) diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S index 9ff3f9c526..d7225511dd 100644 --- a/libavcodec/aarch64/fft_neon.S +++ b/libavcodec/aarch64/fft_neon.S @@ -342,8 +342,7 @@ endfunc function fft\n\()_neon, align=6 AARCH64_VALID_JUMP_TARGET AARCH64_SIGN_LINK_REGISTER -sub sp, sp, #16 -stp x28, x30, [sp] +stp x28, x30, [sp, #-16]! add x28, x0, #\n4*2*8 bl fft\n2\()_neon mov x0, x28 diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S index 7d2879b0ce..375da31d65 100644 --- a/libavcodec/aarch64/h264idct_neon.S +++ b/libavcodec/aarch64/h264idct_neon.S @@ -157,8 +157,7 @@ function ff_h264_idct_add16intra_neon, export=1 endfunc function ff_h264_idct_add8_neon, export=1 -sub sp, sp, #0x40 -stp x19, x20, [sp] +stp x19, x20, [sp, #-0x40]! mov x12, x30 ldp x6, x15, [x0] // dest[0], dest[1] add x5, x1, #16*4 // block_offset @@ -187,8 +186,7 @@ function ff_h264_idct_add8_neon, export=1 cselx6, x15, x6, eq cmp x10, #20 b.lt1b -ldp x19, x20, [sp] -add sp, sp, #0x40 +ldp x19, x20, [sp], #0x40 ret x12 endfunc diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index d4decfde3b..30e83dda5d 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -33,8 +33,7 @@ // int16_t *sao_offset_val, int sao_left_class, // int width, int height) function ff_hevc_sao_band_filter_8x8_8_neon, export=1 -sub sp, sp, #64 -stp xzr, xzr, [sp] +stp xzr, xzr, [sp, #-64]! stp xzr, xzr, [sp, #16] stp xzr, xzr, [sp, #32] stp xzr, xzr, [sp, #48] diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S index 6091e72022..98b09bf1ab 100644 --- a/libavcodec/aarch64/mdct_neon.S +++ b/libavcodec/aarch64/mdct_neon.S @@ -23,8 +23,7 @@ #include "libavutil/aarch64/asm.S" function ff_imdct_half_neon, export=1 -sub sp, sp, #32 -stp x19, x20, [sp] +stp x19, x20, [sp, #-32]! AARCH64_SIGN_LINK_REGISTER str x30, [sp, #16] mov x12, #1 @@ -120,17 +119,15 @@ function ff_imdct_half_neon, export=1 st2 {v4.2s,v5.2s}, [x0] st2 {v6.2s,v7.2s}, [x8] -ldp x19, x20, [sp] ldr x30, [sp, #16] AARCH64_VALIDATE_LINK_REGISTER -add sp, sp, #32 +ldp x19, x20, [sp], #32 ret endfunc function ff_imdct_calc_neon, export=1 -sub sp, sp, #32 -stp x19, x20, [sp] +stp x19, x20, [sp, #-32]! AARCH64_SIGN_LINK_REGISTER str x30, [sp, #16] ldr w3, [x0, #28] // mdct_bits @@ -163,18 +160,16 @@ function ff_imdct_calc_neon, export=1 subsx19, x19, #16 b.gt1b -ldp x19, x20, [sp] ldr x30, [sp, #16] AARCH64_VALIDATE_LINK_REGISTER -add sp, sp, #32 +ldp x19, x20, [sp], #32 ret endfunc function ff_mdct_calc_neon, export=1 -sub sp, sp, #32 -stp x19, x20, [sp] +stp x19, x20, [sp, #-32]! AARCH64_SIGN_LINK_REGISTER str x30, [sp, #16] @@ -323,10 +318,9 @@ function ff_mdct_calc_neon, export=1 st2 {v4.2s,v5.2s}, [x0] st2 {v6.2s,v7.2s}, [x8] -ldp x19, x20, [sp] ldr
[FFmpeg-devel] [PATCH] libavcodec/hevcdsp: port SIMD idct functions from 32-bit.
From: Reimar Döffinger Makes SIMD-optimized 8x8 and 16x16 idcts for 8 and 10 bit depth available on aarch64. For a UHD HDR (10 bit) sample video these were consuming the most time and this optimization reduced overall decode time from 19.4s to 16.4s, approximately 15% speedup. Test sample was the first 300 frames of "LG 4K HDR Demo - New York.ts", running on Apple M1. --- libavcodec/aarch64/Makefile | 2 + libavcodec/aarch64/hevcdsp_idct_neon.S| 380 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 45 +++ libavcodec/hevcdsp.c | 2 + libavcodec/hevcdsp.h | 1 + 5 files changed, 430 insertions(+) create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index f6434e40da..2ea1d74a38 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -61,3 +61,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ aarch64/vp9lpf_neon.o \ aarch64/vp9mc_16bpp_neon.o \ aarch64/vp9mc_neon.o +NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o \ + aarch64/hevcdsp_init_aarch64.o diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S new file mode 100644 index 00..4aac205e22 --- /dev/null +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -0,0 +1,380 @@ +/* + * ARM NEON optimised IDCT functions for HEVC decoding + * Copyright (c) 2014 Seppo Tomperi + * Copyright (c) 2017 Alexandra Hájková + * + * Ported from arm/hevcdsp_idct_neon.S by + * Copyright (c) 2020 Reimar Döffinger + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +const trans, align=4 +.short 64, 83, 64, 36 +.short 89, 75, 50, 18 +.short 90, 87, 80, 70 +.short 57, 43, 25, 9 +.short 90, 90, 88, 85 +.short 82, 78, 73, 67 +.short 61, 54, 46, 38 +.short 31, 22, 13, 4 +endconst + +.macro sum_sub out, in, c, op, p + .ifc \op, + +smlal\p \out, \in, \c + .else +smlsl\p \out, \in, \c + .endif +.endm + +.macro fixsqrshrn d, dt, n, m + .ifc \dt, .8H +sqrshrn2\d\dt, \n\().4S, \m + .else +sqrshrn \n\().4H, \n\().4S, \m +mov \d\().D[0], \n\().D[0] + .endif +.endm + +// uses and clobbers v28-v31 as temp registers +.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2 + sshll\p1 v28.4S, \in0, #6 + movv29.16B, v28.16B + smull\p1 v30.4S, \in1, v0.H[1] + smull\p1 v31.4S, \in1, v0.H[3] + smlal\p2 v28.4S, \in2, v0.H[0] //e0 + smlsl\p2 v29.4S, \in2, v0.H[0] //e1 + smlal\p2 v30.4S, \in3, v0.H[3] //o0 + smlsl\p2 v31.4S, \in3, v0.H[1] //o1 + + add\out0, v28.4S, v30.4S + add\out1, v29.4S, v31.4S + sub\out2, v29.4S, v31.4S + sub\out3, v28.4S, v30.4S +.endm + +.macro transpose8_4x4 r0, r1, r2, r3 +trn1v2.8H, \r0\().8H, \r1\().8H +trn2v3.8H, \r0\().8H, \r1\().8H +trn1v4.8H, \r2\().8H, \r3\().8H +trn2v5.8H, \r2\().8H, \r3\().8H +trn1\r0\().4S, v2.4S, v4.4S +trn2\r2\().4S, v2.4S, v4.4S +trn1\r1\().4S, v3.4S, v5.4S +trn2\r3\().4S, v3.4S, v5.4S +.endm + +.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 +transpose8_4x4 \r0, \r1, \r2, \r3 +transpose8_4x4 \r4, \r5, \r6, \r7 +.endm + +.macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, in5,in5t, in6,in6t, in7,in7t, p1, p2 +tr_4x4_8\in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4S, v25.4S, v26.4S, v27.4S, \p1, \p2 + +smull\p1v30.4S, \in1\in1t, v0.H[6] +smull\p1
[FFmpeg-devel] [PATCH] configure: Set MSVC as_default later.
From: Reimar Döffinger It would get immediately overridden to $cc, which in case of gas-preprocessor missing would result in it trying to use cl.exe for asm files instead of erroring out. This is because cl.exe does not fail but just print a warning when it is given a file it does not know what to do with it... --- configure | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/configure b/configure index 12b41cde1c..d3b665f6f9 100755 --- a/configure +++ b/configure @@ -4271,14 +4271,6 @@ case "$toolchain" in ld_default="$source_path/compat/windows/mslink" nm_default="dumpbin.exe -symbols" ar_default="lib.exe" -case "$arch" in -aarch64|arm64) -as_default="armasm64.exe" -;; -arm*) -as_default="armasm.exe" -;; -esac target_os_default="win32" # Use a relative path for TMPDIR. This makes sure all the # ffconf temp files are written with a relative path, avoiding @@ -4720,6 +4712,14 @@ probe_cc(){ _ld_path='-libpath:' elif $_cc -nologo- 2>&1 | grep -q Microsoft || { $_cc -v 2>&1 | grep -q clang && $_cc -? > /dev/null 2>&1; }; then _type=msvc +case "$arch" in +aarch64|arm64) +as_default="armasm64.exe" +;; +arm*) +as_default="armasm.exe" +;; +esac if $_cc -nologo- 2>&1 | grep -q Microsoft; then _ident=$($_cc 2>&1 | head -n1 | tr -d '\r') else -- 2.24.3 (Apple Git-128) ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] libavcodec/hevcdsp: port SIMD idct functions from 32-bit.
From: Reimar Döffinger Makes SIMD-optimized 8x8 and 16x16 idcts for 8 and 10 bit depth available on aarch64. For a UHD HDR (10 bit) sample video these were consuming the most time and this optimization reduced overall decode time from 19.4s to 16.4s, approximately 15% speedup. Test sample was the first 300 frames of "LG 4K HDR Demo - New York.ts", running on Apple M1. --- libavcodec/aarch64/Makefile | 2 + libavcodec/aarch64/hevcdsp_idct_neon.S| 423 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 45 +++ libavcodec/hevcdsp.c | 2 + libavcodec/hevcdsp.h | 1 + 5 files changed, 473 insertions(+) create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index f6434e40da..2ea1d74a38 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -61,3 +61,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ aarch64/vp9lpf_neon.o \ aarch64/vp9mc_16bpp_neon.o \ aarch64/vp9mc_neon.o +NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o \ + aarch64/hevcdsp_init_aarch64.o diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S new file mode 100644 index 00..6b42f6ca3a --- /dev/null +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -0,0 +1,423 @@ +/* + * ARM NEON optimised IDCT functions for HEVC decoding + * Copyright (c) 2014 Seppo Tomperi + * Copyright (c) 2017 Alexandra Hájková + * + * Ported from arm/hevcdsp_idct_neon.S by + * Copyright (c) 2020 Reimar Döffinger + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +const trans, align=4 +.short 64, 83, 64, 36 +.short 89, 75, 50, 18 +.short 90, 87, 80, 70 +.short 57, 43, 25, 9 +.short 90, 90, 88, 85 +.short 82, 78, 73, 67 +.short 61, 54, 46, 38 +.short 31, 22, 13, 4 +endconst + +.macro sum_sub out, in, c, op, p + .ifc \op, + +smlal\p \out, \in, \c + .else +smlsl\p \out, \in, \c + .endif +.endm + +.macro fixsqrshrn d, dt, n, m + .ifc \dt, .8H +sqrshrn2\d\dt, \n\().4S, \m + .else +sqrshrn \n\().4H, \n\().4S, \m +mov \d\().D[0], \n\().D[0] + .endif +.endm + +// uses and clobbers v28-v31 as temp registers +.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2 + sshll\p1 v28.4S, \in0, #6 + movv29.16B, v28.16B + smull\p1 v30.4S, \in1, v0.H[1] + smull\p1 v31.4S, \in1, v0.H[3] + smlal\p2 v28.4S, \in2, v0.H[0] //e0 + smlsl\p2 v29.4S, \in2, v0.H[0] //e1 + smlal\p2 v30.4S, \in3, v0.H[3] //o0 + smlsl\p2 v31.4S, \in3, v0.H[1] //o1 + + add\out0, v28.4S, v30.4S + add\out1, v29.4S, v31.4S + sub\out2, v29.4S, v31.4S + sub\out3, v28.4S, v30.4S +.endm + +.macro transpose8_4x4 r0, r1, r2, r3 +trn1v2.8H, \r0\().8H, \r1\().8H +trn2v3.8H, \r0\().8H, \r1\().8H +trn1v4.8H, \r2\().8H, \r3\().8H +trn2v5.8H, \r2\().8H, \r3\().8H +trn1\r0\().4S, v2.4S, v4.4S +trn2\r2\().4S, v2.4S, v4.4S +trn1\r1\().4S, v3.4S, v5.4S +trn2\r3\().4S, v3.4S, v5.4S +.endm + +.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 +transpose8_4x4 \r0, \r1, \r2, \r3 +transpose8_4x4 \r4, \r5, \r6, \r7 +.endm + +.macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, in5,in5t, in6,in6t, in7,in7t, p1, p2 +tr_4x4_8\in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4S, v25.4S, v26.4S, v27.4S, \p1, \p2 + +smull\p1v30.4S, \in1\in1t, v0.H[6] +smull\p1
[FFmpeg-devel] [PATCH] libswscale/aarch64/hscale.S: Support more bit-depth variants.
From: Reimar Döffinger Trivially expand hscale assembler to support > 8 bit formats both for input and output. 16-bit input is not supported as I am not certain how to get sufficient test coverage. --- libswscale/aarch64/hscale.S | 53 ++-- libswscale/aarch64/swscale.c | 49 +++-- 2 files changed, 85 insertions(+), 17 deletions(-) diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S index af55ffe2b7..3b42d39dac 100644 --- a/libswscale/aarch64/hscale.S +++ b/libswscale/aarch64/hscale.S @@ -20,7 +20,11 @@ #include "libavutil/aarch64/asm.S" -function ff_hscale_8_to_15_neon, export=1 +.macro hscale srcbits, dstbits, ldt, lds, c +function ff_hscale_\srcbits\()_to_\dstbits\()_neon, export=1 +.if \dstbits >= 16 +moviv20.4S, #(0x1 << (\dstbits - 16)), msl #16 +.endif sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16) 1: ldr w8, [x5], #4// filterPos[idx] ldr w0, [x5], #4// filterPos[idx + 1] @@ -34,30 +38,30 @@ function ff_hscale_8_to_15_neon, export=1 moviv1.2D, #0 // val sum part 2 (for dst[1]) moviv2.2D, #0 // val sum part 3 (for dst[2]) moviv3.2D, #0 // val sum part 4 (for dst[3]) -add x17, x3, w8, UXTW // srcp + filterPos[0] -add x8, x3, w0, UXTW // srcp + filterPos[1] -add x0, x3, w11, UXTW // srcp + filterPos[2] -add x11, x3, w9, UXTW // srcp + filterPos[3] +add x17, x3, w8, UXTW #!!(\srcbits > 8) // srcp + filterPos[0] +add x8, x3, w0, UXTW #!!(\srcbits > 8) // srcp + filterPos[1] +add x0, x3, w11, UXTW #!!(\srcbits > 8) // srcp + filterPos[2] +add x11, x3, w9, UXTW #!!(\srcbits > 8) // srcp + filterPos[3] mov w15, w6 // filterSize counter -2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}] +2: ld1 {v4.\ldt}, [x17], \lds // srcp[filterPos[0] + {0..7}] ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1 -ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}] +ld1 {v6.\ldt}, [x8], \lds // srcp[filterPos[1] + {0..7}] ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize -uxtlv4.8H, v4.8B// unpack part 1 to 16-bit +\c\cuxtlv4.8H, v4.8B// unpack part 1 to 16-bit smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] -ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}] +ld1 {v16.\ldt}, [x0], \lds // srcp[filterPos[2] + {0..7}] ld1 {v17.8H}, [x13], #16// load 8x16-bit at filter+2*filterSize -uxtlv6.8H, v6.8B// unpack part 2 to 16-bit +\c\cuxtlv6.8H, v6.8B// unpack part 2 to 16-bit smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] -uxtlv16.8H, v16.8B // unpack part 3 to 16-bit +\c\cuxtlv16.8H, v16.8B // unpack part 3 to 16-bit smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] -ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}] +ld1 {v18.\ldt}, [x11], \lds // srcp[filterPos[3] + {0..7}] smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize subsw15, w15, #8// j -= 8: processed 8/filterSize -uxtlv18.8H, v18.8B // unpack part 4 to 16-bit +\c\cuxtlv18.8H, v18.8B // unpack part 4 to 16-bit smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] smlal2 v3.4S, v18.8H, v19.8H // v3
[FFmpeg-devel] [PATCH] Add support for "omp simd" pragma.
From: Reimar Döffinger This requests loops to be vectorized using SIMD instructions. The performance increase is far from hand-optimized assembly but still significant over the plain C version. Typical values are a 2-4x speedup where a hand-written version would achieve 4x-10x. So it is far from a replacement, however some architures will get hand-written assembler quite late or not at all, and this is a good improvement for a trivial amount of work. The cause, besides the compiler being a compiler, is usually that it does not manage to use saturating instructions and thus has to use 32-bit operations where actually saturating 16-bit operations would be sufficient. Other causes are for example the av_clip functions that are not ideal for vectorization (and even as scalar code not optimal for any modern CPU that has either CSEL or MAX/MIN instructions). And of course this only works for relatively simple loops, the IDCT functions for example seemed not possible to optimize that way. Also note that while clang may accept the code and sometimes produces warnings, it does not seem to do anything actually useful at all. Here are example measurements using gcc 10 under Linux (in a VM unfortunately) on AArch64 on Apple M1: Commad: time ./ffplay_g LG\ 4K\ HDR\ Demo\ -\ New\ York.ts -t 10 -autoexit -threads 1 -noframedrop Original code: real0m19.572s user0m23.386s sys 0m0.213s Changing all put_hevc: real0m15.648s user0m19.503s (83.4% of original) sys 0m0.186s In addition changing add_residual: real0m15.424s user0m19.278s (82.4% of original) sys 0m0.133s In addition changing planar copy dither: real0m15.040s user0m18.874s (80.7% of original) sys 0m0.168s Signed-off-by: Reimar Döffinger --- configure | 23 + libavcodec/hevcdsp_template.c | 47 +++ libavutil/internal.h | 6 + libswscale/swscale_unscaled.c | 3 +++ 4 files changed, 79 insertions(+) diff --git a/configure b/configure index 900505756b..73b7c3daeb 100755 --- a/configure +++ b/configure @@ -406,6 +406,7 @@ Toolchain options: --enable-pic build position-independent code --enable-thumb compile for Thumb instruction set --enable-lto use link-time optimization + --enable-openmp-simd use the "omp simd" pragma to optimize code --env="ENV=override" override the environment variables Advanced options (experts only): @@ -2335,6 +2336,7 @@ HAVE_LIST=" opencl_dxva2 opencl_vaapi_beignet opencl_vaapi_intel_media +openmp_simd perl pod2man texi2html @@ -2446,6 +2448,7 @@ CMDLINE_SELECT=" extra_warnings logging lto +openmp_simd optimizations rpath stripping @@ -6926,6 +6929,26 @@ if enabled lto; then disable inline_asm_direct_symbol_refs fi +if enabled openmp_simd; then +ompopt="-fopenmp" +if ! test_cflags $ompopt ; then +test_cflags -Xpreprocessor -fopenmp && ompopt="-Xpreprocessor -fopenmp" +fi +test_cc $ompopt <> shift); src += srcstride; @@ -568,6 +573,7 @@ static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { +FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixelsrc[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox); src += srcstride; @@ -592,6 +598,7 @@ static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { +FF_OMP_SIMD for (x = 0; x < width; x++) { dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1)); } @@ -623,6 +630,7 @@ static void FUNC(put_hevc_qpel_h)(int16_t *dst, ptrdiff_t srcstride = _srcstride / sizeof(pixel); const int8_t *filter= ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height; y++) { +FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -639,6 +647,7 @@ static void FUNC(put_hevc_qpel_v)(int16_t *dst, ptrdiff_t srcstride = _srcstride / sizeof(pixel); const int8_t *filter= ff_hevc_qpel_filters[my - 1]; for (y = 0; y < height; y++) { +FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); src += srcstride; @@ -662,6 +671,7 @@ static void FUNC(put_hevc_qpel_hv)(int16_t *dst, src -= QPEL_EXTRA_BEFORE * srcstride; filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height + QPEL_EXTRA; y++) { +FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] =
[FFmpeg-devel] [PATCH] libavcodec/aarch64/hevcdsp_idct_neon.S: Also port add_residual functions.
From: Reimar Döffinger Speedup is fairly small, around 1.5%, but these are fairly simple. --- libavcodec/aarch64/hevcdsp_idct_neon.S| 190 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 24 +++ 2 files changed, 214 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 9f67e45..edd03a0 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -36,6 +36,196 @@ const trans, align=4 .short 31, 22, 13, 4 endconst +.macro clip10 in1, in2, c1, c2 +smax\in1, \in1, \c1 +smax\in2, \in2, \c1 +smin\in1, \in1, \c2 +smin\in2, \in2, \c2 +.endm + +function ff_hevc_add_residual_4x4_8_neon, export=1 +ld1 {v0.8H-v1.8H}, [x1] +ld1 {v2.S}[0], [x0], x2 +ld1 {v2.S}[1], [x0], x2 +ld1 {v2.S}[2], [x0], x2 +ld1 {v2.S}[3], [x0], x2 +sub x0, x0, x2, lsl #2 +uxtlv8.8H, v2.8B +uxtl2 v9.8H, v2.16B +sqadd v0.8H, v0.8H, v8.8H +sqadd v1.8H, v1.8H, v9.8H +sqxtun v0.8B, v0.8H +sqxtun2 v0.16B, v1.8H +st1 {v0.S}[0], [x0], x2 +st1 {v0.S}[1], [x0], x2 +st1 {v0.S}[2], [x0], x2 +st1 {v0.S}[3], [x0], x2 +ret +endfunc + +function ff_hevc_add_residual_4x4_10_neon, export=1 +mov x12, x0 +ld1 {v0.8H-v1.8H}, [x1] +ld1 {v2.D}[0], [x12], x2 +ld1 {v2.D}[1], [x12], x2 +ld1 {v3.D}[0], [x12], x2 +sqadd v0.8H, v0.8H, v2.8H +ld1 {V3.D}[1], [x12], x2 +moviv4.8H, #0 +sqadd v1.8H, v1.8H, v3.8H +mvniv5.8H, #0xFC, LSL #8 // movi #0x3FF +clip10 v0.8H, v1.8H, v4.8H, v5.8H +st1 {v0.D}[0], [x0], x2 +st1 {v0.D}[1], [x0], x2 +st1 {v1.D}[0], [x0], x2 +st1 {v1.D}[1], [x0], x2 +ret +endfunc + +function ff_hevc_add_residual_8x8_8_neon, export=1 +add x12, x0, x2 +add x2, x2, x2 +mov x3, #8 +1: subsx3, x3, #2 +ld1 {v2.D}[0], [x0] +ld1 {v2.D}[1], [x12] +uxtlv3.8H, v2.8B +ld1 {v0.8H-v1.8H}, [x1], #32 +uxtl2 v2.8H, v2.16B +sqadd v0.8H, v0.8H, v3.8H +sqadd v1.8H, v1.8H, v2.8H +sqxtun v0.8B, v0.8H +sqxtun2 v0.16B, v1.8H +st1 {v0.D}[0], [x0], x2 +st1 {v0.D}[1], [x12], x2 +bne 1b +ret +endfunc + +function ff_hevc_add_residual_8x8_10_neon, export=1 +add x12, x0, x2 +add x2, x2, x2 +mov x3, #8 +moviv4.8H, #0 +mvniv5.8H, #0xFC, LSL #8 // movi #0x3FF +1: subsx3, x3, #2 +ld1 {v0.8H-v1.8H}, [x1], #32 +ld1 {v2.8H},[x0] +sqadd v0.8H, v0.8H, v2.8H +ld1 {v3.8H},[x12] +sqadd v1.8H, v1.8H, v3.8H +clip10 v0.8H, v1.8H, v4.8H, v5.8H +st1 {v0.8H}, [x0], x2 +st1 {v1.8H}, [x12], x2 +bne 1b +ret +endfunc + +function ff_hevc_add_residual_16x16_8_neon, export=1 +mov x3, #16 +add x12, x0, x2 +add x2, x2, x2 +1: subsx3, x3, #2 +ld1 {v16.16B}, [x0] +ld1 {v0.8H-v3.8H}, [x1], #64 +ld1 {v19.16B},[x12] +uxtlv17.8H, v16.8B +uxtl2 v18.8H, v16.16B +uxtlv20.8H, v19.8B +uxtl2 v21.8H, v19.16B +sqadd v0.8H, v0.8H, v17.8H +sqadd v1.8H, v1.8H, v18.8H +sqadd v2.8H, v2.8H, v20.8H +sqadd v3.8H, v3.8H, v21.8H +sqxtun v0.8B, v0.8H +sqxtun2 v0.16B, v1.8H +sqxtun v1.8B, v2.8H +sqxtun2 v1.16B, v3.8H +st1 {v0.16B}, [x0], x2 +st1 {v1.16B}, [x12], x2 +bne 1b +ret +endfunc + +function ff_hevc_add_residual_16x16_10_neon, export=1 +mov x3, #16 +moviv20.8H, #0 +mvniv21.8H, #0xFC, LSL #8 // movi #0x3FF +add x12, x0, x2 +add x2, x2, x2 +1: subs
[FFmpeg-devel] [PATCH] libavcodec/aarch64/hevcdsp_idct_neon.S: Also port add_residual functions.
From: Reimar Döffinger Speedup is fairly small, around 1.5%, but these are fairly simple. --- libavcodec/aarch64/hevcdsp_idct_neon.S| 190 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 24 +++ 2 files changed, 214 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 9f67e45..edd03a0 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -36,6 +36,196 @@ const trans, align=4 .short 31, 22, 13, 4 endconst +.macro clip10 in1, in2, c1, c2 +smax\in1, \in1, \c1 +smax\in2, \in2, \c1 +smin\in1, \in1, \c2 +smin\in2, \in2, \c2 +.endm + +function ff_hevc_add_residual_4x4_8_neon, export=1 +ld1 {v0.8H-v1.8H}, [x1] +ld1 {v2.S}[0], [x0], x2 +ld1 {v2.S}[1], [x0], x2 +ld1 {v2.S}[2], [x0], x2 +ld1 {v2.S}[3], [x0], x2 +sub x0, x0, x2, lsl #2 +uxtlv8.8H, v2.8B +uxtl2 v9.8H, v2.16B +sqadd v0.8H, v0.8H, v8.8H +sqadd v1.8H, v1.8H, v9.8H +sqxtun v0.8B, v0.8H +sqxtun2 v0.16B, v1.8H +st1 {v0.S}[0], [x0], x2 +st1 {v0.S}[1], [x0], x2 +st1 {v0.S}[2], [x0], x2 +st1 {v0.S}[3], [x0], x2 +ret +endfunc + +function ff_hevc_add_residual_4x4_10_neon, export=1 +mov x12, x0 +ld1 {v0.8H-v1.8H}, [x1] +ld1 {v2.D}[0], [x12], x2 +ld1 {v2.D}[1], [x12], x2 +ld1 {v3.D}[0], [x12], x2 +sqadd v0.8H, v0.8H, v2.8H +ld1 {V3.D}[1], [x12], x2 +moviv4.8H, #0 +sqadd v1.8H, v1.8H, v3.8H +mvniv5.8H, #0xFC, LSL #8 // movi #0x3FF +clip10 v0.8H, v1.8H, v4.8H, v5.8H +st1 {v0.D}[0], [x0], x2 +st1 {v0.D}[1], [x0], x2 +st1 {v1.D}[0], [x0], x2 +st1 {v1.D}[1], [x0], x2 +ret +endfunc + +function ff_hevc_add_residual_8x8_8_neon, export=1 +add x12, x0, x2 +add x2, x2, x2 +mov x3, #8 +1: subsx3, x3, #2 +ld1 {v2.D}[0], [x0] +ld1 {v2.D}[1], [x12] +uxtlv3.8H, v2.8B +ld1 {v0.8H-v1.8H}, [x1], #32 +uxtl2 v2.8H, v2.16B +sqadd v0.8H, v0.8H, v3.8H +sqadd v1.8H, v1.8H, v2.8H +sqxtun v0.8B, v0.8H +sqxtun2 v0.16B, v1.8H +st1 {v0.D}[0], [x0], x2 +st1 {v0.D}[1], [x12], x2 +bne 1b +ret +endfunc + +function ff_hevc_add_residual_8x8_10_neon, export=1 +add x12, x0, x2 +add x2, x2, x2 +mov x3, #8 +moviv4.8H, #0 +mvniv5.8H, #0xFC, LSL #8 // movi #0x3FF +1: subsx3, x3, #2 +ld1 {v0.8H-v1.8H}, [x1], #32 +ld1 {v2.8H},[x0] +sqadd v0.8H, v0.8H, v2.8H +ld1 {v3.8H},[x12] +sqadd v1.8H, v1.8H, v3.8H +clip10 v0.8H, v1.8H, v4.8H, v5.8H +st1 {v0.8H}, [x0], x2 +st1 {v1.8H}, [x12], x2 +bne 1b +ret +endfunc + +function ff_hevc_add_residual_16x16_8_neon, export=1 +mov x3, #16 +add x12, x0, x2 +add x2, x2, x2 +1: subsx3, x3, #2 +ld1 {v16.16B}, [x0] +ld1 {v0.8H-v3.8H}, [x1], #64 +ld1 {v19.16B},[x12] +uxtlv17.8H, v16.8B +uxtl2 v18.8H, v16.16B +uxtlv20.8H, v19.8B +uxtl2 v21.8H, v19.16B +sqadd v0.8H, v0.8H, v17.8H +sqadd v1.8H, v1.8H, v18.8H +sqadd v2.8H, v2.8H, v20.8H +sqadd v3.8H, v3.8H, v21.8H +sqxtun v0.8B, v0.8H +sqxtun2 v0.16B, v1.8H +sqxtun v1.8B, v2.8H +sqxtun2 v1.16B, v3.8H +st1 {v0.16B}, [x0], x2 +st1 {v1.16B}, [x12], x2 +bne 1b +ret +endfunc + +function ff_hevc_add_residual_16x16_10_neon, export=1 +mov x3, #16 +moviv20.8H, #0 +mvniv21.8H, #0xFC, LSL #8 // movi #0x3FF +add x12, x0, x2 +add x2, x2, x2 +1: subs
[FFmpeg-devel] [PATCH] libavcodec/hevcdsp: port SIMD idct functions from 32-bit.
From: Reimar Döffinger Makes SIMD-optimized 8x8 and 16x16 idcts for 8 and 10 bit depth available on aarch64. For a UHD HDR (10 bit) sample video these were consuming the most time and this optimization reduced overall decode time from 19.4s to 16.4s, approximately 15% speedup. Test sample was the first 300 frames of "LG 4K HDR Demo - New York.ts", running on Apple M1. --- libavcodec/aarch64/Makefile | 2 + libavcodec/aarch64/hevcdsp_idct_neon.S| 426 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 45 +++ libavcodec/hevcdsp.c | 2 + libavcodec/hevcdsp.h | 1 + 5 files changed, 476 insertions(+) create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index f6434e4..2ea1d74 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -61,3 +61,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ aarch64/vp9lpf_neon.o \ aarch64/vp9mc_16bpp_neon.o \ aarch64/vp9mc_neon.o +NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o \ + aarch64/hevcdsp_init_aarch64.o diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S new file mode 100644 index 000..9f67e45 --- /dev/null +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -0,0 +1,426 @@ +/* + * ARM NEON optimised IDCT functions for HEVC decoding + * Copyright (c) 2014 Seppo Tomperi + * Copyright (c) 2017 Alexandra Hájková + * + * Ported from arm/hevcdsp_idct_neon.S by + * Copyright (c) 2020 Reimar Döffinger + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +const trans, align=4 +.short 64, 83, 64, 36 +.short 89, 75, 50, 18 +.short 90, 87, 80, 70 +.short 57, 43, 25, 9 +.short 90, 90, 88, 85 +.short 82, 78, 73, 67 +.short 61, 54, 46, 38 +.short 31, 22, 13, 4 +endconst + +.macro sum_sub out, in, c, op, p + .ifc \op, + +smlal\p \out, \in, \c + .else +smlsl\p \out, \in, \c + .endif +.endm + +.macro fixsqrshrn d, dt, n, m + .ifc \dt, .8H +sqrshrn2\d\dt, \n\().4S, \m + .else +sqrshrn \n\().4H, \n\().4S, \m +mov \d\().D[0], \n\().D[0] + .endif +.endm + +.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3, p1, p2 + sshll\p1 \tmp0, \in0, #6 + mov\tmp1, \tmp0 + smull\p1 \tmp2, \in1, v0.H[1] + smull\p1 \tmp3, \in1, v0.H[3] + smlal\p2 \tmp0, \in2, v0.H[0] //e0 + smlsl\p2 \tmp1, \in2, v0.H[0] //e1 + smlal\p2 \tmp2, \in3, v0.H[3] //o0 + smlsl\p2 \tmp3, \in3, v0.H[1] //o1 + + add\out0, \tmp0, \tmp2 + add\out1, \tmp1, \tmp3 + sub\out2, \tmp1, \tmp3 + sub\out3, \tmp0, \tmp2 +.endm + +.macro transpose8_4x4 r0, r1, r2, r3 +trn1v2.8H, \r0\().8H, \r1\().8H +trn2v3.8H, \r0\().8H, \r1\().8H +trn1v4.8H, \r2\().8H, \r3\().8H +trn2v5.8H, \r2\().8H, \r3\().8H +trn1\r0\().4S, v2.4S, v4.4S +trn2\r2\().4S, v2.4S, v4.4S +trn1\r1\().4S, v3.4S, v5.4S +trn2\r3\().4S, v3.4S, v5.4S +.endm + +.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 +transpose8_4x4 \r0, \r1, \r2, \r3 +transpose8_4x4 \r4, \r5, \r6, \r7 +.endm + +.macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, in5,in5t, in6,in6t, in7,in7t, p1, p2 +tr_4x4_8\in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4S, v25.4S, v26.4S, v27.4S, v28.4S, v29.4S, v30.4S, v31.4S, \p1, \p2 + +smull\p1v30.4S, \in1\in1t, v0.H[6] +smull\p1v28.4S, \in1\in1t,