[FFmpeg-cvslog] libavfilter: add vf_colorrange_cuda, CUDA-accelerated color conversion filter
ffmpeg | branch: master | Roman Arzumanyan | Sat Sep 10 11:05:56 2022 +0300| [cc81ab283c72921a23f7dc149c6a2b386eaf77c6] | committer: Timo Rothenpieler libavfilter: add vf_colorrange_cuda, CUDA-accelerated color conversion filter Signed-off-by: Timo Rothenpieler > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=cc81ab283c72921a23f7dc149c6a2b386eaf77c6 --- configure | 6 +- doc/filters.texi | 32 +++ libavfilter/Makefile | 3 + libavfilter/allfilters.c | 1 + libavfilter/version.h | 2 +- libavfilter/vf_colorspace_cuda.c | 435 ++ libavfilter/vf_colorspace_cuda.cu | 94 7 files changed, 570 insertions(+), 3 deletions(-) diff --git a/configure b/configure index b7dc1d8656..240ae942d1 100755 --- a/configure +++ b/configure @@ -3149,10 +3149,12 @@ qsvvpp_select="qsv" vaapi_encode_deps="vaapi" v4l2_m2m_deps="linux_videodev2_h sem_timedwait" -chromakey_cuda_filter_deps="ffnvcodec" -chromakey_cuda_filter_deps_any="cuda_nvcc cuda_llvm" bilateral_cuda_filter_deps="ffnvcodec" bilateral_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +chromakey_cuda_filter_deps="ffnvcodec" +chromakey_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +colorspace_cuda_filter_deps="ffnvcodec" +colorspace_cuda_filter_deps_any="cuda_nvcc cuda_llvm" hwupload_cuda_filter_deps="ffnvcodec" scale_npp_filter_deps="ffnvcodec libnpp" scale2ref_npp_filter_deps="ffnvcodec libnpp" diff --git a/doc/filters.texi b/doc/filters.texi index dbc08163d8..6aa350a63c 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -9725,6 +9725,38 @@ For example to convert the input to SMPTE-240M, use the command: colorspace=smpte240m @end example +@section colorspace_cuda + +CUDA accelerated implementation of the colorspace filter. + +It is by no means feature complete compared to the software colorspace filter, +and at the current time only supports color range conversion between jpeg/full +and mpeg/limited range. + +The filter accepts the following options: + +@table @option +@item range +Specify output color range. + +The accepted values are: +@table @samp +@item tv +TV (restricted) range + +@item mpeg +MPEG (restricted) range + +@item pc +PC (full) range + +@item jpeg +JPEG (full) range + +@end table + +@end table + @section colortemperature Adjust color temperature in video to simulate variations in ambient color temperature. diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 841ec47141..ff2a06c262 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -230,6 +230,9 @@ OBJS-$(CONFIG_COLORLEVELS_FILTER)+= vf_colorlevels.o OBJS-$(CONFIG_COLORMAP_FILTER) += vf_colormap.o OBJS-$(CONFIG_COLORMATRIX_FILTER)+= vf_colormatrix.o OBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o colorspacedsp.o +OBJS-$(CONFIG_COLORSPACE_CUDA_FILTER)+= vf_colorspace_cuda.o \ +vf_colorspace_cuda.ptx.o \ +cuda/load_helper.o OBJS-$(CONFIG_COLORTEMPERATURE_FILTER) += vf_colortemperature.o OBJS-$(CONFIG_CONVOLUTION_FILTER)+= vf_convolution.o OBJS-$(CONFIG_CONVOLUTION_OPENCL_FILTER) += vf_convolution_opencl.o opencl.o \ diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 79e8a16bbc..119de40b25 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -213,6 +213,7 @@ extern const AVFilter ff_vf_colorlevels; extern const AVFilter ff_vf_colormap; extern const AVFilter ff_vf_colormatrix; extern const AVFilter ff_vf_colorspace; +extern const AVFilter ff_vf_colorspace_cuda; extern const AVFilter ff_vf_colortemperature; extern const AVFilter ff_vf_convolution; extern const AVFilter ff_vf_convolution_opencl; diff --git a/libavfilter/version.h b/libavfilter/version.h index fc0df70dee..5aac9c513a 100644 --- a/libavfilter/version.h +++ b/libavfilter/version.h @@ -31,7 +31,7 @@ #include "version_major.h" -#define LIBAVFILTER_VERSION_MINOR 48 +#define LIBAVFILTER_VERSION_MINOR 49 #define LIBAVFILTER_VERSION_MICRO 100 diff --git a/libavfilter/vf_colorspace_cuda.c b/libavfilter/vf_colorspace_cuda.c new file mode 100644 index 00..131c4ad72b --- /dev/null +++ b/libavfilter/vf_colorspace_cuda.c @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice
[FFmpeg-cvslog] lavu/riscv: add optimisations
ffmpeg | branch: master | Rémi Denis-Courmont | Mon Sep 12 18:53:20 2022 +0300| [c177108ae1144fd4e6cedb4a702260dbaa179825] | committer: James Almer lavu/riscv: add optimisations This provides some micro-optimisations for signed integer clipping, and support for bit weight with the Zbb extension. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c177108ae1144fd4e6cedb4a702260dbaa179825 --- libavutil/intmath.h | 5 ++- libavutil/riscv/intmath.h | 103 ++ 2 files changed, 106 insertions(+), 2 deletions(-) diff --git a/libavutil/intmath.h b/libavutil/intmath.h index 9573109e9d..c54d23b7bf 100644 --- a/libavutil/intmath.h +++ b/libavutil/intmath.h @@ -28,8 +28,9 @@ #if ARCH_ARM # include "arm/intmath.h" -#endif -#if ARCH_X86 +#elif ARCH_RISCV +# include "riscv/intmath.h" +#elif ARCH_X86 # include "x86/intmath.h" #endif diff --git a/libavutil/riscv/intmath.h b/libavutil/riscv/intmath.h new file mode 100644 index 00..78f7ba930a --- /dev/null +++ b/libavutil/riscv/intmath.h @@ -0,0 +1,103 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_RISCV_INTMATH_H +#define AVUTIL_RISCV_INTMATH_H + +#include + +#include "config.h" +#include "libavutil/attributes.h" + +/* + * The compiler is forced to sign-extend the result anyhow, so it is faster to + * compute it explicitly and use it. + */ +#define av_clip_int8 av_clip_int8_rvi +static av_always_inline av_const int8_t av_clip_int8_rvi(int a) +{ +union { uint8_t u; int8_t s; } u = { .u = a }; + +if (a != u.s) +a = ((a >> 31) ^ 0x7F); +return a; +} + +#define av_clip_int16 av_clip_int16_rvi +static av_always_inline av_const int16_t av_clip_int16_rvi(int a) +{ +union { uint8_t u; int8_t s; } u = { .u = a }; + +if (a != u.s) +a = ((a >> 31) ^ 0x7F); +return a; +} + +#define av_clipl_int32 av_clipl_int32_rvi +static av_always_inline av_const int32_t av_clipl_int32_rvi(int64_t a) +{ +union { uint32_t u; int32_t s; } u = { .u = a }; + +if (a != u.s) +a = ((a >> 63) ^ 0x7FFF); +return a; +} + +#define av_clip_intp2 av_clip_intp2_rvi +static av_always_inline av_const int av_clip_intp2_rvi(int a, int p) +{ +const int shift = 32 - p; +int b = (a << shift) >> shift; + +if (a != b) +b = (a >> 31) ^ ((1 << p) - 1); +return b; +} + +#if defined (__riscv_zbb) && (__riscv_zbb > 0) && HAVE_INLINE_ASM + +#define av_popcount av_popcount_rvb +static av_always_inline av_const int av_popcount_rvb(uint32_t x) +{ +int ret; + +#if (__riscv_xlen >= 64) +__asm__ ("cpopw %0, %1\n" : "=r" (ret) : "r" (x)); +#else +__asm__ ("cpop %0, %1\n" : "=r" (ret) : "r" (x)); +#endif +return ret; +} + +#if (__riscv_xlen >= 64) +#define av_popcount64 av_popcount64_rvb +static av_always_inline av_const int av_popcount64_rvb(uint64_t x) +{ +int ret; + +#if (__riscv_xlen >= 128) +__asm__ ("cpopd %0, %1\n" : "=r" (ret) : "r" (x)); +#else +__asm__ ("cpop %0, %1\n" : "=r" (ret) : "r" (x)); +#endif +return ret; +} +#endif /* __riscv_xlen >= 64 */ +#endif /* __riscv_zbb */ + +#endif /* AVUTIL_RISCV_INTMATH_H */ ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavu/riscv: byte-swap operations
ffmpeg | branch: master | Rémi Denis-Courmont | Mon Sep 12 18:53:19 2022 +0300| [df2057041b6079bea2fc5e6b31b00756f3da7d54] | committer: James Almer lavu/riscv: byte-swap operations If the target supports the Basic bit-manipulation (Zbb) extension, then the REV8 instruction is available to reverse byte order. Note that this instruction only exists at the "XLEN" register size, so we need to right shift the result down to the data width. If Zbb is not supported, then this patchset does nothing. Support for run-time detection is left for the future. Currently, there are no bits in auxv/ELF HWCAP for Z-extensions, so there are no clean ways to do this. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=df2057041b6079bea2fc5e6b31b00756f3da7d54 --- libavutil/bswap.h | 2 ++ libavutil/riscv/bswap.h | 74 + 2 files changed, 76 insertions(+) diff --git a/libavutil/bswap.h b/libavutil/bswap.h index 91cb79538d..4840ab433f 100644 --- a/libavutil/bswap.h +++ b/libavutil/bswap.h @@ -40,6 +40,8 @@ # include "arm/bswap.h" #elif ARCH_AVR32 # include "avr32/bswap.h" +#elif ARCH_RISCV +# include "riscv/bswap.h" #elif ARCH_SH4 # include "sh4/bswap.h" #elif ARCH_X86 diff --git a/libavutil/riscv/bswap.h b/libavutil/riscv/bswap.h new file mode 100644 index 00..de1429c0f7 --- /dev/null +++ b/libavutil/riscv/bswap.h @@ -0,0 +1,74 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_RISCV_BSWAP_H +#define AVUTIL_RISCV_BSWAP_H + +#include +#include "config.h" +#include "libavutil/attributes.h" + +#if defined (__riscv_zbb) && (__riscv_zbb > 0) && HAVE_INLINE_ASM + +static av_always_inline av_const uintptr_t av_bswap_xlen(uintptr_t x) +{ +uintptr_t y; + +__asm__("rev8 %0, %1" : "=r" (y) : "r" (x)); +return y; +} + +#define av_bswap16 av_bswap16 + +static av_always_inline av_const uint_fast16_t av_bswap16(uint_fast16_t x) +{ +return av_bswap_xlen(x) >> (__riscv_xlen - 16); +} + +#if (__riscv_xlen == 32) +#define av_bswap32 av_bswap_xlen +#define av_bswap64 av_bswap64 + +static av_always_inline av_const uint64_t av_bswap64(uint64_t x) +{ +return (((uint64_t)av_bswap32(x)) << 32) | av_bswap32(x >> 32); +} + +#else +#define av_bswap32 av_bswap32 + +static av_always_inline av_const uint_fast32_t av_bswap32(uint_fast32_t x) +{ +return av_bswap_xlen(x) >> (__riscv_xlen - 32); +} + +#if (__riscv_xlen == 64) +#define av_bswap64 av_bswap_xlen + +#else +#define av_bswap64 av_bswap64 + +static av_always_inline av_const uint_fast64_t av_bswap64(uint_fast64_t x) +{ +return av_bswap_xlen(x) >> (__riscv_xlen - 64); +} + +#endif /* __riscv_xlen > 64 */ +#endif /* __riscv_xlen > 32 */ +#endif /* __riscv_zbb */ +#endif /* AVUTIL_RISCV_BSWAP_H */ ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] configure/riscv: detect fast CLZ
ffmpeg | branch: master | Rémi Denis-Courmont | Mon Sep 12 18:53:18 2022 +0300| [ff14e3739393147b4596245ea511ec43a4ce6448] | committer: James Almer configure/riscv: detect fast CLZ RISC-V defines the CLZ instruction as part of the ratified Zbb subset of the (not yet ratified) bit mapulation extension (B). We can detect it from the __riscv_zbb predefined constant. At least GCC 12 already supports this correctly. Note that the macro will be non-zero if supported, zero if enabled in the compiler flags (e.g. -march=rv64gzbb) but not known to the compiler, and undefined otherwise. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ff14e3739393147b4596245ea511ec43a4ce6448 --- configure | 6 ++ 1 file changed, 6 insertions(+) diff --git a/configure b/configure index 9e51abd0d3..b7dc1d8656 100755 --- a/configure +++ b/configure @@ -5334,6 +5334,12 @@ elif enabled ppc; then ;; esac +elif enabled riscv; then + +if test_cpp_condition stddef.h "__riscv_zbb"; then +enable fast_clz +fi + elif enabled sparc; then case $cpu in ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavu/riscv: AV_READ_TIME cycle counter
ffmpeg | branch: master | Rémi Denis-Courmont | Mon Sep 12 18:53:17 2022 +0300| [d808070547a867a8f3f7b97fdff3574576213c07] | committer: James Almer lavu/riscv: AV_READ_TIME cycle counter This uses the architected RISC-V 64-bit cycle counter from the RISC-V unprivileged instruction set. In 64-bit and 128-bit, this is a straightforward CSR read. In 32-bit mode, the 64-bit value is exposed as two CSRs, which cannot be read atomically, so a loop is necessary to detect and fix up the race condition where the bottom half wraps exactly between the two reads. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=d808070547a867a8f3f7b97fdff3574576213c07 --- libavutil/riscv/timer.h | 53 + libavutil/timer.h | 2 ++ 2 files changed, 55 insertions(+) diff --git a/libavutil/riscv/timer.h b/libavutil/riscv/timer.h new file mode 100644 index 00..a34157a566 --- /dev/null +++ b/libavutil/riscv/timer.h @@ -0,0 +1,53 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_RISCV_TIMER_H +#define AVUTIL_RISCV_TIMER_H + +#include "config.h" + +#if HAVE_INLINE_ASM +#include + +static inline uint64_t rdcycle64(void) +{ +#if (__riscv_xlen >= 64) +uintptr_t cycles; + +__asm__ volatile ("rdcycle %0" : "=r"(cycles)); + +#else +uint64_t cycles; +uint32_t hi, lo, check; + +__asm__ volatile ( +"1: rdcycleh %0\n" +" rdcycle %1\n" +" rdcycleh %2\n" +" bne %0, %2, 1b\n" : "=r" (hi), "=r" (lo), "=r" (check)); + +cycles = (((uint64_t)hi) << 32) | lo; + +#endif +return cycles; +} + +#define AV_READ_TIME rdcycle64 + +#endif +#endif /* AVUTIL_RISCV_TIMER_H */ diff --git a/libavutil/timer.h b/libavutil/timer.h index 48e576739f..d3db5a27ef 100644 --- a/libavutil/timer.h +++ b/libavutil/timer.h @@ -57,6 +57,8 @@ # include "arm/timer.h" #elif ARCH_PPC # include "ppc/timer.h" +#elif ARCH_RISCV +# include "riscv/timer.h" #elif ARCH_X86 # include "x86/timer.h" #endif ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] doc: reference the RISC-V specification
ffmpeg | branch: master | Rémi Denis-Courmont | Mon Sep 12 18:53:16 2022 +0300| [092ce9712f63fc2641ec831d09c8ca0731083ae4] | committer: James Almer doc: reference the RISC-V specification > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=092ce9712f63fc2641ec831d09c8ca0731083ae4 --- doc/optimization.txt | 5 + 1 file changed, 5 insertions(+) diff --git a/doc/optimization.txt b/doc/optimization.txt index 974e2f9af2..3ed29fe38c 100644 --- a/doc/optimization.txt +++ b/doc/optimization.txt @@ -267,6 +267,11 @@ CELL/SPU: http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/30B3520C93F437AB87257060006FFE5E/$file/Language_Extensions_for_CBEA_2.4.pdf http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/9F820A5FFA3ECE8C8725716A0062585F/$file/CBE_Handbook_v1.1_24APR2007_pub.pdf +RISC-V-specific: + +The RISC-V Instruction Set Manual, Volume 1, Unprivileged ISA: +https://riscv.org/technical/specifications/ + GCC asm links: -- official doc but quite ugly ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] x86/float_dsp: use three operand form for some instructions
ffmpeg | branch: master | James Almer | Tue Sep 13 13:50:09 2022 -0300| [bda3a9faf4a2f201b24fb38a04da86410c9205ae] | committer: James Almer x86/float_dsp: use three operand form for some instructions Fixes compilation with old yasm Signed-off-by: James Almer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bda3a9faf4a2f201b24fb38a04da86410c9205ae --- libavutil/x86/float_dsp.asm | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 8f8e6dddf5..ff608f5f5a 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -443,19 +443,19 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset INIT_YMM fma3 cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset xor offsetq, offsetq -xorps m0, m0 +xorps m0, m0, m0 shl sized, 2 mov lenq, sizeq cmp lenq, 32 jl .l16 cmp lenq, 64 jl .l32 -xorpsm1, m1 +xorpsm1, m1, m1 cmp lenq, 128 jl .l64 andlenq, ~127 -xorpsm2, m2 -xorpsm3, m3 +xorpsm2, m2, m2 +xorpsm3, m3, m3 .loop128: movups m4, [v1q+offsetq] movups m5, [v1q+offsetq + 32] @@ -468,13 +468,13 @@ cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset add offsetq, 128 cmp offsetq, lenq jl .loop128 -addpsm0, m2 -addpsm1, m3 +addpsm0, m0, m2 +addpsm1, m1, m3 mov lenq, sizeq and lenq, 127 cmp lenq, 64 jge .l64 -addpsm0, m1 +addpsm0, m0, m1 cmp lenq, 32 jge .l32 vextractf128 xmm2, m0, 1 @@ -502,7 +502,7 @@ cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset add offsetq, 64 cmp offsetq, lenq jl .loop64 -addpsm0, m1 +addpsm0, m0, m1 mov lenq, sizeq and lenq, 63 cmp lenq, 32 ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] avcodec/x86/audiodsp: add scalarproduct avx2
ffmpeg | branch: master | Paul B Mahol | Mon Sep 12 18:53:31 2022 +0200| [37a503ac879ca7677beb7423c33a6c5d24dd6396] | committer: Paul B Mahol avcodec/x86/audiodsp: add scalarproduct avx2 > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=37a503ac879ca7677beb7423c33a6c5d24dd6396 --- libavcodec/x86/audiodsp.asm| 18 ++ libavcodec/x86/audiodsp_init.c | 6 ++ 2 files changed, 24 insertions(+) diff --git a/libavcodec/x86/audiodsp.asm b/libavcodec/x86/audiodsp.asm index b604b0443c..f64077cb13 100644 --- a/libavcodec/x86/audiodsp.asm +++ b/libavcodec/x86/audiodsp.asm @@ -44,6 +44,24 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order movd eax, m2 RET +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal scalarproduct_int16, 3,3,2, v1, v2, order +add orderd, orderd +add v1q, orderq +add v2q, orderq +neg orderq +pxorm1, m1 +.loop: +movum0, [v1q + orderq] +pmaddwd m0, [v2q + orderq] +paddd m1, m0 +add orderq, mmsize +jl .loop +HADDD m1, m0 +movd eax, xm1 +RET +%endif ;- ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, diff --git a/libavcodec/x86/audiodsp_init.c b/libavcodec/x86/audiodsp_init.c index aa5e43e570..68aa3b2129 100644 --- a/libavcodec/x86/audiodsp_init.c +++ b/libavcodec/x86/audiodsp_init.c @@ -24,6 +24,9 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/audiodsp.h" +int32_t ff_scalarproduct_int16_avx2(const int16_t *v1, const int16_t *v2, +int order); + int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order); @@ -53,4 +56,7 @@ av_cold void ff_audiodsp_init_x86(AudioDSPContext *c) if (EXTERNAL_SSE4(cpu_flags)) c->vector_clip_int32 = ff_vector_clip_int32_sse4; + +if (EXTERNAL_AVX2_FAST(cpu_flags)) +c->scalarproduct_int16 = ff_scalarproduct_int16_avx2; } ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] avutil/x86/float_dsp: add fma3 for scalarproduct
ffmpeg | branch: master | Paul B Mahol | Wed Jan 20 16:58:31 2021 +0100| [72acff9f593f977944a62652fc9dd346ec53225a] | committer: Paul B Mahol avutil/x86/float_dsp: add fma3 for scalarproduct > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=72acff9f593f977944a62652fc9dd346ec53225a --- libavutil/x86/float_dsp.asm| 127 + libavutil/x86/float_dsp_init.c | 2 + 2 files changed, 129 insertions(+) diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index cca4d019c7..8f8e6dddf5 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -440,6 +440,133 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset %endif RET +INIT_YMM fma3 +cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset +xor offsetq, offsetq +xorps m0, m0 +shl sized, 2 +mov lenq, sizeq +cmp lenq, 32 +jl .l16 +cmp lenq, 64 +jl .l32 +xorpsm1, m1 +cmp lenq, 128 +jl .l64 +andlenq, ~127 +xorpsm2, m2 +xorpsm3, m3 +.loop128: +movups m4, [v1q+offsetq] +movups m5, [v1q+offsetq + 32] +movups m6, [v1q+offsetq + 64] +movups m7, [v1q+offsetq + 96] +fmaddps m0, m4, [v2q+offsetq ], m0 +fmaddps m1, m5, [v2q+offsetq + 32], m1 +fmaddps m2, m6, [v2q+offsetq + 64], m2 +fmaddps m3, m7, [v2q+offsetq + 96], m3 +add offsetq, 128 +cmp offsetq, lenq +jl .loop128 +addpsm0, m2 +addpsm1, m3 +mov lenq, sizeq +and lenq, 127 +cmp lenq, 64 +jge .l64 +addpsm0, m1 +cmp lenq, 32 +jge .l32 +vextractf128 xmm2, m0, 1 +addpsxmm0, xmm2 +cmp lenq, 16 +jge .l16 +movhlps xmm1, xmm0 +addpsxmm0, xmm1 +movssxmm1, xmm0 +shufps xmm0, xmm0, 1 +addssxmm0, xmm1 +%if ARCH_X86_64 == 0 +movss r0m, xm0 +fld dword r0m +%endif +RET +.l64: +andlenq, ~63 +addlenq, offsetq +.loop64: +movups m4, [v1q+offsetq] +movups m5, [v1q+offsetq + 32] +fmaddps m0, m4, [v2q+offsetq], m0 +fmaddps m1, m5, [v2q+offsetq + 32], m1 +add offsetq, 64 +cmp offsetq, lenq +jl .loop64 +addpsm0, m1 +mov lenq, sizeq +and lenq, 63 +cmp lenq, 32 +jge .l32 +vextractf128 xmm2, m0, 1 +addpsxmm0, xmm2 +cmp lenq, 16 +jge .l16 +movhlps xmm1, xmm0 +addpsxmm0, xmm1 +movssxmm1, xmm0 +shufps xmm0, xmm0, 1 +addssxmm0, xmm1 +%if ARCH_X86_64 == 0 +movss r0m, xm0 +fld dword r0m +%endif +RET +.l32: +andlenq, ~31 +addlenq, offsetq +.loop32: +movups m4, [v1q+offsetq] +fmaddps m0, m4, [v2q+offsetq], m0 +add offsetq, 32 +cmp offsetq, lenq +jl .loop32 +vextractf128 xmm2, m0, 1 +addpsxmm0, xmm2 +mov lenq, sizeq +and lenq, 31 +cmp lenq, 16 +jge .l16 +movhlps xmm1, xmm0 +addpsxmm0, xmm1 +movssxmm1, xmm0 +shufps xmm0, xmm0, 1 +addssxmm0, xmm1 +%if ARCH_X86_64 == 0 +movss r0m, xm0 +fld dword r0m +%endif +RET +.l16: +andlenq, ~15 +addlenq, offsetq +.loop16: +movaps xmm1, [v1q+offsetq] +mulpsxmm1, [v2q+offsetq] +addpsxmm0, xmm1 +add offsetq, 16 +cmp offsetq, lenq +jl .loop16 +movhlps xmm1, xmm0 +addpsxmm0, xmm1 +movssxmm1, xmm0 +shufps xmm0, xmm0, 1 +addssxmm0, xmm1 +%if ARCH_X86_64 == 0 +movss r0m, xm0 +fld dword r0m +%endif +RET + ;- ; void ff_butterflies_float(float *src0, float *src1, int len); ;- diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index ad17bc2044..ad6b506259 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -74,6 +74,7 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0, const float *src1, int len); float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); +float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order); void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len); @@ -112,5 +113,6 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3; fdsp->vector_fmul_add= ff_vector_fmul_add_fma3; fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3; +fdsp->scalarproduct_float = ff_scalarproduct_float_fma3; } } ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link
[FFmpeg-cvslog] avcodec/flac_parser: avoid returning too negative number
ffmpeg | branch: master | Paul B Mahol | Thu Sep 8 09:59:09 2022 +0200| [cf2cf31805448dd11692313440a21821773a6128] | committer: Paul B Mahol avcodec/flac_parser: avoid returning too negative number If return value is very small parser code will assert. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=cf2cf31805448dd11692313440a21821773a6128 --- libavcodec/flac_parser.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libavcodec/flac_parser.c b/libavcodec/flac_parser.c index 5b3a4e6e67..bd91cc1a05 100644 --- a/libavcodec/flac_parser.c +++ b/libavcodec/flac_parser.c @@ -663,8 +663,11 @@ static int get_best_header(FLACParseContext *fpc, const uint8_t **poutbuf, /* Return the negative overread index so the client can compute pos. This should be the amount overread to the beginning of the child */ -if (child) -return child->offset - flac_fifo_size(>fifo_buf); +if (child) { +int64_t offset = child->offset - flac_fifo_size(>fifo_buf); +if (offset > -(1 << 28)) +return offset; +} return 0; } ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] fate/spdif: Add spdif tests
ffmpeg | branch: master | Andreas Rheinhardt | Sun Sep 11 18:34:47 2022 +0200| [9ad3db3ad932d484708194f419544c33cb3c71e6] | committer: Andreas Rheinhardt fate/spdif: Add spdif tests These tests test both the demuxer as well as the muxer wherever possible. It is not always possible due to the fact that the muxer supports more codecs than the demuxer. The spdif demuxer does currently not set the need_parsing flag. If one were to set this to AVSTREAM_PARSE_FULL, the test results would change as follows: - For spdif-aac-remux, the packets are currently padded to 16bits, i.e. if the actual packet size is odd, there is a padding byte. The parser splits this byte away into a one byte packet of its own. Insanely, these one byte packets get the same duration as normal packets, i.e. timing is ruined. - The DCA-remux tests get proper duration/timestamps. - In the spdif-mp2-remux test the demuxer marks the stream as being MP2; the parser sets it to MP3 and this triggers the "Codec change in IEC 61937" codepath; this test therefore returns only two packets with the parser. - For spdif-mp3-remux some bytes end up in different packets: Some input packets of this file have an odd length (417B instead of 418B like all the other packets) and are padded to 418B. Without a parser, all returned packets from the spdif-demuxer are 418B. With a parser, the packets that were originally 417B are 417B again, but the padding byte has not been discarded, but added to the next packet which is now 419B. This fixes "Multiple frames in a packet" warning and avoids an "Invalid data found when processing input" error when decoding. Signed-off-by: Andreas Rheinhardt > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=9ad3db3ad932d484708194f419544c33cb3c71e6 --- tests/Makefile |1 + tests/fate/spdif.mak | 44 ++ tests/ref/fate/spdif-aac-remux | 93 +++ tests/ref/fate/spdif-ac3-remux | 63 ++ tests/ref/fate/spdif-dca-core-bswap|1 + tests/ref/fate/spdif-dca-core-remux| 14 + tests/ref/fate/spdif-dca-master|1 + tests/ref/fate/spdif-dca-master-core |1 + tests/ref/fate/spdif-dca-master-core-remux | 1179 tests/ref/fate/spdif-eac3 |1 + tests/ref/fate/spdif-mlp |1 + tests/ref/fate/spdif-mp2-remux | 49 ++ tests/ref/fate/spdif-mp3-remux | 47 ++ tests/ref/fate/spdif-truehd|1 + 14 files changed, 1496 insertions(+) diff --git a/tests/Makefile b/tests/Makefile index d9c509a415..06494a9cc4 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -231,6 +231,7 @@ include $(SRC_PATH)/tests/fate/real.mak include $(SRC_PATH)/tests/fate/screen.mak include $(SRC_PATH)/tests/fate/segment.mak include $(SRC_PATH)/tests/fate/source.mak +include $(SRC_PATH)/tests/fate/spdif.mak include $(SRC_PATH)/tests/fate/speedhq.mak include $(SRC_PATH)/tests/fate/subtitles.mak include $(SRC_PATH)/tests/fate/truehd.mak diff --git a/tests/fate/spdif.mak b/tests/fate/spdif.mak new file mode 100644 index 00..093b8138e8 --- /dev/null +++ b/tests/fate/spdif.mak @@ -0,0 +1,44 @@ +# This padds the AAC frames to 16 bit words (the actual size is +# still available in the ADTS headers). +FATE_SPDIF_REMUX-$(call ALLYES, AAC_DEMUXER AAC_DECODER) += fate-spdif-aac-remux +fate-spdif-aac-remux: CMD = transcode aac $(TARGET_SAMPLES)/aac/foo.aac spdif "-c copy" "-c copy" + +FATE_SPDIF_REMUX-$(call ALLYES, AC3_DEMUXER AC3_DECODER) += fate-spdif-ac3-remux +fate-spdif-ac3-remux: CMD = transcode ac3 $(TARGET_SAMPLES)/ac3/monsters_inc_5.1_448_small.ac3 spdif "-c copy" "-c copy" + +FATE_SPDIF_REMUX-$(call ALLYES, DTS_DEMUXER DCA_DECODER) += fate-spdif-dca-core-remux +fate-spdif-dca-core-remux: CMD = transcode dts $(TARGET_SAMPLES)/dts/dcadec-suite/core_51_24_48_768_0.dtshd spdif "-c copy" "-c copy" + +FATE_SPDIF-$(call DEMMUX, DTSHD, SPDIF) += fate-spdif-dca-core-bswap +fate-spdif-dca-core-bswap: CMD = md5 -i $(TARGET_SAMPLES)/dts/dcadec-suite/core_51_24_48_768_0.dtshd -c copy -spdif_flags +be -f spdif + +# Only the core will be transferred, extensions are discarded. +FATE_SPDIF_REMUX-$(call ALLYES, DTS_DEMUXER DCA_DECODER) += fate-spdif-dca-master-core-remux +fate-spdif-dca-master-core-remux: CMD = transcode dts $(TARGET_SAMPLES)/dts/master_audio_7.1_24bit.dts spdif "-c copy" "-c copy" + +FATE_SPDIF-$(call DEMMUX, DTS, SPDIF) += fate-spdif-dca-master fate-spdif-dca-master-core +fate-spdif-dca-master: CMD = md5 -i $(TARGET_SAMPLES)/dts/master_audio_7.1_24bit.dts -c copy -dtshd_rate 192000 -f spdif +# This test uses a too low bitrate and therefore switches to only transmit the core. +fate-spdif-dca-master-core: CMD = md5 -i $(TARGET_SAMPLES)/dts/master_audio_7.1_24bit.dts -c copy -dtshd_rate 96000 -f spdif + +FATE_SPDIF-$(call DEMMUX, EAC3, SPDIF) += fate-spdif-eac3 +fate-spdif-eac3: CMD = md5 -i
[FFmpeg-cvslog] avcodec/arm/sbcenc: avoid callee preserved vfp registers
ffmpeg | branch: master | James Cowgill | Sun Aug 25 09:18:00 2019 +0100| [50a4dff69f6477b06f00eae1cac2a53ae22fe9a5] | committer: Martin Storsjö avcodec/arm/sbcenc: avoid callee preserved vfp registers When compiling FFmpeg with GCC-9, some very random segfaults were observed in code which had previously called down into the SBC encoder NEON assembly routines. This was caused by these functions clobbering some of the vfp callee saved registers (d8 - d15 aka q4 - q7). GCC was using these registers to save local variables, but after these functions returned, they would contain garbage. Fix by reallocating the registers in the two affected functions in the following way: ff_sbc_analyze_4_neon: q2-q5 => q8-q11, then q1-q4 => q8-q11 ff_sbc_analyze_8_neon: q2-q9 => q8-q15 The reason for using these replacements is to keep closely related sets of registers consecutively numbered which hopefully makes the code more easy to follow. Since this commit only reallocates registers, it should have no performance impact. Signed-off-by: James Cowgill Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=50a4dff69f6477b06f00eae1cac2a53ae22fe9a5 --- libavcodec/arm/sbcdsp_neon.S | 220 +-- 1 file changed, 110 insertions(+), 110 deletions(-) diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S index d83d21d202..914abfb6cc 100644 --- a/libavcodec/arm/sbcdsp_neon.S +++ b/libavcodec/arm/sbcdsp_neon.S @@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1 /* TODO: merge even and odd cases (or even merge all four calls to this * function) in order to have only aligned reads from 'in' array * and reduce number of load instructions */ -vld1.16 {d4, d5}, [r0, :64]! -vld1.16 {d8, d9}, [r2, :128]! +vld1.16 {d16, d17}, [r0, :64]! +vld1.16 {d20, d21}, [r2, :128]! -vmull.s16 q0, d4, d8 -vld1.16 {d6, d7}, [r0, :64]! -vmull.s16 q1, d5, d9 -vld1.16 {d10, d11}, [r2, :128]! +vmull.s16 q0, d16, d20 +vld1.16 {d18, d19}, [r0, :64]! +vmull.s16 q1, d17, d21 +vld1.16 {d22, d23}, [r2, :128]! -vmlal.s16 q0, d6, d10 -vld1.16 {d4, d5}, [r0, :64]! -vmlal.s16 q1, d7, d11 -vld1.16 {d8, d9}, [r2, :128]! +vmlal.s16 q0, d18, d22 +vld1.16 {d16, d17}, [r0, :64]! +vmlal.s16 q1, d19, d23 +vld1.16 {d20, d21}, [r2, :128]! -vmlal.s16 q0, d4, d8 -vld1.16 {d6, d7}, [r0, :64]! -vmlal.s16 q1, d5, d9 -vld1.16 {d10, d11}, [r2, :128]! +vmlal.s16 q0, d16, d20 +vld1.16 {d18, d19}, [r0, :64]! +vmlal.s16 q1, d17, d21 +vld1.16 {d22, d23}, [r2, :128]! -vmlal.s16 q0, d6, d10 -vld1.16 {d4, d5}, [r0, :64]! -vmlal.s16 q1, d7, d11 -vld1.16 {d8, d9}, [r2, :128]! +vmlal.s16 q0, d18, d22 +vld1.16 {d16, d17}, [r0, :64]! +vmlal.s16 q1, d19, d23 +vld1.16 {d20, d21}, [r2, :128]! -vmlal.s16 q0, d4, d8 -vmlal.s16 q1, d5, d9 +vmlal.s16 q0, d16, d20 +vmlal.s16 q1, d17, d21 vpadd.s32 d0, d0, d1 vpadd.s32 d1, d2, d3 vrshrn.s32 d0, q0, SBC_PROTO_FIXED_SCALE -vld1.16 {d2, d3, d4, d5}, [r2, :128]! +vld1.16 {d16, d17, d18, d19}, [r2, :128]! vdup.i32d1, d0[1] /* TODO: can be eliminated */ vdup.i32d0, d0[0] /* TODO: can be eliminated */ -vmull.s16 q3, d2, d0 -vmull.s16 q4, d3, d0 -vmlal.s16 q3, d4, d1 -vmlal.s16 q4, d5, d1 +vmull.s16 q10, d16, d0 +vmull.s16 q11, d17, d0 +vmlal.s16 q10, d18, d1 +vmlal.s16 q11, d19, d1 -vpadd.s32 d0, d6, d7 /* TODO: can be eliminated */ -vpadd.s32 d1, d8, d9 /* TODO: can be eliminated */ +vpadd.s32 d0, d20, d21 /* TODO: can be eliminated */ +vpadd.s32 d1, d22, d23 /* TODO: can be eliminated */ vst1.32 {d0, d1}, [r1, :128] @@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1 /* TODO: merge even and odd cases (or even merge all four calls to this * function) in order to have only aligned reads from 'in' array * and reduce number of load instructions */ -vld1.16 {d4, d5}, [r0, :64]! -vld1.16 {d8, d9}, [r2, :128]! - -vmull.s16 q6, d4, d8 -vld1.16 {d6, d7}, [r0, :64]! -vmull.s16 q7, d5, d9 -vld1.16