[FFmpeg-cvslog] libavfilter: add vf_colorrange_cuda, CUDA-accelerated color conversion filter

2022-09-13 Thread Roman Arzumanyan
ffmpeg | branch: master | Roman Arzumanyan  | Sat Sep 
10 11:05:56 2022 +0300| [cc81ab283c72921a23f7dc149c6a2b386eaf77c6] | committer: 
Timo Rothenpieler

libavfilter: add vf_colorrange_cuda, CUDA-accelerated color conversion filter

Signed-off-by: Timo Rothenpieler 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=cc81ab283c72921a23f7dc149c6a2b386eaf77c6
---

 configure |   6 +-
 doc/filters.texi  |  32 +++
 libavfilter/Makefile  |   3 +
 libavfilter/allfilters.c  |   1 +
 libavfilter/version.h |   2 +-
 libavfilter/vf_colorspace_cuda.c  | 435 ++
 libavfilter/vf_colorspace_cuda.cu |  94 
 7 files changed, 570 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index b7dc1d8656..240ae942d1 100755
--- a/configure
+++ b/configure
@@ -3149,10 +3149,12 @@ qsvvpp_select="qsv"
 vaapi_encode_deps="vaapi"
 v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
 
-chromakey_cuda_filter_deps="ffnvcodec"
-chromakey_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 bilateral_cuda_filter_deps="ffnvcodec"
 bilateral_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+chromakey_cuda_filter_deps="ffnvcodec"
+chromakey_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+colorspace_cuda_filter_deps="ffnvcodec"
+colorspace_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 hwupload_cuda_filter_deps="ffnvcodec"
 scale_npp_filter_deps="ffnvcodec libnpp"
 scale2ref_npp_filter_deps="ffnvcodec libnpp"
diff --git a/doc/filters.texi b/doc/filters.texi
index dbc08163d8..6aa350a63c 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -9725,6 +9725,38 @@ For example to convert the input to SMPTE-240M, use the 
command:
 colorspace=smpte240m
 @end example
 
+@section colorspace_cuda
+
+CUDA accelerated implementation of the colorspace filter.
+
+It is by no means feature complete compared to the software colorspace filter,
+and at the current time only supports color range conversion between jpeg/full
+and mpeg/limited range.
+
+The filter accepts the following options:
+
+@table @option
+@item range
+Specify output color range.
+
+The accepted values are:
+@table @samp
+@item tv
+TV (restricted) range
+
+@item mpeg
+MPEG (restricted) range
+
+@item pc
+PC (full) range
+
+@item jpeg
+JPEG (full) range
+
+@end table
+
+@end table
+
 @section colortemperature
 Adjust color temperature in video to simulate variations in ambient color 
temperature.
 
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 841ec47141..ff2a06c262 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -230,6 +230,9 @@ OBJS-$(CONFIG_COLORLEVELS_FILTER)+= 
vf_colorlevels.o
 OBJS-$(CONFIG_COLORMAP_FILTER)   += vf_colormap.o
 OBJS-$(CONFIG_COLORMATRIX_FILTER)+= vf_colormatrix.o
 OBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o colorspacedsp.o
+OBJS-$(CONFIG_COLORSPACE_CUDA_FILTER)+= vf_colorspace_cuda.o \
+vf_colorspace_cuda.ptx.o \
+cuda/load_helper.o
 OBJS-$(CONFIG_COLORTEMPERATURE_FILTER)   += vf_colortemperature.o
 OBJS-$(CONFIG_CONVOLUTION_FILTER)+= vf_convolution.o
 OBJS-$(CONFIG_CONVOLUTION_OPENCL_FILTER) += vf_convolution_opencl.o 
opencl.o \
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 79e8a16bbc..119de40b25 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -213,6 +213,7 @@ extern const AVFilter ff_vf_colorlevels;
 extern const AVFilter ff_vf_colormap;
 extern const AVFilter ff_vf_colormatrix;
 extern const AVFilter ff_vf_colorspace;
+extern const AVFilter ff_vf_colorspace_cuda;
 extern const AVFilter ff_vf_colortemperature;
 extern const AVFilter ff_vf_convolution;
 extern const AVFilter ff_vf_convolution_opencl;
diff --git a/libavfilter/version.h b/libavfilter/version.h
index fc0df70dee..5aac9c513a 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -31,7 +31,7 @@
 
 #include "version_major.h"
 
-#define LIBAVFILTER_VERSION_MINOR  48
+#define LIBAVFILTER_VERSION_MINOR  49
 #define LIBAVFILTER_VERSION_MICRO 100
 
 
diff --git a/libavfilter/vf_colorspace_cuda.c b/libavfilter/vf_colorspace_cuda.c
new file mode 100644
index 00..131c4ad72b
--- /dev/null
+++ b/libavfilter/vf_colorspace_cuda.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice 

[FFmpeg-cvslog] lavu/riscv: add optimisations

2022-09-13 Thread Rémi Denis-Courmont
ffmpeg | branch: master | Rémi Denis-Courmont  | Mon Sep 12 
18:53:20 2022 +0300| [c177108ae1144fd4e6cedb4a702260dbaa179825] | committer: 
James Almer

lavu/riscv: add  optimisations

This provides some micro-optimisations for signed integer clipping, and
support for bit weight with the Zbb extension.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c177108ae1144fd4e6cedb4a702260dbaa179825
---

 libavutil/intmath.h   |   5 ++-
 libavutil/riscv/intmath.h | 103 ++
 2 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/libavutil/intmath.h b/libavutil/intmath.h
index 9573109e9d..c54d23b7bf 100644
--- a/libavutil/intmath.h
+++ b/libavutil/intmath.h
@@ -28,8 +28,9 @@
 
 #if ARCH_ARM
 #   include "arm/intmath.h"
-#endif
-#if ARCH_X86
+#elif ARCH_RISCV
+#   include "riscv/intmath.h"
+#elif ARCH_X86
 #   include "x86/intmath.h"
 #endif
 
diff --git a/libavutil/riscv/intmath.h b/libavutil/riscv/intmath.h
new file mode 100644
index 00..78f7ba930a
--- /dev/null
+++ b/libavutil/riscv/intmath.h
@@ -0,0 +1,103 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_RISCV_INTMATH_H
+#define AVUTIL_RISCV_INTMATH_H
+
+#include 
+
+#include "config.h"
+#include "libavutil/attributes.h"
+
+/*
+ * The compiler is forced to sign-extend the result anyhow, so it is faster to
+ * compute it explicitly and use it.
+ */
+#define av_clip_int8 av_clip_int8_rvi
+static av_always_inline av_const int8_t av_clip_int8_rvi(int a)
+{
+union { uint8_t u; int8_t s; } u = { .u = a };
+
+if (a != u.s)
+a = ((a >> 31) ^ 0x7F);
+return a;
+}
+
+#define av_clip_int16 av_clip_int16_rvi
+static av_always_inline av_const int16_t av_clip_int16_rvi(int a)
+{
+union { uint8_t u; int8_t s; } u = { .u = a };
+
+if (a != u.s)
+a = ((a >> 31) ^ 0x7F);
+return a;
+}
+
+#define av_clipl_int32 av_clipl_int32_rvi
+static av_always_inline av_const int32_t av_clipl_int32_rvi(int64_t a)
+{
+union { uint32_t u; int32_t s; } u = { .u = a };
+
+if (a != u.s)
+a = ((a >> 63) ^ 0x7FFF);
+return a;
+}
+
+#define av_clip_intp2 av_clip_intp2_rvi
+static av_always_inline av_const int av_clip_intp2_rvi(int a, int p)
+{
+const int shift = 32 - p;
+int b = (a << shift) >> shift;
+
+if (a != b)
+b = (a >> 31) ^ ((1 << p) - 1);
+return b;
+}
+
+#if defined (__riscv_zbb) && (__riscv_zbb > 0) && HAVE_INLINE_ASM
+
+#define av_popcount av_popcount_rvb
+static av_always_inline av_const int av_popcount_rvb(uint32_t x)
+{
+int ret;
+
+#if (__riscv_xlen >= 64)
+__asm__ ("cpopw %0, %1\n" : "=r" (ret) : "r" (x));
+#else
+__asm__ ("cpop %0, %1\n" : "=r" (ret) : "r" (x));
+#endif
+return ret;
+}
+
+#if (__riscv_xlen >= 64)
+#define av_popcount64 av_popcount64_rvb
+static av_always_inline av_const int av_popcount64_rvb(uint64_t x)
+{
+int ret;
+
+#if (__riscv_xlen >= 128)
+__asm__ ("cpopd %0, %1\n" : "=r" (ret) : "r" (x));
+#else
+__asm__ ("cpop %0, %1\n" : "=r" (ret) : "r" (x));
+#endif
+return ret;
+}
+#endif /* __riscv_xlen >= 64 */
+#endif /* __riscv_zbb */
+
+#endif /* AVUTIL_RISCV_INTMATH_H */

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] lavu/riscv: byte-swap operations

2022-09-13 Thread Rémi Denis-Courmont
ffmpeg | branch: master | Rémi Denis-Courmont  | Mon Sep 12 
18:53:19 2022 +0300| [df2057041b6079bea2fc5e6b31b00756f3da7d54] | committer: 
James Almer

lavu/riscv: byte-swap operations

If the target supports the Basic bit-manipulation (Zbb) extension, then
the REV8 instruction is available to reverse byte order.

Note that this instruction only exists at the "XLEN" register size,
so we need to right shift the result down to the data width.

If Zbb is not supported, then this patchset does nothing. Support for
run-time detection is left for the future. Currently, there are no
bits in auxv/ELF HWCAP for Z-extensions, so there are no clean ways to
do this.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=df2057041b6079bea2fc5e6b31b00756f3da7d54
---

 libavutil/bswap.h   |  2 ++
 libavutil/riscv/bswap.h | 74 +
 2 files changed, 76 insertions(+)

diff --git a/libavutil/bswap.h b/libavutil/bswap.h
index 91cb79538d..4840ab433f 100644
--- a/libavutil/bswap.h
+++ b/libavutil/bswap.h
@@ -40,6 +40,8 @@
 #   include "arm/bswap.h"
 #elif ARCH_AVR32
 #   include "avr32/bswap.h"
+#elif ARCH_RISCV
+#   include "riscv/bswap.h"
 #elif ARCH_SH4
 #   include "sh4/bswap.h"
 #elif ARCH_X86
diff --git a/libavutil/riscv/bswap.h b/libavutil/riscv/bswap.h
new file mode 100644
index 00..de1429c0f7
--- /dev/null
+++ b/libavutil/riscv/bswap.h
@@ -0,0 +1,74 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_RISCV_BSWAP_H
+#define AVUTIL_RISCV_BSWAP_H
+
+#include 
+#include "config.h"
+#include "libavutil/attributes.h"
+
+#if defined (__riscv_zbb) && (__riscv_zbb > 0) && HAVE_INLINE_ASM
+
+static av_always_inline av_const uintptr_t av_bswap_xlen(uintptr_t x)
+{
+uintptr_t y;
+
+__asm__("rev8 %0, %1" : "=r" (y) : "r" (x));
+return y;
+}
+
+#define av_bswap16 av_bswap16
+
+static av_always_inline av_const uint_fast16_t av_bswap16(uint_fast16_t x)
+{
+return av_bswap_xlen(x) >> (__riscv_xlen - 16);
+}
+
+#if (__riscv_xlen == 32)
+#define av_bswap32 av_bswap_xlen
+#define av_bswap64 av_bswap64
+
+static av_always_inline av_const uint64_t av_bswap64(uint64_t x)
+{
+return (((uint64_t)av_bswap32(x)) << 32) | av_bswap32(x >> 32);
+}
+
+#else
+#define av_bswap32 av_bswap32
+
+static av_always_inline av_const uint_fast32_t av_bswap32(uint_fast32_t x)
+{
+return av_bswap_xlen(x) >> (__riscv_xlen - 32);
+}
+
+#if (__riscv_xlen == 64)
+#define av_bswap64 av_bswap_xlen
+
+#else
+#define av_bswap64 av_bswap64
+
+static av_always_inline av_const uint_fast64_t av_bswap64(uint_fast64_t x)
+{
+return av_bswap_xlen(x) >> (__riscv_xlen - 64);
+}
+
+#endif /* __riscv_xlen > 64 */
+#endif /* __riscv_xlen > 32 */
+#endif /* __riscv_zbb */
+#endif /* AVUTIL_RISCV_BSWAP_H */

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] configure/riscv: detect fast CLZ

2022-09-13 Thread Rémi Denis-Courmont
ffmpeg | branch: master | Rémi Denis-Courmont  | Mon Sep 12 
18:53:18 2022 +0300| [ff14e3739393147b4596245ea511ec43a4ce6448] | committer: 
James Almer

configure/riscv: detect fast CLZ

RISC-V defines the CLZ instruction as part of the ratified Zbb subset
of the (not yet ratified) bit mapulation extension (B). We can detect
it from the __riscv_zbb predefined constant. At least GCC 12 already
supports this correctly.

Note that the macro will be non-zero if supported, zero if enabled
in the compiler flags (e.g. -march=rv64gzbb) but not known to the
compiler, and undefined otherwise.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ff14e3739393147b4596245ea511ec43a4ce6448
---

 configure | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/configure b/configure
index 9e51abd0d3..b7dc1d8656 100755
--- a/configure
+++ b/configure
@@ -5334,6 +5334,12 @@ elif enabled ppc; then
 ;;
 esac
 
+elif enabled riscv; then
+
+if test_cpp_condition stddef.h "__riscv_zbb"; then
+enable fast_clz
+fi
+
 elif enabled sparc; then
 
 case $cpu in

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] lavu/riscv: AV_READ_TIME cycle counter

2022-09-13 Thread Rémi Denis-Courmont
ffmpeg | branch: master | Rémi Denis-Courmont  | Mon Sep 12 
18:53:17 2022 +0300| [d808070547a867a8f3f7b97fdff3574576213c07] | committer: 
James Almer

lavu/riscv: AV_READ_TIME cycle counter

This uses the architected RISC-V 64-bit cycle counter from the
RISC-V unprivileged instruction set.

In 64-bit and 128-bit, this is a straightforward CSR read.
In 32-bit mode, the 64-bit value is exposed as two CSRs, which
cannot be read atomically, so a loop is necessary to detect and fix up
the race condition where the bottom half wraps exactly between the two
reads.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=d808070547a867a8f3f7b97fdff3574576213c07
---

 libavutil/riscv/timer.h | 53 +
 libavutil/timer.h   |  2 ++
 2 files changed, 55 insertions(+)

diff --git a/libavutil/riscv/timer.h b/libavutil/riscv/timer.h
new file mode 100644
index 00..a34157a566
--- /dev/null
+++ b/libavutil/riscv/timer.h
@@ -0,0 +1,53 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_RISCV_TIMER_H
+#define AVUTIL_RISCV_TIMER_H
+
+#include "config.h"
+
+#if HAVE_INLINE_ASM
+#include 
+
+static inline uint64_t rdcycle64(void)
+{
+#if (__riscv_xlen >= 64)
+uintptr_t cycles;
+
+__asm__ volatile ("rdcycle %0" : "=r"(cycles));
+
+#else
+uint64_t cycles;
+uint32_t hi, lo, check;
+
+__asm__ volatile (
+"1: rdcycleh %0\n"
+"   rdcycle  %1\n"
+"   rdcycleh %2\n"
+"   bne %0, %2, 1b\n" : "=r" (hi), "=r" (lo), "=r" (check));
+
+cycles = (((uint64_t)hi) << 32) | lo;
+
+#endif
+return cycles;
+}
+
+#define AV_READ_TIME rdcycle64
+
+#endif
+#endif /* AVUTIL_RISCV_TIMER_H */
diff --git a/libavutil/timer.h b/libavutil/timer.h
index 48e576739f..d3db5a27ef 100644
--- a/libavutil/timer.h
+++ b/libavutil/timer.h
@@ -57,6 +57,8 @@
 #   include "arm/timer.h"
 #elif ARCH_PPC
 #   include "ppc/timer.h"
+#elif ARCH_RISCV
+#   include "riscv/timer.h"
 #elif ARCH_X86
 #   include "x86/timer.h"
 #endif

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] doc: reference the RISC-V specification

2022-09-13 Thread Rémi Denis-Courmont
ffmpeg | branch: master | Rémi Denis-Courmont  | Mon Sep 12 
18:53:16 2022 +0300| [092ce9712f63fc2641ec831d09c8ca0731083ae4] | committer: 
James Almer

doc: reference the RISC-V specification

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=092ce9712f63fc2641ec831d09c8ca0731083ae4
---

 doc/optimization.txt | 5 +
 1 file changed, 5 insertions(+)

diff --git a/doc/optimization.txt b/doc/optimization.txt
index 974e2f9af2..3ed29fe38c 100644
--- a/doc/optimization.txt
+++ b/doc/optimization.txt
@@ -267,6 +267,11 @@ CELL/SPU:
 
http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/30B3520C93F437AB87257060006FFE5E/$file/Language_Extensions_for_CBEA_2.4.pdf
 
http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/9F820A5FFA3ECE8C8725716A0062585F/$file/CBE_Handbook_v1.1_24APR2007_pub.pdf
 
+RISC-V-specific:
+
+The RISC-V Instruction Set Manual, Volume 1, Unprivileged ISA:
+https://riscv.org/technical/specifications/
+
 GCC asm links:
 --
 official doc but quite ugly

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] x86/float_dsp: use three operand form for some instructions

2022-09-13 Thread James Almer
ffmpeg | branch: master | James Almer  | Tue Sep 13 13:50:09 
2022 -0300| [bda3a9faf4a2f201b24fb38a04da86410c9205ae] | committer: James Almer

x86/float_dsp: use three operand form for some instructions

Fixes compilation with old yasm

Signed-off-by: James Almer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bda3a9faf4a2f201b24fb38a04da86410c9205ae
---

 libavutil/x86/float_dsp.asm | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index 8f8e6dddf5..ff608f5f5a 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -443,19 +443,19 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
 INIT_YMM fma3
 cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
 xor   offsetq, offsetq
-xorps  m0, m0
+xorps  m0, m0, m0
 shl sized, 2
 mov  lenq, sizeq
 cmp  lenq, 32
 jl   .l16
 cmp  lenq, 64
 jl   .l32
-xorpsm1, m1
+xorpsm1, m1, m1
 cmp  lenq, 128
 jl   .l64
 andlenq, ~127
-xorpsm2, m2
-xorpsm3, m3
+xorpsm2, m2, m2
+xorpsm3, m3, m3
 .loop128:
 movups   m4, [v1q+offsetq]
 movups   m5, [v1q+offsetq + 32]
@@ -468,13 +468,13 @@ cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, 
offset
 add   offsetq, 128
 cmp   offsetq, lenq
 jl .loop128
-addpsm0, m2
-addpsm1, m3
+addpsm0, m0, m2
+addpsm1, m1, m3
 mov  lenq, sizeq
 and  lenq, 127
 cmp  lenq, 64
 jge .l64
-addpsm0, m1
+addpsm0, m0, m1
 cmp  lenq, 32
 jge .l32
 vextractf128 xmm2, m0, 1
@@ -502,7 +502,7 @@ cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, 
offset
 add   offsetq, 64
 cmp   offsetq, lenq
 jl .loop64
-addpsm0, m1
+addpsm0, m0, m1
 mov  lenq, sizeq
 and  lenq, 63
 cmp  lenq, 32

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] avcodec/x86/audiodsp: add scalarproduct avx2

2022-09-13 Thread Paul B Mahol
ffmpeg | branch: master | Paul B Mahol  | Mon Sep 12 18:53:31 
2022 +0200| [37a503ac879ca7677beb7423c33a6c5d24dd6396] | committer: Paul B Mahol

avcodec/x86/audiodsp: add scalarproduct avx2

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=37a503ac879ca7677beb7423c33a6c5d24dd6396
---

 libavcodec/x86/audiodsp.asm| 18 ++
 libavcodec/x86/audiodsp_init.c |  6 ++
 2 files changed, 24 insertions(+)

diff --git a/libavcodec/x86/audiodsp.asm b/libavcodec/x86/audiodsp.asm
index b604b0443c..f64077cb13 100644
--- a/libavcodec/x86/audiodsp.asm
+++ b/libavcodec/x86/audiodsp.asm
@@ -44,6 +44,24 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
 movd   eax, m2
 RET
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal scalarproduct_int16, 3,3,2, v1, v2, order
+add orderd, orderd
+add v1q, orderq
+add v2q, orderq
+neg orderq
+pxorm1, m1
+.loop:
+movum0, [v1q + orderq]
+pmaddwd m0, [v2q + orderq]
+paddd   m1, m0
+add orderq, mmsize
+jl .loop
+HADDD   m1, m0
+movd   eax, xm1
+RET
+%endif
 
 ;-
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
diff --git a/libavcodec/x86/audiodsp_init.c b/libavcodec/x86/audiodsp_init.c
index aa5e43e570..68aa3b2129 100644
--- a/libavcodec/x86/audiodsp_init.c
+++ b/libavcodec/x86/audiodsp_init.c
@@ -24,6 +24,9 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/audiodsp.h"
 
+int32_t ff_scalarproduct_int16_avx2(const int16_t *v1, const int16_t *v2,
+int order);
+
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
 int order);
 
@@ -53,4 +56,7 @@ av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
 
 if (EXTERNAL_SSE4(cpu_flags))
 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
+
+if (EXTERNAL_AVX2_FAST(cpu_flags))
+c->scalarproduct_int16 = ff_scalarproduct_int16_avx2;
 }

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] avutil/x86/float_dsp: add fma3 for scalarproduct

2022-09-13 Thread Paul B Mahol
ffmpeg | branch: master | Paul B Mahol  | Wed Jan 20 16:58:31 
2021 +0100| [72acff9f593f977944a62652fc9dd346ec53225a] | committer: Paul B Mahol

avutil/x86/float_dsp: add fma3 for scalarproduct

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=72acff9f593f977944a62652fc9dd346ec53225a
---

 libavutil/x86/float_dsp.asm| 127 +
 libavutil/x86/float_dsp_init.c |   2 +
 2 files changed, 129 insertions(+)

diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index cca4d019c7..8f8e6dddf5 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -440,6 +440,133 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
 %endif
 RET
 
+INIT_YMM fma3
+cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
+xor   offsetq, offsetq
+xorps  m0, m0
+shl sized, 2
+mov  lenq, sizeq
+cmp  lenq, 32
+jl   .l16
+cmp  lenq, 64
+jl   .l32
+xorpsm1, m1
+cmp  lenq, 128
+jl   .l64
+andlenq, ~127
+xorpsm2, m2
+xorpsm3, m3
+.loop128:
+movups   m4, [v1q+offsetq]
+movups   m5, [v1q+offsetq + 32]
+movups   m6, [v1q+offsetq + 64]
+movups   m7, [v1q+offsetq + 96]
+fmaddps  m0, m4, [v2q+offsetq ], m0
+fmaddps  m1, m5, [v2q+offsetq + 32], m1
+fmaddps  m2, m6, [v2q+offsetq + 64], m2
+fmaddps  m3, m7, [v2q+offsetq + 96], m3
+add   offsetq, 128
+cmp   offsetq, lenq
+jl .loop128
+addpsm0, m2
+addpsm1, m3
+mov  lenq, sizeq
+and  lenq, 127
+cmp  lenq, 64
+jge .l64
+addpsm0, m1
+cmp  lenq, 32
+jge .l32
+vextractf128 xmm2, m0, 1
+addpsxmm0, xmm2
+cmp  lenq, 16
+jge .l16
+movhlps  xmm1, xmm0
+addpsxmm0, xmm1
+movssxmm1, xmm0
+shufps   xmm0, xmm0, 1
+addssxmm0, xmm1
+%if ARCH_X86_64 == 0
+movss r0m, xm0
+fld dword r0m
+%endif
+RET
+.l64:
+andlenq, ~63
+addlenq, offsetq
+.loop64:
+movups   m4, [v1q+offsetq]
+movups   m5, [v1q+offsetq + 32]
+fmaddps  m0, m4, [v2q+offsetq], m0
+fmaddps  m1, m5, [v2q+offsetq + 32], m1
+add   offsetq, 64
+cmp   offsetq, lenq
+jl .loop64
+addpsm0, m1
+mov  lenq, sizeq
+and  lenq, 63
+cmp  lenq, 32
+jge .l32
+vextractf128 xmm2, m0, 1
+addpsxmm0, xmm2
+cmp  lenq, 16
+jge .l16
+movhlps  xmm1, xmm0
+addpsxmm0, xmm1
+movssxmm1, xmm0
+shufps   xmm0, xmm0, 1
+addssxmm0, xmm1
+%if ARCH_X86_64 == 0
+movss r0m, xm0
+fld dword r0m
+%endif
+RET
+.l32:
+andlenq, ~31
+addlenq, offsetq
+.loop32:
+movups   m4, [v1q+offsetq]
+fmaddps  m0, m4, [v2q+offsetq], m0
+add   offsetq, 32
+cmp   offsetq, lenq
+jl .loop32
+vextractf128 xmm2, m0, 1
+addpsxmm0, xmm2
+mov  lenq, sizeq
+and  lenq, 31
+cmp  lenq, 16
+jge .l16
+movhlps  xmm1, xmm0
+addpsxmm0, xmm1
+movssxmm1, xmm0
+shufps   xmm0, xmm0, 1
+addssxmm0, xmm1
+%if ARCH_X86_64 == 0
+movss r0m, xm0
+fld dword r0m
+%endif
+RET
+.l16:
+andlenq, ~15
+addlenq, offsetq
+.loop16:
+movaps   xmm1, [v1q+offsetq]
+mulpsxmm1, [v2q+offsetq]
+addpsxmm0, xmm1
+add   offsetq, 16
+cmp   offsetq, lenq
+jl .loop16
+movhlps  xmm1, xmm0
+addpsxmm0, xmm1
+movssxmm1, xmm0
+shufps   xmm0, xmm0, 1
+addssxmm0, xmm1
+%if ARCH_X86_64 == 0
+movss r0m, xm0
+fld dword r0m
+%endif
+RET
+
 ;-
 ; void ff_butterflies_float(float *src0, float *src1, int len);
 ;-
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index ad17bc2044..ad6b506259 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -74,6 +74,7 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float 
*src0,
  const float *src1, int len);
 
 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
+float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order);
 
 void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict 
src1, int len);
 
@@ -112,5 +113,6 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
 fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
 fdsp->vector_fmul_add= ff_vector_fmul_add_fma3;
 fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
+fdsp->scalarproduct_float = ff_scalarproduct_float_fma3;
 }
 }

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link 

[FFmpeg-cvslog] avcodec/flac_parser: avoid returning too negative number

2022-09-13 Thread Paul B Mahol
ffmpeg | branch: master | Paul B Mahol  | Thu Sep  8 09:59:09 
2022 +0200| [cf2cf31805448dd11692313440a21821773a6128] | committer: Paul B Mahol

avcodec/flac_parser: avoid returning too negative number

If return value is very small parser code will assert.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=cf2cf31805448dd11692313440a21821773a6128
---

 libavcodec/flac_parser.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libavcodec/flac_parser.c b/libavcodec/flac_parser.c
index 5b3a4e6e67..bd91cc1a05 100644
--- a/libavcodec/flac_parser.c
+++ b/libavcodec/flac_parser.c
@@ -663,8 +663,11 @@ static int get_best_header(FLACParseContext *fpc, const 
uint8_t **poutbuf,
 
 /* Return the negative overread index so the client can compute pos.
This should be the amount overread to the beginning of the child */
-if (child)
-return child->offset - flac_fifo_size(>fifo_buf);
+if (child) {
+int64_t offset = child->offset - flac_fifo_size(>fifo_buf);
+if (offset > -(1 << 28))
+return offset;
+}
 return 0;
 }
 

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] fate/spdif: Add spdif tests

2022-09-13 Thread Andreas Rheinhardt
ffmpeg | branch: master | Andreas Rheinhardt  | 
Sun Sep 11 18:34:47 2022 +0200| [9ad3db3ad932d484708194f419544c33cb3c71e6] | 
committer: Andreas Rheinhardt

fate/spdif: Add spdif tests

These tests test both the demuxer as well as the muxer
wherever possible. It is not always possible due to the fact
that the muxer supports more codecs than the demuxer.

The spdif demuxer does currently not set the need_parsing flag.
If one were to set this to AVSTREAM_PARSE_FULL, the test results
would change as follows:
- For spdif-aac-remux, the packets are currently padded to 16bits,
i.e. if the actual packet size is odd, there is a padding byte.
The parser splits this byte away into a one byte packet of its own.
Insanely, these one byte packets get the same duration as normal
packets, i.e. timing is ruined.
- The DCA-remux tests get proper duration/timestamps.
- In the spdif-mp2-remux test the demuxer marks the stream as
being MP2; the parser sets it to MP3 and this triggers
the "Codec change in IEC 61937" codepath; this test therefore
returns only two packets with the parser.
- For spdif-mp3-remux some bytes end up in different packets:
Some input packets of this file have an odd length (417B instead
of 418B like all the other packets) and are padded to 418B.
Without a parser, all returned packets from the spdif-demuxer
are 418B. With a parser, the packets that were originally 417B
are 417B again, but the padding byte has not been discarded,
but added to the next packet which is now 419B.
This fixes "Multiple frames in a packet" warning and avoids
an "Invalid data found when processing input" error when decoding.

Signed-off-by: Andreas Rheinhardt 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=9ad3db3ad932d484708194f419544c33cb3c71e6
---

 tests/Makefile |1 +
 tests/fate/spdif.mak   |   44 ++
 tests/ref/fate/spdif-aac-remux |   93 +++
 tests/ref/fate/spdif-ac3-remux |   63 ++
 tests/ref/fate/spdif-dca-core-bswap|1 +
 tests/ref/fate/spdif-dca-core-remux|   14 +
 tests/ref/fate/spdif-dca-master|1 +
 tests/ref/fate/spdif-dca-master-core   |1 +
 tests/ref/fate/spdif-dca-master-core-remux | 1179 
 tests/ref/fate/spdif-eac3  |1 +
 tests/ref/fate/spdif-mlp   |1 +
 tests/ref/fate/spdif-mp2-remux |   49 ++
 tests/ref/fate/spdif-mp3-remux |   47 ++
 tests/ref/fate/spdif-truehd|1 +
 14 files changed, 1496 insertions(+)

diff --git a/tests/Makefile b/tests/Makefile
index d9c509a415..06494a9cc4 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -231,6 +231,7 @@ include $(SRC_PATH)/tests/fate/real.mak
 include $(SRC_PATH)/tests/fate/screen.mak
 include $(SRC_PATH)/tests/fate/segment.mak
 include $(SRC_PATH)/tests/fate/source.mak
+include $(SRC_PATH)/tests/fate/spdif.mak
 include $(SRC_PATH)/tests/fate/speedhq.mak
 include $(SRC_PATH)/tests/fate/subtitles.mak
 include $(SRC_PATH)/tests/fate/truehd.mak
diff --git a/tests/fate/spdif.mak b/tests/fate/spdif.mak
new file mode 100644
index 00..093b8138e8
--- /dev/null
+++ b/tests/fate/spdif.mak
@@ -0,0 +1,44 @@
+# This padds the AAC frames to 16 bit words (the actual size is
+# still available in the ADTS headers).
+FATE_SPDIF_REMUX-$(call ALLYES, AAC_DEMUXER AAC_DECODER) += 
fate-spdif-aac-remux
+fate-spdif-aac-remux: CMD = transcode aac $(TARGET_SAMPLES)/aac/foo.aac spdif 
"-c copy" "-c copy"
+
+FATE_SPDIF_REMUX-$(call ALLYES, AC3_DEMUXER AC3_DECODER) += 
fate-spdif-ac3-remux
+fate-spdif-ac3-remux: CMD = transcode ac3 
$(TARGET_SAMPLES)/ac3/monsters_inc_5.1_448_small.ac3 spdif "-c copy" "-c copy"
+
+FATE_SPDIF_REMUX-$(call ALLYES, DTS_DEMUXER DCA_DECODER) += 
fate-spdif-dca-core-remux
+fate-spdif-dca-core-remux: CMD = transcode dts 
$(TARGET_SAMPLES)/dts/dcadec-suite/core_51_24_48_768_0.dtshd spdif "-c copy" 
"-c copy"
+
+FATE_SPDIF-$(call DEMMUX, DTSHD, SPDIF) += fate-spdif-dca-core-bswap
+fate-spdif-dca-core-bswap: CMD = md5 -i 
$(TARGET_SAMPLES)/dts/dcadec-suite/core_51_24_48_768_0.dtshd -c copy 
-spdif_flags +be -f spdif
+
+# Only the core will be transferred, extensions are discarded.
+FATE_SPDIF_REMUX-$(call ALLYES, DTS_DEMUXER DCA_DECODER) += 
fate-spdif-dca-master-core-remux
+fate-spdif-dca-master-core-remux: CMD = transcode dts 
$(TARGET_SAMPLES)/dts/master_audio_7.1_24bit.dts spdif "-c copy" "-c copy"
+
+FATE_SPDIF-$(call DEMMUX, DTS, SPDIF) += fate-spdif-dca-master 
fate-spdif-dca-master-core
+fate-spdif-dca-master:  CMD = md5 -i 
$(TARGET_SAMPLES)/dts/master_audio_7.1_24bit.dts -c copy -dtshd_rate 192000 -f 
spdif
+# This test uses a too low bitrate and therefore switches to only transmit the 
core.
+fate-spdif-dca-master-core: CMD = md5 -i 
$(TARGET_SAMPLES)/dts/master_audio_7.1_24bit.dts -c copy -dtshd_rate  96000 -f 
spdif
+
+FATE_SPDIF-$(call DEMMUX, EAC3, SPDIF) += fate-spdif-eac3
+fate-spdif-eac3: CMD = md5 -i 

[FFmpeg-cvslog] avcodec/arm/sbcenc: avoid callee preserved vfp registers

2022-09-13 Thread James Cowgill
ffmpeg | branch: master | James Cowgill  | Sun Aug 25 
09:18:00 2019 +0100| [50a4dff69f6477b06f00eae1cac2a53ae22fe9a5] | committer: 
Martin Storsjö

avcodec/arm/sbcenc: avoid callee preserved vfp registers

When compiling FFmpeg with GCC-9, some very random segfaults were
observed in code which had previously called down into the SBC encoder
NEON assembly routines. This was caused by these functions clobbering
some of the vfp callee saved registers (d8 - d15 aka q4 - q7). GCC was
using these registers to save local variables, but after these
functions returned, they would contain garbage.

Fix by reallocating the registers in the two affected functions in
the following way:
 ff_sbc_analyze_4_neon: q2-q5 => q8-q11, then q1-q4 => q8-q11
 ff_sbc_analyze_8_neon: q2-q9 => q8-q15

The reason for using these replacements is to keep closely related
sets of registers consecutively numbered which hopefully makes the
code more easy to follow. Since this commit only reallocates
registers, it should have no performance impact.

Signed-off-by: James Cowgill 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=50a4dff69f6477b06f00eae1cac2a53ae22fe9a5
---

 libavcodec/arm/sbcdsp_neon.S | 220 +--
 1 file changed, 110 insertions(+), 110 deletions(-)

diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
index d83d21d202..914abfb6cc 100644
--- a/libavcodec/arm/sbcdsp_neon.S
+++ b/libavcodec/arm/sbcdsp_neon.S
@@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1
 /* TODO: merge even and odd cases (or even merge all four calls to this
  * function) in order to have only aligned reads from 'in' array
  * and reduce number of load instructions */
-vld1.16 {d4, d5}, [r0, :64]!
-vld1.16 {d8, d9}, [r2, :128]!
+vld1.16 {d16, d17}, [r0, :64]!
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmull.s16   q0, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmull.s16   q1, d5, d9
-vld1.16 {d10, d11}, [r2, :128]!
+vmull.s16   q0, d16, d20
+vld1.16 {d18, d19}, [r0, :64]!
+vmull.s16   q1, d17, d21
+vld1.16 {d22, d23}, [r2, :128]!
 
-vmlal.s16   q0, d6, d10
-vld1.16 {d4, d5}, [r0, :64]!
-vmlal.s16   q1, d7, d11
-vld1.16 {d8, d9}, [r2, :128]!
+vmlal.s16   q0, d18, d22
+vld1.16 {d16, d17}, [r0, :64]!
+vmlal.s16   q1, d19, d23
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmlal.s16   q0, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmlal.s16   q1, d5, d9
-vld1.16 {d10, d11}, [r2, :128]!
+vmlal.s16   q0, d16, d20
+vld1.16 {d18, d19}, [r0, :64]!
+vmlal.s16   q1, d17, d21
+vld1.16 {d22, d23}, [r2, :128]!
 
-vmlal.s16   q0, d6, d10
-vld1.16 {d4, d5}, [r0, :64]!
-vmlal.s16   q1, d7, d11
-vld1.16 {d8, d9}, [r2, :128]!
+vmlal.s16   q0, d18, d22
+vld1.16 {d16, d17}, [r0, :64]!
+vmlal.s16   q1, d19, d23
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmlal.s16   q0, d4, d8
-vmlal.s16   q1, d5, d9
+vmlal.s16   q0, d16, d20
+vmlal.s16   q1, d17, d21
 
 vpadd.s32   d0, d0, d1
 vpadd.s32   d1, d2, d3
 
 vrshrn.s32  d0, q0, SBC_PROTO_FIXED_SCALE
 
-vld1.16 {d2, d3, d4, d5}, [r2, :128]!
+vld1.16 {d16, d17, d18, d19}, [r2, :128]!
 
 vdup.i32d1, d0[1]  /* TODO: can be eliminated */
 vdup.i32d0, d0[0]  /* TODO: can be eliminated */
 
-vmull.s16   q3, d2, d0
-vmull.s16   q4, d3, d0
-vmlal.s16   q3, d4, d1
-vmlal.s16   q4, d5, d1
+vmull.s16   q10, d16, d0
+vmull.s16   q11, d17, d0
+vmlal.s16   q10, d18, d1
+vmlal.s16   q11, d19, d1
 
-vpadd.s32   d0, d6, d7 /* TODO: can be eliminated */
-vpadd.s32   d1, d8, d9 /* TODO: can be eliminated */
+vpadd.s32   d0, d20, d21 /* TODO: can be eliminated */
+vpadd.s32   d1, d22, d23 /* TODO: can be eliminated */
 
 vst1.32 {d0, d1}, [r1, :128]
 
@@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
 /* TODO: merge even and odd cases (or even merge all four calls to this
  * function) in order to have only aligned reads from 'in' array
  * and reduce number of load instructions */
-vld1.16 {d4, d5}, [r0, :64]!
-vld1.16 {d8, d9}, [r2, :128]!
-
-vmull.s16   q6, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmull.s16   q7, d5, d9
-vld1.16