[FFmpeg-devel] [PATCH] avcodec/ac3: Remove unused fixed-point ARMv7 DSP

2024-04-17 Thread Geoff Hill
This diff removes 4 unused ARMv7 NEON fixed-point DSP functions.

The function were originally moved here by 4958f35a2 (Dec 2013).

After 9e05421db (Jan 2021), as part of the refactor of the AC3
DSP to consistently use 32-bit sample format in the encoder, these
functions were removed from the DSP function table, but the ARMv7
implementations were kept.

Signed-off-by: Geoff Hill 
---
 libavcodec/arm/ac3dsp_neon.S | 63 
 1 file changed, 63 deletions(-)

diff --git a/libavcodec/arm/ac3dsp_neon.S b/libavcodec/arm/ac3dsp_neon.S
index 89d0ae8048..dc829541aa 100644
--- a/libavcodec/arm/ac3dsp_neon.S
+++ b/libavcodec/arm/ac3dsp_neon.S
@@ -20,25 +20,6 @@
 
 #include "libavutil/arm/asm.S"
 
-function ff_ac3_max_msb_abs_int16_neon, export=1
-vmov.i16q0,  #0
-vmov.i16q2,  #0
-1:  vld1.16 {q1}, [r0,:128]!
-vabs.s16q1,  q1
-vld1.16 {q3}, [r0,:128]!
-vabs.s16q3,  q3
-vorrq0,  q0,  q1
-vorrq2,  q2,  q3
-subsr1,  r1,  #16
-bgt 1b
-vorrq0,  q0,  q2
-vorrd0,  d0,  d1
-vpmax.u16   d0,  d0,  d0
-vpmax.u16   d0,  d0,  d0
-vmov.u16r0,  d0[0]
-bx  lr
-endfunc
-
 function ff_ac3_exponent_min_neon, export=1
 cmp r1,  #0
 it  eq
@@ -59,27 +40,6 @@ function ff_ac3_exponent_min_neon, export=1
 pop {pc}
 endfunc
 
-function ff_ac3_lshift_int16_neon, export=1
-vdup.16 q0,  r2
-1:  vld1.16 {q1}, [r0,:128]
-vshl.s16q1,  q1,  q0
-vst1.16 {q1}, [r0,:128]!
-subsr1,  r1,  #8
-bgt 1b
-bx  lr
-endfunc
-
-function ff_ac3_rshift_int32_neon, export=1
-rsb r2,  r2,  #0
-vdup.32 q0,  r2
-1:  vld1.32 {q1}, [r0,:128]
-vshl.s32q1,  q1,  q0
-vst1.32 {q1}, [r0,:128]!
-subsr1,  r1,  #4
-bgt 1b
-bx  lr
-endfunc
-
 function ff_float_to_fixed24_neon, export=1
 1:  vld1.32 {q0-q1},  [r1,:128]!
 vcvt.s32.f32q0,  q0,  #24
@@ -109,29 +69,6 @@ function ff_ac3_extract_exponents_neon, export=1
 bx  lr
 endfunc
 
-function ff_apply_window_int16_neon, export=1
-push{r4,lr}
-add r4,  r1,  r3,  lsl #1
-add lr,  r0,  r3,  lsl #1
-sub r4,  r4,  #16
-sub lr,  lr,  #16
-mov r12, #-16
-1:
-vld1.16 {q0}, [r1,:128]!
-vld1.16 {q2}, [r2,:128]!
-vld1.16 {q1}, [r4,:128], r12
-vrev64.16   q3,  q2
-vqrdmulh.s16q0,  q0,  q2
-vqrdmulh.s16d2,  d2,  d7
-vqrdmulh.s16d3,  d3,  d6
-vst1.16 {q0}, [r0,:128]!
-vst1.16 {q1}, [lr,:128], r12
-subsr3,  r3,  #16
-bgt 1b
-
-pop {r4,pc}
-endfunc
-
 function ff_ac3_sum_square_butterfly_int32_neon, export=1
 vmov.i64q0,  #0
 vmov.i64q1,  #0
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v4 5/5] avcodec/ac3: Implement sum_square_butterfly_float for aarch64 NEON

2024-04-06 Thread Geoff Hill
Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  5 
 libavcodec/aarch64/ac3dsp_neon.S | 30 
 tests/checkasm/ac3dsp.c  | 26 
 3 files changed, 61 insertions(+)

diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index e95436c651..e367353e11 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -32,6 +32,10 @@ void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
 const int32_t *coef0,
 const int32_t *coef1,
 int len);
+void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
+const float *coef0,
+const float *coef1,
+int len);
 
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 {
@@ -42,4 +46,5 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 c->extract_exponents = ff_ac3_extract_exponents_neon;
 c->float_to_fixed24 = ff_float_to_fixed24_neon;
 c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
+c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
 }
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index 77f9d20275..20beb6cc50 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -87,3 +87,33 @@ function ff_ac3_sum_square_butterfly_int32_neon, export=1
 st1 {v0.1d-v3.1d}, [x0]
 ret
 endfunc
+
+function ff_ac3_sum_square_butterfly_float_neon, export=1
+moviv0.4s, #0
+moviv1.4s, #0
+moviv2.4s, #0
+moviv3.4s, #0
+1:  ld1 {v30.4s}, [x1], #16
+ld1 {v31.4s}, [x2], #16
+faddv16.4s, v30.4s, v31.4s
+fsubv17.4s, v30.4s, v31.4s
+fmlav0.4s, v30.4s, v30.4s
+fmlav1.4s, v31.4s, v31.4s
+fmlav2.4s, v16.4s, v16.4s
+fmlav3.4s, v17.4s, v17.4s
+subsw3, w3, #4
+b.gt1b
+faddp   v0.4s, v0.4s, v0.4s
+faddp   v0.2s, v0.2s, v0.2s
+st1 {v0.s}[0], [x0], #4
+faddp   v1.4s, v1.4s, v1.4s
+faddp   v1.2s, v1.2s, v1.2s
+st1 {v1.s}[0], [x0], #4
+faddp   v2.4s, v2.4s, v2.4s
+faddp   v2.2s, v2.2s, v2.2s
+st1 {v2.s}[0], [x0], #4
+faddp   v3.4s, v3.4s, v3.4s
+faddp   v3.2s, v3.2s, v3.2s
+st1 {v3.s}[0], [x0]
+ret
+endfunc
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index 573a76c764..442e965f3b 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -165,6 +165,31 @@ static void 
check_ac3_sum_square_butterfly_int32(AC3DSPContext *c) {
 report("ac3_sum_square_butterfly_int32");
 }
 
+static void check_ac3_sum_square_butterfly_float(AC3DSPContext *c) {
+LOCAL_ALIGNED_32(float, lt, [ELEMS]);
+LOCAL_ALIGNED_32(float, rt, [ELEMS]);
+LOCAL_ALIGNED_16(float, v1, [4]);
+LOCAL_ALIGNED_16(float, v2, [4]);
+
+declare_func(void, float[4], const float *, const float *, int);
+
+randomize_float(lt, ELEMS);
+randomize_float(rt, ELEMS);
+
+if (check_func(c->sum_square_butterfly_float,
+   "ac3_sum_square_bufferfly_float")) {
+call_ref(v1, lt, rt, ELEMS);
+call_new(v2, lt, rt, ELEMS);
+
+if (!float_near_ulp_array(v1, v2, 10, 4))
+fail();
+
+bench_new(v2, lt, rt, ELEMS);
+}
+
+report("ac3_sum_square_butterfly_float");
+}
+
 void checkasm_check_ac3dsp(void)
 {
 AC3DSPContext c;
@@ -174,4 +199,5 @@ void checkasm_check_ac3dsp(void)
 check_ac3_extract_exponents(&c);
 check_float_to_fixed24(&c);
 check_ac3_sum_square_butterfly_int32(&c);
+check_ac3_sum_square_butterfly_float(&c);
 }
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v4 4/5] avcodec/ac3: Implement sum_square_butterfly_int32 for aarch64 NEON

2024-04-06 Thread Geoff Hill
Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  5 +
 libavcodec/aarch64/ac3dsp_neon.S | 23 
 tests/checkasm/ac3dsp.c  | 27 
 3 files changed, 55 insertions(+)

diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index 1bdc215b51..e95436c651 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -28,6 +28,10 @@
 void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int 
nb_coefs);
 void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
+void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
+const int32_t *coef0,
+const int32_t *coef1,
+int len);
 
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 {
@@ -37,4 +41,5 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 c->ac3_exponent_min = ff_ac3_exponent_min_neon;
 c->extract_exponents = ff_ac3_extract_exponents_neon;
 c->float_to_fixed24 = ff_float_to_fixed24_neon;
+c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
 }
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index c350c1f173..77f9d20275 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -64,3 +64,26 @@ function ff_float_to_fixed24_neon, export=1
 b.ne1b
 ret
 endfunc
+
+function ff_ac3_sum_square_butterfly_int32_neon, export=1
+moviv0.2d, #0
+moviv1.2d, #0
+moviv2.2d, #0
+moviv3.2d, #0
+1:  ld1 {v4.2s}, [x1], #8
+ld1 {v5.2s}, [x2], #8
+add v6.2s, v4.2s, v5.2s
+sub v7.2s, v4.2s, v5.2s
+smlal   v0.2d, v4.2s, v4.2s
+smlal   v1.2d, v5.2s, v5.2s
+smlal   v2.2d, v6.2s, v6.2s
+smlal   v3.2d, v7.2s, v7.2s
+subsw3, w3, #2
+b.gt1b
+addpd0, v0.2d
+addpd1, v1.2d
+addpd2, v2.2d
+addpd3, v3.2d
+st1 {v0.1d-v3.1d}, [x0]
+ret
+endfunc
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index dc1b169e68..573a76c764 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -139,6 +139,32 @@ static void check_float_to_fixed24(AC3DSPContext *c) {
 report("float_to_fixed24");
 }
 
+static void check_ac3_sum_square_butterfly_int32(AC3DSPContext *c) {
+#define ELEMS 240
+LOCAL_ALIGNED_16(int32_t, lt, [ELEMS]);
+LOCAL_ALIGNED_16(int32_t, rt, [ELEMS]);
+LOCAL_ALIGNED_16(uint64_t, v1, [4]);
+LOCAL_ALIGNED_16(uint64_t, v2, [4]);
+
+declare_func(void, int64_t[4], const int32_t *, const int32_t *, int);
+
+randomize_i24(lt, ELEMS);
+randomize_i24(rt, ELEMS);
+
+if (check_func(c->sum_square_butterfly_int32,
+   "ac3_sum_square_bufferfly_int32")) {
+call_ref(v1, lt, rt, ELEMS);
+call_new(v2, lt, rt, ELEMS);
+
+if (memcmp(v1, v2, sizeof(int64_t[4])) != 0)
+fail();
+
+bench_new(v2, lt, rt, ELEMS);
+}
+
+report("ac3_sum_square_butterfly_int32");
+}
+
 void checkasm_check_ac3dsp(void)
 {
 AC3DSPContext c;
@@ -147,4 +173,5 @@ void checkasm_check_ac3dsp(void)
 check_ac3_exponent_min(&c);
 check_ac3_extract_exponents(&c);
 check_float_to_fixed24(&c);
+check_ac3_sum_square_butterfly_int32(&c);
 }
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v4 3/5] avcodec/ac3: Implement ac3_extract_exponents for aarch64 NEON

2024-04-06 Thread Geoff Hill
Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  2 ++
 libavcodec/aarch64/ac3dsp_neon.S | 14 +
 tests/checkasm/ac3dsp.c  | 38 
 3 files changed, 54 insertions(+)

diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index 8874b41393..1bdc215b51 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -26,6 +26,7 @@
 #include "config.h"
 
 void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int 
nb_coefs);
+void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
 
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
@@ -34,5 +35,6 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 if (!have_neon(cpu_flags)) return;
 
 c->ac3_exponent_min = ff_ac3_exponent_min_neon;
+c->extract_exponents = ff_ac3_extract_exponents_neon;
 c->float_to_fixed24 = ff_float_to_fixed24_neon;
 }
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index f916c32538..c350c1f173 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -37,6 +37,20 @@ function ff_ac3_exponent_min_neon, export=1
 3:  ret
 endfunc
 
+function ff_ac3_extract_exponents_neon, export=1
+moviv1.4s, #8
+1:  ld1 {v0.4s}, [x1], #16
+abs v0.4s, v0.4s
+clz v0.4s, v0.4s
+sub v0.4s, v0.4s, v1.4s
+xtn v0.4h, v0.4s
+xtn v0.8b, v0.8h
+st1 {v0.s}[0], [x0], #4
+subsw2, w2, #4
+b.gt1b
+ret
+endfunc
+
 function ff_float_to_fixed24_neon, export=1
 1:  ld1 {v0.4s, v1.4s}, [x1], #32
 fcvtzs  v0.4s, v0.4s, #24
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index 06f31339f9..dc1b169e68 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -19,6 +19,7 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include 
 #include 
 
 #include "libavutil/mem.h"
@@ -36,6 +37,16 @@
 }  \
 } while (0)
 
+#define randomize_i24(buf, len)  \
+do { \
+int i;   \
+for (i = 0; i < len; i++) {  \
+int32_t v = (int32_t)rnd();  \
+int32_t u = (v & 0xFF);  \
+buf[i] = (v < 0) ? -u : u;   \
+}\
+} while (0)
+
 #define randomize_float(buf, len)   \
 do {\
 int i;  \
@@ -77,6 +88,32 @@ static void check_ac3_exponent_min(AC3DSPContext *c) {
 report("ac3_exponent_min");
 }
 
+static void check_ac3_extract_exponents(AC3DSPContext *c) {
+#define MAX_EXPS 3072
+LOCAL_ALIGNED_16(int32_t, src, [MAX_EXPS]);
+LOCAL_ALIGNED_16(uint8_t, v1, [MAX_EXPS]);
+LOCAL_ALIGNED_16(uint8_t, v2, [MAX_EXPS]);
+int n;
+
+declare_func(void, uint8_t *, int32_t *, int);
+
+for (n = 512; n <= MAX_EXPS; n += 256) {
+if (check_func(c->extract_exponents, "ac3_extract_exponents_n%d", n)) {
+randomize_i24(src, n);
+
+call_ref(v1, src, n);
+call_new(v2, src, n);
+
+if (memcmp(v1, v2, n) != 0)
+fail();
+
+bench_new(v1, src, n);
+}
+}
+
+report("ac3_extract_exponents");
+}
+
 static void check_float_to_fixed24(AC3DSPContext *c) {
 #define BUF_SIZE 1024
 LOCAL_ALIGNED_32(float, src, [BUF_SIZE]);
@@ -108,5 +145,6 @@ void checkasm_check_ac3dsp(void)
 ff_ac3dsp_init(&c);
 
 check_ac3_exponent_min(&c);
+check_ac3_extract_exponents(&c);
 check_float_to_fixed24(&c);
 }
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v4 2/5] avcodec/ac3: Implement ac3_exponent_min for aarch64 NEON

2024-04-06 Thread Geoff Hill
Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  2 ++
 libavcodec/aarch64/ac3dsp_neon.S | 16 +
 tests/checkasm/ac3dsp.c  | 41 
 3 files changed, 59 insertions(+)

diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index e3320de0f5..8874b41393 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -25,6 +25,7 @@
 #include "libavcodec/ac3dsp.h"
 #include "config.h"
 
+void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int 
nb_coefs);
 void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
 
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
@@ -32,5 +33,6 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 int cpu_flags = av_get_cpu_flags();
 if (!have_neon(cpu_flags)) return;
 
+c->ac3_exponent_min = ff_ac3_exponent_min_neon;
 c->float_to_fixed24 = ff_float_to_fixed24_neon;
 }
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index c4d204b51a..f916c32538 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -21,6 +21,22 @@
 
 #include "libavutil/aarch64/asm.S"
 
+function ff_ac3_exponent_min_neon, export=1
+cbz w1, 3f
+1:  ld1 {v0.16b}, [x0]
+mov w3, w1
+add x4, x0, #256
+2:  ld1 {v1.16b}, [x4]
+uminv0.16b, v0.16b, v1.16b
+add x4, x4, #256
+subsw3, w3, #1
+b.gt2b
+st1 {v0.16b}, [x0], #16
+subsw2, w2, #16
+b.gt1b
+3:  ret
+endfunc
+
 function ff_float_to_fixed24_neon, export=1
 1:  ld1 {v0.4s, v1.4s}, [x1], #32
 fcvtzs  v0.4s, v0.4s, #24
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index b1064fccb4..06f31339f9 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -28,6 +28,14 @@
 
 #include "checkasm.h"
 
+#define randomize_exp(buf, len)\
+do {   \
+int i; \
+for (i = 0; i < len; i++) {\
+buf[i] = (uint8_t)rnd();   \
+}  \
+} while (0)
+
 #define randomize_float(buf, len)   \
 do {\
 int i;  \
@@ -37,6 +45,38 @@
 }   \
 } while (0)
 
+static void check_ac3_exponent_min(AC3DSPContext *c) {
+#define MAX_COEFS 256
+#define MAX_CTXT 6
+#define EXP_SIZE (MAX_CTXT * MAX_COEFS)
+
+LOCAL_ALIGNED_16(uint8_t, src, [EXP_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, v1, [EXP_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, v2, [EXP_SIZE]);
+int n;
+
+declare_func(void, uint8_t *, int, int);
+
+for (n = 0; n < MAX_CTXT; ++n) {
+if (check_func(c->ac3_exponent_min, "ac3_exponent_min_reuse%d", n)) {
+randomize_exp(src, EXP_SIZE);
+
+memcpy(v1, src, EXP_SIZE);
+memcpy(v2, src, EXP_SIZE);
+
+call_ref(v1, n, MAX_COEFS);
+call_new(v2, n, MAX_COEFS);
+
+if (memcmp(v1, v2, EXP_SIZE) != 0)
+fail();
+
+bench_new(v2, n, MAX_COEFS);
+}
+}
+
+report("ac3_exponent_min");
+}
+
 static void check_float_to_fixed24(AC3DSPContext *c) {
 #define BUF_SIZE 1024
 LOCAL_ALIGNED_32(float, src, [BUF_SIZE]);
@@ -67,5 +107,6 @@ void checkasm_check_ac3dsp(void)
 AC3DSPContext c;
 ff_ac3dsp_init(&c);
 
+check_ac3_exponent_min(&c);
 check_float_to_fixed24(&c);
 }
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v4 1/5] avcodec/ac3: Implement float_to_fixed24 for aarch64 NEON

2024-04-06 Thread Geoff Hill
Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/Makefile  |  2 ++
 libavcodec/aarch64/ac3dsp_init_aarch64.c | 36 
 libavcodec/aarch64/ac3dsp_neon.S | 36 
 libavcodec/ac3dsp.c  |  4 ++-
 libavcodec/ac3dsp.h  |  3 +-
 tests/checkasm/ac3dsp.c  |  1 +
 6 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/aarch64/ac3dsp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/ac3dsp_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index beb6a02f5f..95ad4dd202 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -1,4 +1,5 @@
 # subsystems
+OBJS-$(CONFIG_AC3DSP)   += aarch64/ac3dsp_init_aarch64.o
 OBJS-$(CONFIG_FMTCONVERT)   += aarch64/fmtconvert_init.o
 OBJS-$(CONFIG_H264CHROMA)   += aarch64/h264chroma_init_aarch64.o
 OBJS-$(CONFIG_H264DSP)  += aarch64/h264dsp_init_aarch64.o
@@ -35,6 +36,7 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP)   += aarch64/videodsp.o
 
 # subsystems
 NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
+NEON-OBJS-$(CONFIG_AC3DSP)  += aarch64/ac3dsp_neon.o
 NEON-OBJS-$(CONFIG_FMTCONVERT)  += aarch64/fmtconvert_neon.o
 NEON-OBJS-$(CONFIG_H264CHROMA)  += aarch64/h264cmc_neon.o
 NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o  
\
diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
new file mode 100644
index 00..e3320de0f5
--- /dev/null
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024 Geoff Hill 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/ac3dsp.h"
+#include "config.h"
+
+void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
+
+av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
+{
+int cpu_flags = av_get_cpu_flags();
+if (!have_neon(cpu_flags)) return;
+
+c->float_to_fixed24 = ff_float_to_fixed24_neon;
+}
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
new file mode 100644
index 00..c4d204b51a
--- /dev/null
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard 
+ * Copyright (c) 2024 Geoff Hill 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_float_to_fixed24_neon, export=1
+1:  ld1 {v0.4s, v1.4s}, [x1], #32
+fcvtzs  v0.4s, v0.4s, #24
+ld1 {v2.4s, v3.4s}, [x1], #32
+fcvtzs  v1.4s, v1.4s, #24
+fcvtzs  v2.4s, v2.4s, #24
+st1 {v0.4s, v1.4s}, [x0], #32
+fcvtzs  v3.4s, v3.4s, #24
+st1 {v2.4s, v3.4s}, [x0], #32
+subsw2, w2, #16
+b.ne1b
+ret
+endfunc
diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 8397e03d32..730fa70fff 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -389,7 +389,9 @@ av_cold void ff_ac3dsp_init(AC3DSPContext *c)
 c->downmix   = NULL;
 c->downmix_fixed = NULL;
 
-#if ARCH_ARM
+#if ARCH_AARCH64
+ff_ac3dsp_init_aarch64(c);
+#elif ARCH_ARM
 ff_ac3dsp_init_arm(c);
 #elif ARCH_X86
 ff_ac3dsp

[FFmpeg-devel] [PATCH v4 0/5] avcodec/ac3: Add aarch64 NEON DSP

2024-04-06 Thread Geoff Hill
Thanks Martin for your review and testing.

Here's v4 with the following changes:

  * Use fmal in sum_square_butterfly_float loop. Faster.

  * Removed redundant loop bound zero checks in extract_exponents,
sum_square_bufferfly_int32 and sum_square_bufferfly_float.

  * Fixed randomize_int24() to also use negative values.

  * Carry copyright from arm implementation over to aarch64. I
did use this version as reference.

  * Fix indentation to match existing aarch64 assembly style.

Tested once again on aarch64 and x86.

On AWS Graviton2 (t4g.medium), GCC 12.3:

$ tests/checkasm/checkasm --bench --test=ac3dsp
...
NEON:
 - ac3dsp.ac3_exponent_min   [OK]
 - ac3dsp.ac3_extract_exponents  [OK]
 - ac3dsp.float_to_fixed24   [OK]
 - ac3dsp.ac3_sum_square_butterfly_int32 [OK]
 - ac3dsp.ac3_sum_square_butterfly_float [OK]
checkasm: all 20 tests passed
ac3_exponent_min_reuse0_c: 7.5
ac3_exponent_min_reuse0_neon: 7.5
ac3_exponent_min_reuse1_c: 1044.0
ac3_exponent_min_reuse1_neon: 57.0
ac3_exponent_min_reuse2_c: 2073.0
ac3_exponent_min_reuse2_neon: 73.7
ac3_exponent_min_reuse3_c: 2596.2
ac3_exponent_min_reuse3_neon: 154.0
ac3_exponent_min_reuse4_c: 3107.2
ac3_exponent_min_reuse4_neon: 169.2
ac3_exponent_min_reuse5_c: 3615.2
ac3_exponent_min_reuse5_neon: 185.2
ac3_extract_exponents_n512_c: 1672.0
ac3_extract_exponents_n512_neon: 517.5
ac3_extract_exponents_n768_c: 2505.0
ac3_extract_exponents_n768_neon: 770.5
ac3_extract_exponents_n1024_c: 3304.0
ac3_extract_exponents_n1024_neon: 1022.0
ac3_extract_exponents_n1280_c: 4163.5
ac3_extract_exponents_n1280_neon: 1279.5
ac3_extract_exponents_n1536_c: 5001.2
ac3_extract_exponents_n1536_neon: 1553.2
ac3_extract_exponents_n1792_c: 5823.5
ac3_extract_exponents_n1792_neon: 1851.7
ac3_extract_exponents_n2048_c: 6601.5
ac3_extract_exponents_n2048_neon: 2116.2
ac3_extract_exponents_n2304_c: 7425.2
ac3_extract_exponents_n2304_neon: 2382.7
ac3_extract_exponents_n2560_c: 8278.5
ac3_extract_exponents_n2560_neon: 2620.5
ac3_extract_exponents_n2816_c: 9079.5
ac3_extract_exponents_n2816_neon: 2893.2
ac3_extract_exponents_n3072_c: 10026.5
ac3_extract_exponents_n3072_neon: 3127.0
ac3_sum_square_bufferfly_float_c: 1647.5
ac3_sum_square_bufferfly_float_neon: 229.5
ac3_sum_square_bufferfly_int32_c: 963.5
ac3_sum_square_bufferfly_int32_neon: 546.5
float_to_fixed24_c: 2460.5
float_to_fixed24_neon: 561.5


Geoff Hill (5):
  avcodec/ac3: Implement float_to_fixed24 for aarch64 NEON
  avcodec/ac3: Implement ac3_exponent_min for aarch64 NEON
  avcodec/ac3: Implement ac3_extract_exponents for aarch64 NEON
  avcodec/ac3: Implement sum_square_butterfly_int32 for aarch64 NEON
  avcodec/ac3: Implement sum_square_butterfly_float for aarch64 NEON

 libavcodec/aarch64/Makefile  |   2 +
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  50 +
 libavcodec/aarch64/ac3dsp_neon.S | 119 
 libavcodec/ac3dsp.c  |   4 +-
 libavcodec/ac3dsp.h  |   3 +-
 tests/checkasm/ac3dsp.c  | 133 +++
 6 files changed, 309 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/aarch64/ac3dsp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/ac3dsp_neon.S

-- 
2.42.0
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3 5/5] avcodec/ac3: Implement sum_square_butterfly_float for aarch64 NEON

2024-04-02 Thread Geoff Hill
Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  5 
 libavcodec/aarch64/ac3dsp_neon.S | 35 
 tests/checkasm/ac3dsp.c  | 26 ++
 3 files changed, 66 insertions(+)

diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index e95436c651..e367353e11 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -32,6 +32,10 @@ void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
 const int32_t *coef0,
 const int32_t *coef1,
 int len);
+void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
+const float *coef0,
+const float *coef1,
+int len);
 
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 {
@@ -42,4 +46,5 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 c->extract_exponents = ff_ac3_extract_exponents_neon;
 c->float_to_fixed24 = ff_float_to_fixed24_neon;
 c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
+c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
 }
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index fa8fcf2e47..4a78ec0b2a 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -88,3 +88,38 @@ function ff_ac3_sum_square_butterfly_int32_neon, export=1
 st1 {v0.1d-v3.1d}, [x0]
 1:  ret
 endfunc
+
+function ff_ac3_sum_square_butterfly_float_neon, export=1
+cbz w3, 1f
+moviv0.4s, #0
+moviv1.4s, #0
+moviv2.4s, #0
+moviv3.4s, #0
+0:  ld1 {v30.4s}, [x1], #16
+ld1 {v31.4s}, [x2], #16
+faddv16.4s, v30.4s, v31.4s
+fsubv17.4s, v30.4s, v31.4s
+fmulv30.4s, v30.4s, v30.4s
+faddv0.4s, v0.4s, v30.4s
+fmulv31.4s, v31.4s, v31.4s
+faddv1.4s, v1.4s, v31.4s
+fmulv16.4s, v16.4s, v16.4s
+faddv2.4s, v2.4s, v16.4s
+fmulv17.4s, v17.4s, v17.4s
+faddv3.4s, v3.4s, v17.4s
+subsw3, w3, #4
+b.gt0b
+faddp   v0.4s, v0.4s, v0.4s
+faddp   v0.2s, v0.2s, v0.2s
+st1 {v0.s}[0], [x0], #4
+faddp   v1.4s, v1.4s, v1.4s
+faddp   v1.2s, v1.2s, v1.2s
+st1 {v1.s}[0], [x0], #4
+faddp   v2.4s, v2.4s, v2.4s
+faddp   v2.2s, v2.2s, v2.2s
+st1 {v2.s}[0], [x0], #4
+faddp   v3.4s, v3.4s, v3.4s
+faddp   v3.2s, v3.2s, v3.2s
+st1 {v3.s}[0], [x0]
+1:  ret
+endfunc
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index c920dc9eb0..ef5186cfc1 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -162,6 +162,31 @@ static void 
check_ac3_sum_square_butterfly_int32(AC3DSPContext *c) {
 report("ac3_sum_square_butterfly_int32");
 }
 
+static void check_ac3_sum_square_butterfly_float(AC3DSPContext *c) {
+LOCAL_ALIGNED_32(float, lt, [ELEMS]);
+LOCAL_ALIGNED_32(float, rt, [ELEMS]);
+LOCAL_ALIGNED_16(float, v1, [4]);
+LOCAL_ALIGNED_16(float, v2, [4]);
+
+declare_func(void, float[4], const float *, const float *, int);
+
+randomize_float(lt, ELEMS);
+randomize_float(rt, ELEMS);
+
+if (check_func(c->sum_square_butterfly_float,
+   "ac3_sum_square_bufferfly_float")) {
+call_ref(v1, lt, rt, ELEMS);
+call_new(v2, lt, rt, ELEMS);
+
+if (!float_near_ulp_array(v1, v2, 10, 4))
+fail();
+
+bench_new(v2, lt, rt, ELEMS);
+}
+
+report("ac3_sum_square_butterfly_float");
+}
+
 void checkasm_check_ac3dsp(void)
 {
 AC3DSPContext c;
@@ -171,4 +196,5 @@ void checkasm_check_ac3dsp(void)
 check_ac3_extract_exponents(&c);
 check_float_to_fixed24(&c);
 check_ac3_sum_square_butterfly_int32(&c);
+check_ac3_sum_square_butterfly_float(&c);
 }
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3 4/5] avcodec/ac3: Implement sum_square_butterfly_int32 for aarch64 NEON

2024-04-02 Thread Geoff Hill
Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  5 +
 libavcodec/aarch64/ac3dsp_neon.S | 24 +
 tests/checkasm/ac3dsp.c  | 27 
 3 files changed, 56 insertions(+)

diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index 1bdc215b51..e95436c651 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -28,6 +28,10 @@
 void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int 
nb_coefs);
 void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
+void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
+const int32_t *coef0,
+const int32_t *coef1,
+int len);
 
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 {
@@ -37,4 +41,5 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 c->ac3_exponent_min = ff_ac3_exponent_min_neon;
 c->extract_exponents = ff_ac3_extract_exponents_neon;
 c->float_to_fixed24 = ff_float_to_fixed24_neon;
+c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
 }
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index b26f71a3f6..fa8fcf2e47 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -64,3 +64,27 @@ function ff_float_to_fixed24_neon, export=1
 b.ne0b
 ret
 endfunc
+
+function ff_ac3_sum_square_butterfly_int32_neon, export=1
+cbz w3, 1f
+moviv0.2d, #0
+moviv1.2d, #0
+moviv2.2d, #0
+moviv3.2d, #0
+0:  ld1 {v4.2s}, [x1], #8
+ld1 {v5.2s}, [x2], #8
+add v6.2s, v4.2s, v5.2s
+sub v7.2s, v4.2s, v5.2s
+smlal   v0.2d, v4.2s, v4.2s
+smlal   v1.2d, v5.2s, v5.2s
+smlal   v2.2d, v6.2s, v6.2s
+smlal   v3.2d, v7.2s, v7.2s
+subsw3, w3, #2
+b.gt0b
+addpd0, v0.2d
+addpd1, v1.2d
+addpd2, v2.2d
+addpd3, v3.2d
+st1 {v0.1d-v3.1d}, [x0]
+1:  ret
+endfunc
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index a8a20349f9..c920dc9eb0 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -136,6 +136,32 @@ static void check_float_to_fixed24(AC3DSPContext *c) {
 report("float_to_fixed24");
 }
 
+static void check_ac3_sum_square_butterfly_int32(AC3DSPContext *c) {
+#define ELEMS 240
+LOCAL_ALIGNED_16(int32_t, lt, [ELEMS]);
+LOCAL_ALIGNED_16(int32_t, rt, [ELEMS]);
+LOCAL_ALIGNED_16(uint64_t, v1, [4]);
+LOCAL_ALIGNED_16(uint64_t, v2, [4]);
+
+declare_func(void, int64_t[4], const int32_t *, const int32_t *, int);
+
+randomize_i24(lt, ELEMS);
+randomize_i24(rt, ELEMS);
+
+if (check_func(c->sum_square_butterfly_int32,
+   "ac3_sum_square_bufferfly_int32")) {
+call_ref(v1, lt, rt, ELEMS);
+call_new(v2, lt, rt, ELEMS);
+
+if (memcmp(v1, v2, sizeof(int64_t[4])) != 0)
+fail();
+
+bench_new(v2, lt, rt, ELEMS);
+}
+
+report("ac3_sum_square_butterfly_int32");
+}
+
 void checkasm_check_ac3dsp(void)
 {
 AC3DSPContext c;
@@ -144,4 +170,5 @@ void checkasm_check_ac3dsp(void)
 check_ac3_exponent_min(&c);
 check_ac3_extract_exponents(&c);
 check_float_to_fixed24(&c);
+check_ac3_sum_square_butterfly_int32(&c);
 }
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3 3/5] avcodec/ac3: Implement ac3_extract_exponents for aarch64 NEON

2024-04-02 Thread Geoff Hill
Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  2 ++
 libavcodec/aarch64/ac3dsp_neon.S | 15 ++
 tests/checkasm/ac3dsp.c  | 36 
 3 files changed, 53 insertions(+)

diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index 8874b41393..1bdc215b51 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -26,6 +26,7 @@
 #include "config.h"
 
 void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int 
nb_coefs);
+void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
 
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
@@ -34,5 +35,6 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 if (!have_neon(cpu_flags)) return;
 
 c->ac3_exponent_min = ff_ac3_exponent_min_neon;
+c->extract_exponents = ff_ac3_extract_exponents_neon;
 c->float_to_fixed24 = ff_float_to_fixed24_neon;
 }
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index c8bdbb1bd3..b26f71a3f6 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -36,6 +36,21 @@ function ff_ac3_exponent_min_neon, export=1
 2:  ret
 endfunc
 
+function ff_ac3_extract_exponents_neon, export=1
+cbzw2, 1f
+movi   v1.4s, #8
+0:  ld1{v0.4s}, [x1], #16
+absv0.4s, v0.4s
+clzv0.4s, v0.4s
+subv0.4s, v0.4s, v1.4s
+xtnv0.4h, v0.4s
+xtnv0.8b, v0.8h
+st1{v0.s}[0], [x0], #4
+subs   w2, w2, #4
+b.gt   0b
+1:  ret
+endfunc
+
 function ff_float_to_fixed24_neon, export=1
 0:  ld1 {v0.4s, v1.4s}, [x1], #32
 fcvtzs  v0.4s, v0.4s, #24
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index acb00b6fe1..a8a20349f9 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -18,6 +18,7 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include 
 #include 
 
 #include "libavutil/mem.h"
@@ -35,6 +36,14 @@
 }  \
 } while (0)\
 
+#define randomize_i24(buf, len)   \
+do {  \
+int i;\
+for (i = 0; i < len; i++) {   \
+buf[i] = (int32_t)rnd() & 0xFF;   \
+} \
+} while (0)   \
+
 #define randomize_float(buf, len)   \
 do {\
 int i;  \
@@ -76,6 +85,32 @@ static void check_ac3_exponent_min(AC3DSPContext *c) {
 report("ac3_exponent_min");
 }
 
+static void check_ac3_extract_exponents(AC3DSPContext *c) {
+#define MAX_EXPS 3072
+LOCAL_ALIGNED_16(int32_t, src, [MAX_EXPS]);
+LOCAL_ALIGNED_16(uint8_t, v1, [MAX_EXPS]);
+LOCAL_ALIGNED_16(uint8_t, v2, [MAX_EXPS]);
+int n;
+
+declare_func(void, uint8_t *, int32_t *, int);
+
+for (n = 512; n <= MAX_EXPS; n += 256) {
+if (check_func(c->extract_exponents, "ac3_extract_exponents_n%d", n)) {
+randomize_i24(src, n);
+
+call_ref(v1, src, n);
+call_new(v2, src, n);
+
+if (memcmp(v1, v2, n) != 0)
+fail();
+
+bench_new(v1, src, n);
+}
+}
+
+report("ac3_extract_exponents");
+}
+
 static void check_float_to_fixed24(AC3DSPContext *c) {
 #define BUF_SIZE 1024
 LOCAL_ALIGNED_32(float, src, [BUF_SIZE]);
@@ -107,5 +142,6 @@ void checkasm_check_ac3dsp(void)
 ff_ac3dsp_init(&c);
 
 check_ac3_exponent_min(&c);
+check_ac3_extract_exponents(&c);
 check_float_to_fixed24(&c);
 }
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3 2/5] avcodec/ac3: Implement ac3_exponent_min for aarch64 NEON

2024-04-02 Thread Geoff Hill
Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  2 ++
 libavcodec/aarch64/ac3dsp_neon.S | 16 +
 tests/checkasm/ac3dsp.c  | 41 
 3 files changed, 59 insertions(+)

diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index e3320de0f5..8874b41393 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -25,6 +25,7 @@
 #include "libavcodec/ac3dsp.h"
 #include "config.h"
 
+void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int 
nb_coefs);
 void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
 
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
@@ -32,5 +33,6 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 int cpu_flags = av_get_cpu_flags();
 if (!have_neon(cpu_flags)) return;
 
+c->ac3_exponent_min = ff_ac3_exponent_min_neon;
 c->float_to_fixed24 = ff_float_to_fixed24_neon;
 }
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index 6924645b7e..c8bdbb1bd3 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -20,6 +20,22 @@
 
 #include "libavutil/aarch64/asm.S"
 
+function ff_ac3_exponent_min_neon, export=1
+cbz w1, 2f
+0:  ld1 {v0.16b}, [x0]
+mov w3, w1
+add x4, x0, #256
+1:  ld1 {v1.16b}, [x4]
+uminv0.16b, v0.16b, v1.16b
+add x4, x4, #256
+subsw3, w3, #1
+b.gt1b
+st1 {v0.16b}, [x0], #16
+subsw2, w2, #16
+b.gt0b
+2:  ret
+endfunc
+
 function ff_float_to_fixed24_neon, export=1
 0:  ld1 {v0.4s, v1.4s}, [x1], #32
 fcvtzs  v0.4s, v0.4s, #24
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index 344e1fe5c2..acb00b6fe1 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -27,6 +27,14 @@
 
 #include "checkasm.h"
 
+#define randomize_exp(buf, len)\
+do {   \
+int i; \
+for (i = 0; i < len; i++) {\
+buf[i] = (uint8_t)rnd();   \
+}  \
+} while (0)\
+
 #define randomize_float(buf, len)   \
 do {\
 int i;  \
@@ -36,6 +44,38 @@
 }   \
 } while (0)
 
+static void check_ac3_exponent_min(AC3DSPContext *c) {
+#define MAX_COEFS 256
+#define MAX_CTXT 6
+#define EXP_SIZE (MAX_CTXT * MAX_COEFS)
+
+LOCAL_ALIGNED_16(uint8_t, src, [EXP_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, v1, [EXP_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, v2, [EXP_SIZE]);
+int n;
+
+declare_func(void, uint8_t *, int, int);
+
+for (n = 0; n < MAX_CTXT; ++n) {
+if (check_func(c->ac3_exponent_min, "ac3_exponent_min_reuse%d", n)) {
+randomize_exp(src, EXP_SIZE);
+
+memcpy(v1, src, EXP_SIZE);
+memcpy(v2, src, EXP_SIZE);
+
+call_ref(v1, n, MAX_COEFS);
+call_new(v2, n, MAX_COEFS);
+
+if (memcmp(v1, v2, EXP_SIZE) != 0)
+fail();
+
+bench_new(v2, n, MAX_COEFS);
+}
+}
+
+report("ac3_exponent_min");
+}
+
 static void check_float_to_fixed24(AC3DSPContext *c) {
 #define BUF_SIZE 1024
 LOCAL_ALIGNED_32(float, src, [BUF_SIZE]);
@@ -66,5 +106,6 @@ void checkasm_check_ac3dsp(void)
 AC3DSPContext c;
 ff_ac3dsp_init(&c);
 
+check_ac3_exponent_min(&c);
 check_float_to_fixed24(&c);
 }
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3 1/5] avcodec/ac3: Implement float_to_fixed24 for aarch64 NEON

2024-04-02 Thread Geoff Hill
Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/Makefile  |  2 ++
 libavcodec/aarch64/ac3dsp_init_aarch64.c | 36 
 libavcodec/aarch64/ac3dsp_neon.S | 35 +++
 libavcodec/ac3dsp.c  |  4 ++-
 libavcodec/ac3dsp.h  |  3 +-
 5 files changed, 78 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/aarch64/ac3dsp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/ac3dsp_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index beb6a02f5f..95ad4dd202 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -1,4 +1,5 @@
 # subsystems
+OBJS-$(CONFIG_AC3DSP)   += aarch64/ac3dsp_init_aarch64.o
 OBJS-$(CONFIG_FMTCONVERT)   += aarch64/fmtconvert_init.o
 OBJS-$(CONFIG_H264CHROMA)   += aarch64/h264chroma_init_aarch64.o
 OBJS-$(CONFIG_H264DSP)  += aarch64/h264dsp_init_aarch64.o
@@ -35,6 +36,7 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP)   += aarch64/videodsp.o
 
 # subsystems
 NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
+NEON-OBJS-$(CONFIG_AC3DSP)  += aarch64/ac3dsp_neon.o
 NEON-OBJS-$(CONFIG_FMTCONVERT)  += aarch64/fmtconvert_neon.o
 NEON-OBJS-$(CONFIG_H264CHROMA)  += aarch64/h264cmc_neon.o
 NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o  
\
diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
new file mode 100644
index 00..e3320de0f5
--- /dev/null
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024 Geoff Hill 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/ac3dsp.h"
+#include "config.h"
+
+void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
+
+av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
+{
+int cpu_flags = av_get_cpu_flags();
+if (!have_neon(cpu_flags)) return;
+
+c->float_to_fixed24 = ff_float_to_fixed24_neon;
+}
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
new file mode 100644
index 00..6924645b7e
--- /dev/null
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024 Geoff Hill 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_float_to_fixed24_neon, export=1
+0:  ld1 {v0.4s, v1.4s}, [x1], #32
+fcvtzs  v0.4s, v0.4s, #24
+ld1 {v2.4s, v3.4s}, [x1], #32
+fcvtzs  v1.4s, v1.4s, #24
+fcvtzs  v2.4s, v2.4s, #24
+st1 {v0.4s, v1.4s}, [x0], #32
+fcvtzs  v3.4s, v3.4s, #24
+st1 {v2.4s, v3.4s}, [x0], #32
+subsw2, w2, #16
+b.ne0b
+ret
+endfunc
diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 8397e03d32..730fa70fff 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -389,7 +389,9 @@ av_cold void ff_ac3dsp_init(AC3DSPContext *c)
 c->downmix   = NULL;
 c->downmix_fixed = NULL;
 
-#if ARCH_ARM
+#if ARCH_AARCH64
+ff_ac3dsp_init_aarch64(c);
+#elif ARCH_ARM
 ff_ac3dsp_init_arm(c);
 #elif ARCH_X86
 ff_ac3dsp_init_x86(c);
diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
index ae33b361a9..b1b2bced8f 100644
--- a/libavcodec/ac3dsp.h
+

[FFmpeg-devel] [PATCH v3 0/5] avcodec/ac3: Add aarch64 NEON DSP

2024-04-02 Thread Geoff Hill
Here's v3 to push the AC-3 ARMv8 NEON experiment a step further.

This version implements 5 of the AC-3 encoder DSP functions,
and adds checkasm tests where missing.

I've tested that the checkasm tests pass on aarch64 and x86.

On AWS Graviton2 (t4g.medium), GCC 12.3:

$ tests/checkasm/checkasm --bench --verbose --test=ac3dsp
...
NEON:
 - ac3dsp.ac3_exponent_min   [OK]
 - ac3dsp.ac3_extract_exponents  [OK]
 - ac3dsp.float_to_fixed24   [OK]
 - ac3dsp.ac3_sum_square_butterfly_int32 [OK]
 - ac3dsp.ac3_sum_square_butterfly_float [OK]
checkasm: all 20 tests passed
ac3_exponent_min_reuse0_c: 9.0
ac3_exponent_min_reuse0_neon: 9.7
ac3_exponent_min_reuse1_c: 1037.5
ac3_exponent_min_reuse1_neon: 54.0
ac3_exponent_min_reuse2_c: 1820.7
ac3_exponent_min_reuse2_neon: 135.2
ac3_exponent_min_reuse3_c: 2080.5
ac3_exponent_min_reuse3_neon: 167.7
ac3_exponent_min_reuse4_c: 2493.2
ac3_exponent_min_reuse4_neon: 200.0
ac3_exponent_min_reuse5_c: 2970.0
ac3_exponent_min_reuse5_neon: 231.7
ac3_extract_exponents_n512_c: 1717.5
ac3_extract_exponents_n512_neon: 506.7
ac3_extract_exponents_n768_c: 2562.7
ac3_extract_exponents_n768_neon: 769.7
ac3_extract_exponents_n1024_c: 3389.2
ac3_extract_exponents_n1024_neon: 1019.0
ac3_extract_exponents_n1280_c: 4210.7
ac3_extract_exponents_n1280_neon: 1267.5
ac3_extract_exponents_n1536_c: 5071.5
ac3_extract_exponents_n1536_neon: 1522.0
ac3_extract_exponents_n1792_c: 5896.5
ac3_extract_exponents_n1792_neon: 1784.0
ac3_extract_exponents_n2048_c: 6779.2
ac3_extract_exponents_n2048_neon: 2051.0
ac3_extract_exponents_n2304_c: 7559.5
ac3_extract_exponents_n2304_neon: 2290.0
ac3_extract_exponents_n2560_c: 8397.2
ac3_extract_exponents_n2560_neon: 2552.5
ac3_extract_exponents_n2816_c: 9224.2
ac3_extract_exponents_n2816_neon: 2797.7
ac3_extract_exponents_n3072_c: 10026.2
ac3_extract_exponents_n3072_neon: 3047.7
ac3_sum_square_bufferfly_float_c: 1605.7
ac3_sum_square_bufferfly_float_neon: 365.7
ac3_sum_square_bufferfly_int32_c: 965.5
ac3_sum_square_bufferfly_int32_neon: 486.2
float_to_fixed24_c: 2453.7
float_to_fixed24_neon: 516.2

Geoff Hill (5):
  avcodec/ac3: Implement float_to_fixed24 for aarch64 NEON
  avcodec/ac3: Implement ac3_exponent_min for aarch64 NEON
  avcodec/ac3: Implement ac3_extract_exponents for aarch64 NEON
  avcodec/ac3: Implement sum_square_butterfly_int32 for aarch64 NEON
  avcodec/ac3: Implement sum_square_butterfly_float for aarch64 NEON

 libavcodec/aarch64/Makefile  |   2 +
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  50 +
 libavcodec/aarch64/ac3dsp_neon.S | 125 ++
 libavcodec/ac3dsp.c  |   4 +-
 libavcodec/ac3dsp.h  |   3 +-
 tests/checkasm/ac3dsp.c  | 130 +++
 6 files changed, 312 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/aarch64/ac3dsp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/ac3dsp_neon.S

-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2] avcodec/ac3: Implement float_to_fixed24 for aarch64 neon

2024-03-28 Thread Geoff Hill
Start porting AC-3 ARMv7 NEON algorithms over to aarch64.

This one is low-hanging fruit since checkasm tests exist.

Fixed the Makefile compared to v1.

Tested on AWS Graviton2 (t4g.medium), GCC 12.3:

$ tests/checkasm/checkasm --verbose --bench --test=ac3dsp
...
NEON:
 - ac3dsp.float_to_fixed24 [OK]
checkasm: all 1 tests passed
float_to_fixed24_c: 2450.7
float_to_fixed24_neon: 574.0

Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/Makefile  |  2 ++
 libavcodec/aarch64/ac3dsp_init_aarch64.c | 36 +++
 libavcodec/aarch64/ac3dsp_neon.S | 37 
 libavcodec/ac3dsp.c  |  4 ++-
 libavcodec/ac3dsp.h  |  3 +-
 5 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/aarch64/ac3dsp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/ac3dsp_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index beb6a02f5f..95ad4dd202 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -1,4 +1,5 @@
 # subsystems
+OBJS-$(CONFIG_AC3DSP)   += aarch64/ac3dsp_init_aarch64.o
 OBJS-$(CONFIG_FMTCONVERT)   += aarch64/fmtconvert_init.o
 OBJS-$(CONFIG_H264CHROMA)   += aarch64/h264chroma_init_aarch64.o
 OBJS-$(CONFIG_H264DSP)  += aarch64/h264dsp_init_aarch64.o
@@ -35,6 +36,7 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP)   += aarch64/videodsp.o
 
 # subsystems
 NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
+NEON-OBJS-$(CONFIG_AC3DSP)  += aarch64/ac3dsp_neon.o
 NEON-OBJS-$(CONFIG_FMTCONVERT)  += aarch64/fmtconvert_neon.o
 NEON-OBJS-$(CONFIG_H264CHROMA)  += aarch64/h264cmc_neon.o
 NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o  
\
diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
new file mode 100644
index 00..e3320de0f5
--- /dev/null
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024 Geoff Hill 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/ac3dsp.h"
+#include "config.h"
+
+void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
+
+av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
+{
+int cpu_flags = av_get_cpu_flags();
+if (!have_neon(cpu_flags)) return;
+
+c->float_to_fixed24 = ff_float_to_fixed24_neon;
+}
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
new file mode 100644
index 00..77106ea586
--- /dev/null
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Geoff Hill 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int 
len)
+
+function ff_float_to_fixed24_neon, export=1
+0:  ld1 {v0.4s, v1.4s}, [x1], #32
+fcvtzs  v0.4s, v0.4s, #24
+ld1 {v2.4s, v3.4s}, [x1], #32
+fcvtzs  v1.4s, v1.4s, #24
+fcvtzs  v2.4s, v2.4s, #24
+st1 {v0.4s, v1.4s}, [x0], #32
+fcvtzs  v3.4s, v3.4s, #24
+st1 {v2.4s, v3.4s}, [x0], #32
+subsw2, w2, #16
+b.ne0b
+ret
+endfunc
diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 8397e03d32..730fa70

[FFmpeg-devel] [PATCH] avcodec/ac3: Implement float_to_fixed24 for aarch64 neon

2024-03-27 Thread Geoff Hill
Tested on AWS Graviton2 (t4g.medium), NixOS 23.11, GCC 12.3:

float_to_fixed24_c: 2462.7
float_to_fixed24_neon: 513.5

Signed-off-by: Geoff Hill 
---
 libavcodec/aarch64/Makefile  |  2 ++
 libavcodec/aarch64/ac3dsp_init_aarch64.c | 36 +++
 libavcodec/aarch64/ac3dsp_neon.S | 37 
 libavcodec/ac3dsp.c  |  4 ++-
 libavcodec/ac3dsp.h  |  3 +-
 5 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/aarch64/ac3dsp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/ac3dsp_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index beb6a02f5f..5151b7510e 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -35,6 +35,8 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP)   += aarch64/videodsp.o
 
 # subsystems
 NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
+NEON-OBJS-$(CONFIG_AC3DSP)  += aarch64/ac3dsp_init_aarch64.o   
\
+   aarch64/ac3dsp_neon.o
 NEON-OBJS-$(CONFIG_FMTCONVERT)  += aarch64/fmtconvert_neon.o
 NEON-OBJS-$(CONFIG_H264CHROMA)  += aarch64/h264cmc_neon.o
 NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o  
\
diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c 
b/libavcodec/aarch64/ac3dsp_init_aarch64.c
new file mode 100644
index 00..e3320de0f5
--- /dev/null
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024 Geoff Hill 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/ac3dsp.h"
+#include "config.h"
+
+void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
+
+av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
+{
+int cpu_flags = av_get_cpu_flags();
+if (!have_neon(cpu_flags)) return;
+
+c->float_to_fixed24 = ff_float_to_fixed24_neon;
+}
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
new file mode 100644
index 00..77106ea586
--- /dev/null
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Geoff Hill 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int 
len)
+
+function ff_float_to_fixed24_neon, export=1
+0:  ld1 {v0.4s, v1.4s}, [x1], #32
+fcvtzs  v0.4s, v0.4s, #24
+ld1 {v2.4s, v3.4s}, [x1], #32
+fcvtzs  v1.4s, v1.4s, #24
+fcvtzs  v2.4s, v2.4s, #24
+st1 {v0.4s, v1.4s}, [x0], #32
+fcvtzs  v3.4s, v3.4s, #24
+st1 {v2.4s, v3.4s}, [x0], #32
+subsw2, w2, #16
+b.ne0b
+ret
+endfunc
diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 8397e03d32..730fa70fff 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -389,7 +389,9 @@ av_cold void ff_ac3dsp_init(AC3DSPContext *c)
 c->downmix   = NULL;
 c->downmix_fixed = NULL;
 
-#if ARCH_ARM
+#if ARCH_AARCH64
+ff_ac3dsp_init_aarch64(c);
+#elif ARCH_ARM
 ff_ac3dsp_init_arm(c);
 #elif ARCH_X86
 ff_ac3dsp_init_x86(c);
diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
index ae33b361a9..b1b2bced8f 100644
--- a/libavcodec/ac3dsp.h
+++ b/libavcodec/ac3dsp.h
@