The existing Neon assembly of dequant_normal only works for SBD, so add
the missing code to make it work for HBD.
The parameter values used in REPORT_SPEEDUP are updated to fully test
the high bit-depth version.
---
source/common/aarch64/asm-primitives.cpp | 2 +-
source/common/aarch64/pixel-util.S | 13 +++++++++++--
source/test/mbdstharness.cpp | 6 ++++--
3 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp
b/source/common/aarch64/asm-primitives.cpp
index c1317eb74..a8560d269 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -714,7 +714,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
// dequant_scaling
p.dequant_scaling = PFX(dequant_scaling_neon);
- p.dequant_normal = PFX(dequant_normal_neon);
// ssim_4x4x2_core
p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
@@ -743,6 +742,7 @@ void setupNeonPrimitives(EncoderPrimitives &p)
#endif
// quant
+ p.dequant_normal = PFX(dequant_normal_neon);
p.quant = PFX(quant_neon);
p.nquant = PFX(nquant_neon);
}
diff --git a/source/common/aarch64/pixel-util.S
b/source/common/aarch64/pixel-util.S
index 1825466ea..495bac1fa 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -1626,7 +1626,16 @@ endfunc
// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int
scale, int shift)
function PFX(dequant_normal_neon)
- lsr w2, w2, #4 // num / 16
+// X265_CHECK(num >= 4 * 4, "dequant num %d too small\n", num);
+// X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
+// X265_CHECK((num % 16) == 0, "dequant num %d not multiple of 16\n", num);
+#if HIGH_BIT_DEPTH
+ cmp w3, #32768
+ blt .dqn_skip
+ lsr w3, w3, #(BIT_DEPTH - 8)
+ sub w4, w4, #(BIT_DEPTH - 8)
+.dqn_skip:
+#endif
neg w4, w4
dup v0.8h, w3
dup v1.4s, w4
@@ -1648,7 +1657,7 @@ function PFX(dequant_normal_neon)
sqxtn v3.4h, v18.4s
sqxtn2 v3.8h, v19.4s
- sub w2, w2, #1
+ sub w2, w2, #16
st1 {v2.8h, v3.8h}, [x1], #32
cbnz w2, .dqn_loop1
ret
diff --git a/source/test/mbdstharness.cpp b/source/test/mbdstharness.cpp
index cceadd833..05027d109 100644
--- a/source/test/mbdstharness.cpp
+++ b/source/test/mbdstharness.cpp
@@ -524,7 +524,7 @@ bool MBDstHarness::testCorrectness(const EncoderPrimitives&
ref, const EncoderPr
{
if (!check_dequant_primitive(ref.dequant_normal, opt.dequant_normal))
{
- printf("dequant: Failed!\n");
+ printf("dequant_normal: Failed!\n");
return false;
}
}
@@ -655,8 +655,10 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives&
ref, const EncoderPrimi
if (opt.dequant_normal)
{
+ int scale = 72 << X265_DEPTH;
+ int shift = X265_DEPTH - 4;
printf("dequant_normal\t");
- REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal,
short_test_buff[0], mshortbuf2, 32 * 32, 70, 1);
+ REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal,
short_test_buff[0], mshortbuf2, 32 * 32, scale, shift);
}
if (opt.dequant_scaling)
--
2.34.1
>From ca57ccc3f57b6e6e75cb3683d2c6c5ce61089595 Mon Sep 17 00:00:00 2001
Message-Id: <ca57ccc3f57b6e6e75cb3683d2c6c5ce61089595.1744030934.git.microdaryl.rob...@arm.com>
In-Reply-To: <[email protected]>
References: <[email protected]>
From: Micro Daryl Robles <[email protected]>
Date: Fri, 7 Mar 2025 12:07:13 +0000
Subject: [PATCH 1/2] AArch64: Fix Neon asm implementation of dequant_normal
for HBD
The existing Neon assembly of dequant_normal only works for SBD, so add
the missing code to make it work for HBD.
The parameter values used in REPORT_SPEEDUP are updated to fully test
the high bit-depth version.
---
source/common/aarch64/asm-primitives.cpp | 2 +-
source/common/aarch64/pixel-util.S | 13 +++++++++++--
source/test/mbdstharness.cpp | 6 ++++--
3 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index c1317eb74..a8560d269 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -714,7 +714,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
// dequant_scaling
p.dequant_scaling = PFX(dequant_scaling_neon);
- p.dequant_normal = PFX(dequant_normal_neon);
// ssim_4x4x2_core
p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
@@ -743,6 +742,7 @@ void setupNeonPrimitives(EncoderPrimitives &p)
#endif
// quant
+ p.dequant_normal = PFX(dequant_normal_neon);
p.quant = PFX(quant_neon);
p.nquant = PFX(nquant_neon);
}
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 1825466ea..495bac1fa 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -1626,7 +1626,16 @@ endfunc
// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
function PFX(dequant_normal_neon)
- lsr w2, w2, #4 // num / 16
+// X265_CHECK(num >= 4 * 4, "dequant num %d too small\n", num);
+// X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
+// X265_CHECK((num % 16) == 0, "dequant num %d not multiple of 16\n", num);
+#if HIGH_BIT_DEPTH
+ cmp w3, #32768
+ blt .dqn_skip
+ lsr w3, w3, #(BIT_DEPTH - 8)
+ sub w4, w4, #(BIT_DEPTH - 8)
+.dqn_skip:
+#endif
neg w4, w4
dup v0.8h, w3
dup v1.4s, w4
@@ -1648,7 +1657,7 @@ function PFX(dequant_normal_neon)
sqxtn v3.4h, v18.4s
sqxtn2 v3.8h, v19.4s
- sub w2, w2, #1
+ sub w2, w2, #16
st1 {v2.8h, v3.8h}, [x1], #32
cbnz w2, .dqn_loop1
ret
diff --git a/source/test/mbdstharness.cpp b/source/test/mbdstharness.cpp
index cceadd833..05027d109 100644
--- a/source/test/mbdstharness.cpp
+++ b/source/test/mbdstharness.cpp
@@ -524,7 +524,7 @@ bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
{
if (!check_dequant_primitive(ref.dequant_normal, opt.dequant_normal))
{
- printf("dequant: Failed!\n");
+ printf("dequant_normal: Failed!\n");
return false;
}
}
@@ -655,8 +655,10 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
if (opt.dequant_normal)
{
+ int scale = 72 << X265_DEPTH;
+ int shift = X265_DEPTH - 4;
printf("dequant_normal\t");
- REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, 70, 1);
+ REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, scale, shift);
}
if (opt.dequant_scaling)
--
2.34.1
_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel