[x265] [PATCH 1/2] AArch64: Fix Neon asm implementation of dequant_normal for HBD

Micro Daryl Robles Thu, 10 Apr 2025 13:23:43 -0700

The existing Neon assembly of dequant_normal only works for SBD, so add
the missing code to make it work for HBD.


The parameter values used in REPORT_SPEEDUP are updated to fully test
the high bit-depth version.
---
 source/common/aarch64/asm-primitives.cpp |  2 +-
 source/common/aarch64/pixel-util.S       | 13 +++++++++++--
 source/test/mbdstharness.cpp             |  6 ++++--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp 
b/source/common/aarch64/asm-primitives.cpp
index c1317eb74..a8560d269 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -714,7 +714,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
 
     // dequant_scaling
     p.dequant_scaling = PFX(dequant_scaling_neon);
-    p.dequant_normal  = PFX(dequant_normal_neon);
 
     // ssim_4x4x2_core
     p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
@@ -743,6 +742,7 @@ void setupNeonPrimitives(EncoderPrimitives &p)
 #endif
 
     // quant
+    p.dequant_normal = PFX(dequant_normal_neon);
     p.quant = PFX(quant_neon);
     p.nquant = PFX(nquant_neon);
 }
diff --git a/source/common/aarch64/pixel-util.S 
b/source/common/aarch64/pixel-util.S
index 1825466ea..495bac1fa 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -1626,7 +1626,16 @@ endfunc
 
 // void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int 
scale, int shift)
 function PFX(dequant_normal_neon)
-    lsr             w2, w2, #4              // num / 16
+//  X265_CHECK(num >= 4 * 4, "dequant num %d too small\n", num);
+//  X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
+//  X265_CHECK((num % 16) == 0, "dequant num %d not multiple of 16\n", num);
+#if HIGH_BIT_DEPTH
+    cmp             w3, #32768
+    blt             .dqn_skip
+    lsr             w3, w3, #(BIT_DEPTH - 8)
+    sub             w4, w4, #(BIT_DEPTH - 8)
+.dqn_skip:
+#endif
     neg             w4, w4
     dup             v0.8h, w3
     dup             v1.4s, w4
@@ -1648,7 +1657,7 @@ function PFX(dequant_normal_neon)
     sqxtn           v3.4h, v18.4s
     sqxtn2          v3.8h, v19.4s
 
-    sub             w2, w2, #1
+    sub             w2, w2, #16
     st1             {v2.8h, v3.8h}, [x1], #32
     cbnz            w2, .dqn_loop1
     ret
diff --git a/source/test/mbdstharness.cpp b/source/test/mbdstharness.cpp
index cceadd833..05027d109 100644
--- a/source/test/mbdstharness.cpp
+++ b/source/test/mbdstharness.cpp
@@ -524,7 +524,7 @@ bool MBDstHarness::testCorrectness(const EncoderPrimitives& 
ref, const EncoderPr
     {
         if (!check_dequant_primitive(ref.dequant_normal, opt.dequant_normal))
         {
-            printf("dequant: Failed!\n");
+            printf("dequant_normal: Failed!\n");
             return false;
         }
     }
@@ -655,8 +655,10 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& 
ref, const EncoderPrimi
 
     if (opt.dequant_normal)
     {
+        int scale = 72 << X265_DEPTH;
+        int shift = X265_DEPTH - 4;
         printf("dequant_normal\t");
-        REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, 
short_test_buff[0], mshortbuf2, 32 * 32, 70, 1);
+        REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, 
short_test_buff[0], mshortbuf2, 32 * 32, scale, shift);
     }
 
     if (opt.dequant_scaling)
-- 
2.34.1

>From ca57ccc3f57b6e6e75cb3683d2c6c5ce61089595 Mon Sep 17 00:00:00 2001
Message-Id: <ca57ccc3f57b6e6e75cb3683d2c6c5ce61089595.1744030934.git.microdaryl.rob...@arm.com>
In-Reply-To: <[email protected]>
References: <[email protected]>
From: Micro Daryl Robles <[email protected]>
Date: Fri, 7 Mar 2025 12:07:13 +0000
Subject: [PATCH 1/2] AArch64: Fix Neon asm implementation of dequant_normal
 for HBD

The existing Neon assembly of dequant_normal only works for SBD, so add
the missing code to make it work for HBD.

The parameter values used in REPORT_SPEEDUP are updated to fully test
the high bit-depth version.
---
 source/common/aarch64/asm-primitives.cpp |  2 +-
 source/common/aarch64/pixel-util.S       | 13 +++++++++++--
 source/test/mbdstharness.cpp             |  6 ++++--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index c1317eb74..a8560d269 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -714,7 +714,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
 
     // dequant_scaling
     p.dequant_scaling = PFX(dequant_scaling_neon);
-    p.dequant_normal  = PFX(dequant_normal_neon);
 
     // ssim_4x4x2_core
     p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
@@ -743,6 +742,7 @@ void setupNeonPrimitives(EncoderPrimitives &p)
 #endif
 
     // quant
+    p.dequant_normal = PFX(dequant_normal_neon);
     p.quant = PFX(quant_neon);
     p.nquant = PFX(nquant_neon);
 }
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 1825466ea..495bac1fa 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -1626,7 +1626,16 @@ endfunc
 
 // void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
 function PFX(dequant_normal_neon)
-    lsr             w2, w2, #4              // num / 16
+//  X265_CHECK(num >= 4 * 4, "dequant num %d too small\n", num);
+//  X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
+//  X265_CHECK((num % 16) == 0, "dequant num %d not multiple of 16\n", num);
+#if HIGH_BIT_DEPTH
+    cmp             w3, #32768
+    blt             .dqn_skip
+    lsr             w3, w3, #(BIT_DEPTH - 8)
+    sub             w4, w4, #(BIT_DEPTH - 8)
+.dqn_skip:
+#endif
     neg             w4, w4
     dup             v0.8h, w3
     dup             v1.4s, w4
@@ -1648,7 +1657,7 @@ function PFX(dequant_normal_neon)
     sqxtn           v3.4h, v18.4s
     sqxtn2          v3.8h, v19.4s
 
-    sub             w2, w2, #1
+    sub             w2, w2, #16
     st1             {v2.8h, v3.8h}, [x1], #32
     cbnz            w2, .dqn_loop1
     ret
diff --git a/source/test/mbdstharness.cpp b/source/test/mbdstharness.cpp
index cceadd833..05027d109 100644
--- a/source/test/mbdstharness.cpp
+++ b/source/test/mbdstharness.cpp
@@ -524,7 +524,7 @@ bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
     {
         if (!check_dequant_primitive(ref.dequant_normal, opt.dequant_normal))
         {
-            printf("dequant: Failed!\n");
+            printf("dequant_normal: Failed!\n");
             return false;
         }
     }
@@ -655,8 +655,10 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
 
     if (opt.dequant_normal)
     {
+        int scale = 72 << X265_DEPTH;
+        int shift = X265_DEPTH - 4;
         printf("dequant_normal\t");
-        REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, 70, 1);
+        REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, scale, shift);
     }
 
     if (opt.dequant_scaling)
-- 
2.34.1

_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 1/2] AArch64: Fix Neon asm implementation of dequant_normal for HBD

Reply via email to