Unrolling improves performance by 7-9% on Arm Neoverse server platforms.
---
 source/common/loopfilter.cpp | 66 ++++++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 22 deletions(-)

diff --git a/source/common/loopfilter.cpp b/source/common/loopfilter.cpp
index 3aad542b8..562776282 100644
--- a/source/common/loopfilter.cpp
+++ b/source/common/loopfilter.cpp
@@ -152,27 +152,6 @@ static void pelFilterLumaStrong_c(pixel* src, intptr_t 
srcStep, intptr_t offset,
     }
 }
 
-/* Deblocking of one line/column for the chrominance component
-* \param src     pointer to picture data
-* \param offset  offset value for picture data
-* \param tc      tc value
-* \param maskP   indicator to disable filtering on partP
-* \param maskQ   indicator to disable filtering on partQ */
-static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, 
int32_t tc, int32_t maskP, int32_t maskQ)
-{
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
-    {
-        int16_t m4 = (int16_t)src[0];
-        int16_t m3 = (int16_t)src[-offset];
-        int16_t m5 = (int16_t)src[offset];
-        int16_t m2 = (int16_t)src[-offset * 2];
-
-        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) 
>> 3));
-        src[-offset]  = x265_clip(m3 + (delta & maskP));
-        src[0]        = x265_clip(m4 - (delta & maskQ));
-    }
-}
-
 void pelFilterChroma_V_c(pixel *src, intptr_t srcStep, intptr_t offset, 
int32_t tc,
                          int32_t maskP, int32_t maskQ)
 {
@@ -216,6 +195,49 @@ void pelFilterChroma_V_c(pixel *src, intptr_t srcStep, 
intptr_t offset, int32_t
     src[3 * srcStep + 0] = x265_clip(m4 - (delta & maskQ));
 }
 
+void pelFilterChroma_H_c(pixel *src, intptr_t srcStep, intptr_t offset, 
int32_t tc,
+                         int32_t maskP, int32_t maskQ)
+{
+    assert(srcStep == 1);
+    (void)srcStep;
+
+    int16_t m2 = (int16_t)src[0 - offset * 2];
+    int16_t m3 = (int16_t)src[0 - offset * 1];
+    int16_t m4 = (int16_t)src[0 + offset * 0];
+    int16_t m5 = (int16_t)src[0 + offset * 1];
+
+    int32_t delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src[0 - offset * 1] = x265_clip(m3 + (delta & maskP));
+    src[0 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src[1 - offset * 2];
+    m3 = (int16_t)src[1 - offset * 1];
+    m4 = (int16_t)src[1 + offset * 0];
+    m5 = (int16_t)src[1 + offset * 1];
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src[1 - offset * 1] = x265_clip(m3 + (delta & maskP));
+    src[1 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src[2 - offset * 2];
+    m3 = (int16_t)src[2 - offset * 1];
+    m4 = (int16_t)src[2 + offset * 0];
+    m5 = (int16_t)src[2 + offset * 1];
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src[2 - offset * 1] = x265_clip(m3 + (delta & maskP));
+    src[2 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src[3 - offset * 2];
+    m3 = (int16_t)src[3 - offset * 1];
+    m4 = (int16_t)src[3 + offset * 0];
+    m5 = (int16_t)src[3 + offset * 1];
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src[3 - offset * 1] = x265_clip(m3 + (delta & maskP));
+    src[3 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+}
+
 }
 
 namespace X265_NS {
@@ -235,6 +257,6 @@ void setupLoopFilterPrimitives_c(EncoderPrimitives &p)
     p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
     p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
     p.pelFilterChroma[0]     = pelFilterChroma_V_c;
-    p.pelFilterChroma[1]     = pelFilterChroma_c;
+    p.pelFilterChroma[1]     = pelFilterChroma_H_c;
 }
 }
-- 
2.34.1

>From 8a10fcb8ced5dc43d64400c747b62d25a6747fed Mon Sep 17 00:00:00 2001
Message-Id: <8a10fcb8ced5dc43d64400c747b62d25a6747fed.1739282617.git.microdaryl.rob...@arm.com>
In-Reply-To: <[email protected]>
References: <[email protected]>
From: Micro Daryl Robles <[email protected]>
Date: Thu, 22 Aug 2024 16:49:17 +0100
Subject: [PATCH 5/5] Unroll C implementation of pelFilterChroma_H

Unrolling improves performance by 7-9% on Arm Neoverse server platforms.
---
 source/common/loopfilter.cpp | 66 ++++++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 22 deletions(-)

diff --git a/source/common/loopfilter.cpp b/source/common/loopfilter.cpp
index 3aad542b8..562776282 100644
--- a/source/common/loopfilter.cpp
+++ b/source/common/loopfilter.cpp
@@ -152,27 +152,6 @@ static void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset,
     }
 }
 
-/* Deblocking of one line/column for the chrominance component
-* \param src     pointer to picture data
-* \param offset  offset value for picture data
-* \param tc      tc value
-* \param maskP   indicator to disable filtering on partP
-* \param maskQ   indicator to disable filtering on partQ */
-static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
-{
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
-    {
-        int16_t m4 = (int16_t)src[0];
-        int16_t m3 = (int16_t)src[-offset];
-        int16_t m5 = (int16_t)src[offset];
-        int16_t m2 = (int16_t)src[-offset * 2];
-
-        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3));
-        src[-offset]  = x265_clip(m3 + (delta & maskP));
-        src[0]        = x265_clip(m4 - (delta & maskQ));
-    }
-}
-
 void pelFilterChroma_V_c(pixel *src, intptr_t srcStep, intptr_t offset, int32_t tc,
                          int32_t maskP, int32_t maskQ)
 {
@@ -216,6 +195,49 @@ void pelFilterChroma_V_c(pixel *src, intptr_t srcStep, intptr_t offset, int32_t
     src[3 * srcStep + 0] = x265_clip(m4 - (delta & maskQ));
 }
 
+void pelFilterChroma_H_c(pixel *src, intptr_t srcStep, intptr_t offset, int32_t tc,
+                         int32_t maskP, int32_t maskQ)
+{
+    assert(srcStep == 1);
+    (void)srcStep;
+
+    int16_t m2 = (int16_t)src[0 - offset * 2];
+    int16_t m3 = (int16_t)src[0 - offset * 1];
+    int16_t m4 = (int16_t)src[0 + offset * 0];
+    int16_t m5 = (int16_t)src[0 + offset * 1];
+
+    int32_t delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src[0 - offset * 1] = x265_clip(m3 + (delta & maskP));
+    src[0 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src[1 - offset * 2];
+    m3 = (int16_t)src[1 - offset * 1];
+    m4 = (int16_t)src[1 + offset * 0];
+    m5 = (int16_t)src[1 + offset * 1];
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src[1 - offset * 1] = x265_clip(m3 + (delta & maskP));
+    src[1 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src[2 - offset * 2];
+    m3 = (int16_t)src[2 - offset * 1];
+    m4 = (int16_t)src[2 + offset * 0];
+    m5 = (int16_t)src[2 + offset * 1];
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src[2 - offset * 1] = x265_clip(m3 + (delta & maskP));
+    src[2 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+
+    m2 = (int16_t)src[3 - offset * 2];
+    m3 = (int16_t)src[3 - offset * 1];
+    m4 = (int16_t)src[3 + offset * 0];
+    m5 = (int16_t)src[3 + offset * 1];
+
+    delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+    src[3 - offset * 1] = x265_clip(m3 + (delta & maskP));
+    src[3 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+}
+
 }
 
 namespace X265_NS {
@@ -235,6 +257,6 @@ void setupLoopFilterPrimitives_c(EncoderPrimitives &p)
     p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
     p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
     p.pelFilterChroma[0]     = pelFilterChroma_V_c;
-    p.pelFilterChroma[1]     = pelFilterChroma_c;
+    p.pelFilterChroma[1]     = pelFilterChroma_H_c;
 }
 }
-- 
2.34.1

_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to