The 256-bit SVE vector length loop for 16xh already only processes one
full vector per iteration, so a further specialization for even longer
vectors cannot be useful. In particular the implementation here for
vector lengths of at least 512 bits appears to be incorrect and
unreachable, so simply delete it and use the 256-bit implmentation
instead.

The existing code for 32xh flips the add and sqrshrnb instructions
leading to an incorrect result, so reorder them to fix.

Co-authored-by: Hari Limaye <[email protected]>
---
 source/common/aarch64/mc-a-sve2.S | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/source/common/aarch64/mc-a-sve2.S 
b/source/common/aarch64/mc-a-sve2.S
index e4540ce9b..00fb0048f 100644
--- a/source/common/aarch64/mc-a-sve2.S
+++ b/source/common/aarch64/mc-a-sve2.S
@@ -511,8 +511,6 @@ function PFX(addAvg_16x\h\()_sve2)
     cbnz            w12, .Loop_eq_16_sve2_addavg_16x\h
     ret
 .vl_gt_16_addAvg_16x\h\():
-    cmp             x9, #32
-    bgt             .vl_gt_32_addAvg_16x\h
     ptrue           p0.b, vl32
 .Loop_gt_16_sve2_addavg_16x\h\():
     sub             w12, w12, #1
@@ -523,25 +521,9 @@ function PFX(addAvg_16x\h\()_sve2)
     add             z0.h, p0/m, z0.h, z1.h
     sqrshrnb        z0.b, z0.h, #7
     add             z0.b, z0.b, #0x80
-    st1b            {z0.h}, p1, [x2]
-    add             x2, x2, x5
-    cbnz            w12, .Loop_gt_16_sve2_addavg_16x\h
-    ret
-.vl_gt_32_addAvg_16x\h\():
-    mov             x10, #48
-    mov             x11, #0
-    whilelt         p0.b, x11, x10
-.Loop_gt_32_sve2_addavg_16x\h\():
-    sub             w12, w12, #1
-    ld1b            {z0.b}, p0/z, [x0]
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    add             z0.h, p0/m, z0.h, z1.h
-    sqrshrnb        z0.b, z0.h, #7
-    add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .Loop_gt_32_sve2_addavg_16x\h
+    cbnz            w12, .Loop_gt_16_sve2_addavg_16x\h
     ret
 endfunc
 .endm
@@ -674,9 +656,9 @@ function PFX(addAvg_32x\h\()_sve2)
     add             z0.h, p0/m, z0.h, z2.h
     add             z1.h, p0/m, z1.h, z3.h
     sqrshrnb        z0.b, z0.h, #7
-    add             z1.b, z1.b, #0x80
-    sqrshrnb        z1.b, z1.h, #7
     add             z0.b, z0.b, #0x80
+    sqrshrnb        z1.b, z1.h, #7
+    add             z1.b, z1.b, #0x80
     st1b            {z0.h}, p0, [x2]
     st1b            {z1.h}, p0, [x2, #1, mul vl]
     add             x2, x2, x5
-- 
2.34.1

>From 50998808e68c9d7f875ff35ea87e7f64e2a5562c Mon Sep 17 00:00:00 2001
Message-Id: <50998808e68c9d7f875ff35ea87e7f64e2a5562c.1736179734.git.george.st...@arm.com>
In-Reply-To: <[email protected]>
References: <[email protected]>
From: George Steed <[email protected]>
Date: Mon, 23 Dec 2024 14:10:38 +0000
Subject: [PATCH 1/6] mc-a-sve2.S: Fix addAvg_{16,32}xh_sve2 for longer SVE
 vectors

The 256-bit SVE vector length loop for 16xh already only processes one
full vector per iteration, so a further specialization for even longer
vectors cannot be useful. In particular the implementation here for
vector lengths of at least 512 bits appears to be incorrect and
unreachable, so simply delete it and use the 256-bit implmentation
instead.

The existing code for 32xh flips the add and sqrshrnb instructions
leading to an incorrect result, so reorder them to fix.

Co-authored-by: Hari Limaye <[email protected]>
---
 source/common/aarch64/mc-a-sve2.S | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/source/common/aarch64/mc-a-sve2.S b/source/common/aarch64/mc-a-sve2.S
index e4540ce9b..00fb0048f 100644
--- a/source/common/aarch64/mc-a-sve2.S
+++ b/source/common/aarch64/mc-a-sve2.S
@@ -511,8 +511,6 @@ function PFX(addAvg_16x\h\()_sve2)
     cbnz            w12, .Loop_eq_16_sve2_addavg_16x\h
     ret
 .vl_gt_16_addAvg_16x\h\():
-    cmp             x9, #32
-    bgt             .vl_gt_32_addAvg_16x\h
     ptrue           p0.b, vl32
 .Loop_gt_16_sve2_addavg_16x\h\():
     sub             w12, w12, #1
@@ -523,25 +521,9 @@ function PFX(addAvg_16x\h\()_sve2)
     add             z0.h, p0/m, z0.h, z1.h
     sqrshrnb        z0.b, z0.h, #7
     add             z0.b, z0.b, #0x80
-    st1b            {z0.h}, p1, [x2]
-    add             x2, x2, x5
-    cbnz            w12, .Loop_gt_16_sve2_addavg_16x\h
-    ret
-.vl_gt_32_addAvg_16x\h\():
-    mov             x10, #48
-    mov             x11, #0
-    whilelt         p0.b, x11, x10
-.Loop_gt_32_sve2_addavg_16x\h\():
-    sub             w12, w12, #1
-    ld1b            {z0.b}, p0/z, [x0]
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    add             z0.h, p0/m, z0.h, z1.h
-    sqrshrnb        z0.b, z0.h, #7
-    add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .Loop_gt_32_sve2_addavg_16x\h
+    cbnz            w12, .Loop_gt_16_sve2_addavg_16x\h
     ret
 endfunc
 .endm
@@ -674,9 +656,9 @@ function PFX(addAvg_32x\h\()_sve2)
     add             z0.h, p0/m, z0.h, z2.h
     add             z1.h, p0/m, z1.h, z3.h
     sqrshrnb        z0.b, z0.h, #7
-    add             z1.b, z1.b, #0x80
-    sqrshrnb        z1.b, z1.h, #7
     add             z0.b, z0.b, #0x80
+    sqrshrnb        z1.b, z1.h, #7
+    add             z1.b, z1.b, #0x80
     st1b            {z0.h}, p0, [x2]
     st1b            {z1.h}, p0, [x2, #1, mul vl]
     add             x2, x2, x5
-- 
2.34.1

_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to