The 256-bit SVE vector length loop for 16xh already only processes one full vector per iteration, so a further specialization for even longer vectors cannot be useful. In particular the implementation here for vector lengths of at least 512 bits appears to be incorrect and unreachable, so simply delete it and use the 256-bit implmentation instead.
The existing code for 32xh flips the add and sqrshrnb instructions leading to an incorrect result, so reorder them to fix. Co-authored-by: Hari Limaye <[email protected]> --- source/common/aarch64/mc-a-sve2.S | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/source/common/aarch64/mc-a-sve2.S b/source/common/aarch64/mc-a-sve2.S index e4540ce9b..00fb0048f 100644 --- a/source/common/aarch64/mc-a-sve2.S +++ b/source/common/aarch64/mc-a-sve2.S @@ -511,8 +511,6 @@ function PFX(addAvg_16x\h\()_sve2) cbnz w12, .Loop_eq_16_sve2_addavg_16x\h ret .vl_gt_16_addAvg_16x\h\(): - cmp x9, #32 - bgt .vl_gt_32_addAvg_16x\h ptrue p0.b, vl32 .Loop_gt_16_sve2_addavg_16x\h\(): sub w12, w12, #1 @@ -523,25 +521,9 @@ function PFX(addAvg_16x\h\()_sve2) add z0.h, p0/m, z0.h, z1.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 - st1b {z0.h}, p1, [x2] - add x2, x2, x5 - cbnz w12, .Loop_gt_16_sve2_addavg_16x\h - ret -.vl_gt_32_addAvg_16x\h\(): - mov x10, #48 - mov x11, #0 - whilelt p0.b, x11, x10 -.Loop_gt_32_sve2_addavg_16x\h\(): - sub w12, w12, #1 - ld1b {z0.b}, p0/z, [x0] - add x0, x0, x3, lsl #1 - add x1, x1, x4, lsl #1 - add z0.h, p0/m, z0.h, z1.h - sqrshrnb z0.b, z0.h, #7 - add z0.b, z0.b, #0x80 st1b {z0.h}, p0, [x2] add x2, x2, x5 - cbnz w12, .Loop_gt_32_sve2_addavg_16x\h + cbnz w12, .Loop_gt_16_sve2_addavg_16x\h ret endfunc .endm @@ -674,9 +656,9 @@ function PFX(addAvg_32x\h\()_sve2) add z0.h, p0/m, z0.h, z2.h add z1.h, p0/m, z1.h, z3.h sqrshrnb z0.b, z0.h, #7 - add z1.b, z1.b, #0x80 - sqrshrnb z1.b, z1.h, #7 add z0.b, z0.b, #0x80 + sqrshrnb z1.b, z1.h, #7 + add z1.b, z1.b, #0x80 st1b {z0.h}, p0, [x2] st1b {z1.h}, p0, [x2, #1, mul vl] add x2, x2, x5 -- 2.34.1
>From 50998808e68c9d7f875ff35ea87e7f64e2a5562c Mon Sep 17 00:00:00 2001 Message-Id: <50998808e68c9d7f875ff35ea87e7f64e2a5562c.1736179734.git.george.st...@arm.com> In-Reply-To: <[email protected]> References: <[email protected]> From: George Steed <[email protected]> Date: Mon, 23 Dec 2024 14:10:38 +0000 Subject: [PATCH 1/6] mc-a-sve2.S: Fix addAvg_{16,32}xh_sve2 for longer SVE vectors The 256-bit SVE vector length loop for 16xh already only processes one full vector per iteration, so a further specialization for even longer vectors cannot be useful. In particular the implementation here for vector lengths of at least 512 bits appears to be incorrect and unreachable, so simply delete it and use the 256-bit implmentation instead. The existing code for 32xh flips the add and sqrshrnb instructions leading to an incorrect result, so reorder them to fix. Co-authored-by: Hari Limaye <[email protected]> --- source/common/aarch64/mc-a-sve2.S | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/source/common/aarch64/mc-a-sve2.S b/source/common/aarch64/mc-a-sve2.S index e4540ce9b..00fb0048f 100644 --- a/source/common/aarch64/mc-a-sve2.S +++ b/source/common/aarch64/mc-a-sve2.S @@ -511,8 +511,6 @@ function PFX(addAvg_16x\h\()_sve2) cbnz w12, .Loop_eq_16_sve2_addavg_16x\h ret .vl_gt_16_addAvg_16x\h\(): - cmp x9, #32 - bgt .vl_gt_32_addAvg_16x\h ptrue p0.b, vl32 .Loop_gt_16_sve2_addavg_16x\h\(): sub w12, w12, #1 @@ -523,25 +521,9 @@ function PFX(addAvg_16x\h\()_sve2) add z0.h, p0/m, z0.h, z1.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 - st1b {z0.h}, p1, [x2] - add x2, x2, x5 - cbnz w12, .Loop_gt_16_sve2_addavg_16x\h - ret -.vl_gt_32_addAvg_16x\h\(): - mov x10, #48 - mov x11, #0 - whilelt p0.b, x11, x10 -.Loop_gt_32_sve2_addavg_16x\h\(): - sub w12, w12, #1 - ld1b {z0.b}, p0/z, [x0] - add x0, x0, x3, lsl #1 - add x1, x1, x4, lsl #1 - add z0.h, p0/m, z0.h, z1.h - sqrshrnb z0.b, z0.h, #7 - add z0.b, z0.b, #0x80 st1b {z0.h}, p0, [x2] add x2, x2, x5 - cbnz w12, .Loop_gt_32_sve2_addavg_16x\h + cbnz w12, .Loop_gt_16_sve2_addavg_16x\h ret endfunc .endm @@ -674,9 +656,9 @@ function PFX(addAvg_32x\h\()_sve2) add z0.h, p0/m, z0.h, z2.h add z1.h, p0/m, z1.h, z3.h sqrshrnb z0.b, z0.h, #7 - add z1.b, z1.b, #0x80 - sqrshrnb z1.b, z1.h, #7 add z0.b, z0.b, #0x80 + sqrshrnb z1.b, z1.h, #7 + add z1.b, z1.b, #0x80 st1b {z0.h}, p0, [x2] st1b {z1.h}, p0, [x2, #1, mul vl] add x2, x2, x5 -- 2.34.1
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
