The existing code for vectors of at least 1024-bits uses predication to
ensure that only the low 1024 bits of the vector are operated on,
however the address arithmetic is performed relative to the vector
length so for 2048-bit vectors this is incorrect. Since we are operating
on the fixed low 1024 bits of the vectors here, hard-code the address
offset to 32 bytes to fix.
---
source/common/aarch64/pixel-util-sve2.S | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/source/common/aarch64/pixel-util-sve2.S
b/source/common/aarch64/pixel-util-sve2.S
index 5b872f437..c7ff0b35e 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -1339,12 +1339,13 @@ function PFX(ssimDist64_sve2)
.vl_gt_112_ssimDist64:
ssimDist_start_sve2
ptrue p0.s, vl32
+ mov x5, #32
.vl_gt_112_loop_ssimDist64_sve2:
sub w12, w12, #1
ld1b {z2.s}, p0/z, [x0]
- ld1b {z3.s}, p0/z, [x0, #1, mul vl]
+ ld1b {z3.s}, p0/z, [x0, x5]
ld1b {z23.s}, p0/z, [x2]
- ld1b {z24.s}, p0/z, [x2, #1, mul vl]
+ ld1b {z24.s}, p0/z, [x2, x5]
ssimDist_1_sve2 z2, z3, z23, z24
add x0, x0, x1
add x2, x2, x3
@@ -1596,10 +1597,11 @@ function PFX(normFact64_sve2)
.vl_gt_112_normFact64:
normFact_start_sve2
ptrue p0.s, vl32
+ mov x4, #32
.vl_gt_112_loop_normFact64_sve2:
sub w12, w12, #1
ld1b {z4.s}, p0/z, [x0]
- ld1b {z5.s}, p0/z, [x0, #1, mul vl]
+ ld1b {z5.s}, p0/z, [x0, x4]
normFact_1_sve2 z4, z5
add x0, x0, x1
cbnz w12, .vl_gt_112_loop_normFact64_sve2
--
2.34.1
>From 361042d076bb8dab0f874ed9731c5c2371d6c469 Mon Sep 17 00:00:00 2001
Message-Id: <361042d076bb8dab0f874ed9731c5c2371d6c469.1736179734.git.george.st...@arm.com>
In-Reply-To: <[email protected]>
References: <[email protected]>
From: George Steed <[email protected]>
Date: Mon, 23 Dec 2024 11:05:46 +0000
Subject: [PATCH 6/6] pixel-util-sve2.S: Fix normFact/ssimDist64 for longer SVE
vectors
The existing code for vectors of at least 1024-bits uses predication to
ensure that only the low 1024 bits of the vector are operated on,
however the address arithmetic is performed relative to the vector
length so for 2048-bit vectors this is incorrect. Since we are operating
on the fixed low 1024 bits of the vectors here, hard-code the address
offset to 32 bytes to fix.
---
source/common/aarch64/pixel-util-sve2.S | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index 5b872f437..c7ff0b35e 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -1339,12 +1339,13 @@ function PFX(ssimDist64_sve2)
.vl_gt_112_ssimDist64:
ssimDist_start_sve2
ptrue p0.s, vl32
+ mov x5, #32
.vl_gt_112_loop_ssimDist64_sve2:
sub w12, w12, #1
ld1b {z2.s}, p0/z, [x0]
- ld1b {z3.s}, p0/z, [x0, #1, mul vl]
+ ld1b {z3.s}, p0/z, [x0, x5]
ld1b {z23.s}, p0/z, [x2]
- ld1b {z24.s}, p0/z, [x2, #1, mul vl]
+ ld1b {z24.s}, p0/z, [x2, x5]
ssimDist_1_sve2 z2, z3, z23, z24
add x0, x0, x1
add x2, x2, x3
@@ -1596,10 +1597,11 @@ function PFX(normFact64_sve2)
.vl_gt_112_normFact64:
normFact_start_sve2
ptrue p0.s, vl32
+ mov x4, #32
.vl_gt_112_loop_normFact64_sve2:
sub w12, w12, #1
ld1b {z4.s}, p0/z, [x0]
- ld1b {z5.s}, p0/z, [x0, #1, mul vl]
+ ld1b {z5.s}, p0/z, [x0, x4]
normFact_1_sve2 z4, z5
add x0, x0, x1
cbnz w12, .vl_gt_112_loop_normFact64_sve2
--
2.34.1
_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel