ssimDist64 for longer SVE vectors

George Steed Mon, 06 Jan 2025 09:18:22 -0800

The existing code for vectors of at least 1024-bits uses predication to
ensure that only the low 1024 bits of the vector are operated on,
however the address arithmetic is performed relative to the vector
length so for 2048-bit vectors this is incorrect. Since we are operating
on the fixed low 1024 bits of the vectors here, hard-code the address
offset to 32 bytes to fix.
---
 source/common/aarch64/pixel-util-sve2.S | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)


diff --git a/source/common/aarch64/pixel-util-sve2.S 
b/source/common/aarch64/pixel-util-sve2.S
index 5b872f437..c7ff0b35e 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -1339,12 +1339,13 @@ function PFX(ssimDist64_sve2)
 .vl_gt_112_ssimDist64:
     ssimDist_start_sve2
     ptrue           p0.s, vl32
+    mov             x5, #32
 .vl_gt_112_loop_ssimDist64_sve2:
     sub             w12, w12, #1
     ld1b            {z2.s}, p0/z, [x0]
-    ld1b            {z3.s}, p0/z, [x0, #1, mul vl]
+    ld1b            {z3.s}, p0/z, [x0, x5]
     ld1b            {z23.s}, p0/z, [x2]
-    ld1b            {z24.s}, p0/z, [x2, #1, mul vl]
+    ld1b            {z24.s}, p0/z, [x2, x5]
     ssimDist_1_sve2 z2, z3, z23, z24
     add             x0, x0, x1
     add             x2, x2, x3
@@ -1596,10 +1597,11 @@ function PFX(normFact64_sve2)
 .vl_gt_112_normFact64:
     normFact_start_sve2
     ptrue           p0.s, vl32
+    mov             x4, #32
 .vl_gt_112_loop_normFact64_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, [x0]
-    ld1b            {z5.s}, p0/z, [x0, #1, mul vl]
+    ld1b            {z5.s}, p0/z, [x0, x4]
     normFact_1_sve2 z4, z5
     add             x0, x0, x1
     cbnz            w12, .vl_gt_112_loop_normFact64_sve2
-- 
2.34.1

>From 361042d076bb8dab0f874ed9731c5c2371d6c469 Mon Sep 17 00:00:00 2001
Message-Id: <361042d076bb8dab0f874ed9731c5c2371d6c469.1736179734.git.george.st...@arm.com>
In-Reply-To: <[email protected]>
References: <[email protected]>
From: George Steed <[email protected]>
Date: Mon, 23 Dec 2024 11:05:46 +0000
Subject: [PATCH 6/6] pixel-util-sve2.S: Fix normFact/ssimDist64 for longer SVE
 vectors

The existing code for vectors of at least 1024-bits uses predication to
ensure that only the low 1024 bits of the vector are operated on,
however the address arithmetic is performed relative to the vector
length so for 2048-bit vectors this is incorrect. Since we are operating
on the fixed low 1024 bits of the vectors here, hard-code the address
offset to 32 bytes to fix.
---
 source/common/aarch64/pixel-util-sve2.S | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index 5b872f437..c7ff0b35e 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -1339,12 +1339,13 @@ function PFX(ssimDist64_sve2)
 .vl_gt_112_ssimDist64:
     ssimDist_start_sve2
     ptrue           p0.s, vl32
+    mov             x5, #32
 .vl_gt_112_loop_ssimDist64_sve2:
     sub             w12, w12, #1
     ld1b            {z2.s}, p0/z, [x0]
-    ld1b            {z3.s}, p0/z, [x0, #1, mul vl]
+    ld1b            {z3.s}, p0/z, [x0, x5]
     ld1b            {z23.s}, p0/z, [x2]
-    ld1b            {z24.s}, p0/z, [x2, #1, mul vl]
+    ld1b            {z24.s}, p0/z, [x2, x5]
     ssimDist_1_sve2 z2, z3, z23, z24
     add             x0, x0, x1
     add             x2, x2, x3
@@ -1596,10 +1597,11 @@ function PFX(normFact64_sve2)
 .vl_gt_112_normFact64:
     normFact_start_sve2
     ptrue           p0.s, vl32
+    mov             x4, #32
 .vl_gt_112_loop_normFact64_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, [x0]
-    ld1b            {z5.s}, p0/z, [x0, #1, mul vl]
+    ld1b            {z5.s}, p0/z, [x0, x4]
     normFact_1_sve2 z4, z5
     add             x0, x0, x1
     cbnz            w12, .vl_gt_112_loop_normFact64_sve2
-- 
2.34.1

_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 6/6] pixel-util-sve2.S: Fix normFact/ssimDist64 for longer SVE vectors

Reply via email to