The existing filterPixelToShort_48x64 high bit-depth code for vectors of
at least 256-bits uses predication to ensure that only the low 256 bits
of the vector are operated on, however the address arithmetic is
performed relative to the vector length so for vectors of 512-bits or
longer this is incorrect. Since we are operating on the fixed low 256
bits of the vectors here, fix the code by hard-coding the address offset
to multiples of 32 bytes.
---
source/common/aarch64/p2s-sve.S | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/source/common/aarch64/p2s-sve.S b/source/common/aarch64/p2s-sve.S
index 85bb14b3d..11e63ddab 100644
--- a/source/common/aarch64/p2s-sve.S
+++ b/source/common/aarch64/p2s-sve.S
@@ -401,10 +401,12 @@ function PFX(filterPixelToShort_48x64_sve)
ret
.vl_gt_16_filterPixelToShort_high_48x64:
ptrue p0.h, vl16
+ mov x4, #16
+ mov x5, #32
.rept 64
ld1h {z0.h}, p0/z, [x0]
- ld1h {z1.h}, p0/z, [x0, #1, mul vl]
- ld1h {z2.h}, p0/z, [x0, #2, mul vl]
+ ld1h {z1.h}, p0/z, [x0, x4, lsl #1]
+ ld1h {z2.h}, p0/z, [x0, x5, lsl #1]
add x0, x0, x1
lsl z0.h, p0/m, z0.h, #P2S_SHIFT
lsl z1.h, p0/m, z1.h, #P2S_SHIFT
@@ -413,8 +415,8 @@ function PFX(filterPixelToShort_48x64_sve)
add z1.h, p0/m, z1.h, z31.h
add z2.h, p0/m, z2.h, z31.h
st1h {z0.h}, p0, [x2]
- st1h {z1.h}, p0, [x2, #1, mul vl]
- st1h {z2.h}, p0, [x2, #2, mul vl]
+ st1h {z1.h}, p0, [x2, x4, lsl #1]
+ st1h {z2.h}, p0, [x2, x5, lsl #1]
add x2, x2, x3
.endr
ret
--
2.34.1
>From 9b2d75b095b847bca9b0ad315132ff17e96b1506 Mon Sep 17 00:00:00 2001
From: George Steed <[email protected]>
Date: Sun, 9 Mar 2025 18:03:36 +0000
Subject: [PATCH] p2s-sve.S: Fix filterPixelToShort_48x64 for longer SVE
vectors
The existing filterPixelToShort_48x64 high bit-depth code for vectors of
at least 256-bits uses predication to ensure that only the low 256 bits
of the vector are operated on, however the address arithmetic is
performed relative to the vector length so for vectors of 512-bits or
longer this is incorrect. Since we are operating on the fixed low 256
bits of the vectors here, fix the code by hard-coding the address offset
to multiples of 32 bytes.
---
source/common/aarch64/p2s-sve.S | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/source/common/aarch64/p2s-sve.S b/source/common/aarch64/p2s-sve.S
index 85bb14b3d..11e63ddab 100644
--- a/source/common/aarch64/p2s-sve.S
+++ b/source/common/aarch64/p2s-sve.S
@@ -401,10 +401,12 @@ function PFX(filterPixelToShort_48x64_sve)
ret
.vl_gt_16_filterPixelToShort_high_48x64:
ptrue p0.h, vl16
+ mov x4, #16
+ mov x5, #32
.rept 64
ld1h {z0.h}, p0/z, [x0]
- ld1h {z1.h}, p0/z, [x0, #1, mul vl]
- ld1h {z2.h}, p0/z, [x0, #2, mul vl]
+ ld1h {z1.h}, p0/z, [x0, x4, lsl #1]
+ ld1h {z2.h}, p0/z, [x0, x5, lsl #1]
add x0, x0, x1
lsl z0.h, p0/m, z0.h, #P2S_SHIFT
lsl z1.h, p0/m, z1.h, #P2S_SHIFT
@@ -413,8 +415,8 @@ function PFX(filterPixelToShort_48x64_sve)
add z1.h, p0/m, z1.h, z31.h
add z2.h, p0/m, z2.h, z31.h
st1h {z0.h}, p0, [x2]
- st1h {z1.h}, p0, [x2, #1, mul vl]
- st1h {z2.h}, p0, [x2, #2, mul vl]
+ st1h {z1.h}, p0, [x2, x4, lsl #1]
+ st1h {z2.h}, p0, [x2, x5, lsl #1]
add x2, x2, x3
.endr
ret
--
2.34.1
_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel