Hi Hari Limaye,



Thank you fix AARCH64 build issues, these 12 patches looks good for me.




Regards,

Chen

At 2024-05-03 05:19:36, "Hari Limaye" <hari.lim...@arm.com> wrote:
>The assembly routine x265_costCoeffNxN_neon is buggy and produces an
>incorrect result on Apple Silicon, causing the pixel testbench to fail
>on these platforms.
>
>x265_costCoeffNxN assumes that parameter `int subPosBase`, the second
>parameter of type `int` passed on the stack, is at position `sp + 8`;
>this assumption is consistent with the AArch64 PCS, as arguments smaller
>than 8 bytes are widened to 8 bytes (aapcs64 6.8.2 C.16).
>However arm64e diverges from AAPCS64: 'Function arguments may consume
>slots on the stack that are not multiples of 8 bytes'.
>---
> source/common/aarch64/asm.S        | 12 +++++++++++-
> source/common/aarch64/pixel-util.S |  4 ++--
> 2 files changed, 13 insertions(+), 3 deletions(-)
>
>diff --git a/source/common/aarch64/asm.S b/source/common/aarch64/asm.S
>index ce0668103..742978631 100644
>--- a/source/common/aarch64/asm.S
>+++ b/source/common/aarch64/asm.S
>@@ -72,6 +72,16 @@
>
> #define PFX_C(name)        JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
>
>+// Alignment of stack arguments of size less than 8 bytes.
>+#ifdef __APPLE__
>+#define STACK_ARG_ALIGNMENT 4
>+#else
>+#define STACK_ARG_ALIGNMENT 8
>+#endif
>+
>+// Get offset from SP of stack argument at index `idx`.
>+#define STACK_ARG_OFFSET(idx) (idx * STACK_ARG_ALIGNMENT)
>+
> #ifdef __APPLE__
> .macro endfunc
> ELF .size \name, . - \name
>@@ -184,4 +194,4 @@ ELF     .size   \name, . - \name
>     vtrn            \t3, \t4, \s3, \s4
> .endm
>
>-#endif
>\ No newline at end of file
>+#endif
>diff --git a/source/common/aarch64/pixel-util.S 
>b/source/common/aarch64/pixel-util.S
>index 9b3c11504..378c6891c 100644
>--- a/source/common/aarch64/pixel-util.S
>+++ b/source/common/aarch64/pixel-util.S
>@@ -2311,7 +2311,7 @@ endfunc
> //    uint8_t *baseCtx,      // x6
> //    int offset,            // x7
> //    int scanPosSigOff,     // sp
>-//    int subPosBase)        // sp + 8
>+//    int subPosBase)        // sp + 8, or sp + 4 on APPLE
> function PFX(costCoeffNxN_neon)
>     // abs(coeff)
>     add             x2, x2, x2
>@@ -2410,7 +2410,7 @@ function PFX(costCoeffNxN_neon)
>     add             x4, x4, x15
>     str             h2, [x13]              // absCoeff[numNonZero] = 
> tmpCoeff[blkPos]
>
>-    ldr             x9, [sp, #8]           // subPosBase
>+    ldr             x9, [sp, #STACK_ARG_OFFSET(1)]           // subPosBase
>     uxth            w9, w9
>     cmp             w9, #0
>     cset            x2, eq
>--
>2.42.1
>
>IMPORTANT NOTICE: The contents of this email and any attachments are 
>confidential and may also be privileged. If you are not the intended 
>recipient, please notify the sender immediately and do not disclose the 
>contents to any other person, use it for any purpose, or store or copy the 
>information in any medium. Thank you.
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to