The branch main has been updated by kevans:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=42f0ac5f1bd2bb5c779ce51c369a0e47c62cbf9b

commit 42f0ac5f1bd2bb5c779ce51c369a0e47c62cbf9b
Author:     Tino Reichardt <[email protected]>
AuthorDate: 2023-04-26 19:40:26 +0000
Commit:     Kyle Evans <[email protected]>
CommitDate: 2023-04-27 00:46:47 +0000

    Fix BLAKE3 aarch64 assembly for FreeBSD and macOS
    
    The x18 register isn't useable within FreeBSD kernel space, so we
    have to fix the BLAKE3 aarch64 assembly for not using it.
    
    The source files are here: https://github.com/mcmilk/BLAKE3-tests
    
    Reviewed-by: Kyle Evans <[email protected]>
    Signed-off-by: Tino Reichardt <[email protected]>
    Closes #14728
---
 .../icp/asm-aarch64/blake3/b3_aarch64_sse2.S       | 4163 +++++++++---------
 .../icp/asm-aarch64/blake3/b3_aarch64_sse41.S      | 4447 ++++++++++----------
 2 files changed, 4078 insertions(+), 4532 deletions(-)

diff --git 
a/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S 
b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
index 8237f0eb5a4e..dc2719d142db 100644
--- a/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
+++ b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
@@ -22,480 +22,61 @@
 /*
  * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
  * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale
- * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * Copyright (c) 2022-2023 Tino Reichardt <[email protected]>
  *
  * This is converted assembly: SSE2 -> ARMv8-A
  * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ *
+ * Should work on FreeBSD, Linux and macOS
+ * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh
  */
 
 #if defined(__aarch64__)
        .text
-       .section        .rodata.cst16,"aM",@progbits,16
-       .p2align        4
-.LCPI0_0:
-       .word   1779033703
-       .word   3144134277
-       .word   1013904242
-       .word   2773480762
-.LCPI0_1:
-       .xword  0
-       .xword  -4294967296
-.LCPI0_2:
-       .xword  -1
-       .xword  4294967295
+       .section        .note.gnu.property,"a",@note
+       .p2align        3
+       .word   4
+       .word   16
+       .word   5
+       .asciz  "GNU"
+       .word   3221225472
+       .word   4
+       .word   3
+       .word   0
+.Lsec_end0:
        .text
        .globl  zfs_blake3_compress_in_place_sse2
        .p2align        2
        .type   zfs_blake3_compress_in_place_sse2,@function
 zfs_blake3_compress_in_place_sse2:
        .cfi_startproc
-       ldp     q3, q2, [x0]
-       ldp     q5, q6, [x1]
-       add     x10, x1, #32
-       lsr     x11, x3, #32
-       fmov    s4, w3
-       ld2     { v17.4s, v18.4s }, [x10]
-       adrp    x10, .LCPI0_2
-       and     w8, w2, #0xff
-       mov     v4.s[1], w11
-       ldr     q1, [x10, :lo12:.LCPI0_2]
-       and     w9, w4, #0xff
-       adrp    x12, .LCPI0_0
-       mov     v4.s[2], w8
-       uzp1    v19.4s, v5.4s, v6.4s
-       add     v3.4s, v2.4s, v3.4s
-       ldr     q7, [x12, :lo12:.LCPI0_0]
-       mov     v4.s[3], w9
-       add     v3.4s, v3.4s, v19.4s
-       uzp2    v5.4s, v5.4s, v6.4s
-       ext     v21.16b, v18.16b, v18.16b, #12
-       uzp1    v6.4s, v19.4s, v19.4s
-       ext     v22.16b, v19.16b, v19.16b, #12
-       eor     v4.16b, v3.16b, v4.16b
-       ext     v20.16b, v17.16b, v17.16b, #12
-       ext     v6.16b, v6.16b, v19.16b, #8
-       ext     v19.16b, v19.16b, v22.16b, #12
-       zip1    v22.2d, v21.2d, v5.2d
-       rev32   v24.8h, v4.8h
-       mov     v4.16b, v1.16b
-       zip2    v23.4s, v5.4s, v21.4s
-       uzp2    v6.4s, v6.4s, v5.4s
-       bsl     v4.16b, v22.16b, v20.16b
-       add     v3.4s, v3.4s, v5.4s
-       zip1    v5.4s, v23.4s, v20.4s
-       zip1    v22.4s, v20.4s, v23.4s
-       add     v23.4s, v24.4s, v7.4s
-       ext     v7.16b, v6.16b, v6.16b, #4
-       ext     v25.16b, v4.16b, v4.16b, #12
-       ext     v5.16b, v22.16b, v5.16b, #8
-       eor     v2.16b, v23.16b, v2.16b
-       uzp1    v4.4s, v4.4s, v25.4s
-       uzp1    v22.4s, v7.4s, v7.4s
-       ext     v25.16b, v7.16b, v7.16b, #12
-       ext     v22.16b, v22.16b, v7.16b, #8
-       ext     v7.16b, v7.16b, v25.16b, #12
-       ushr    v25.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       orr     v2.16b, v2.16b, v25.16b
-       add     v3.4s, v3.4s, v2.4s
-       eor     v24.16b, v3.16b, v24.16b
-       add     v3.4s, v3.4s, v17.4s
-       ushr    v17.4s, v24.4s, #8
-       shl     v18.4s, v24.4s, #24
-       orr     v17.16b, v18.16b, v17.16b
-       add     v18.4s, v17.4s, v23.4s
-       eor     v2.16b, v18.16b, v2.16b
-       ushr    v23.4s, v2.4s, #7
-       shl     v2.4s, v2.4s, #25
-       ext     v3.16b, v3.16b, v3.16b, #12
-       orr     v2.16b, v2.16b, v23.16b
-       ext     v17.16b, v17.16b, v17.16b, #8
-       add     v3.4s, v2.4s, v3.4s
-       adrp    x11, .LCPI0_1
-       eor     v17.16b, v3.16b, v17.16b
-       ldr     q16, [x11, :lo12:.LCPI0_1]
-       ext     v18.16b, v18.16b, v18.16b, #4
-       rev32   v24.8h, v17.8h
-       movi    v0.2d, #0xffffffff00000000
-       add     v23.4s, v3.4s, v21.4s
-       mov     v21.s[1], v20.s[2]
-       add     v20.4s, v18.4s, v24.4s
-       bit     v19.16b, v21.16b, v0.16b
-       eor     v3.16b, v20.16b, v2.16b
-       uzp2    v2.4s, v22.4s, v19.4s
-       zip1    v17.2d, v5.2d, v19.2d
-       zip2    v18.4s, v19.4s, v5.4s
-       ushr    v21.4s, v3.4s, #12
-       shl     v3.4s, v3.4s, #20
-       ext     v22.16b, v2.16b, v2.16b, #4
-       bsl     v16.16b, v4.16b, v17.16b
-       zip1    v17.4s, v18.4s, v4.4s
-       zip1    v18.4s, v4.4s, v18.4s
-       orr     v21.16b, v3.16b, v21.16b
-       ext     v25.16b, v16.16b, v16.16b, #12
-       ext     v3.16b, v18.16b, v17.16b, #8
-       uzp1    v18.4s, v22.4s, v22.4s
-       ext     v26.16b, v22.16b, v22.16b, #12
-       add     v23.4s, v23.4s, v21.4s
-       uzp1    v17.4s, v16.4s, v25.4s
-       ext     v16.16b, v18.16b, v22.16b, #8
-       ext     v18.16b, v22.16b, v26.16b, #12
-       eor     v22.16b, v23.16b, v24.16b
-       add     v6.4s, v23.4s, v6.4s
-       ushr    v23.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       orr     v22.16b, v22.16b, v23.16b
-       add     v20.4s, v22.4s, v20.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ushr    v23.4s, v21.4s, #7
-       shl     v21.4s, v21.4s, #25
-       ext     v6.16b, v6.16b, v6.16b, #4
-       orr     v21.16b, v21.16b, v23.16b
-       ext     v22.16b, v22.16b, v22.16b, #8
-       add     v6.4s, v21.4s, v6.4s
-       eor     v22.16b, v6.16b, v22.16b
-       ext     v20.16b, v20.16b, v20.16b, #12
-       add     v6.4s, v6.4s, v19.4s
-       rev32   v19.8h, v22.8h
-       add     v20.4s, v20.4s, v19.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ushr    v22.4s, v21.4s, #12
-       shl     v21.4s, v21.4s, #20
-       orr     v21.16b, v21.16b, v22.16b
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       ushr    v22.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v19.16b, v19.16b, v22.16b
-       add     v20.4s, v19.4s, v20.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #12
-       ushr    v22.4s, v21.4s, #7
-       shl     v21.4s, v21.4s, #25
-       add     v6.4s, v6.4s, v4.4s
-       orr     v21.16b, v21.16b, v22.16b
-       ext     v19.16b, v19.16b, v19.16b, #8
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       ext     v20.16b, v20.16b, v20.16b, #4
-       rev32   v19.8h, v19.8h
-       add     v20.4s, v20.4s, v19.4s
-       add     v6.4s, v6.4s, v5.4s
-       mov     v5.s[1], v4.s[2]
-       eor     v4.16b, v20.16b, v21.16b
-       ushr    v21.4s, v4.4s, #12
-       shl     v4.4s, v4.4s, #20
-       orr     v21.16b, v4.16b, v21.16b
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       add     v2.4s, v6.4s, v2.4s
-       ushr    v6.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v6.16b, v19.16b, v6.16b
-       add     v19.4s, v6.4s, v20.4s
-       eor     v20.16b, v19.16b, v21.16b
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       ext     v2.16b, v2.16b, v2.16b, #4
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v2.4s, v20.4s, v2.4s
-       eor     v6.16b, v2.16b, v6.16b
-       ext     v19.16b, v19.16b, v19.16b, #12
-       rev32   v6.8h, v6.8h
-       add     v19.4s, v19.4s, v6.4s
-       mov     v22.16b, v0.16b
-       eor     v20.16b, v19.16b, v20.16b
-       bsl     v22.16b, v5.16b, v7.16b
-       ushr    v21.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       add     v2.4s, v2.4s, v22.4s
-       orr     v20.16b, v20.16b, v21.16b
-       add     v2.4s, v2.4s, v20.4s
-       eor     v6.16b, v2.16b, v6.16b
-       ushr    v21.4s, v6.4s, #8
-       shl     v6.4s, v6.4s, #24
-       orr     v6.16b, v6.16b, v21.16b
-       add     v19.4s, v6.4s, v19.4s
-       eor     v20.16b, v19.16b, v20.16b
-       ext     v2.16b, v2.16b, v2.16b, #12
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       add     v2.4s, v2.4s, v17.4s
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v2.4s, v2.4s, v20.4s
-       eor     v6.16b, v2.16b, v6.16b
-       uzp2    v5.4s, v16.4s, v22.4s
-       zip1    v7.2d, v3.2d, v22.2d
-       zip2    v16.4s, v22.4s, v3.4s
-       ext     v19.16b, v19.16b, v19.16b, #4
-       rev32   v22.8h, v6.8h
-       ext     v23.16b, v5.16b, v5.16b, #4
-       bif     v7.16b, v17.16b, v1.16b
-       zip1    v24.4s, v16.4s, v17.4s
-       zip1    v16.4s, v17.4s, v16.4s
-       add     v21.4s, v2.4s, v3.4s
-       mov     v3.s[1], v17.s[2]
-       add     v17.4s, v19.4s, v22.4s
-       mov     v19.16b, v0.16b
-       ext     v25.16b, v7.16b, v7.16b, #12
-       ext     v4.16b, v16.16b, v24.16b, #8
-       uzp1    v16.4s, v23.4s, v23.4s
-       bsl     v19.16b, v3.16b, v18.16b
-       eor     v2.16b, v17.16b, v20.16b
-       uzp1    v7.4s, v7.4s, v25.4s
-       ext     v25.16b, v16.16b, v23.16b, #8
-       zip1    v3.2d, v4.2d, v19.2d
-       ushr    v20.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       ext     v24.16b, v23.16b, v23.16b, #12
-       uzp2    v6.4s, v25.4s, v19.4s
-       zip2    v18.4s, v19.4s, v4.4s
-       bif     v3.16b, v7.16b, v1.16b
-       orr     v20.16b, v2.16b, v20.16b
-       ext     v16.16b, v23.16b, v24.16b, #12
-       ext     v23.16b, v6.16b, v6.16b, #4
-       zip1    v24.4s, v18.4s, v7.4s
-       zip1    v18.4s, v7.4s, v18.4s
-       ext     v25.16b, v3.16b, v3.16b, #12
-       add     v21.4s, v21.4s, v20.4s
-       ext     v2.16b, v18.16b, v24.16b, #8
-       uzp1    v18.4s, v23.4s, v23.4s
-       ext     v24.16b, v23.16b, v23.16b, #12
-       uzp1    v3.4s, v3.4s, v25.4s
-       eor     v22.16b, v21.16b, v22.16b
-       ext     v25.16b, v18.16b, v23.16b, #8
-       dup     v18.4s, v2.s[3]
-       ext     v23.16b, v23.16b, v24.16b, #12
-       add     v5.4s, v21.4s, v5.4s
-       trn1    v21.4s, v3.4s, v3.4s
-       ushr    v24.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       ext     v18.16b, v21.16b, v18.16b, #8
-       orr     v21.16b, v22.16b, v24.16b
-       add     v17.4s, v21.4s, v17.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ushr    v22.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       ext     v5.16b, v5.16b, v5.16b, #4
-       orr     v20.16b, v20.16b, v22.16b
-       ext     v21.16b, v21.16b, v21.16b, #8
-       add     v5.4s, v20.4s, v5.4s
-       eor     v21.16b, v5.16b, v21.16b
-       ext     v17.16b, v17.16b, v17.16b, #12
-       add     v5.4s, v5.4s, v19.4s
-       rev32   v19.8h, v21.8h
-       add     v17.4s, v17.4s, v19.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ushr    v21.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       orr     v20.16b, v20.16b, v21.16b
-       add     v5.4s, v5.4s, v20.4s
-       eor     v19.16b, v5.16b, v19.16b
-       ushr    v21.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v19.16b, v19.16b, v21.16b
-       add     v17.4s, v19.4s, v17.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ext     v5.16b, v5.16b, v5.16b, #12
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       add     v5.4s, v5.4s, v7.4s
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v19.16b, v19.16b, v19.16b, #8
-       add     v5.4s, v5.4s, v20.4s
-       eor     v19.16b, v5.16b, v19.16b
-       ext     v17.16b, v17.16b, v17.16b, #4
-       rev32   v22.8h, v19.8h
-       add     v21.4s, v5.4s, v4.4s
-       mov     v4.s[1], v7.s[2]
-       add     v19.4s, v17.4s, v22.4s
-       bit     v16.16b, v4.16b, v0.16b
-       eor     v5.16b, v19.16b, v20.16b
-       uzp2    v4.4s, v25.4s, v16.4s
-       zip1    v7.2d, v2.2d, v16.2d
-       zip2    v17.4s, v16.4s, v2.4s
-       ushr    v20.4s, v5.4s, #12
-       shl     v5.4s, v5.4s, #20
-       ext     v24.16b, v4.16b, v4.16b, #4
-       bif     v7.16b, v3.16b, v1.16b
-       zip1    v25.4s, v17.4s, v3.4s
-       zip1    v17.4s, v3.4s, v17.4s
-       orr     v20.16b, v5.16b, v20.16b
-       ext     v26.16b, v7.16b, v7.16b, #12
-       ext     v5.16b, v17.16b, v25.16b, #8
-       uzp1    v17.4s, v24.4s, v24.4s
-       ext     v25.16b, v24.16b, v24.16b, #12
-       bit     v23.16b, v18.16b, v0.16b
-       add     v21.4s, v21.4s, v20.4s
-       uzp1    v7.4s, v7.4s, v26.4s
-       ext     v26.16b, v17.16b, v24.16b, #8
-       ext     v17.16b, v24.16b, v25.16b, #12
-       eor     v22.16b, v21.16b, v22.16b
-       add     v6.4s, v21.4s, v6.4s
-       zip1    v21.2d, v5.2d, v23.2d
-       zip2    v24.4s, v23.4s, v5.4s
-       bif     v21.16b, v7.16b, v1.16b
-       zip1    v1.4s, v24.4s, v7.4s
-       zip1    v24.4s, v7.4s, v24.4s
-       ext     v1.16b, v24.16b, v1.16b, #8
-       ushr    v24.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       orr     v22.16b, v22.16b, v24.16b
-       add     v19.4s, v22.4s, v19.4s
-       ext     v24.16b, v21.16b, v21.16b, #12
-       eor     v20.16b, v19.16b, v20.16b
-       uzp1    v21.4s, v21.4s, v24.4s
-       ushr    v24.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       orr     v20.16b, v20.16b, v24.16b
-       ext     v6.16b, v6.16b, v6.16b, #4
-       ext     v22.16b, v22.16b, v22.16b, #8
-       add     v6.4s, v20.4s, v6.4s
-       eor     v22.16b, v6.16b, v22.16b
-       ext     v19.16b, v19.16b, v19.16b, #12
-       add     v6.4s, v6.4s, v16.4s
-       rev32   v16.8h, v22.8h
-       add     v19.4s, v19.4s, v16.4s
-       eor     v20.16b, v19.16b, v20.16b
-       ushr    v22.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       orr     v20.16b, v20.16b, v22.16b
-       add     v6.4s, v6.4s, v20.4s
-       eor     v16.16b, v6.16b, v16.16b
-       ext     v6.16b, v6.16b, v6.16b, #12
-       add     v3.4s, v6.4s, v3.4s
-       ushr    v6.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       orr     v6.16b, v16.16b, v6.16b
-       add     v16.4s, v6.4s, v19.4s
-       eor     v19.16b, v16.16b, v20.16b
-       ushr    v20.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       orr     v19.16b, v19.16b, v20.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v3.4s, v3.4s, v19.4s
-       eor     v6.16b, v3.16b, v6.16b
-       ext     v16.16b, v16.16b, v16.16b, #4
-       add     v2.4s, v3.4s, v2.4s
-       rev32   v3.8h, v6.8h
-       add     v6.4s, v16.4s, v3.4s
-       eor     v16.16b, v6.16b, v19.16b
-       ushr    v19.4s, v16.4s, #12
-       shl     v16.4s, v16.4s, #20
-       orr     v16.16b, v16.16b, v19.16b
-       add     v2.4s, v2.4s, v16.4s
-       eor     v3.16b, v2.16b, v3.16b
-       add     v2.4s, v2.4s, v4.4s
-       ushr    v4.4s, v3.4s, #8
-       shl     v3.4s, v3.4s, #24
-       orr     v3.16b, v3.16b, v4.16b
-       add     v4.4s, v3.4s, v6.4s
-       eor     v6.16b, v4.16b, v16.16b
-       ushr    v16.4s, v6.4s, #7
-       shl     v6.4s, v6.4s, #25
-       ext     v2.16b, v2.16b, v2.16b, #4
-       orr     v6.16b, v6.16b, v16.16b
-       ext     v3.16b, v3.16b, v3.16b, #8
-       add     v2.4s, v6.4s, v2.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ext     v4.16b, v4.16b, v4.16b, #12
-       rev32   v3.8h, v3.8h
-       add     v4.4s, v4.4s, v3.4s
-       eor     v6.16b, v4.16b, v6.16b
-       ushr    v16.4s, v6.4s, #12
-       shl     v6.4s, v6.4s, #20
-       add     v2.4s, v2.4s, v23.4s
-       orr     v6.16b, v6.16b, v16.16b
-       add     v2.4s, v2.4s, v6.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ushr    v16.4s, v3.4s, #8
-       shl     v3.4s, v3.4s, #24
-       orr     v3.16b, v3.16b, v16.16b
-       add     v4.4s, v3.4s, v4.4s
-       eor     v6.16b, v4.16b, v6.16b
-       ext     v2.16b, v2.16b, v2.16b, #12
-       ushr    v16.4s, v6.4s, #7
-       shl     v6.4s, v6.4s, #25
-       add     v2.4s, v2.4s, v7.4s
-       orr     v6.16b, v6.16b, v16.16b
-       ext     v3.16b, v3.16b, v3.16b, #8
-       add     v2.4s, v2.4s, v6.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ext     v4.16b, v4.16b, v4.16b, #4
-       rev32   v3.8h, v3.8h
-       add     v2.4s, v2.4s, v5.4s
-       mov     v5.s[1], v7.s[2]
-       add     v4.4s, v4.4s, v3.4s
-       bsl     v0.16b, v5.16b, v17.16b
-       eor     v5.16b, v4.16b, v6.16b
-       ushr    v6.4s, v5.4s, #12
-       shl     v5.4s, v5.4s, #20
-       orr     v5.16b, v5.16b, v6.16b
-       add     v2.4s, v2.4s, v5.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ushr    v6.4s, v3.4s, #8
-       shl     v3.4s, v3.4s, #24
-       orr     v3.16b, v3.16b, v6.16b
-       add     v4.4s, v3.4s, v4.4s
-       uzp2    v18.4s, v26.4s, v18.4s
-       eor     v5.16b, v4.16b, v5.16b
-       add     v2.4s, v2.4s, v18.4s
-       ushr    v6.4s, v5.4s, #7
-       shl     v5.4s, v5.4s, #25
-       ext     v2.16b, v2.16b, v2.16b, #4
-       orr     v5.16b, v5.16b, v6.16b
-       ext     v3.16b, v3.16b, v3.16b, #8
-       add     v2.4s, v5.4s, v2.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ext     v4.16b, v4.16b, v4.16b, #12
-       add     v0.4s, v2.4s, v0.4s
-       rev32   v2.8h, v3.8h
-       add     v3.4s, v4.4s, v2.4s
-       eor     v4.16b, v3.16b, v5.16b
-       ushr    v5.4s, v4.4s, #12
-       shl     v4.4s, v4.4s, #20
-       orr     v4.16b, v4.16b, v5.16b
-       add     v0.4s, v0.4s, v4.4s
-       eor     v2.16b, v0.16b, v2.16b
-       ushr    v5.4s, v2.4s, #8
-       shl     v2.4s, v2.4s, #24
-       orr     v2.16b, v2.16b, v5.16b
-       add     v3.4s, v2.4s, v3.4s
-       eor     v4.16b, v3.16b, v4.16b
-       ext     v0.16b, v0.16b, v0.16b, #12
-       ushr    v5.4s, v4.4s, #7
-       shl     v4.4s, v4.4s, #25
-       add     v0.4s, v0.4s, v21.4s
-       orr     v4.16b, v4.16b, v5.16b
-       ext     v2.16b, v2.16b, v2.16b, #8
-       add     v0.4s, v0.4s, v4.4s
-       eor     v2.16b, v0.16b, v2.16b
-       ext     v3.16b, v3.16b, v3.16b, #4
-       add     v0.4s, v0.4s, v1.4s
-       rev32   v1.8h, v2.8h
-       add     v2.4s, v3.4s, v1.4s
-       eor     v3.16b, v2.16b, v4.16b
-       ushr    v4.4s, v3.4s, #12
-       shl     v3.4s, v3.4s, #20
-       orr     v3.16b, v3.16b, v4.16b
-       add     v0.4s, v0.4s, v3.4s
-       eor     v1.16b, v0.16b, v1.16b
-       ushr    v4.4s, v1.4s, #8
-       shl     v1.4s, v1.4s, #24
-       orr     v1.16b, v1.16b, v4.16b
-       add     v2.4s, v1.4s, v2.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ext     v0.16b, v0.16b, v0.16b, #4
-       ext     v2.16b, v2.16b, v2.16b, #12
-       ushr    v4.4s, v3.4s, #7
-       shl     v3.4s, v3.4s, #25
-       ext     v1.16b, v1.16b, v1.16b, #8
+       hint    #25
+       .cfi_negate_ra_state
+       sub     sp, sp, #96
+       stp     x29, x30, [sp, #64]
+       add     x29, sp, #64
+       str     x19, [sp, #80]
+       .cfi_def_cfa w29, 32
+       .cfi_offset w19, -16
+       .cfi_offset w30, -24
+       .cfi_offset w29, -32
+       mov     x19, x0
+       mov     w5, w4
+       mov     x4, x3
+       mov     w3, w2
+       mov     x2, x1
+       mov     x0, sp
+       mov     x1, x19
+       bl      compress_pre
+       ldp     q0, q1, [sp]
+       ldp     q2, q3, [sp, #32]
        eor     v0.16b, v2.16b, v0.16b
-       orr     v2.16b, v3.16b, v4.16b
-       eor     v1.16b, v2.16b, v1.16b
-       stp     q0, q1, [x0]
+       eor     v1.16b, v3.16b, v1.16b
+       ldp     x29, x30, [sp, #64]
+       stp     q0, q1, [x19]
+       ldr     x19, [sp, #80]
+       add     sp, sp, #96
+       hint    #29
        ret
 .Lfunc_end0:
        .size   zfs_blake3_compress_in_place_sse2, 
.Lfunc_end0-zfs_blake3_compress_in_place_sse2
@@ -504,483 +85,518 @@ zfs_blake3_compress_in_place_sse2:
        .section        .rodata.cst16,"aM",@progbits,16
        .p2align        4
 .LCPI1_0:
-       .word   1779033703
-       .word   3144134277
-       .word   1013904242
-       .word   2773480762
-.LCPI1_1:
-       .xword  0
-       .xword  -4294967296
-.LCPI1_2:
-       .xword  -1
-       .xword  4294967295
+       .xword  -4942790177982912921
+       .xword  -6534734903820487822
        .text
-       .globl  zfs_blake3_compress_xof_sse2
        .p2align        2
-       .type   zfs_blake3_compress_xof_sse2,@function
-zfs_blake3_compress_xof_sse2:
+       .type   compress_pre,@function
+compress_pre:
        .cfi_startproc
-       ldp     q3, q2, [x0]
-       ldp     q5, q6, [x1]
-       add     x10, x1, #32
-       lsr     x11, x3, #32
-       fmov    s4, w3
-       ld2     { v17.4s, v18.4s }, [x10]
-       adrp    x10, .LCPI1_2
-       and     w8, w2, #0xff
-       mov     v4.s[1], w11
-       ldr     q1, [x10, :lo12:.LCPI1_2]
-       and     w9, w4, #0xff
-       adrp    x12, .LCPI1_0
-       mov     v4.s[2], w8
-       uzp1    v19.4s, v5.4s, v6.4s
-       add     v3.4s, v2.4s, v3.4s
-       ldr     q7, [x12, :lo12:.LCPI1_0]
-       mov     v4.s[3], w9
-       add     v3.4s, v3.4s, v19.4s
-       uzp2    v5.4s, v5.4s, v6.4s
-       ext     v21.16b, v18.16b, v18.16b, #12
-       uzp1    v6.4s, v19.4s, v19.4s
-       ext     v22.16b, v19.16b, v19.16b, #12
-       eor     v4.16b, v3.16b, v4.16b
-       ext     v20.16b, v17.16b, v17.16b, #12
-       ext     v6.16b, v6.16b, v19.16b, #8
-       ext     v19.16b, v19.16b, v22.16b, #12
-       zip1    v22.2d, v21.2d, v5.2d
-       rev32   v24.8h, v4.8h
-       mov     v4.16b, v1.16b
-       zip2    v23.4s, v5.4s, v21.4s
-       uzp2    v6.4s, v6.4s, v5.4s
-       bsl     v4.16b, v22.16b, v20.16b
-       add     v3.4s, v3.4s, v5.4s
-       zip1    v5.4s, v23.4s, v20.4s
-       zip1    v22.4s, v20.4s, v23.4s
-       add     v23.4s, v24.4s, v7.4s
-       ext     v7.16b, v6.16b, v6.16b, #4
-       ext     v25.16b, v4.16b, v4.16b, #12
-       ext     v5.16b, v22.16b, v5.16b, #8
-       eor     v2.16b, v23.16b, v2.16b
-       uzp1    v4.4s, v4.4s, v25.4s
-       uzp1    v22.4s, v7.4s, v7.4s
-       ext     v25.16b, v7.16b, v7.16b, #12
-       ext     v22.16b, v22.16b, v7.16b, #8
-       ext     v7.16b, v7.16b, v25.16b, #12
-       ushr    v25.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       orr     v2.16b, v2.16b, v25.16b
-       add     v3.4s, v3.4s, v2.4s
-       eor     v24.16b, v3.16b, v24.16b
-       add     v3.4s, v3.4s, v17.4s
-       ushr    v17.4s, v24.4s, #8
-       shl     v18.4s, v24.4s, #24
-       orr     v17.16b, v18.16b, v17.16b
-       add     v18.4s, v17.4s, v23.4s
-       eor     v2.16b, v18.16b, v2.16b
-       ushr    v23.4s, v2.4s, #7
-       shl     v2.4s, v2.4s, #25
-       ext     v3.16b, v3.16b, v3.16b, #12
-       orr     v2.16b, v2.16b, v23.16b
-       ext     v17.16b, v17.16b, v17.16b, #8
-       add     v3.4s, v2.4s, v3.4s
-       adrp    x11, .LCPI1_1
-       eor     v17.16b, v3.16b, v17.16b
-       ldr     q16, [x11, :lo12:.LCPI1_1]
-       ext     v18.16b, v18.16b, v18.16b, #4
-       rev32   v24.8h, v17.8h
-       movi    v0.2d, #0xffffffff00000000
-       add     v23.4s, v3.4s, v21.4s
-       mov     v21.s[1], v20.s[2]
-       add     v20.4s, v18.4s, v24.4s
-       bit     v19.16b, v21.16b, v0.16b
-       eor     v3.16b, v20.16b, v2.16b
-       uzp2    v2.4s, v22.4s, v19.4s
-       zip1    v17.2d, v5.2d, v19.2d
-       zip2    v18.4s, v19.4s, v5.4s
-       ushr    v21.4s, v3.4s, #12
-       shl     v3.4s, v3.4s, #20
-       ext     v22.16b, v2.16b, v2.16b, #4
-       bsl     v16.16b, v4.16b, v17.16b
-       zip1    v17.4s, v18.4s, v4.4s
-       zip1    v18.4s, v4.4s, v18.4s
-       orr     v21.16b, v3.16b, v21.16b
-       ext     v25.16b, v16.16b, v16.16b, #12
-       ext     v3.16b, v18.16b, v17.16b, #8
-       uzp1    v18.4s, v22.4s, v22.4s
-       ext     v26.16b, v22.16b, v22.16b, #12
-       add     v23.4s, v23.4s, v21.4s
-       uzp1    v17.4s, v16.4s, v25.4s
-       ext     v16.16b, v18.16b, v22.16b, #8
-       ext     v18.16b, v22.16b, v26.16b, #12
-       eor     v22.16b, v23.16b, v24.16b
-       add     v6.4s, v23.4s, v6.4s
-       ushr    v23.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       orr     v22.16b, v22.16b, v23.16b
-       add     v20.4s, v22.4s, v20.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ushr    v23.4s, v21.4s, #7
-       shl     v21.4s, v21.4s, #25
-       ext     v6.16b, v6.16b, v6.16b, #4
-       orr     v21.16b, v21.16b, v23.16b
-       ext     v22.16b, v22.16b, v22.16b, #8
-       add     v6.4s, v21.4s, v6.4s
-       eor     v22.16b, v6.16b, v22.16b
-       ext     v20.16b, v20.16b, v20.16b, #12
-       add     v6.4s, v6.4s, v19.4s
-       rev32   v19.8h, v22.8h
-       add     v20.4s, v20.4s, v19.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ushr    v22.4s, v21.4s, #12
-       shl     v21.4s, v21.4s, #20
-       orr     v21.16b, v21.16b, v22.16b
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       ushr    v22.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v19.16b, v19.16b, v22.16b
-       add     v20.4s, v19.4s, v20.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #12
-       ushr    v22.4s, v21.4s, #7
-       shl     v21.4s, v21.4s, #25
-       add     v6.4s, v6.4s, v4.4s
-       orr     v21.16b, v21.16b, v22.16b
-       ext     v19.16b, v19.16b, v19.16b, #8
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       ext     v20.16b, v20.16b, v20.16b, #4
-       rev32   v19.8h, v19.8h
-       add     v20.4s, v20.4s, v19.4s
-       add     v6.4s, v6.4s, v5.4s
-       mov     v5.s[1], v4.s[2]
-       eor     v4.16b, v20.16b, v21.16b
-       ushr    v21.4s, v4.4s, #12
-       shl     v4.4s, v4.4s, #20
-       orr     v21.16b, v4.16b, v21.16b
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       add     v2.4s, v6.4s, v2.4s
-       ushr    v6.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v6.16b, v19.16b, v6.16b
-       add     v19.4s, v6.4s, v20.4s
-       eor     v20.16b, v19.16b, v21.16b
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       ext     v2.16b, v2.16b, v2.16b, #4
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v2.4s, v20.4s, v2.4s
-       eor     v6.16b, v2.16b, v6.16b
-       ext     v19.16b, v19.16b, v19.16b, #12
-       rev32   v6.8h, v6.8h
-       add     v19.4s, v19.4s, v6.4s
-       mov     v22.16b, v0.16b
-       eor     v20.16b, v19.16b, v20.16b
-       bsl     v22.16b, v5.16b, v7.16b
-       ushr    v21.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       add     v2.4s, v2.4s, v22.4s
-       orr     v20.16b, v20.16b, v21.16b
-       add     v2.4s, v2.4s, v20.4s
-       eor     v6.16b, v2.16b, v6.16b
-       ushr    v21.4s, v6.4s, #8
-       shl     v6.4s, v6.4s, #24
-       orr     v6.16b, v6.16b, v21.16b
-       add     v19.4s, v6.4s, v19.4s
-       eor     v20.16b, v19.16b, v20.16b
-       ext     v2.16b, v2.16b, v2.16b, #12
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       add     v2.4s, v2.4s, v17.4s
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v2.4s, v2.4s, v20.4s
-       eor     v6.16b, v2.16b, v6.16b
-       uzp2    v5.4s, v16.4s, v22.4s
-       zip1    v7.2d, v3.2d, v22.2d
-       zip2    v16.4s, v22.4s, v3.4s
-       ext     v19.16b, v19.16b, v19.16b, #4
-       rev32   v22.8h, v6.8h
-       ext     v23.16b, v5.16b, v5.16b, #4
-       bif     v7.16b, v17.16b, v1.16b
-       zip1    v24.4s, v16.4s, v17.4s
-       zip1    v16.4s, v17.4s, v16.4s
-       add     v21.4s, v2.4s, v3.4s
-       mov     v3.s[1], v17.s[2]
-       add     v17.4s, v19.4s, v22.4s
-       mov     v19.16b, v0.16b
-       ext     v25.16b, v7.16b, v7.16b, #12
-       ext     v4.16b, v16.16b, v24.16b, #8
-       uzp1    v16.4s, v23.4s, v23.4s
-       bsl     v19.16b, v3.16b, v18.16b
-       eor     v2.16b, v17.16b, v20.16b
-       uzp1    v7.4s, v7.4s, v25.4s
-       ext     v25.16b, v16.16b, v23.16b, #8
-       zip1    v3.2d, v4.2d, v19.2d
-       ushr    v20.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       ext     v24.16b, v23.16b, v23.16b, #12
-       uzp2    v6.4s, v25.4s, v19.4s
-       zip2    v18.4s, v19.4s, v4.4s
-       bif     v3.16b, v7.16b, v1.16b
-       orr     v20.16b, v2.16b, v20.16b
-       ext     v16.16b, v23.16b, v24.16b, #12
-       ext     v23.16b, v6.16b, v6.16b, #4
-       zip1    v24.4s, v18.4s, v7.4s
-       zip1    v18.4s, v7.4s, v18.4s
-       ext     v25.16b, v3.16b, v3.16b, #12
-       add     v21.4s, v21.4s, v20.4s
-       ext     v2.16b, v18.16b, v24.16b, #8
-       uzp1    v18.4s, v23.4s, v23.4s
-       ext     v24.16b, v23.16b, v23.16b, #12
-       uzp1    v3.4s, v3.4s, v25.4s
-       eor     v22.16b, v21.16b, v22.16b
-       ext     v25.16b, v18.16b, v23.16b, #8
-       dup     v18.4s, v2.s[3]
-       ext     v23.16b, v23.16b, v24.16b, #12
-       add     v5.4s, v21.4s, v5.4s
-       trn1    v21.4s, v3.4s, v3.4s
-       ushr    v24.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       ext     v18.16b, v21.16b, v18.16b, #8
-       orr     v21.16b, v22.16b, v24.16b
-       add     v17.4s, v21.4s, v17.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ushr    v22.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       ext     v5.16b, v5.16b, v5.16b, #4
-       orr     v20.16b, v20.16b, v22.16b
-       ext     v21.16b, v21.16b, v21.16b, #8
-       add     v5.4s, v20.4s, v5.4s
-       eor     v21.16b, v5.16b, v21.16b
-       ext     v17.16b, v17.16b, v17.16b, #12
-       add     v5.4s, v5.4s, v19.4s
-       rev32   v19.8h, v21.8h
-       add     v17.4s, v17.4s, v19.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ushr    v21.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       orr     v20.16b, v20.16b, v21.16b
-       add     v5.4s, v5.4s, v20.4s
-       eor     v19.16b, v5.16b, v19.16b
-       ushr    v21.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v19.16b, v19.16b, v21.16b
-       add     v17.4s, v19.4s, v17.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ext     v5.16b, v5.16b, v5.16b, #12
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       add     v5.4s, v5.4s, v7.4s
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v19.16b, v19.16b, v19.16b, #8
-       add     v5.4s, v5.4s, v20.4s
-       eor     v19.16b, v5.16b, v19.16b
-       ext     v17.16b, v17.16b, v17.16b, #4
-       rev32   v22.8h, v19.8h
-       add     v21.4s, v5.4s, v4.4s
-       mov     v4.s[1], v7.s[2]
-       add     v19.4s, v17.4s, v22.4s
-       bit     v16.16b, v4.16b, v0.16b
-       eor     v5.16b, v19.16b, v20.16b
-       uzp2    v4.4s, v25.4s, v16.4s
-       zip1    v7.2d, v2.2d, v16.2d
-       zip2    v17.4s, v16.4s, v2.4s
-       ushr    v20.4s, v5.4s, #12
-       shl     v5.4s, v5.4s, #20
-       ext     v24.16b, v4.16b, v4.16b, #4
-       bif     v7.16b, v3.16b, v1.16b
-       zip1    v25.4s, v17.4s, v3.4s
-       zip1    v17.4s, v3.4s, v17.4s
-       orr     v20.16b, v5.16b, v20.16b
-       ext     v26.16b, v7.16b, v7.16b, #12
-       ext     v5.16b, v17.16b, v25.16b, #8
-       uzp1    v17.4s, v24.4s, v24.4s
-       ext     v25.16b, v24.16b, v24.16b, #12
-       bit     v23.16b, v18.16b, v0.16b
-       add     v21.4s, v21.4s, v20.4s
-       uzp1    v7.4s, v7.4s, v26.4s
-       ext     v26.16b, v17.16b, v24.16b, #8
-       ext     v17.16b, v24.16b, v25.16b, #12
-       eor     v22.16b, v21.16b, v22.16b
-       add     v6.4s, v21.4s, v6.4s
-       zip1    v21.2d, v5.2d, v23.2d
-       zip2    v24.4s, v23.4s, v5.4s
-       bif     v21.16b, v7.16b, v1.16b
-       zip1    v1.4s, v24.4s, v7.4s
-       zip1    v24.4s, v7.4s, v24.4s
-       ext     v1.16b, v24.16b, v1.16b, #8
-       ushr    v24.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       orr     v22.16b, v22.16b, v24.16b
-       add     v19.4s, v22.4s, v19.4s
-       ext     v24.16b, v21.16b, v21.16b, #12
-       eor     v20.16b, v19.16b, v20.16b
-       uzp1    v21.4s, v21.4s, v24.4s
-       ushr    v24.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       orr     v20.16b, v20.16b, v24.16b
-       ext     v6.16b, v6.16b, v6.16b, #4
-       ext     v22.16b, v22.16b, v22.16b, #8
-       add     v6.4s, v20.4s, v6.4s
-       eor     v22.16b, v6.16b, v22.16b
-       ext     v19.16b, v19.16b, v19.16b, #12
-       add     v6.4s, v6.4s, v16.4s
-       rev32   v16.8h, v22.8h
-       add     v19.4s, v19.4s, v16.4s
-       eor     v20.16b, v19.16b, v20.16b
-       ushr    v22.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       orr     v20.16b, v20.16b, v22.16b
-       add     v6.4s, v6.4s, v20.4s
-       eor     v16.16b, v6.16b, v16.16b
-       ext     v6.16b, v6.16b, v6.16b, #12
-       add     v3.4s, v6.4s, v3.4s
-       ushr    v6.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       orr     v6.16b, v16.16b, v6.16b
-       add     v16.4s, v6.4s, v19.4s
-       eor     v19.16b, v16.16b, v20.16b
-       ushr    v20.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       orr     v19.16b, v19.16b, v20.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v3.4s, v3.4s, v19.4s
-       eor     v6.16b, v3.16b, v6.16b
-       ext     v16.16b, v16.16b, v16.16b, #4
-       add     v2.4s, v3.4s, v2.4s
-       rev32   v3.8h, v6.8h
-       add     v6.4s, v16.4s, v3.4s
-       eor     v16.16b, v6.16b, v19.16b
-       ushr    v19.4s, v16.4s, #12
-       shl     v16.4s, v16.4s, #20
-       orr     v16.16b, v16.16b, v19.16b
-       add     v2.4s, v2.4s, v16.4s
-       eor     v3.16b, v2.16b, v3.16b
-       add     v2.4s, v2.4s, v4.4s
-       ushr    v4.4s, v3.4s, #8
-       shl     v3.4s, v3.4s, #24
-       orr     v3.16b, v3.16b, v4.16b
-       add     v4.4s, v3.4s, v6.4s
-       eor     v6.16b, v4.16b, v16.16b
-       ushr    v16.4s, v6.4s, #7
-       shl     v6.4s, v6.4s, #25
-       ext     v2.16b, v2.16b, v2.16b, #4
-       orr     v6.16b, v6.16b, v16.16b
-       ext     v3.16b, v3.16b, v3.16b, #8
-       add     v2.4s, v6.4s, v2.4s
+       hint    #34
+       fmov    s1, w3
+       movi    d0, #0x0000ff000000ff
+       ldr     q2, [x1]
+       fmov    d3, x4
+       adrp    x8, .LCPI1_0
+       mov     v1.s[1], w5
+       str     q2, [x0]
+       ldr     q4, [x8, :lo12:.LCPI1_0]
+       add     x8, x2, #32
+       ldr     q5, [x1, #16]
+       and     v0.8b, v1.8b, v0.8b
+       stp     q5, q4, [x0, #16]
+       mov     v3.d[1], v0.d[0]
+       str     q3, [x0, #48]
+       ldp     q0, q6, [x2]
+       uzp1    v1.4s, v0.4s, v6.4s
+       uzp2    v0.4s, v0.4s, v6.4s
+       add     v2.4s, v2.4s, v1.4s
+       uzp1    v18.4s, v1.4s, v1.4s
+       add     v2.4s, v2.4s, v5.4s
        eor     v3.16b, v2.16b, v3.16b
-       ext     v4.16b, v4.16b, v4.16b, #12
+       add     v2.4s, v2.4s, v0.4s
        rev32   v3.8h, v3.8h
-       add     v4.4s, v4.4s, v3.4s
-       eor     v6.16b, v4.16b, v6.16b
-       ushr    v16.4s, v6.4s, #12
-       shl     v6.4s, v6.4s, #20
-       add     v2.4s, v2.4s, v23.4s
-       orr     v6.16b, v6.16b, v16.16b
-       add     v2.4s, v2.4s, v6.4s
+       add     v4.4s, v3.4s, v4.4s
+       eor     v5.16b, v4.16b, v5.16b
+       ushr    v6.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       orr     v5.16b, v5.16b, v6.16b
+       add     v2.4s, v2.4s, v5.4s
        eor     v3.16b, v2.16b, v3.16b
-       ushr    v16.4s, v3.4s, #8
+       ushr    v6.4s, v3.4s, #8
        shl     v3.4s, v3.4s, #24
-       orr     v3.16b, v3.16b, v16.16b
+       orr     v3.16b, v3.16b, v6.16b
+       ld2     { v6.4s, v7.4s }, [x8]
        add     v4.4s, v3.4s, v4.4s
-       eor     v6.16b, v4.16b, v6.16b
-       ext     v2.16b, v2.16b, v2.16b, #12
-       ushr    v16.4s, v6.4s, #7
-       shl     v6.4s, v6.4s, #25
-       add     v2.4s, v2.4s, v7.4s
-       orr     v6.16b, v6.16b, v16.16b
        ext     v3.16b, v3.16b, v3.16b, #8
        add     v2.4s, v2.4s, v6.4s
-       eor     v3.16b, v2.16b, v3.16b
+       eor     v5.16b, v4.16b, v5.16b
        ext     v4.16b, v4.16b, v4.16b, #4
-       rev32   v3.8h, v3.8h
+       ext     v6.16b, v6.16b, v6.16b, #12
+       ext     v2.16b, v2.16b, v2.16b, #12
+       ushr    v16.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
*** 7941 LINES SKIPPED ***

Reply via email to