PR #22588 opened by Shreesh Adiga (tantei3) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22588 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22588.patch
Adding crc32 specialization for aarch64 which uses both PMULL and crc32 instructions to perform 192 bytes fold in one iteration, performing 9x PMULL and 6 crc32 in one loop iteration, obtaining higher performance for large inputs >8kB. This approach is based on zlib-ng implementation which is also described at https://github.com/corsix/fast-crc32. For smaller buffer size, it was observed to be slightly slower, thus only for input size >8192 this logic is used, for smaller sizes otherwise the 4x PMULL folding method is used along with scalar crc32 instructions for processing the remainder input size. Testing for input size of ~100 kB shows that on Cortex-x925 it is about 2x faster than the current PMULL implementation: ``` crc_32_IEEE_LE_c: 1766.9 ( 1.00x) crc_32_IEEE_LE_crc: 86.0 (20.54x) crc_32_IEEE_LE_pmull_eor3: 26.1 (67.67x) ``` >From 6fccef628747c798c6e47839bb48b41566d97e06 Mon Sep 17 00:00:00 2001 From: Shreesh Adiga <[email protected]> Date: Mon, 23 Mar 2026 17:25:00 +0530 Subject: [PATCH] avutil/crc: add aarch64 hybrid crc32 NEON PMULL+EOR SIMD implementation Adding crc32 specialization for aarch64 which uses both PMULL and crc32 instructions to perform 192 bytes fold in one iteration, performing 9x PMULL and 6 crc32 in one loop iteration, obtaining higher performance for large inputs >8kB. This approach is based on zlib-ng implementation which is also described at https://github.com/corsix/fast-crc32. For smaller buffer size, it was observed to be slightly slower, thus only for input size >8192 this logic is used, for smaller sizes otherwise the 4x PMULL folding method is used along with scalar crc32 instructions for processing the remainder input size. Testing for input size of ~100 kB shows that on Cortex-x925 it is about 2x faster than the current PMULL implementation: crc_32_IEEE_LE_c: 1766.9 ( 1.00x) crc_32_IEEE_LE_crc: 86.0 (20.54x) crc_32_IEEE_LE_pmull_eor3: 26.1 (67.67x) --- libavutil/aarch64/crc.S | 288 +++++++++++++++++++++++++++++++++++++++- libavutil/aarch64/crc.h | 29 +++- tests/checkasm/crc.c | 8 +- 3 files changed, 317 insertions(+), 8 deletions(-) diff --git a/libavutil/aarch64/crc.S b/libavutil/aarch64/crc.S index 6ff109aa71..462c0df92e 100644 --- a/libavutil/aarch64/crc.S +++ b/libavutil/aarch64/crc.S @@ -122,13 +122,13 @@ endconst // assume Vfold is v16 and v0 is filled with 0 // uses v17 as temp -.macro FOLD_64_TO_32 le, Vconst +.macro FOLD_64_TO_32 le, Vconst, output_reg .if ! \le pmull v17.1q, v16.1d, \Vconst\().1d pmull2 v17.1q, v17.2d, \Vconst\().2d eor v16.16b, v16.16b, v17.16b - fmov w0, s16 - rev w0, w0 + fmov \output_reg\(), s16 + rev \output_reg\(), \output_reg\() .else mov v16.s[0], wzr pmull v17.1q, v16.1d, \Vconst\().1d @@ -136,7 +136,7 @@ endconst ext \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8 pmull v17.1q, v17.1d, \Vconst\().1d eor v16.16b, v16.16b, v17.16b - mov w0, v16.s[2] + mov \output_reg\(), v16.s[2] .endif .endm @@ -259,7 +259,7 @@ function ff_crc_neon_pmull, export=1 7: // reduce 64 to 32 ldr q3, [x0, #(CTX_OFFSET + 48)] - FOLD_64_TO_32 \le, v3 + FOLD_64_TO_32 \le, v3, w0 ret 8: // less than 64 bytes @@ -329,6 +329,284 @@ endfunc crc_fn_template 0 crc_fn_template 1 +#if HAVE_ARM_CRC +ENABLE_ARM_CRC +// uses x7, x6, x4 and v31 as temporary registers. +.macro CRC_SHIFT crc_reg, nbits_reg, output_neon_reg + mov x7, #-2 +1: + and x6, x\nbits_reg\(), #1 + lsr x\nbits_reg\(), x\nbits_reg\(), #1 + sub x\nbits_reg\(), x\nbits_reg\(), #16 + add x7, x6, x7, lsl #1 + cmp x\nbits_reg\(), #191 + b.hi 1b + mvn x6, x7 + mov w7, #-2147483648 + lsr w7, w7, w\nbits_reg\() + lsr x\nbits_reg\(), x\nbits_reg\(), #5 +2: + subs x\nbits_reg\(), x\nbits_reg\(), #1 + crc32w w7, w7, wzr + b.ne 2b + lsr x4, x6, #1 + cbz x4, 4f + and w\nbits_reg\(), w6, #1 +3: + fmov s31, w7 + pmull v31.8h, v31.8b, v31.8b + fmov x7, d31 + lsl x7, x7, x\nbits_reg\() + and w\nbits_reg\(), w4, #1 + crc32x w7, wzr, x7 + lsr x4, x4, #1 + cbnz x4, 3b +4: + fmov s\output_neon_reg\(), w\crc_reg\() + fmov s31, w7 + pmull v\output_neon_reg\().1q, v\output_neon_reg\().1d, v31.1d +.endm + +// This routine is based on zlib-ng's implementation based on +// https://github.com/zlib-ng/zlib-ng/commit/b5638a82e726c9941bd3a1e7a23182d038eb831f +// https://github.com/corsix/fast-crc32 +function ff_crc32_pmull_eor3_aarch64, export=1 + neg x8, x2 + tst x8, #0xf + b.eq 4f // buf 16b aligned + cbz x3, 11f + tbz w8, #0, 1f + ldrb w9, [x2], #1 + sub x3, x3, #1 + crc32b w1, w1, w9 +1: + tbz w8, #1, 2f + subs x9, x3, #2 + b.lo 9f + mov x3, x9 + ldrh w10, [x2], #2 + crc32h w1, w1, w10 +2: + tbz w8, #2, 3f + subs x9, x3, #4 + b.lo 9f + mov x3, x9 + ldr w10, [x2], #4 + crc32w w1, w1, w10 +3: + tbz w8, #3, 4f + subs x9, x3, #8 + b.lo 9f + mov x3, x9 + ldr x10, [x2], #8 + crc32x w1, w1, x10 + +4: // buf 16b aligned + cmp x3, #8192 + b.lo 12f // 4x fold + + mov x8, #-6148914691236517206 + ldur q17, [x0, #(CTX_OFFSET + 0)] + movk x8, #43691 + mov w10, wzr + umulh x8, x3, x8 + ldur q18, [x0, #(CTX_OFFSET + 16)] + mov w11, wzr + lsr x9, x8, #7 + add x8, x9, x9, lsl #1 + lsl x12, x9, #4 + lsl x13, x9, #5 + lsl x8, x8, #4 + add x16, x2, x12 + sub x15, x16, #32 + sub x17, x3, x8 + add x8, x2, x8 + ldp q6, q16, [x8] + ldr q0, [x8, #128] + ldp q4, q7, [x8, #32] + sub x3, x17, #144 + ldp q3, q5, [x8, #64] + ldp q2, q1, [x8, #96] + add x8, x8, #144 + +5: // 192b hybrid fold + pmull v19.1q, v6.1d, v17.1d + ldp q26, q28, [x8] + pmull2 v6.1q, v6.2d, v17.2d + ldp x16, x6, [x2] + pmull v20.1q, v16.1d, v17.1d + add x17, x2, x12 + add x7, x2, x13 + pmull2 v16.1q, v16.2d, v17.2d + add x2, x2, #16 + sub x3, x3, #144 + pmull v21.1q, v4.1d, v17.1d + ldp x4, x17, [x17] + pmull2 v4.1q, v4.2d, v17.2d + eor3 v6.16b, v6.16b, v19.16b, v26.16b + crc32x w10, w10, x4 + pmull v22.1q, v7.1d, v17.1d + ldp x5, x7, [x7] + pmull2 v7.1q, v7.2d, v17.2d + eor3 v16.16b, v16.16b, v20.16b, v28.16b + crc32x w11, w11, x5 + ldp q26, q20, [x8, #32] + cmp x2, x15 + pmull v23.1q, v3.1d, v17.1d + crc32x w16, w1, x16 + crc32x w1, w16, x6 + pmull2 v3.1q, v3.2d, v17.2d + crc32x w10, w10, x17 + crc32x w11, w11, x7 + pmull v24.1q, v5.1d, v17.1d + eor3 v4.16b, v4.16b, v21.16b, v26.16b + eor3 v7.16b, v7.16b, v22.16b, v20.16b + pmull2 v5.1q, v5.2d, v17.2d + ldp q21, q26, [x8, #64] + pmull v25.1q, v2.1d, v17.1d + ldp q28, q20, [x8, #96] + pmull2 v2.1q, v2.2d, v17.2d + ldr q22, [x8, #128] + add x8, x8, #144 + pmull v27.1q, v1.1d, v17.1d + eor3 v3.16b, v3.16b, v23.16b, v21.16b + pmull2 v1.1q, v1.2d, v17.2d + eor3 v5.16b, v5.16b, v24.16b, v26.16b + pmull v19.1q, v0.1d, v17.1d + pmull2 v0.1q, v0.2d, v17.2d + eor3 v2.16b, v2.16b, v25.16b, v28.16b + eor3 v1.16b, v1.16b, v27.16b, v20.16b + eor3 v0.16b, v0.16b, v19.16b, v22.16b + b.ls 5b // 192b hybrid fold + + add x17, x2, x13 + mov x13, #-33 + add x12, x2, x12 + ldur q22, [x0, #(CTX_OFFSET + 32)] + ldp x16, x15, [x2] + ldur q23, [x0, #(CTX_OFFSET + 48)] + ldp x14, x0, [x12] + crc32x w16, w1, x16 + crc32x w10, w10, x14 + crc32x w12, w16, x15 + ldp x7, x17, [x17] + crc32x w14, w11, x7 + crc32x w11, w10, x0 + crc32x w10, w14, x17 + mov w14, #1408 + madd x14, x9, x14, x13 + + pmull v20.1q, v6.1d, v18.1d + pmull2 v6.1q, v6.2d, v18.2d + pmull v21.1q, v5.1d, v18.1d + pmull2 v5.1q, v5.2d, v18.2d + eor3 v6.16b, v6.16b, v20.16b, v16.16b + pmull v20.1q, v7.1d, v18.1d + pmull2 v7.1q, v7.2d, v18.2d + eor3 v2.16b, v5.16b, v21.16b, v2.16b + pmull v16.1q, v6.1d, v18.1d + pmull2 v6.1q, v6.2d, v18.2d + pmull v5.1q, v2.1d, v22.1d + eor3 v3.16b, v7.16b, v20.16b, v3.16b + pmull2 v2.1q, v2.2d, v22.2d + + CRC_SHIFT 12, 14, 29 + mov w14, #1280 + eor3 v4.16b, v6.16b, v16.16b, v4.16b + pmull v16.1q, v1.1d, v18.1d + pmull2 v1.1q, v1.2d, v18.2d + pmull v6.1q, v4.1d, v22.1d + pmull2 v4.1q, v4.2d, v22.2d + madd x14, x9, x14, x13 + CRC_SHIFT 11, 14, 28 + eor3 v3.16b, v4.16b, v6.16b, v3.16b + eor3 v4.16b, v1.16b, v16.16b, v0.16b + pmull v0.1q, v3.1d, v23.1d + eor3 v2.16b, v2.16b, v5.16b, v4.16b + pmull2 v1.1q, v3.2d, v23.2d + mov w12, #1152 + madd x9, x9, x12, x13 + eor3 v0.16b, v1.16b, v0.16b, v2.16b + CRC_SHIFT 10, 9, 27 + fmov x9, d0 + crc32x w9, wzr, x9 + eor3 v1.16b, v27.16b, v28.16b, v29.16b + dup v2.2d, v0.d[1] + eor v1.16b, v1.16b, v2.16b + mov x2, x8 + fmov x10, d1 + crc32x w1, w9, x10 + +6: // process tail (<192 bytes) + bic x5, x3, #15 + and x3, x3, #0xf + cbz x5, 8f +7: + ldp x6, x7, [x2], #16 + subs x5, x5, #16 + crc32x w1, w1, x6 + crc32x w1, w1, x7 + b.ne 7b +8: + tbz x3, #3, 9f + ldr x10, [x2], #8 + sub x3, x3, #8 + crc32x w1, w1, x10 +9: + cbz x3, 11f +10: + ldrb w10, [x2], #1 + subs x3, x3, #1 + crc32b w1, w1, w10 + b.ne 10b +11: + mov w0, w1 + ret + +12: // 4x fold + cmp x3, #192 + b.lo 6b // process tail (<192 bytes) + + ldur q3, [x0, #(CTX_OFFSET + 64)] + movi v0.2d, #0 + fmov s1, w1 + ld1 {v16.16b-v19.16b}, [x2], #64 + sub x3, x3, #64 + eor v16.16b, v16.16b, v1.16b + ldur q25, [x0, #(CTX_OFFSET + 80)] + + bic x5, x3, #63 + and x3, x3, #0x3f + +13: // fold 4x loop + ld1 {v20.16b-v23.16b}, [x2], #64 + pmull v4.1q, v16.1d, v3.1d + pmull v5.1q, v17.1d, v3.1d + pmull v6.1q, v18.1d, v3.1d + pmull v7.1q, v19.1d, v3.1d + pmull2 v16.1q, v16.2d, v3.2d + pmull2 v17.1q, v17.2d, v3.2d + pmull2 v18.1q, v18.2d, v3.2d + pmull2 v19.1q, v19.2d, v3.2d + subs x5, x5, #64 + eor3 v16.16b, v16.16b, v4.16b, v20.16b + eor3 v17.16b, v17.16b, v5.16b, v21.16b + eor3 v18.16b, v18.16b, v6.16b, v22.16b + eor3 v19.16b, v19.16b, v7.16b, v23.16b + b.ne 13b // fold 4x loop + + FOLD_SINGLE v16, v25, v17, v4 + ldur q26, [x0, #(CTX_OFFSET + 96)] + FOLD_SINGLE v16, v25, v18, v4 + ldur q27, [x0, #(CTX_OFFSET + 112)] + FOLD_SINGLE v16, v25, v19, v4 + FOLD_128_TO_64 1, v26 + FOLD_64_TO_32 1, v27, w1 + b 6b // process tail (<192 bytes) +endfunc +DISABLE_ARM_CRC +#endif + DISABLE_PMULL DISABLE_EOR3 #endif diff --git a/libavutil/aarch64/crc.h b/libavutil/aarch64/crc.h index e31625606a..90c7a834d0 100644 --- a/libavutil/aarch64/crc.h +++ b/libavutil/aarch64/crc.h @@ -52,6 +52,7 @@ enum { CRC_C = 0, PMULL_BE, PMULL_LE, + CRC32_PMULL_LE, }; static const AVCRC crc_table_pmull[AV_CRC_MAX][17] = { @@ -149,6 +150,24 @@ static inline void crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, AV_WN64(dst + 56, poly_ | (1ULL << 32)); } } + +#if HAVE_ARM_CRC +FF_VISIBILITY_PUSH_HIDDEN +uint32_t ff_crc32_pmull_eor3_aarch64(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, + size_t length); +FF_VISIBILITY_POP_HIDDEN +static const AVCRC crc_table_crc32_pmull[] = { + CRC32_PMULL_LE, + 0x26b70c3d, 0x0, 0x3f41287a, 0x0, + 0xae689191, 0x0, 0xccaa009e, 0x0, + 0xf1da05aa, 0x0, 0x81256527, 0x0, + 0x8f352d95, 0x0, 0x1d9513d7, 0x0, + 0x54442bd4, 0x1, 0xc6e41596, 0x1, + 0x751997d0, 0x1, 0xccaa009e, 0x0, + 0xccaa009e, 0x0, 0x63cd6124, 0x1, + 0xf7011640, 0x1, 0xdb710641, 0x1, +}; +#endif #endif static inline av_cold int ff_crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) @@ -169,13 +188,16 @@ static inline uint32_t ff_crc_aarch64(const AVCRC *ctx, uint32_t crc, { switch (ctx[0]) { #if HAVE_PMULL && HAVE_EOR3 +#if HAVE_ARM_CRC + case CRC32_PMULL_LE: return ff_crc32_pmull_eor3_aarch64(ctx, crc, buffer, length); +#endif case PMULL_BE: return ff_crc_neon_pmull(ctx, crc, buffer, length); case PMULL_LE: return ff_crc_le_neon_pmull(ctx, crc, buffer, length); #endif #if HAVE_ARM_CRC case (AV_CRC_32_IEEE_LE + 1): return ff_crc32_aarch64(ctx, crc, buffer, length); #endif - default: av_unreachable("AARCH64 has PMULL_LE, PMULL_BE and AV_CRC_32_IEEE_LE arch-specific CRC code"); + default: av_unreachable("AARCH64 has PMULL_LE, PMULL_BE, CRC32_PMULL_LE, and AV_CRC_32_IEEE_LE arch-specific CRC code"); } return 0; } @@ -185,6 +207,11 @@ static inline const AVCRC *ff_crc_get_table_aarch64(AVCRCId crc_id) int cpu_flags = av_get_cpu_flags(); #if HAVE_PMULL && HAVE_EOR3 if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) { +#if HAVE_ARM_CRC + if (crc_id == AV_CRC_32_IEEE_LE && have_arm_crc(cpu_flags)) { + return crc_table_crc32_pmull; + } +#endif return crc_table_pmull[crc_id]; } #endif diff --git a/tests/checkasm/crc.c b/tests/checkasm/crc.c index 60d9ef6018..4e19880b53 100644 --- a/tests/checkasm/crc.c +++ b/tests/checkasm/crc.c @@ -43,7 +43,7 @@ static void check_crc(const AVCRC *table_new, const char *name, unsigned idx) if (!table_ref) return; - DECLARE_ALIGNED(4, uint8_t, buf)[8192]; + DECLARE_ALIGNED(4, uint8_t, buf)[8192 * 16]; size_t offset = rnd() & 31; static size_t sizes[AV_CRC_MAX + 1]; static unsigned sizes_initialized = 0; @@ -51,7 +51,11 @@ static void check_crc(const AVCRC *table_new, const char *name, unsigned idx) if (!(sizes_initialized & (1 << idx))) { sizes_initialized |= 1 << idx; - sizes[idx] = rnd() % (sizeof(buf) - 1 - offset); + if (idx == AV_CRC_32_IEEE_LE) { + sizes[idx] = (rnd() % (8192 - 1 - offset)) + 100 * 1024; + } else { + sizes[idx] = rnd() % (8192 - 1 - offset); + } } size_t size = sizes[idx]; -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
