Benchmarks: A53 A72 h264_idct4_add_10bpp_c: 187.7 115.2 h264_idct4_add_10bpp_neon: 72.5 45.0 h264_idct4_add_dc_10bpp_c: 96.0 61.2 h264_idct4_add_dc_10bpp_neon: 36.0 19.5 h264_idct8_add4_10bpp_c: 2115.5 1424.2 h264_idct8_add4_10bpp_neon: 734.0 459.5 h264_idct8_add_10bpp_c: 1017.5 709.0 h264_idct8_add_10bpp_neon: 345.5 216.5 h264_idct8_add_dc_10bpp_c: 316.0 235.5 h264_idct8_add_dc_10bpp_neon: 69.7 44.0 h264_idct_add16_10bpp_c: 2540.2 1498.5 h264_idct_add16_10bpp_neon: 1080.5 616.0 h264_idct_add16intra_10bpp_c: 784.7 439.5 h264_idct_add16intra_10bpp_neon: 641.0 462.2
Signed-off-by: Mikhail Nitenko <mnite...@gmail.com> --- there is a function that is not covered by tests, but I tested it with sample videos, not sure what to do with it libavcodec/aarch64/h264dsp_init_aarch64.c | 28 ++ libavcodec/aarch64/h264idct_neon.S | 524 ++++++++++++++++++++++ 2 files changed, 552 insertions(+) diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c index 6bf3ecb8a1..78ed9d06cd 100644 --- a/libavcodec/aarch64/h264dsp_init_aarch64.c +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -106,6 +106,24 @@ void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t strid void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); +void ff_h264_idct_add_neon_10(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_dc_add_neon_10(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_add16_neon_10(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); +void ff_h264_idct_add16intra_neon_10(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); +void ff_h264_idct_add8_neon_10(uint8_t **dest, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); + +void ff_h264_idct8_add_neon_10(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_dc_add_neon_10(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_add4_neon_10(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); + av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { @@ -162,5 +180,15 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10; c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10; } + + c->h264_idct_add = ff_h264_idct_add_neon_10; + c->h264_idct_dc_add = ff_h264_idct_dc_add_neon_10; + c->h264_idct_add16 = ff_h264_idct_add16_neon_10; + c->h264_idct_add16intra = ff_h264_idct_add16intra_neon_10; + if (chroma_format_idc <= 1) + c->h264_idct_add8 = ff_h264_idct_add8_neon_10; + c->h264_idct8_add = ff_h264_idct8_add_neon_10; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon_10; + c->h264_idct8_add4 = ff_h264_idct8_add4_neon_10; } } diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S index 7de44205d3..f238a2cd3f 100644 --- a/libavcodec/aarch64/h264idct_neon.S +++ b/libavcodec/aarch64/h264idct_neon.S @@ -411,3 +411,527 @@ const scan8 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 endconst + +function ff_h264_idct_add_neon_10, export=1 +.L_ff_h264_idct_add_neon_10: + ld1 {v0.4S, v1.4S, v2.4S, v3.4S}, [x1] + + sxtw x2, w2 + movi v30.8H, #0 + + add v4.4S, v0.4S, v2.4S + sshr v16.4S, v1.4S, #1 + st1 {v30.8H}, [x1], #16 + st1 {v30.8H}, [x1], #16 + sshr v17.4S, v3.4S, #1 + st1 {v30.8H}, [x1], #16 + st1 {v30.8H}, [x1], #16 + sub v5.4S, v0.4S, v2.4S + sub v6.4S, v16.4S, v3.4S + add v7.4S, v1.4S, v17.4S + add v0.4S, v4.4S, v7.4S + add v1.4S, v5.4S, v6.4S + sub v2.4S, v5.4S, v6.4S + sub v3.4S, v4.4S, v7.4S + + transpose_4x4S v0, v1, v2, v3, v4, v5, v6, v7 + + add v4.4S, v0.4S, v2.4S + ld1 {v18.D}[0], [x0], x2 + sshr v16.4S, v3.4S, #1 + sshr v17.4S, v1.4S, #1 + ld1 {v18.D}[1], [x0], x2 + sub v5.4S, v0.4S, v2.4S + ld1 {v19.D}[1], [x0], x2 + add v6.4S, v16.4S, v1.4S + sub v7.4S, v17.4S, v3.4S + ld1 {v19.D}[0], [x0], x2 + sub x0, x0, x2, lsl #2 + add v0.4S, v4.4S, v6.4S + add v1.4S, v5.4S, v7.4S + sub v2.4S, v4.4S, v6.4S + sub v3.4S, v5.4S, v7.4S + + srshr v0.4S, v0.4S, #6 + srshr v1.4S, v1.4S, #6 + srshr v2.4S, v2.4S, #6 + srshr v3.4S, v3.4S, #6 + + uaddw v0.4S, v0.4S, v18.4H + uaddw2 v1.4S, v1.4S, v18.8H + uaddw v2.4S, v2.4S, v19.4H + uaddw2 v3.4S, v3.4S, v19.8H + + sqxtun v0.4H, v0.4S + sqxtun2 v0.8H, v1.4S + sqxtun v1.4H, v2.4S + sqxtun2 v1.8H, v3.4S + + st1 {v0.D}[0], [x0], x2 + st1 {v0.D}[1], [x0], x2 + st1 {v1.D}[1], [x0], x2 + st1 {v1.D}[0], [x0], x2 + + sub x1, x1, #64 + ret +endfunc + +function ff_h264_idct_dc_add_neon_10, export=1 +.L_ff_h264_idct_dc_add_neon_10: + sxtw x2, w2 + mov x3, #0 + ld1r {v2.4S}, [x1] + dup v3.4S, v2.S[0] + str x3, [x1] + srshr v2.4S, v2.4S, #6 + srshr v3.4S, v3.4S, #6 + ld1 {v0.D}[0], [x0], x2 + ld1 {v0.D}[1], [x0], x2 + uaddw v4.4S, v2.4S, v0.4H + uaddw2 v5.4S, v3.4S, v0.8H + ld1 {v1.D}[0], [x0], x2 + ld1 {v1.D}[1], [x0], x2 + uaddw v6.4S, v2.4S, v1.4H + uaddw2 v7.4S, v3.4S, v1.8H + sqxtun v0.4H, v4.4S + sqxtun2 v0.8H, v5.4S + sqxtun v1.4H, v6.4S + sqxtun2 v1.8H, v7.4S + sub x0, x0, x2, lsl #2 + + mvni v4.8H, #0xFC, lsl #8 + smin v0.8H, v0.8H, v4.8H + smin v1.8H, v1.8H, v4.8H + + st1 {v0.D}[0], [x0], x2 + st1 {v0.D}[1], [x0], x2 + st1 {v1.D}[0], [x0], x2 + st1 {v1.D}[1], [x0], x2 + ret +endfunc + +function ff_h264_idct_add16_neon_10, export=1 + mov x12, x30 + mov x6, x0 // dest + mov x5, x1 // block_offset + mov x1, x2 // block + mov w9, w3 // stride + movrel x7, scan8 + mov x10, #16 + movrel x13, .L_ff_h264_idct_dc_add_neon_10 + movrel x14, .L_ff_h264_idct_add_neon_10 +1: mov w2, w9 + ldrb w3, [x7], #1 + ldrsw x0, [x5], #4 + ldrb w3, [x4, w3, uxtw] + subs w3, w3, #1 + b.lt 2f + ldrsh w3, [x1] + add x0, x0, x6 + ccmp w3, #0, #4, eq + csel x15, x13, x14, ne + blr x15 +2: subs x10, x10, #1 + add x1, x1, #64 + b.ne 1b + ret x12 +endfunc + +function ff_h264_idct_add16intra_neon_10, export=1 + mov x12, x30 + mov x6, x0 // dest + mov x5, x1 // block_offset + mov x1, x2 // block + mov w9, w3 // stride + movrel x7, scan8 + mov x10, #16 + movrel x13, .L_ff_h264_idct_dc_add_neon_10 + movrel x14, .L_ff_h264_idct_add_neon_10 +1: mov w2, w9 + ldrb w3, [x7], #1 + ldrsw x0, [x5], #4 + ldrb w3, [x4, w3, uxtw] + add x0, x0, x6 + cmp w3, #0 + ldrsh w3, [x1] + csel x15, x13, x14, eq + ccmp w3, #0, #0, eq + b.eq 2f + blr x15 +2: subs x10, x10, #1 + add x1, x1, #64 + b.ne 1b + ret x12 +endfunc + +function ff_h264_idct_add8_neon_10, export=1 // NO TESTS but test video looks fine (did not look fine before the fixes so it is definitely working somehow) + sub sp, sp, #0x40 + stp x19, x20, [sp] + mov x12, x30 + ldp x6, x15, [x0] // dest[0], dest[1] + add x5, x1, #16*4 // block_offset + add x9, x2, #32*32 // block + mov w19, w3 // stride + movrel x13, .L_ff_h264_idct_dc_add_neon_10 + movrel x14, .L_ff_h264_idct_add_neon_10 + movrel x7, scan8, 16 + mov x10, #0 // i + mov x11, #16 +1: mov w2, w19 + ldrb w3, [x7, x10] // scan8[i] + ldrsw x0, [x5, x10, lsl #2] // block_offset[i] + ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ] + add x0, x0, x6 // block_offset[i] + dst[j-1] + add x1, x9, x10, lsl #6 // block + i * 16 * 2 + cmp w3, #0 + ldrsw x3, [x1] // block[i*16] + csel x20, x13, x14, eq + ccmp x3, #0, #0, eq + b.eq 2f + blr x20 +2: add x10, x10, #1 + cmp x10, #4 + csel x10, x11, x10, eq + csel x6, x15, x6, eq + cmp x10, #20 + b.lt 1b + ldp x19, x20, [sp] + add sp, sp, #0x40 + ret x12 +endfunc + +.macro idct8x8_cols_10 pass + .if \pass == 0 + va .req v0 + vaa .req v1 + vb .req v28 + vbb .req v29 + + sshr v0.4S, v20.4S, #1 + sshr v1.4S, v21.4S, #1 + add v2.4S, v16.4S, v24.4S // a0 + add v3.4S, v17.4S, v25.4S + + ld1 {v28.4S, v29.4S, v30.4S, v31.4S}, [x1] + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + + sub v4.4S, v16.4S, v24.4S + sub v5.4S, v17.4S, v25.4S + sshr v6.4S, v28.4S, #1 + sshr v7.4S, v29.4S, #1 + sub v0.4S, v0.4S, v28.4S + sub v1.4S, v1.4S, v29.4S + add v6.4S, v6.4S, v20.4S // a6 + add v7.4S, v7.4S, v21.4S + .else + va .req v28 + vaa .req v29 + vb .req v0 + vbb .req v1 + + sshr v28.4S, v20.4S, #1 + sshr v29.4S, v21.4S, #1 + sshr v6.4S, v0.4S, #1 + sshr v7.4S, v1.4S, #1 + add v2.4S, v16.4S, v24.4S + add v3.4S, v17.4S, v25.4S + sub v4.4S, v16.4S, v24.4S + sub v5.4S, v17.4S, v25.4S + sub v28.4S, v28.4S, v0.4S + sub v29.4S, v29.4S, v1.4S + add v6.4S, v6.4S, v20.4S + add v7.4S, v7.4S, v21.4S + .endif + add v20.4S, v4.4S, va.4S + add v21.4S, v5.4S, vaa.4S + sub v24.4S, v4.4S, va.4S + sub v25.4S, v5.4S, vaa.4S + add v16.4S, v2.4S, v6.4S // b0 + add v17.4S, v3.4S, v7.4S + sub vb.4S, v2.4S, v6.4S + sub vbb.4S, v3.4S, v7.4S + sub v2.4S, v26.4S, v22.4S + sub v3.4S, v27.4S, v23.4S + add v4.4S, v30.4S, v18.4S + add v5.4S, v31.4S, v19.4S + sub va.4S, v30.4S, v18.4S + sub vaa.4S, v31.4S, v19.4S + add v6.4S, v26.4S, v22.4S + add v7.4S, v27.4S, v23.4S + sub v2.4S, v2.4S, v30.4S + sub v3.4S, v3.4S, v31.4S + sub v4.4S, v4.4S, v22.4S + sub v5.4S, v5.4S, v23.4S + add va.4S, va.4S, v26.4S + add vaa.4S, vaa.4S, v27.4S + add v6.4S, v6.4S, v18.4S + add v7.4S, v7.4S, v19.4S + sshr v18.4S, v18.4S, #1 + sshr v19.4S, v19.4S, #1 + sshr v22.4S, v22.4S, #1 + sshr v23.4S, v23.4S, #1 + sshr v26.4S, v26.4S, #1 + sshr v27.4S, v27.4S, #1 + sshr v30.4S, v30.4S, #1 + sshr v31.4S, v31.4S, #1 + sub v2.4S, v2.4S, v30.4S + sub v3.4S, v3.4S, v31.4S + sub v4.4S, v4.4S, v22.4S + sub v5.4S, v5.4S, v23.4S + add va.4S, va.4S, v26.4S + add vaa.4S, vaa.4S, v27.4S + add v6.4S, v6.4S, v18.4S // a7 + add v7.4S, v7.4S, v19.4S + sshr v18.4S, v2.4S, #2 + sshr v19.4S, v3.4S, #2 + sshr v22.4S, v4.4S, #2 + sshr v23.4S, v5.4S, #2 + sshr v26.4S, va.4S, #2 + sshr v27.4S, vaa.4S, #2 + sshr v30.4S, v6.4S, #2 + sshr v31.4S, v7.4S, #2 + sub v6.4S, v6.4S, v18.4S + sub v7.4S, v7.4S, v19.4S + sub va.4S, v22.4S, va.4S + sub vaa.4S, v23.4S, vaa.4S + add v4.4S, v4.4S, v26.4S + add v5.4S, v5.4S, v27.4S + add v2.4S, v2.4S, v30.4S + add v3.4S, v3.4S, v31.4S + .if \pass == 0 + sub v30.4S, v16.4S, v6.4S + sub v31.4S, v17.4S, v7.4S + add v16.4S, v16.4S, v6.4S + add v17.4S, v17.4S, v7.4S + add v18.4S, v20.4S, v0.4S + add v19.4S, v21.4S, v1.4S + sub v0.4S, v20.4S, v0.4S + sub v1.4S, v21.4S, v1.4S + add v20.4S, v24.4S, v4.4S + add v21.4S, v25.4S, v5.4S + add v22.4S, v28.4S, v2.4S + add v23.4S, v29.4S, v3.4S + sub v26.4S, v24.4S, v4.4S + sub v27.4S, v25.4S, v5.4S + sub v24.4S, v28.4S, v2.4S + sub v25.4S, v29.4S, v3.4S + .else + sub v30.4S, v16.4S, v6.4S + sub v31.4S, v17.4S, v7.4S + add v16.4S, v16.4S, v6.4S + add v17.4S, v17.4S, v7.4S + add v18.4S, v20.4S, v28.4S + add v19.4S, v21.4S, v29.4S + sub v28.4S, v20.4S, v28.4S + sub v29.4S, v21.4S, v29.4S + add v20.4S, v24.4S, v4.4S + add v21.4S, v25.4S, v5.4S + sub v26.4S, v24.4S, v4.4S + sub v27.4S, v25.4S, v5.4S + add v22.4S, v0.4S, v2.4S + add v23.4S, v1.4S, v3.4S + sub v24.4S, v0.4S, v2.4S + sub v25.4S, v1.4S, v3.4S + .endif + .unreq va + .unreq vaa + .unreq vb + .unreq vbb +.endm + +function ff_h264_idct8_add_neon_10, export=1 +.L_ff_h264_idct8_add_neon_10: + movi v7.4S, #0 + sxtw x2, w2 + + ld1 {v16.4S, v17.4S, v18.4S, v19.4S}, [x1] + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1] + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1] + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + st1 {v7.4S}, [x1], #16 + + idct8x8_cols_10 0 + + transpose_8x8S v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v0, v1, v30, v31, v2, v3, v4, v5 + idct8x8_cols_10 1 + + mov x3, x0 + srshr v16.4S, v16.4S, #6 + srshr v17.4S, v17.4S, #6 + ld1 {v0.8H}, [x0], x2 + srshr v18.4S, v18.4S, #6 + srshr v19.4S, v19.4S, #6 + ld1 {v1.8H}, [x0], x2 + srshr v20.4S, v20.4S, #6 + srshr v21.4S, v21.4S, #6 + ld1 {v2.8H}, [x0], x2 + srshr v22.4S, v22.4S, #6 + srshr v23.4S, v23.4S, #6 + ld1 {v3.8H}, [x0], x2 + srshr v24.4S, v24.4S, #6 + srshr v25.4S, v25.4S, #6 + ld1 {v4.8H}, [x0], x2 + srshr v26.4S, v26.4S, #6 + srshr v27.4S, v27.4S, #6 + ld1 {v5.8H}, [x0], x2 + srshr v28.4S, v28.4S, #6 + srshr v29.4S, v29.4S, #6 + ld1 {v6.8H}, [x0], x2 + srshr v30.4S, v30.4S, #6 + srshr v31.4S, v31.4S, #6 + ld1 {v7.8H}, [x0], x2 + uaddw v16.4S, v16.4S, v0.4H + uaddw2 v17.4S, v17.4S, v0.8H + uaddw v18.4S, v18.4S, v1.4H + uaddw2 v19.4S, v19.4S, v1.8H + uaddw v20.4S, v20.4S, v2.4H + uaddw2 v21.4S, v21.4S, v2.8H + sqxtun v0.4H, v16.4S + sqxtun2 v0.8H, v17.4S + uaddw v22.4S, v22.4S, v3.4H + uaddw2 v23.4S, v23.4S, v3.8H + sqxtun v1.4H, v18.4S + sqxtun2 v1.8H, v19.4S + uaddw v24.4S, v24.4S, v4.4H + uaddw2 v25.4S, v25.4S, v4.8H + sqxtun v2.4H, v20.4S + sqxtun2 v2.8H, v21.4S + st1 {v0.8H}, [x3], x2 + uaddw v26.4S, v26.4S, v5.4H + uaddw2 v27.4S, v27.4S, v5.8H + sqxtun v3.4H, v22.4S + sqxtun2 v3.8H, v23.4S + st1 {v1.8H}, [x3], x2 + uaddw v28.4S, v28.4S, v6.4H + uaddw2 v29.4S, v29.4S, v6.8H + sqxtun v4.4H, v24.4S + sqxtun2 v4.8H, v25.4S + st1 {v2.8H}, [x3], x2 + uaddw v30.4S, v30.4S, v7.4H + uaddw2 v31.4S, v31.4S, v7.8H + sqxtun v5.4H, v26.4S + sqxtun2 v5.8H, v27.4S + st1 {v3.8H}, [x3], x2 + sqxtun v6.4H, v28.4S + sqxtun2 v6.8H, v29.4S + sqxtun v7.4H, v30.4S + sqxtun2 v7.8H, v31.4S + st1 {v4.8H}, [x3], x2 + st1 {v5.8H}, [x3], x2 + st1 {v6.8H}, [x3], x2 + st1 {v7.8H}, [x3], x2 + + sub x1, x1, #256 + ret +endfunc + +function ff_h264_idct8_dc_add_neon_10, export=1 +.L_ff_h264_idct8_dc_add_neon_10: + mov x3, #0 + sxtw x2, w2 + ld1r {v31.4S}, [x1] + str x3, [x1] + ld1 {v0.8H}, [x0], x2 + srshr v30.4S, v31.4S, #6 + srshr v31.4S, v31.4S, #6 + ld1 {v1.8H}, [x0], x2 + ld1 {v2.8H}, [x0], x2 + uaddw v16.4S, v31.4S, v0.4H + uaddw2 v17.4S, v31.4S, v0.8H + ld1 {v3.8H}, [x0], x2 + uaddw v18.4S, v31.4S, v1.4H + uaddw2 v19.4S, v31.4S, v1.8H + ld1 {v4.8H}, [x0], x2 + uaddw v20.4S, v31.4S, v2.4H + uaddw2 v21.4S, v31.4S, v2.8H + ld1 {v5.8H}, [x0], x2 + uaddw v22.4S, v31.4S, v3.4H + uaddw2 v23.4S, v31.4S, v3.8H + ld1 {v6.8H}, [x0], x2 + uaddw v24.4S, v31.4S, v4.4H + uaddw2 v25.4S, v31.4S, v4.8H + ld1 {v7.8H}, [x0], x2 + uaddw v26.4S, v31.4S, v5.4H + uaddw2 v27.4S, v31.4S, v5.8H + uaddw v28.4S, v31.4S, v6.4H + uaddw2 v29.4S, v31.4S, v6.8H + uaddw v30.4S, v31.4S, v7.4H + uaddw2 v31.4S, v31.4S, v7.8H + sqxtun v0.4H, v16.4S + sqxtun2 v0.8H, v17.4S + sqxtun v1.4H, v18.4S + sqxtun2 v1.8H, v19.4S + sqxtun v2.4H, v20.4S + sqxtun2 v2.8H, v21.4S + sqxtun v3.4H, v22.4S + sqxtun2 v3.8H, v23.4S + sub x0, x0, x2, lsl #3 + + mvni v16.8H, #0xFC, lsl #8 + smin v0.8H, v0.8H, v16.8H + st1 {v0.8H}, [x0], x2 + sqxtun v4.4H, v24.4S + sqxtun2 v4.8H, v25.4S + smin v1.8H, v1.8H, v16.8H + st1 {v1.8H}, [x0], x2 + sqxtun v5.4H, v26.4S + sqxtun2 v5.8H, v27.4S + smin v2.8H, v2.8H, v16.8H + st1 {v2.8H}, [x0], x2 + sqxtun v6.4H, v28.4S + sqxtun2 v6.8H, v29.4S + smin v3.8H, v3.8H, v16.8H + st1 {v3.8H}, [x0], x2 + sqxtun v7.4H, v30.4S + sqxtun2 v7.8H, v31.4S + smin v4.8H, v4.8H, v16.8H + st1 {v4.8H}, [x0], x2 + smin v5.8H, v5.8H, v16.8H + st1 {v5.8H}, [x0], x2 + smin v6.8H, v6.8H, v16.8H + st1 {v6.8H}, [x0], x2 + smin v7.8H, v7.8H, v16.8H + st1 {v7.8H}, [x0], x2 + ret +endfunc + +function ff_h264_idct8_add4_neon_10, export=1 + mov x12, x30 + mov x6, x0 + mov x5, x1 + mov x1, x2 + mov w2, w3 + movrel x7, scan8 + mov w10, #16 + movrel x13, .L_ff_h264_idct8_dc_add_neon_10 + movrel x14, .L_ff_h264_idct8_add_neon_10 +1: ldrb w9, [x7], #4 + ldrsw x0, [x5], #16 // block_offset + ldrb w9, [x4, w9, UXTW] // nnz + subs w9, w9, #1 + b.lt 2f + ldr w11, [x1] + add x0, x6, x0 + ccmp w11, #0, #4, eq + csel x15, x13, x14, ne + blr x15 +2: subs w10, w10, #4 + add x1, x1, #256 + b.ne 1b + ret x12 +endfunc -- 2.32.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".