Add AVX2 implementations for 10-bit H.264 16x16 intra prediction:
- pred16x16_vertical_10
- pred16x16_horizontal_10
- pred16x16_dc_10
- pred16x16_top_dc_10
- pred16x16_left_dc_10
- pred16x16_128_dc_10
10-bit 16x16 blocks are 32 bytes per row, perfectly matching AVX2's
256-bit YMM registers, allowing single-instruction row operations
versus two XMM operations with SSE2.
checkasm benchmarks on Zen3 (cycles, lower is better):
C SSE2 AVX2
pred16x16_dc_10 65.7 40.3 27.3 (1.48x vs SSE2)
pred16x16_128_dc_10 31.1 28.1 21.4 (1.31x vs SSE2)
pred16x16_horizontal 67.8 28.1 21.6 (1.30x vs SSE2)
pred16x16_left_dc_10 55.6 35.0 22.9 (1.53x vs SSE2)
pred16x16_top_dc_10 49.5 32.3 21.8 (1.48x vs SSE2)
pred16x16_vertical_10 32.3 28.3 24.1 (1.17x vs SSE2)
Merry Christmas from New Zealand!
---
libavcodec/x86/h264_intrapred_10bit.asm | 186 ++++++++++++++++++++++++
libavcodec/x86/h264_intrapred_init.c | 14 ++
2 files changed, 200 insertions(+)
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm
b/libavcodec/x86/h264_intrapred_10bit.asm
index 2f30807332..78e2f263bc 100644
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@@ -1117,3 +1117,189 @@ cglobal pred16x16_128_dc_10, 2,3
dec r2d
jg .loop
RET
+
+;-----------------------------------------------------------------------------
+; AVX2 versions of pred16x16 10-bit functions
+; For 10-bit: 16 pixels * 2 bytes = 32 bytes = 1 YMM register (perfect match\!)
+;-----------------------------------------------------------------------------
+
+%if HAVE_AVX2_EXTERNAL
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_vertical_10_avx2(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal pred16x16_vertical_10, 2, 4
+ sub r0, r1
+ movu m0, [r0] ; Load all 16 pixels (32 bytes) from top
row
+ mov r2d, 4
+ lea r3, [r1*3]
+.loop:
+ movu [r0+r1*1], m0
+ movu [r0+r1*2], m0
+ movu [r0+r3 ], m0
+ lea r0, [r0+r1*2]
+ movu [r0+r1*2], m0
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_horizontal_10_avx2(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal pred16x16_horizontal_10, 2, 4
+ lea r2, [r1*3]
+ mov r3d, 4
+.loop:
+ vpbroadcastw m0, [r0-2]
+ movu [r0], m0
+ vpbroadcastw m0, [r0+r1-2]
+ movu [r0+r1], m0
+ vpbroadcastw m0, [r0+r1*2-2]
+ movu [r0+r1*2], m0
+ vpbroadcastw m0, [r0+r2-2]
+ movu [r0+r2], m0
+ lea r0, [r0+r1*4]
+ dec r3d
+ jg .loop
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_dc_10_avx2(pixel *src, ptrdiff_t stride)
+; DC = (sum of 16 top pixels + sum of 16 left pixels + 16) >> 5
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal pred16x16_dc_10, 2, 6
+ mov r5, r0 ; Save dest pointer
+ sub r0, r1
+ movu m0, [r0] ; Load top row (32 bytes)
+ vextracti128 xm1, m0, 1 ; Get high 128 bits
+ paddw xm0, xm1 ; Sum to 8 words
+ phaddw xm0, xm0 ; 4 words
+ phaddw xm0, xm0 ; 2 words
+ phaddw xm0, xm0 ; 1 word (top sum in low word)
+ movd r3d, xm0
+ and r3d, 0xFFFF ; Keep only low 16 bits
+
+ ; Sum left column using lea-based pointer advancement
+ lea r0, [r0+r1-2] ; Point to left pixel of row 0
+ movzx r4d, word [r0]
+ add r3d, r4d
+ movzx r4d, word [r0+r1]
+ add r3d, r4d
+%rep 7
+ lea r0, [r0+r1*2]
+ movzx r4d, word [r0]
+ add r3d, r4d
+ movzx r4d, word [r0+r1]
+ add r3d, r4d
+%endrep
+ add r3d, 16 ; Rounding
+ shr r3d, 5 ; Divide by 32
+
+ movd xm0, r3d
+ vpbroadcastw m0, xm0 ; Broadcast to all 16 words
+
+ ; Fill all 16 rows
+ mov r3d, 4
+ lea r4, [r1*3]
+.loop:
+ movu [r5+r1*0], m0
+ movu [r5+r1*1], m0
+ movu [r5+r1*2], m0
+ movu [r5+r4 ], m0
+ lea r5, [r5+r1*4]
+ dec r3d
+ jg .loop
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_top_dc_10_avx2(pixel *src, ptrdiff_t stride)
+; DC = (sum of 16 top pixels + 8) >> 4
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal pred16x16_top_dc_10, 2, 4
+ sub r0, r1
+ movu m0, [r0] ; Load top row
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ phaddw xm0, xm0
+ phaddw xm0, xm0
+ phaddw xm0, xm0
+ paddw xm0, [pw_8] ; Add 8 for rounding
+ psrlw xm0, 4 ; Divide by 16
+ vpbroadcastw m0, xm0
+
+ mov r2d, 4
+ lea r3, [r1*3]
+.loop:
+ movu [r0+r1*1], m0
+ movu [r0+r1*2], m0
+ movu [r0+r3 ], m0
+ lea r0, [r0+r1*2]
+ movu [r0+r1*2], m0
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_left_dc_10_avx2(pixel *src, ptrdiff_t stride)
+; DC = (sum of 16 left pixels + 8) >> 4
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal pred16x16_left_dc_10, 2, 5
+ mov r4, r0 ; Save dest pointer
+
+ ; Sum left column using lea-based pointer advancement
+ sub r0, 2 ; Point to left pixel of row 0
+ movzx r2d, word [r0]
+ movzx r3d, word [r0+r1]
+%rep 7
+ lea r0, [r0+r1*2]
+ movzx eax, word [r0]
+ add r2d, eax
+ movzx eax, word [r0+r1]
+ add r3d, eax
+%endrep
+ lea r2d, [r2+r3+8] ; Sum with rounding
+ shr r2d, 4 ; Divide by 16
+
+ movd xm0, r2d
+ vpbroadcastw m0, xm0
+
+ ; Fill all 16 rows
+ mov r2d, 4
+ lea r3, [r1*3]
+.loop:
+ movu [r4+r1*0], m0
+ movu [r4+r1*1], m0
+ movu [r4+r1*2], m0
+ movu [r4+r3 ], m0
+ lea r4, [r4+r1*4]
+ dec r2d
+ jg .loop
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_128_dc_10_avx2(pixel *src, ptrdiff_t stride)
+; Fill with constant 512 (1 << 9 for 10-bit midpoint)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal pred16x16_128_dc_10, 2, 4
+ vpbroadcastw m0, [pw_512]
+ mov r2d, 4
+ lea r3, [r1*3]
+.loop:
+ movu [r0+r1*0], m0
+ movu [r0+r1*1], m0
+ movu [r0+r1*2], m0
+ movu [r0+r3 ], m0
+ lea r0, [r0+r1*4]
+ dec r2d
+ jg .loop
+ RET
+
+%endif ; HAVE_AVX2_EXTERNAL
diff --git a/libavcodec/x86/h264_intrapred_init.c
b/libavcodec/x86/h264_intrapred_init.c
index aa9bc721f0..6918c7f985 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -97,6 +97,12 @@ PRED16x16(128_dc, 10, sse2)
PRED16x16(left_dc, 10, sse2)
PRED16x16(vertical, 10, sse2)
PRED16x16(horizontal, 10, sse2)
+PRED16x16(dc, 10, avx2)
+PRED16x16(top_dc, 10, avx2)
+PRED16x16(128_dc, 10, avx2)
+PRED16x16(left_dc, 10, avx2)
+PRED16x16(vertical, 10, avx2)
+PRED16x16(horizontal, 10, avx2)
/* 8-bit versions */
PRED16x16(vertical, 8, sse)
@@ -328,5 +334,13 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int
codec_id,
h->pred8x8l[VERT_RIGHT_PRED ] =
ff_pred8x8l_vertical_right_10_avx;
h->pred8x8l[HOR_UP_PRED ] =
ff_pred8x8l_horizontal_up_10_avx;
}
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_avx2;
+ h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_avx2;
+ h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_avx2;
+ h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_avx2;
+ h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_avx2;
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_avx2;
+ }
}
}
--
2.51.0.windows.2
_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]