Hi, Just looking for someone with i386 hw that could test enabling ASM on i386. It should be fairly noticeable between the straight C path and SSSE3.
Sample AV1 encoded content to try.. https://comstyle.com/av1/Snowfall%20-%2029314.mkv Index: Makefile =================================================================== RCS file: /home/cvs/ports/multimedia/dav1d/Makefile,v retrieving revision 1.22 diff -u -p -u -p -r1.22 Makefile --- Makefile 24 Jun 2020 16:43:34 -0000 1.22 +++ Makefile 28 Jun 2020 04:40:27 -0000 @@ -4,6 +4,7 @@ COMMENT= small and fast AV1 decoder VER= 0.7.1 DISTNAME= dav1d-${VER} +REVISION= 0 CATEGORIES= multimedia MASTER_SITES= https://downloads.videolan.org/pub/videolan/dav1d/${VER}/ EXTRACT_SUFX= .tar.xz @@ -25,12 +26,8 @@ MODULES= devel/meson COMPILER= base-clang ports-gcc COMPILER_LANGS= c -.if ${MACHINE_ARCH} == "amd64" +.if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386" BUILD_DEPENDS+= devel/nasm -.endif - -.if ${MACHINE_ARCH} == "i386" -CONFIGURE_ARGS+=-Denable_asm=false .endif .include <bsd.port.mk> Index: patches/patch-src_x86_mc_sse_asm =================================================================== RCS file: patches/patch-src_x86_mc_sse_asm diff -N patches/patch-src_x86_mc_sse_asm --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ patches/patch-src_x86_mc_sse_asm 27 Jun 2020 04:39:39 -0000 @@ -0,0 +1,371 @@ +$OpenBSD$ + +x86: Fix 32-bit build with PIC enabled. + +Index: src/x86/mc_sse.asm +--- src/x86/mc_sse.asm.orig ++++ src/x86/mc_sse.asm +@@ -1263,7 +1263,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, m + %if ARCH_X86_64 + mova m8, [pw_8] + %else +- %define m8 [pw_8] ++ %define m8 [t1-prep_sse2+pw_8] + %endif + pxor m7, m7 + %endif +@@ -1272,13 +1272,11 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, m + pshuflw m6, m6, q0000 + %if cpuflag(ssse3) + punpcklqdq m6, m6 +-%else +- %if ARCH_X86_64 ++%elif ARCH_X86_64 + psrlw m0, m8, 3 + punpcklwd m6, m0 +- %else ++%else + punpcklwd m6, [base+pw_1] +- %endif + %endif + %if ARCH_X86_32 + mov t1, t2 ; save base reg for w4 +@@ -1396,8 +1394,8 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, m + PUSH r7 + %endif + mov r7, tmpq ++ mov r5, srcq + %endif +- mov t1, srcq + .hv_w16_hloop: + movu m0, [srcq+strideq*0+8*0] + movu m1, [srcq+strideq*0+8*1] +@@ -1440,14 +1438,17 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, m + sub hd, 2 + jg .hv_w16_vloop + movzx hd, t2w +- add t1, 16 +- mov srcq, t1 + %if ARCH_X86_64 ++ add r5, 16 + add r7, 2*16 ++ mov srcq, r5 + mov tmpq, r7 + %else ++ mov srcq, srcmp + mov tmpq, tmpmp ++ add srcq, 16 + add tmpq, 2*16 ++ mov srcmp, srcq + mov tmpmp, tmpq + %endif + sub t2d, 1<<16 +@@ -2624,22 +2625,20 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, + %macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1 + %if cpuflag(ssse3) + phaddw %1, %2 +- %else +- %ifnidn %1, %2 ++ %elifnidn %1, %2 + %if %4 == 1 +- mova %3, [pw_1] ++ mova %3, [base+pw_1] + %endif + pmaddwd %1, %3 + pmaddwd %2, %3 + packssdw %1, %2 +- %else ++ %else + %if %4 == 1 +- pmaddwd %1, [pw_1] ++ pmaddwd %1, [base+pw_1] + %else + pmaddwd %1, %3 + %endif + packssdw %1, %1 +- %endif + %endif + %endmacro + +@@ -2795,11 +2794,9 @@ PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH + %if ARCH_X86_32 + %define base_reg r2 + %define base base_reg-prep%+SUFFIX +- %define W32_RESTORE_SSQ mov strideq, stridem + %else + %define base_reg r7 + %define base 0 +- %define W32_RESTORE_SSQ + %endif + cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 + %assign org_stack_offset stack_offset +@@ -2835,6 +2832,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + %else + WIN64_SPILL_XMM 16 + %endif ++%if ARCH_X86_32 ++ %define strideq r6 ++ mov strideq, stridem ++%endif + cmp wd, 4 + je .h_w4 + tzcnt wd, wd +@@ -2894,7 +2895,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + punpcklbw m4, m4 + psraw m4, 8 + %endif +- W32_RESTORE_SSQ + %if ARCH_X86_64 + lea stride3q, [strideq*3] + %endif +@@ -2916,8 +2916,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + pshufb m1, m5 + pshufb m2, m5 + pshufb m3, m5 +-%else +- %if ARCH_X86_64 ++%elif ARCH_X86_64 + movd m0, [srcq+strideq*0+0] + movd m12, [srcq+strideq*0+1] + movd m1, [srcq+strideq*1+0] +@@ -2947,7 +2946,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + punpcklqdq m1, m5 ; 1 + punpcklqdq m2, m13 ; 2 + punpcklqdq m3, m7 ; 3 +- %else ++%else + movd m0, [srcq+strideq*0+0] + movd m1, [srcq+strideq*0+1] + movd m2, [srcq+strideq*0+2] +@@ -2978,7 +2977,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + lea srcq, [srcq+strideq*2] + punpckldq m7, m5 + punpcklqdq m3, m7 ; 3 +- %endif + %endif + PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2 + PMADDUBSW m1, m4, m5, m7, 0 +@@ -2994,14 +2992,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + sub hd, 4 + jg .h_w4_loop + RET +- ; + .h_w8: +-%if ARCH_X86_32 +- mov r3, r2 +- %define base_reg r3 +- W32_RESTORE_SSQ +-%endif +-.h_w8_loop: + %if cpuflag(ssse3) + PREP_8TAP_H 0, srcq+strideq*0 + PREP_8TAP_H 1, srcq+strideq*1 +@@ -3017,51 +3008,42 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + add tmpq, 16 + dec hd + %endif +- jg .h_w8_loop ++ jg .h_w8 + RET + .h_w16: +- mov r6, -16*1 ++ mov r3, -16*1 + jmp .h_start + .h_w32: +- mov r6, -16*2 ++ mov r3, -16*2 + jmp .h_start + .h_w64: +- mov r6, -16*4 ++ mov r3, -16*4 + jmp .h_start + .h_w128: +- mov r6, -16*8 ++ mov r3, -16*8 + .h_start: +-%if ARCH_X86_32 +- mov r3, r2 +- %define base_reg r3 +-%endif +- sub srcq, r6 +- mov r5, r6 +- W32_RESTORE_SSQ ++ sub srcq, r3 ++ mov r5, r3 + .h_loop: + %if cpuflag(ssse3) +- PREP_8TAP_H 0, srcq+r6+8*0 +- PREP_8TAP_H 1, srcq+r6+8*1 ++ PREP_8TAP_H 0, srcq+r3+8*0 ++ PREP_8TAP_H 1, srcq+r3+8*1 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 32 +- add r6, 16 ++ add r3, 16 + %else +- PREP_8TAP_H 0, srcq+r6 ++ PREP_8TAP_H 0, srcq+r3 + mova [tmpq], m0 + add tmpq, 16 +- add r6, 8 ++ add r3, 8 + %endif + jl .h_loop + add srcq, strideq +- mov r6, r5 ++ mov r3, r5 + dec hd + jg .h_loop + RET +-%if ARCH_X86_32 +- %define base_reg r2 +-%endif +- ; + .v: + LEA base_reg, prep%+SUFFIX + %if ARCH_X86_32 +@@ -3086,7 +3068,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + %define subpel1 [rsp+mmsize*1] + %define subpel2 [rsp+mmsize*2] + %define subpel3 [rsp+mmsize*3] +-%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed ++%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed + %if cpuflag(ssse3) + ALLOC_STACK -mmsize*4 + %else +@@ -3105,15 +3087,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + movd m0, [myq+6] + PSHUFB_0X1X m0, m2 + mova subpel3, m0 +- %if notcpuflag(ssse3) +- mov r6, base_reg +- %define base_reg r6 +- %endif + mov strideq, [rstk+stack_offset+gprsize*3] +- lea strideq, [strideq*3] +- sub [rstk+stack_offset+gprsize*2], strideq +- mov strideq, [rstk+stack_offset+gprsize*3] +- mov srcq, [rstk+stack_offset+gprsize*2] ++ lea r5, [strideq*3] ++ sub srcq, r5 + %else + %define subpel0 m8 + %define subpel1 m9 +@@ -3245,10 +3221,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + jg .v_w4_loop0 + %endif + RET +-%if ARCH_X86_32 && notcpuflag(ssse3) +- %define base_reg r2 +-%endif +- ; + %if ARCH_X86_64 + .v_w8: + lea r5d, [wq - 8] ; horizontal loop +@@ -3373,16 +3345,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] +- mov r5, r2; use as new base +- %define base_reg r5 +- %assign regs_used 2 ++ mov strideq, stridem ++ %assign regs_used 6 + ALLOC_STACK -mmsize*14 + %assign regs_used 7 +- mov strideq, [rstk+stack_offset+gprsize*3] +- lea strideq, [strideq*3 + 1] +- sub [rstk+stack_offset+gprsize*2], strideq +- mov strideq, [rstk+stack_offset+gprsize*3] +- mov srcq, [rstk+stack_offset+gprsize*2] ++ lea r5, [strideq*3+1] ++ sub srcq, r5 + %define subpelv0 [rsp+mmsize*0] + %define subpelv1 [rsp+mmsize*1] + %define subpelv2 [rsp+mmsize*2] +@@ -3445,9 +3413,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + %define hv4_line_1_3 13 + %if ARCH_X86_32 + %if cpuflag(ssse3) +- %define w8192reg [base+pw_8192] ++ %define w8192reg [base+pw_8192] + %else +- %define w8192reg [base+pw_2] ++ %define w8192reg [base+pw_2] + %endif + %define d32reg [base+pd_32] + %else +@@ -3676,7 +3644,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + %define hv8_line_6 4 + shr mxd, 16 + %if ARCH_X86_32 +- %define base_reg r2 + %define subpelh0 [rsp+mmsize*5] + %define subpelh1 [rsp+mmsize*6] + %define subpelv0 [rsp+mmsize*7] +@@ -3692,16 +3659,16 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + cmp hd, 6 + cmovs myd, mxd + movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] +- ALLOC_STACK -mmsize*13 ++ mov strideq, stridem ++ %assign regs_used 6 ++ ALLOC_STACK -mmsize*14 ++ %assign regs_used 7 + %if STACK_ALIGNMENT < mmsize +- mov rstk, r2m +- %define tmpm [rsp+mmsize*13+gprsize*1] +- %define srcm [rsp+mmsize*13+gprsize*2] +- %define stridem [rsp+mmsize*13+gprsize*3] +- mov stridem, rstk ++ %define tmpm [rsp+mmsize*13+gprsize*1] ++ %define srcm [rsp+mmsize*13+gprsize*2] ++ %define stridem [rsp+mmsize*13+gprsize*3] ++ mov stridem, strideq + %endif +- mov r6, r2 +- %define base_reg r6 + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + punpcklbw m5, m5 +@@ -3724,12 +3691,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + mova subpelv1, m3 + mova subpelv2, m4 + mova subpelv3, m5 +- W32_RESTORE_SSQ +- lea strided, [strided*3] +- sub srcd, strided +- sub srcd, 3 +- mov srcm, srcd +- W32_RESTORE_SSQ ++ lea r5, [strideq*3+3] ++ sub srcq, r5 ++ mov srcm, srcq + %else + ALLOC_STACK mmsize*5, 16 + %define subpelh0 m10 +@@ -3765,7 +3729,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + %if notcpuflag(ssse3) + mova m7, [base+pw_2] + %endif +- lea stride3q, [strideq*3] ++ lea stride3q, [strideq*3] + sub srcq, 3 + sub srcq, stride3q + mov r6, srcq +@@ -3939,11 +3903,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx + .hv_w8_outer: + movzx hd, r5w + %if ARCH_X86_32 +- add dword tmpm, 8 +- mov tmpq, tmpm + mov srcq, srcm ++ mov tmpq, tmpm + add srcq, 4 ++ add tmpq, 8 + mov srcm, srcq ++ mov tmpm, tmpq + %else + add r8, 8 + mov tmpq, r8