Re: [FFmpeg-devel] [PATCH v2 1/2] lavc/aarch64: add hevc qpel assembly

2022-02-07 Thread Martin Storsjö

On Thu, 3 Feb 2022, J. Dekker wrote:


Thanks: Rafal Dabrowa 
---
libavcodec/aarch64/Makefile   |1 +
libavcodec/aarch64/hevcdsp_init_aarch64.c |   67 +
libavcodec/aarch64/hevcdsp_qpel_neon.S| 2799 +
3 files changed, 2867 insertions(+)
create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S

Had trouble testing on a Linux machine as well, but have a workflow
setup for that now so should be easier in the future. Passes FATE on
both macOS and Linux.



+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));


Passing a whole parenthesized expression like this, via one macro 
parameter, feels quite unorthodox to me, but it does seem to work now with 
all compilers I have to test with, so I guess it's tolerable that way.



+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+.Lqpel_filters:
+.byte  0,  0,  0,  0,  0,  0, 0,  0


This assembles incorrectly with gas-preprocessor targeting MSVC armasm64.

Normally we enclose all such constants in const/endconst, which sets up 
the appropriate section and all that. But if put into the const data 
section, it's probably too far away for an 'adr' instruction, so then 
you'd need to use the movrel macro (expanding to 'adrp' + 'add').


A less elegant workaround for armasm/gas-preprocessor is to just add a 
'.text' above this.



+.byte -1,  4,-10, 58, 17, -5, 1,  0
+.byte -1,  4,-11, 40, 40,-11, 4, -1
+.byte  0,  1, -5, 17, 58,-10, 4, -1
+
+.macro load_qpel_filterb freg, xreg
+adr \xreg, .Lqpel_filters
+add \xreg, \xreg, \freg, lsl #3
+ld4r   {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
+ld4r   {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]


Please follow the normal coding style (align the starting '{' just like 
other characters at the start of the operand column, don't leave it 
outside. This goes for the whole file.



+neg v0.16b, v0.16b
+neg v2.16b, v2.16b
+neg v5.16b, v5.16b
+neg v7.16b, v7.16b


Why these negations? Can't you just change the corresponding umlsl/umlal 
instructions matchingly?


Also, can't those umlsl/umlal use the elementwise form, e.g. v0.b[0], so 
you wouldn't need to waste 8 full registers on the coefficients? (If 
you've got enough registers so you don't need to clobber v8-v15, there's 
probably no benefit in squeezing things tighter though. But if there's 
code that could be made more efficient if you'd have more spare registers, 
that could help.)



+.endm
+
+.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
+umlsl   \dst\().8h, \src0\().8b, v0.8b


Could this first one be plain 'umull' (if you wouldn't negate the 
coefficient), avoiding an extra 'movi v28.8h, #0'?



+umlal   \dst\().8h, \src1\().8b, v1.8b
+umlsl   \dst\().8h, \src2\().8b, v2.8b
+umlal   \dst\().8h, \src3\().8b, v3.8b
+umlal   \dst\().8h, \src4\().8b, v4.8b
+umlsl   \dst\().8h, \src5\().8b, v5.8b
+umlal   \dst\().8h, \src6\().8b, v6.8b
+umlsl   \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+umlsl2  \dst\().8h, \src0\().16b, v0.16b
+umlal2  \dst\().8h, \src1\().16b, v1.16b
+umlsl2  \dst\().8h, \src2\().16b, v2.16b
+umlal2  \dst\().8h, \src3\().16b, v3.16b
+umlal2  \dst\().8h, \src4\().16b, v4.16b
+umlsl2  \dst\().8h, \src5\().16b, v5.16b
+umlal2  \dst\().8h, \src6\().16b, v6.16b
+umlsl2  \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro load_qpel_filterh freg, xreg
+adr \xreg, .Lqpel_filters
+add \xreg, \xreg, \freg, lsl #3
+ld1{v0.8b}, [\xreg]
+sxtlv0.8h, v0.8b
+.endm
+
+.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, 
shift=6
+smull   \dst\().4s, \src0\().4h, v0.h[0]
+smlal   \dst\().4s, \src1\().4h, v0.h[1]
+smlal   \dst\().4s, \src2\().4h, v0.h[2]
+smlal   \dst\().4s, \src3\().4h, v0.h[3]
+smlal   \dst\().4s, \src4\().4h, v0.h[4]
+smlal   \dst\().4s, \src5\().4h, v0.h[5]
+smlal   \dst\().4s, \src6\().4h, v0.h[6]
+smlal   \dst\().4s, \src7\().4h, v0.h[7]
+.ifc \op, sshr
+sshr\dst\().4s, \dst\().4s, \shift
+.else
+\op \dst\().4h, \dst\().4s, \shift
+.endif
+.endm
+
+.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, 
op, shift=6
+smull2  \dstt\().4s, \src0\().8h, v0.h[0]
+smlal2  

[FFmpeg-devel] [PATCH v2 1/2] lavc/aarch64: add hevc qpel assembly

2022-02-03 Thread J. Dekker
Thanks: Rafal Dabrowa 
---
 libavcodec/aarch64/Makefile   |1 +
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   67 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S| 2799 +
 3 files changed, 2867 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S

 Had trouble testing on a Linux machine as well, but have a workflow
 setup for that now so should be easier in the future. Passes FATE on
 both macOS and Linux.

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..8592692479 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -63,4 +63,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
aarch64/vp9itxfm_16bpp_neon.o   \
aarch64/vp9mc_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o 
\
aarch64/hevcdsp_init_aarch64.o  
\
+   aarch64/hevcdsp_qpel_neon.o 
\
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..3e5d85247e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,63 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   int16_t *sao_offset_val, int sao_left_class,
   int width, int height);
 
+#define NEON8_FNPROTO(fn, args) \
+void ff_hevc_put_hevc_##fn##4_8_neon args; \
+void ff_hevc_put_hevc_##fn##6_8_neon args; \
+void ff_hevc_put_hevc_##fn##8_8_neon args; \
+void ff_hevc_put_hevc_##fn##12_8_neon args; \
+void ff_hevc_put_hevc_##fn##16_8_neon args; \
+void ff_hevc_put_hevc_##fn##24_8_neon args; \
+void ff_hevc_put_hevc_##fn##32_8_neon args; \
+void ff_hevc_put_hevc_##fn##48_8_neon args; \
+void ff_hevc_put_hevc_##fn##64_8_neon args; \
 
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_h, (uint8_t *dst,  ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_hv, (uint8_t *dst,  ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+int height, intptr_t mx, intptr_t my, int width));
+
+#define NEON8_FNASSIGN(member, v, h, fn) \
+member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon;  \
+member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon;  \
+member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon;  \
+member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon; \
+member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon; \
+member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon; \
+member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon; \
+member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon; \
+member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon;
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -80,6 +136,17 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 // for the current size, but if enabled for bigger sizes, the cases
 // of non-multiple of 8 seem to arise.
 //c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+
+NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
+NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
+NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
+NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
+NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
+NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
+NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 1,