[FFmpeg-cvslog] libswscale/aarch64: add another hscale specialization

2022-08-16 Thread Swinney , Jonathan
ffmpeg | branch: master | Swinney, Jonathan  | Sat Aug 13 
20:48:30 2022 +| [75ffca7eef557bcc714d924048a6e184b39fa470] | committer: 
Martin Storsjö

libswscale/aarch64: add another hscale specialization

This specialization handles the case where filtersize is 4 mod 8, e.g.
12, 20, etc. Aarch64 was previously using the c function for this case.
This implementation speeds up that case significantly.

hscale_8_to_15__fs_12_dstW_512_c: 6234.1
hscale_8_to_15__fs_12_dstW_512_neon: 1505.6

Signed-off-by: Jonathan Swinney 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=75ffca7eef557bcc714d924048a6e184b39fa470
---

 libswscale/aarch64/hscale.S  | 107 +++
 libswscale/aarch64/swscale.c |  18 
 2 files changed, 117 insertions(+), 8 deletions(-)

diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S
index b7b21b7a0f..a16d3dca42 100644
--- a/libswscale/aarch64/hscale.S
+++ b/libswscale/aarch64/hscale.S
@@ -91,6 +91,113 @@ function ff_hscale8to15_X8_neon, export=1
 ret
 endfunc
 
+function ff_hscale8to15_X4_neon, export=1
+// x0  SwsContext *c (not used)
+// x1  int16_t *dst
+// w2  int dstW
+// x3  const uint8_t *src
+// x4  const int16_t *filter
+// x5  const int32_t *filterPos
+// w6  int filterSize
+
+// This function for filter sizes that are 4 mod 8. In other words, anything 
that's 0 mod 4 but not
+// 0 mod 8. It also assumes that dstW is 0 mod 4.
+
+lsl w7, w6, #1  // w7 = filterSize * 2
+1:
+ldp w8, w9,  [x5]   // filterPos[idx + 0], 
[idx + 1]
+ldp w10, w11, [x5, #8]  // filterPos[idx + 2], 
[idx + 3]
+
+moviv16.2d, #0  // initialize 
accumulator for idx + 0
+moviv17.2d, #0  // initialize 
accumulator for idx + 1
+moviv18.2d, #0  // initialize 
accumulator for idx + 2
+moviv19.2d, #0  // initialize 
accumulator for idx + 3
+
+mov x12, x4 // filter pointer for 
idx + 0
+add x13, x4, x7 // filter pointer for 
idx + 1
+add x8, x3, w8, uxtw// srcp + 
filterPos[idx + 0]
+add x9, x3, w9, uxtw// srcp + 
filterPos[idx + 1]
+
+add x14, x13, x7// filter pointer for 
idx + 2
+add x10, x3, w10, uxtw  // srcp + 
filterPos[idx + 2]
+add x11, x3, w11, uxtw  // srcp + 
filterPos[idx + 3]
+
+mov w0, w6  // copy filterSize to 
a temp register, w0
+add x5, x5, #16 // advance the 
filterPos pointer
+add x15, x14, x7// filter pointer for 
idx + 3
+mov x16, xzr// temp register for 
offsetting filter pointers
+
+2:
+// This section loops over 8-wide chunks of filter size
+ldr d4, [x8], #8// load 8 bytes from 
srcp for idx + 0
+ldr q0, [x12, x16]  // load 8 values, 16 
bytes from filter for idx + 0
+
+ldr d5, [x9], #8// load 8 bytes from 
srcp for idx + 1
+ldr q1, [x13, x16]  // load 8 values, 16 
bytes from filter for idx + 1
+
+uxtlv4.8h, v4.8b// unsigned extend 
long for idx + 0
+uxtlv5.8h, v5.8b// unsigned extend 
long for idx + 1
+
+ldr d6, [x10], #8   // load 8 bytes from 
srcp for idx + 2
+ldr q2, [x14, x16]  // load 8 values, 16 
bytes from filter for idx + 2
+
+smlal   v16.4s, v0.4h, v4.4h// val += src[srcPos + 
j + 0..3] * filter[fs * i + j + 0..3], idx + 0
+smlal   v17.4s, v1.4h, v5.4h// val += src[srcPos + 
j + 0..3] * filter[fs * i + j + 0..3], idx + 1
+
+ldr d7, [x11], #8   // load 8 bytes from 
srcp for idx + 3
+ldr q3, [x15, x16]  // load 8 values, 16 
bytes from filter for idx + 3
+
+sub w0, w0, #8  // decrement the 
remaining filterSize counter
+smlal2  v16.4s, v0.8h, v4.8h// val += src[srcPos + 
j + 4..7] * filter[fs * i + j + 4..7], idx + 0
+smlal2  v17.4s, v1.8h, v5.8h// val += src[srcPos + 
j + 4..7] * filter[fs * i + j + 4..7], idx + 1
+uxtlv6.8h, v6.8b// unsigned extend 
long for idx + 2
+uxtlv7.8h, v7.8b   

[FFmpeg-cvslog] fftools/ffmpeg: store a separate copy of input codec parameters

2022-08-16 Thread Anton Khirnov
ffmpeg | branch: master | Anton Khirnov  | Sat Aug 13 
17:03:39 2022 +0200| [ab3147383006f1a31978efce23c6bad38a754e05] | committer: 
Anton Khirnov

fftools/ffmpeg: store a separate copy of input codec parameters

Use it instead of AVStream.codecpar in the main thread. While
AVStream.codecpar is documented to only be updated when the stream is
added or avformat_find_stream_info(), it is actually updated during
demuxing. Accessing it from a different thread then constitutes a race.

Ideally, some mechanism should eventually be provided for signalling
parameter updates to the user. Then the demuxing thread could pick up
the changes and propagate them to the decoder.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ab3147383006f1a31978efce23c6bad38a754e05
---

 fftools/ffmpeg.c | 39 ---
 fftools/ffmpeg.h |  6 ++
 fftools/ffmpeg_opt.c |  6 +-
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index 8eb7759392..ef7177fc33 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -608,6 +608,7 @@ static void ffmpeg_cleanup(int ret)
 av_freep(&ist->dts_buffer);
 
 avcodec_free_context(&ist->dec_ctx);
+avcodec_parameters_free(&ist->par);
 
 av_freep(&input_streams[i]);
 }
@@ -1492,7 +1493,7 @@ static void print_final_stats(int64_t total_size)
 
 for (j = 0; j < f->nb_streams; j++) {
 InputStream *ist = input_streams[f->ist_index + j];
-enum AVMediaType type = ist->st->codecpar->codec_type;
+enum AVMediaType type = ist->par->codec_type;
 
 total_size+= ist->data_size;
 total_packets += ist->nb_packets;
@@ -1809,7 +1810,7 @@ static void flush_encoders(void)
 for (x = 0; x < fg->nb_inputs; x++) {
 InputFilter *ifilter = fg->inputs[x];
 if (ifilter->format < 0 &&
-ifilter_parameters_from_codecpar(ifilter, 
ifilter->ist->st->codecpar) < 0) {
+ifilter_parameters_from_codecpar(ifilter, 
ifilter->ist->par) < 0) {
 av_log(NULL, AV_LOG_ERROR, "Error copying paramerets 
from input stream\n");
 exit_program(1);
 }
@@ -1912,11 +1913,11 @@ static void do_streamcopy(InputStream *ist, 
OutputStream *ost, const AVPacket *p
 if (pkt->dts == AV_NOPTS_VALUE) {
 opkt->dts = av_rescale_q(ist->dts, AV_TIME_BASE_Q, ost->mux_timebase);
 } else if (ost->st->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
-int duration = av_get_audio_frame_duration2(ist->st->codecpar, 
pkt->size);
+int duration = av_get_audio_frame_duration2(ist->par, pkt->size);
 if(!duration)
-duration = ist->st->codecpar->frame_size;
+duration = ist->par->frame_size;
 opkt->dts = av_rescale_delta(ist->st->time_base, pkt->dts,
-(AVRational){1, 
ist->st->codecpar->sample_rate}, duration,
+(AVRational){1, ist->par->sample_rate}, 
duration,
 &ist->filter_in_rescale_delta_last, 
ost->mux_timebase);
 /* dts will be set immediately afterwards to what pts is now */
 opkt->pts = opkt->dts - ost_tb_start_time;
@@ -1976,7 +1977,7 @@ static int ifilter_send_frame(InputFilter *ifilter, 
AVFrame *frame, int keep_ref
 /* determine if the parameters for this input changed */
 need_reinit = ifilter->format != frame->format;
 
-switch (ifilter->ist->st->codecpar->codec_type) {
+switch (ifilter->ist->par->codec_type) {
 case AVMEDIA_TYPE_AUDIO:
 need_reinit |= ifilter->sample_rate!= frame->sample_rate ||
av_channel_layout_compare(&ifilter->ch_layout, 
&frame->ch_layout);
@@ -2056,7 +2057,7 @@ static int ifilter_send_eof(InputFilter *ifilter, int64_t 
pts)
 } else {
 // the filtergraph was never configured
 if (ifilter->format < 0) {
-ret = ifilter_parameters_from_codecpar(ifilter, 
ifilter->ist->st->codecpar);
+ret = ifilter_parameters_from_codecpar(ifilter, ifilter->ist->par);
 if (ret < 0)
 return ret;
 }
@@ -2212,9 +2213,9 @@ static int decode_video(InputStream *ist, AVPacket *pkt, 
int *got_output, int64_
 
 // The following line may be required in some cases where there is no 
parser
 // or the parser does not has_b_frames correctly
-if (ist->st->codecpar->video_delay < ist->dec_ctx->has_b_frames) {
+if (ist->par->video_delay < ist->dec_ctx->has_b_frames) {
 if (ist->dec_ctx->codec_id == AV_CODEC_ID_H264) {
-ist->st->codecpar->video_delay = ist->dec_ctx->has_b_frames;
+ist->par->video_delay = ist->dec_ctx->has_b_frames;
 } else
 av_log(ist->dec_ctx, AV_LOG_WARNING,
"video_delay is larger in

[FFmpeg-cvslog] doc/APIchanges: add missing rgbaf16 pixfmt entry

2022-08-16 Thread Timo Rothenpieler
ffmpeg | branch: master | Timo Rothenpieler  | Tue Aug 
16 12:31:03 2022 +0200| [317f5252c09d6deee7025907eea2dfe44935c891] | committer: 
Timo Rothenpieler

doc/APIchanges: add missing rgbaf16 pixfmt entry

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=317f5252c09d6deee7025907eea2dfe44935c891
---

 doc/APIchanges | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/APIchanges b/doc/APIchanges
index a196bc40d4..b3ba07ee7c 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -14,6 +14,9 @@ libavutil: 2021-04-27
 
 API changes, most recent first:
 
+2022-08-07 - e95b08a7dd - lavu 57.33.101 - pixfmt.h
+  Add AV_PIX_FMT_RGBAF16{BE,LE} pixel formats.
+
 2022-08-xx - xx - lavu 57.33.100 - hwcontext_qsv.h
   Add loader field to AVQSVDeviceContext
 

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] checkasm: updated tests for sw_scale

2022-08-16 Thread Swinney , Jonathan
ffmpeg | branch: master | Swinney, Jonathan  | Sat Aug 13 
20:55:55 2022 +| [4dcd191a507b76003bc8a1d0d845d99e3ba9fabf] | committer: 
Martin Storsjö

checkasm: updated tests for sw_scale

Change the reference to exactly match the C reference in swscale,
instead of exactly matching the x86 SIMD implementations (which
differs slightly). Test with and without SWS_ACCURATE_RND - if this
flag isn't set, the output must match the C reference exactly,
otherwise it is allowed to be off by 2.

Mark a couple x86 functions as unavailable when SWS_ACCURATE_RND
is set - apparently this discrepancy hasn't been noticed in other
exact tests before.

Add a test for yuv2plane1.

Signed-off-by: Jonathan Swinney 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=4dcd191a507b76003bc8a1d0d845d99e3ba9fabf
---

 libswscale/x86/swscale.c  |   8 +-
 tests/checkasm/sw_scale.c | 188 --
 2 files changed, 154 insertions(+), 42 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 628f12137c..32d441245d 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -534,7 +534,8 @@ switch(c->dstBpc){ \
 ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
 ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
 HAVE_ALIGNED_STACK || ARCH_X86_64);
-ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2);
+if (!(c->flags & SWS_ACCURATE_RND))
+ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2);
 
 switch (c->srcFormat) {
 case AV_PIX_FMT_YA8:
@@ -583,14 +584,15 @@ switch(c->dstBpc){ \
 ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
 if (!isBE(c->dstFormat)) c->yuv2planeX = 
ff_yuv2planeX_16_sse4,
 HAVE_ALIGNED_STACK || ARCH_X86_64);
-if (c->dstBpc == 16 && !isBE(c->dstFormat))
+if (c->dstBpc == 16 && !isBE(c->dstFormat) && !(c->flags & 
SWS_ACCURATE_RND))
 c->yuv2plane1 = ff_yuv2plane1_16_sse4;
 }
 
 if (EXTERNAL_AVX(cpu_flags)) {
 ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
 HAVE_ALIGNED_STACK || ARCH_X86_64);
-ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx);
+if (!(c->flags & SWS_ACCURATE_RND))
+ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx);
 
 switch (c->srcFormat) {
 case AV_PIX_FMT_YUYV422:
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index b643a47c30..859993db6f 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -35,40 +35,140 @@
 AV_WN32(buf + j, rnd());  \
 } while (0)
 
-// This reference function is the same approximate algorithm employed by the
-// SIMD functions
-static void ref_function(const int16_t *filter, int filterSize,
- const int16_t **src, uint8_t 
*dest, int dstW,
- const uint8_t *dither, int 
offset)
+static void yuv2planeX_8_ref(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
 {
-int i, d;
-d = ((filterSize - 1) * 8 + dither[0]) >> 4;
-for ( i = 0; i < dstW; i++) {
-int16_t val = d;
+// This corresponds to the yuv2planeX_8_c function
+int i;
+for (i = 0; i < dstW; i++) {
+int val = dither[(i + offset) & 7] << 12;
 int j;
-union {
-int val;
-int16_t v[2];
-} t;
-for (j = 0; j < filterSize; j++){
-t.val = (int)src[j][i + offset] * (int)filter[j];
-val += t.v[1];
+for (j = 0; j < filterSize; j++)
+val += src[j][i] * filter[j];
+
+dest[i]= av_clip_uint8(val >> 19);
+}
+}
+
+static int cmp_off_by_n(const uint8_t *ref, const uint8_t *test, size_t n, int 
accuracy)
+{
+for (size_t i = 0; i < n; i++) {
+if (abs(ref[i] - test[i]) > accuracy)
+return 1;
+}
+return 0;
+}
+
+static void print_data(uint8_t *p, size_t len, size_t offset)
+{
+size_t i = 0;
+for (; i < len; i++) {
+if (i % 8 == 0) {
+printf("0x%04zx: ", i+offset);
+}
+printf("0x%02x ", (uint32_t) p[i]);
+if (i % 8 == 7) {
+printf("\n");
 }
-dest[i]= av_clip_uint8(val>>3);
 }
+if (i % 8 != 0) {
+printf("\n");
+}
+}
+
+static size_t show_differences(uint8_t *a, uint8_t *b, size_t len)
+{
+for (size_t i = 0; i < len; i++) {
+if (a[i] != b[i]) {
+size_t offset_of_mismatch = i;
+size_t offset;
+if (i >= 8) i-=8;
+offset = i & (~7);
+printf("test a:\n");
+print_data(&a[offset], 32, offset);
+printf("\ntest b:\n");
+print_data(&b[offset], 32, offset);
+   

[FFmpeg-cvslog] swscale/aarch64: vscale optimization

2022-08-16 Thread Swinney , Jonathan
ffmpeg | branch: master | Swinney, Jonathan  | Sat Aug 13 
20:56:02 2022 +| [3e708722a2d06b8c7290d359ccaea9cd9885e701] | committer: 
Martin Storsjö

swscale/aarch64: vscale optimization

Use scalar times vector multiply accumlate instructions instead of
vector times vector to remove the need for replicating load instructions
which are slightly slower.

On AWS c7g (Graviton 3, Neoverse V1) instances:
yuv2yuvX_8_0_512_accurate_neon:  1144.8  987.4
yuv2yuvX_16_0_512_accurate_neon: 2080.5 1869.4

Signed-off-by: Jonathan Swinney 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3e708722a2d06b8c7290d359ccaea9cd9885e701
---

 libswscale/aarch64/output.S | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index af71de6050..991750cf31 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -34,16 +34,15 @@ function ff_yuv2planeX_8_neon, export=1
 mov x9, x2  // srcp= src
 mov x10, x0 // filterp = filter
 3:  ldp x11, x12, [x9], #16 // get 2 pointers: 
src[j] and src[j+1]
+ldr s7, [x10], #4   // read 2x16-bit 
coeff X and Y at filter[j] and filter[j+1]
 add x11, x11, x7, lsl #1// &src[j  ][i]
 add x12, x12, x7, lsl #1// &src[j+1][i]
 ld1 {v5.8H}, [x11]  // read 8x16-bit @ 
src[j  ][i + {0..7}]: A,B,C,D,E,F,G,H
 ld1 {v6.8H}, [x12]  // read 8x16-bit @ 
src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
-ld1r{v7.8H}, [x10], #2  // read 1x16-bit 
coeff X at filter[j  ] and duplicate across lanes
-ld1r{v16.8H}, [x10], #2 // read 1x16-bit 
coeff Y at filter[j+1] and duplicate across lanes
-smlal   v3.4S, v5.4H, v7.4H // val0 += 
{A,B,C,D} * X
-smlal2  v4.4S, v5.8H, v7.8H // val1 += 
{E,F,G,H} * X
-smlal   v3.4S, v6.4H, v16.4H// val0 += 
{I,J,K,L} * Y
-smlal2  v4.4S, v6.8H, v16.8H// val1 += 
{M,N,O,P} * Y
+smlal   v3.4S, v5.4H, v7.H[0]   // val0 += 
{A,B,C,D} * X
+smlal2  v4.4S, v5.8H, v7.H[0]   // val1 += 
{E,F,G,H} * X
+smlal   v3.4S, v6.4H, v7.H[1]   // val0 += 
{I,J,K,L} * Y
+smlal2  v4.4S, v6.8H, v7.H[1]   // val1 += 
{M,N,O,P} * Y
 subsw8, w8, #2  // tmpfilterSize 
-= 2
 b.gt3b  // loop until 
filterSize consumed
 

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] swscale/aarch64: add vscale specializations

2022-08-16 Thread Swinney , Jonathan
ffmpeg | branch: master | Swinney, Jonathan  | Sat Aug 13 
20:56:06 2022 +| [0d7caa5b09b680aa2c8ae677a8d0fdc134b7e658] | committer: 
Martin Storsjö

swscale/aarch64: add vscale specializations

This commit adds new code paths for vscale when filterSize is 2, 4, or
8. By using specialized code with unrolling to match the filterSize we
can improve performance.

On AWS c7g (Graviton 3, Neoverse V1) instances:
 before   after
yuv2yuvX_2_0_512_accurate_neon:  558.8268.9
yuv2yuvX_4_0_512_accurate_neon:  637.5434.9
yuv2yuvX_8_0_512_accurate_neon:  1144.8   806.2
yuv2yuvX_16_0_512_accurate_neon: 2080.5   1853.7

Signed-off-by: Jonathan Swinney 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=0d7caa5b09b680aa2c8ae677a8d0fdc134b7e658
---

 libswscale/aarch64/output.S  | 177 +++
 libswscale/aarch64/swscale.c |  13 
 2 files changed, 190 insertions(+)

diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 991750cf31..b8a2818c9b 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -21,13 +21,33 @@
 #include "libavutil/aarch64/asm.S"
 
 function ff_yuv2planeX_8_neon, export=1
+// x0 - const int16_t *filter,
+// x1 - int filterSize,
+// x2 - const int16_t **src,
+// x3 - uint8_t *dest,
+// w4 - int dstW,
+// x5 - const uint8_t *dither,
+// w6 - int offset
+
 ld1 {v0.8B}, [x5]   // load 8x8-bit 
dither
+and w6, w6, #7
 cbz w6, 1f  // check if 
offsetting present
 ext v0.8B, v0.8B, v0.8B, #3 // honor 
offsetting which can be 0 or 3 only
 1:  uxtlv0.8H, v0.8B// extend dither 
to 16-bit
 ushll   v1.4S, v0.4H, #12   // extend dither 
to 32-bit with left shift by 12 (part 1)
 ushll2  v2.4S, v0.8H, #12   // extend dither 
to 32-bit with left shift by 12 (part 2)
+cmp w1, #8  // if filterSize 
== 8, branch to specialized version
+b.eq6f
+cmp w1, #4  // if filterSize 
== 4, branch to specialized version
+b.eq8f
+cmp w1, #2  // if filterSize 
== 2, branch to specialized version
+b.eq10f
+
+// The filter size does not match of the of specialized implementations. It is 
either even or odd. If it is even
+// then use the first section below.
 mov x7, #0  // i = 0
+tbnzw1, #0, 4f  // if filterSize % 
2 != 0 branch to specialized version
+// fs % 2 == 0
 2:  mov v3.16B, v1.16B  // initialize 
accumulator part 1 with dithering value
 mov v4.16B, v2.16B  // initialize 
accumulator part 2 with dithering value
 mov w8, w1  // tmpfilterSize = 
filterSize
@@ -54,4 +74,161 @@ function ff_yuv2planeX_8_neon, export=1
 add x7, x7, #8  // i += 8
 b.gt2b  // loop until 
width consumed
 ret
+
+// If filter size is odd (most likely == 1), then use this section.
+// fs % 2 != 0
+4:  mov v3.16B, v1.16B  // initialize 
accumulator part 1 with dithering value
+mov v4.16B, v2.16B  // initialize 
accumulator part 2 with dithering value
+mov w8, w1  // tmpfilterSize = 
filterSize
+mov x9, x2  // srcp= src
+mov x10, x0 // filterp = filter
+5:  ldr x11, [x9], #8   // get 1 pointer: 
src[j]
+ldr h6, [x10], #2   // read 1 16 bit 
coeff X at filter[j]
+add x11, x11, x7, lsl #1// &src[j  ][i]
+ld1 {v5.8H}, [x11]  // read 8x16-bit @ 
src[j  ][i + {0..7}]: A,B,C,D,E,F,G,H
+smlal   v3.4S, v5.4H, v6.H[0]   // val0 += 
{A,B,C,D} * X
+smlal2  v4.4S, v5.8H, v6.H[0]   // val1 += 
{E,F,G,H} * X
+subsw8, w8, #1  // tmpfilterSize 
-= 2
+b.gt5b  // loop until 
filterSize consumed
+
+sqshrun v3.4h, v3.4s, #16   // clip16(val0>>16)
+sqshrun2v3.8h, v4.4s, #16   // clip16(val1>>16)
+uqshrn  v3.8b, v3.8h, #3// clip8(v

[FFmpeg-cvslog] checkasm/hevc_add_res: add 12bit test

2022-08-16 Thread J . Dekker
ffmpeg | branch: master | J. Dekker  | Thu Jun 23 20:04:05 
2022 +0200| [ea6ecb12aa9ebfbc985f71938a6cccf5046ca826] | committer: J. Dekker

checkasm/hevc_add_res: add 12bit test

Also fix the bug where in every other byte only the lower 2 bits were
used in the 8bit test.

Signed-off-by: J. Dekker 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ea6ecb12aa9ebfbc985f71938a6cccf5046ca826
---

 tests/checkasm/hevc_add_res.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/checkasm/hevc_add_res.c b/tests/checkasm/hevc_add_res.c
index 0c896adaca..f17d121939 100644
--- a/tests/checkasm/hevc_add_res.c
+++ b/tests/checkasm/hevc_add_res.c
@@ -36,14 +36,14 @@
 }   \
 } while (0)
 
-#define randomize_buffers2(buf, size) \
+#define randomize_buffers2(buf, size, mask)   \
 do {  \
 int j;\
 for (j = 0; j < size; j++)\
-AV_WN16A(buf + j * 2, rnd() & 0x3FF); \
+AV_WN16A(buf + j * 2, rnd() & mask); \
 } while (0)
 
-static void compare_add_res(int size, ptrdiff_t stride, int overflow_test)
+static void compare_add_res(int size, ptrdiff_t stride, int overflow_test, int 
mask)
 {
 LOCAL_ALIGNED_32(int16_t, res0, [32 * 32]);
 LOCAL_ALIGNED_32(int16_t, res1, [32 * 32]);
@@ -53,7 +53,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int 
overflow_test)
 declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *res, 
ptrdiff_t stride);
 
 randomize_buffers(res0, size);
-randomize_buffers2(dst0, size);
+randomize_buffers2(dst0, size, mask);
 if (overflow_test)
 res0[0] = 0x8000;
 memcpy(res1, res0, sizeof(*res0) * size);
@@ -69,6 +69,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int 
overflow_test)
 static void check_add_res(HEVCDSPContext h, int bit_depth)
 {
 int i;
+int mask = bit_depth == 8 ? 0x : bit_depth == 10 ? 0x03FF : 0x07FF;
 
 for (i = 2; i <= 5; i++) {
 int block_size = 1 << i;
@@ -76,9 +77,9 @@ static void check_add_res(HEVCDSPContext h, int bit_depth)
 ptrdiff_t stride = block_size << (bit_depth > 8);
 
 if (check_func(h.add_residual[i - 2], "hevc_add_res_%dx%d_%d", 
block_size, block_size, bit_depth)) {
-compare_add_res(size, stride, 0);
+compare_add_res(size, stride, 0, mask);
 // overflow test for res = -32768
-compare_add_res(size, stride, 1);
+compare_add_res(size, stride, 1, mask);
 }
 }
 }
@@ -87,7 +88,7 @@ void checkasm_check_hevc_add_res(void)
 {
 int bit_depth;
 
-for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
+for (bit_depth = 8; bit_depth <= 12; bit_depth++) {
 HEVCDSPContext h;
 
 ff_hevc_dsp_init(&h, bit_depth);

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] lavc/aarch64: reformat add_res funcs

2022-08-16 Thread J . Dekker
ffmpeg | branch: master | J. Dekker  | Thu Jun 23 20:04:06 
2022 +0200| [aa9eabb7a5283fd90b3274ac4b6ba0d16e42] | committer: J. Dekker

lavc/aarch64: reformat add_res funcs

Signed-off-by: J. Dekker 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=aa9eabb7a5283fd90b3274ac4b6ba0d16e42
---

 libavcodec/aarch64/hevcdsp_idct_neon.S | 216 -
 1 file changed, 108 insertions(+), 108 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 0869431294..484eea8437 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -27,21 +27,21 @@
 #include "libavutil/aarch64/asm.S"
 
 const trans, align=4
-.short 64, 83, 64, 36
-.short 89, 75, 50, 18
-.short 90, 87, 80, 70
-.short 57, 43, 25, 9
-.short 90, 90, 88, 85
-.short 82, 78, 73, 67
-.short 61, 54, 46, 38
-.short 31, 22, 13, 4
+.short  64, 83, 64, 36
+.short  89, 75, 50, 18
+.short  90, 87, 80, 70
+.short  57, 43, 25, 9
+.short  90, 90, 88, 85
+.short  82, 78, 73, 67
+.short  61, 54, 46, 38
+.short  31, 22, 13, 4
 endconst
 
 .macro clip10 in1, in2, c1, c2
-smax\in1, \in1, \c1
-smax\in2, \in2, \c1
-smin\in1, \in1, \c2
-smin\in2, \in2, \c2
+smax\in1, \in1, \c1
+smax\in2, \in2, \c1
+smin\in1, \in1, \c2
+smin\in2, \in2, \c2
 .endm
 
 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -50,13 +50,13 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
 ld1 {v2.s}[1], [x0], x2
 ld1 {v2.s}[2], [x0], x2
 ld1 {v2.s}[3], [x0], x2
-sub  x0,  x0,  x2, lsl #2
-uxtl v6.8h,  v2.8b
-uxtl2v7.8h,  v2.16b
-sqaddv0.8h,  v0.8h, v6.8h
-sqaddv1.8h,  v1.8h, v7.8h
-sqxtun   v0.8b,  v0.8h
-sqxtun2  v0.16b, v1.8h
+sub x0, x0, x2, lsl #2
+uxtlv6.8h,  v2.8b
+uxtl2   v7.8h,  v2.16b
+sqadd   v0.8h,  v0.8h, v6.8h
+sqadd   v1.8h,  v1.8h, v7.8h
+sqxtun  v0.8b,  v0.8h
+sqxtun2 v0.16b, v1.8h
 st1 {v0.s}[0], [x0], x2
 st1 {v0.s}[1], [x0], x2
 st1 {v0.s}[2], [x0], x2
@@ -70,63 +70,63 @@ function ff_hevc_add_residual_4x4_10_neon, export=1
 ld1 {v2.d}[0], [x12], x2
 ld1 {v2.d}[1], [x12], x2
 ld1 {v3.d}[0], [x12], x2
-sqaddv0.8h, v0.8h, v2.8h
+sqadd   v0.8h, v0.8h, v2.8h
 ld1 {v3.d}[1], [x12], x2
-movi v4.8h, #0
-sqaddv1.8h, v1.8h, v3.8h
-mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
-clip10   v0.8h, v1.8h, v4.8h, v5.8h
-st1 {v0.d}[0],  [x0], x2
-st1 {v0.d}[1],  [x0], x2
-st1 {v1.d}[0],  [x0], x2
-st1 {v1.d}[1],  [x0], x2
+moviv4.8h, #0
+sqadd   v1.8h, v1.8h, v3.8h
+mvniv5.8h, #0xFC, lsl #8 // movi #0x3FF
+clip10  v0.8h, v1.8h, v4.8h, v5.8h
+st1 {v0.d}[0], [x0],  x2
+st1 {v0.d}[1], [x0],  x2
+st1 {v1.d}[0], [x0],  x2
+st1 {v1.d}[1], [x0],  x2
 ret
 endfunc
 
 function ff_hevc_add_residual_8x8_8_neon, export=1
-add x12,  x0, x2
-add  x2,  x2, x2
-mov  x3,  #8
-1:  subs x3,  x3, #2
-ld1 {v2.d}[0], [x0]
-ld1 {v2.d}[1],[x12]
-uxtl v3.8h,  v2.8b
+add x12, x0, x2
+add x2, x2, x2
+mov x3, #8
+1:  subsx3, x3, #2
+ld1 {v2.d}[0], [x0]
+ld1 {v2.d}[1], [x12]
+uxtlv3.8h,  v2.8b
 ld1 {v0.8h-v1.8h}, [x1], #32
-uxtl2v2.8h,  v2.16b
-sqaddv0.8h,  v0.8h,   v3.8h
-sqaddv1.8h,  v1.8h,   v2.8h
-sqxtun   v0.8b,  v0.8h
-sqxtun2  v0.16b, v1.8h
-st1 {v0.d}[0], [x0], x2
-st1 {v0.d}[1],[x12], x2
-bne  1b
+uxtl2   v2.8h,  v2.16b
+sqadd   v0.8h,  v0.8h, v3.8h
+sqadd   v1.8h,  v1.8h, v2.8h
+sqxtun  v0.8b,  v0.8h
+sqxtun2 v0.16b, v1.8h
+s

[FFmpeg-cvslog] checkasm: Provide enough alignment in the new yuv2plane1 test

2022-08-16 Thread Martin Storsjö
ffmpeg | branch: master | Martin Storsjö  | Tue Aug 16 
23:46:35 2022 +0300| [21c2c57ba5a8b426ad9c07902ec957dbbb20cf36] | committer: 
Martin Storsjö

checkasm: Provide enough alignment in the new yuv2plane1 test

This fixes the checkasm test in some setups on x86.

Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=21c2c57ba5a8b426ad9c07902ec957dbbb20cf36
---

 tests/checkasm/sw_scale.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 859993db6f..cbe4460a99 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -114,9 +114,9 @@ static void check_yuv2yuv1(int accurate)
   const int16_t *src, uint8_t *dest,
   int dstW, const uint8_t *dither, int offset);
 
-LOCAL_ALIGNED_8(int16_t, src_pixels, [LARGEST_INPUT_SIZE]);
-LOCAL_ALIGNED_8(uint8_t, dst0, [LARGEST_INPUT_SIZE]);
-LOCAL_ALIGNED_8(uint8_t, dst1, [LARGEST_INPUT_SIZE]);
+LOCAL_ALIGNED_16(int16_t, src_pixels, [LARGEST_INPUT_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, dst0, [LARGEST_INPUT_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, dst1, [LARGEST_INPUT_SIZE]);
 LOCAL_ALIGNED_8(uint8_t, dither, [8]);
 
 randomize_buffers((uint8_t*)dither, 8);

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] x86: Don't hardcode the height to 8 in sad8_xy2_mmx

2022-08-16 Thread Martin Storsjö
ffmpeg | branch: master | Martin Storsjö  | Wed Jul 13 
23:39:40 2022 +0300| [dc55e6357818e21e26afbcdfdf2dd7368ec0e8fd] | committer: 
Martin Storsjö

x86: Don't hardcode the height to 8 in sad8_xy2_mmx

The height is hardcoded in some of the me_cmp functions, but not
in all of them. But in the case of all other functions, it's hardcoded
in the same place in SIMD functions as in the C reference functions,
while this one function differs from the behaviour of the C code.

(Before 542765ce3eccbca587d54262a512cbdb1407230d, there were a
couple other sad8_*_mmx functions with similar hardcoded height.)

Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=dc55e6357818e21e26afbcdfdf2dd7368ec0e8fd
---

 libavcodec/x86/me_cmp_init.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index f140381c98..bc1051c27e 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -202,13 +202,12 @@ static inline int sum_mmx(void)
 static int sad8_xy2_ ## suf(MpegEncContext *v, const uint8_t *blk2, \
 const uint8_t *blk1, ptrdiff_t stride, int h) \
 {   \
-av_assert2(h == 8); \
 __asm__ volatile (  \
 "pxor %%mm7, %%mm7 \n\t"\
 "pxor %%mm6, %%mm6 \n\t"\
 ::);\
 \
-sad8_4_ ## suf(blk1, blk2, stride, 8);  \
+sad8_4_ ## suf(blk1, blk2, stride, h);  \
 \
 return sum_ ## suf();   \
 }   \

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] tools: Make sure to create the tools directory before building decode_simple.o

2022-08-16 Thread Martin Storsjö
ffmpeg | branch: master | Martin Storsjö  | Mon Aug  8 
11:33:26 2022 +0300| [1eaa575cf11c054b0b724480f3070fc908faf8ef] | committer: 
Martin Storsjö

tools: Make sure to create the tools directory before building decode_simple.o

This directory dependency is normally added implicitly by rules
in ffbuild/common.mak; for tools it's created by a rule for TOOLOBJS.
TOOLOBJS is populated implicitly from TOOLS, and decode_simple.o
doesn't end up there because it's an odd occurrance of a lone
object file in the tools subdirectory, not belonging to any other
tool.

Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=1eaa575cf11c054b0b724480f3070fc908faf8ef
---

 tools/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/Makefile b/tools/Makefile
index f4d1327b9f..4afa23342d 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -20,6 +20,8 @@ tools/target_io_dem_fuzzer.o: tools/target_dem_fuzzer.c
 tools/venc_data_dump$(EXESUF): tools/decode_simple.o
 tools/scale_slice_test$(EXESUF): tools/decode_simple.o
 
+tools/decode_simple.o: | tools
+
 OUTDIRS += tools
 
 clean::

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] checkasm: motion: Test different h parameters

2022-08-16 Thread Martin Storsjö
ffmpeg | branch: master | Martin Storsjö  | Tue Jul 12 
23:58:01 2022 +0300| [d69d12a5b9236b9d2f1fd247ea452f84cdd1aaf9] | committer: 
Martin Storsjö

checkasm: motion: Test different h parameters

Previously, the checkasm test always passed h=8, so no other cases
were tested.

Out of the me_cmp functions, in practice, some functions are hardcoded
to always assume a 8x8 block (ignoring the h parameter), while others
do use the parameter. For those with hardcoded height, both the
reference C function and the assembly implementations ignore the
parameter similarly.

The documentation for the functions indicate that heights between
w/2 and 2*w, within the range of 4 to 16, should be supported. This
patch just tests random heights in that range, without knowing what
width the current function actually uses.

Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=d69d12a5b9236b9d2f1fd247ea452f84cdd1aaf9
---

 tests/checkasm/motion.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
index 631a9ed29f..87b20d1c10 100644
--- a/tests/checkasm/motion.c
+++ b/tests/checkasm/motion.c
@@ -45,7 +45,7 @@ static void test_motion(const char *name, me_cmp_func 
test_func)
 /* motion estimation can look up to 17 bytes ahead */
 static const int look_ahead = 17;
 
-int i, x, y, d1, d2;
+int i, x, y, h, d1, d2;
 uint8_t *ptr;
 
 LOCAL_ALIGNED_16(uint8_t, img1, [WIDTH * HEIGHT]);
@@ -68,14 +68,16 @@ static void test_motion(const char *name, me_cmp_func 
test_func)
 for (i = 0; i < ITERATIONS; i++) {
 x = rnd() % (WIDTH - look_ahead);
 y = rnd() % (HEIGHT - look_ahead);
+// Pick a random h between 4 and 16; pick an even value.
+h = 4 + ((rnd() % (16 + 1 - 4)) & ~1);
 
 ptr = img2 + y * WIDTH + x;
-d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
-d1 = call_new(NULL, img1, ptr, WIDTH, 8);
+d2 = call_ref(NULL, img1, ptr, WIDTH, h);
+d1 = call_new(NULL, img1, ptr, WIDTH, h);
 
 if (d1 != d2) {
 fail();
-printf("func: %s, x=%d y=%d, error: asm=%d c=%d\n", name, x, 
y, d1, d2);
+printf("func: %s, x=%d y=%d h=%d, error: asm=%d c=%d\n", name, 
x, y, h, d1, d2);
 break;
 }
 }

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".