from:"J. Dekker"

Re: [FFmpeg-devel] [PATCH] README.md: fix typo

2021-10-07 Thread J. Dekker


On 7 Oct 2021, at 12:40, Arif Driessen wrote:


Hi,

I think this is a typo...

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 447347c700..f54299d340 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ such as audio, video, subtitles and related metadata.
 * `libavcodec` provides implementation of a wider range of codecs.
 * `libavformat` implements streaming protocols, container formats and
basic I/O access.
 * `libavutil` includes hashers, decompressors and miscellaneous 
utility

functions.
-* `libavfilter` provides a mean to alter decoded Audio and Video 
through

chain of filters.
+* `libavfilter` provides a means to alter decoded Audio and Video 
through

chain of filters.


I think it's missing an article too: 'provides a means to alter decoded 
Audio and Video through a chain of filters' might be better here, 
alternatively: 'provides a means to alter decoded Audio and Video 
through a filtergraph'.


 * `libavdevice` provides an abstraction to access capture and 
playback

devices.
 * `libswresample` implements audio mixing and resampling routines.
 * `libswscale` implements color conversion and scaling routines.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] next release

2021-10-07 Thread J. Dekker


On 6 Oct 2021, at 11:57, Michael Niedermayer wrote:

Hi

Should the next release be called "LTS"?
No, we can specify it *as* LTS but it should still have an actual name. 
The danger in calling it 'LTS' is that it will then be confusing if we 
specify other releases as LTS: do we mean an LTS release or the release 
called 'LTS'?


Though it is definitely a good to start specifying specific releases as 
LTS, it should definitely be discussed how this is done. Will we 
maintain two versions per major, i.e. a LTS like 4.1 and then the latest 
in the major 4.4? I think this would make the most sense, limiting the 
burden of maintainership as much as possible.


Those who want a very stable release stay on the LTS minor within the 
major version and those who want a stable major stay on the latest minor 
within the major.



Should the next release be 4.5 or 5.0 ?

5.0

Should it be made in december 2021 ? (as was suggested in jbs release 
mail)

This seems reasonable.

--
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/4] lavc/aarch64: add hevc sao edge 8x8

2021-10-07 Thread J. Dekker

--bench on AWS Graviton:

hevc_sao_edge_8x8_8_c: 516.0
hevc_sao_edge_8x8_8_neon: 81.0

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  3 ++
 libavcodec/aarch64/hevcdsp_sao_neon.S | 52 +++
 2 files changed, 55 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 747ff0412d..b93cec9e44 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -59,6 +59,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   int width, int height);
 void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, 
ptrdiff_t stride_dst,
   int16_t *sao_offset_val, int eo, int 
width, int height);
+void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t 
stride_dst,
+  int16_t *sao_offset_val, int eo, int 
width, int height);
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -76,6 +78,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->idct_dc[2]  = ff_hevc_idct_16x16_dc_8_neon;
 c->idct_dc[3]  = ff_hevc_idct_32x32_dc_8_neon;
 c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_edge_filter[0]  = ff_hevc_sao_edge_filter_8x8_8_neon;
 c->sao_edge_filter[1]  =
 c->sao_edge_filter[2]  =
 c->sao_edge_filter[3]  =
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index a7f054c075..c4b931aab7 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -151,3 +151,55 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
// no lines to filter
ret
 endfunc
+
+// ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst,
+//int16 *sao_offset_val, int eo, int 
width, int height)
+function ff_hevc_sao_edge_filter_8x8_8_neon, export=1
+   lsl w4,  w4, #2
+   adr x7, .Lsao_edge_pos
+   ldr w4, [x7, x4]
+   ld1 {v3.8h}, [x3]
+   mov v3.h[7], v3.h[0]
+   mov v3.h[0], v3.h[1]
+   mov v3.h[1], v3.h[2]
+   mov v3.h[2], v3.h[7]
+   uzp2v1.16b, v3.16b, v3.16b
+   uzp1v0.16b, v3.16b, v3.16b
+   moviv2.16b, #2
+   add x16, x0, x2
+   lsl x2, x2, #1
+   mov x15, #192
+   mov  x8, x1
+   sub  x9, x1, x4
+   add x10, x1, x4
+   mov x17, #4
+1: ld1 {v3.d}[0], [ x8], x15
+   ld1 {v4.d}[0], [ x9], x15
+   ld1 {v5.d}[0], [x10], x15
+   ld1 {v3.d}[1], [ x8], x15
+   ld1 {v4.d}[1], [ x9], x15
+   ld1 {v5.d}[1], [x10], x15
+   cmhiv16.16b, v4.16b, v3.16b
+   cmhiv17.16b, v3.16b, v4.16b
+   cmhiv18.16b, v5.16b, v3.16b
+   cmhiv19.16b, v3.16b, v5.16b
+   sub v20.16b, v16.16b, v17.16b
+   sub v21.16b, v18.16b, v19.16b
+   add v20.16b, v20.16b, v21.16b
+   add v20.16b, v20.16b, v2.16b
+   tbl v16.16b, {v0.16b}, v20.16b
+   tbl v17.16b, {v1.16b}, v20.16b
+   zip1v18.16b, v16.16b, v17.16b
+   zip2v19.16b, v16.16b, v17.16b
+   uxtlv20.8h, v3.8b
+   uxtl2   v21.8h, v3.16b
+   sqadd   v20.8h, v18.8h, v20.8h
+   sqadd   v21.8h, v19.8h, v21.8h
+   sqxtun  v6.8b, v20.8h
+   sqxtun  v7.8b, v21.8h
+   st1 {v6.8b}, [ x0], x2
+   st1 {v7.8b}, [x16], x2
+   subsx17, x17, #1
+   b.ne1b
+   ret
+endfunc
-- 
2.30.1 (Apple Git-130)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/4] lavc/aarch64: add hevc sao edge 16x16

2021-10-07 Thread J. Dekker

--bench on AWS Graviton:

hevc_sao_edge_16x16_8_c: 1857.0
hevc_sao_edge_16x16_8_neon: 211.0
hevc_sao_edge_32x32_8_c: 7802.2
hevc_sao_edge_32x32_8_neon: 808.2
hevc_sao_edge_48x48_8_c: 16764.2
hevc_sao_edge_48x48_8_neon: 1796.5
hevc_sao_edge_64x64_8_c: 32647.5
hevc_sao_edge_64x64_8_neon: 3118.5

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  8 ++-
 libavcodec/aarch64/hevcdsp_sao_neon.S | 66 +++
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index c785e46f79..747ff0412d 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -57,8 +57,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   ptrdiff_t stride_dst, ptrdiff_t stride_src,
   int16_t *sao_offset_val, int sao_left_class,
   int width, int height);
-
-
+void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, 
ptrdiff_t stride_dst,
+  int16_t *sao_offset_val, int eo, int 
width, int height);
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -76,6 +76,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->idct_dc[2]  = ff_hevc_idct_16x16_dc_8_neon;
 c->idct_dc[3]  = ff_hevc_idct_32x32_dc_8_neon;
 c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_edge_filter[1]  =
+c->sao_edge_filter[2]  =
+c->sao_edge_filter[3]  =
+c->sao_edge_filter[4]  = ff_hevc_sao_edge_filter_16x16_8_neon;
 }
 if (bit_depth == 10) {
 c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index f9fed8345b..a7f054c075 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -85,3 +85,69 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 bne 1b
 ret
 endfunc
+
+// ASSUMES STRIDE_SRC = 192
+.Lsao_edge_pos:
+.word 1 // horizontal
+.word 192 // vertical
+.word 192 + 1 // 45 degree
+.word 192 - 1 // 135 degree
+
+// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff 
stride_dst,
+//  int16 *sao_offset_val, int eo, int 
width, int height)
+function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
+   lsl w4, w4, #2
+   adr x7, .Lsao_edge_pos
+   ld1 {v3.8h}, [x3]  // load sao_offset_val
+   sxtwx5, w5
+   ldr w4, [x7, x4]   // stride_src
+   mov v3.h[7], v3.h[0]   // reorder to [1,2,0,3,4]
+   mov v3.h[0], v3.h[1]
+   mov v3.h[1], v3.h[2]
+   mov v3.h[2], v3.h[7]
+   // split 16bit values into two tables
+   uzp2v1.16b, v3.16b, v3.16b // sao_offset_val -> upper
+   uzp1v0.16b, v3.16b, v3.16b // sao_offset_val -> lower
+   moviv2.16b, #2
+   mov x15, #192
+   // strides between end of line and next src/dst
+   sub x15, x15, x5   // stride_src - width
+   sub x16, x2, x5// stride_dst - width
+   mov x11, x1// copy base src
+1: // new line
+   mov x14, x5// copy width
+   sub x12, x11, x4   // src_a (prev) = src - 
sao_edge_pos
+   add x13, x11, x4   // src_b (next) = src + 
sao_edge_pos
+2: // process 16 bytes
+   ld1 {v3.16b}, [x11], #16   // load src
+   ld1 {v4.16b}, [x12], #16   // load src_a (prev)
+   ld1 {v5.16b}, [x13], #16   // load src_b (next)
+   cmhiv16.16b, v4.16b, v3.16b// (prev > cur)
+   cmhiv17.16b, v3.16b, v4.16b// (cur > prev)
+   cmhiv18.16b, v5.16b, v3.16b// (next > cur)
+   cmhiv19.16b, v3.16b, v5.16b// (cur > next)
+   sub v20.16b, v16.16b, v17.16b  // diff0 = CMP(cur, prev) = 
(cur > prev) - (cur < prev)
+   sub v21.16b, v18.16b, v19.16b  // diff1 = CMP(cur, next) = 
(cur > next) - (cur < next)
+   add v20.16b, v20.16b, v21.16b  // diff = diff0 + diff1
+   add v20.16b, v20.16b, v2.16b   // offset_val = diff + 2
+   tbl v16.16b, {v0.16b}, v20.16b
+   tbl v17.16b, {v1.16b}, v20.16b
+   zip1

[FFmpeg-devel] [PATCH 3/4] lavc/aarch64: add hevc sao band 8x8 tiling

2021-10-07 Thread J. Dekker

--bench on AWS Graviton:

hevc_sao_band_8x8_8_c: 317.5
hevc_sao_band_8x8_8_neon: 97.5
hevc_sao_band_16x16_8_c: 1115.0
hevc_sao_band_16x16_8_neon: 322.7
hevc_sao_band_32x32_8_c: 4599.2
hevc_sao_band_32x32_8_neon: 1246.2
hevc_sao_band_48x48_8_c: 10021.7
hevc_sao_band_48x48_8_neon: 2740.5
hevc_sao_band_64x64_8_c: 17635.0
hevc_sao_band_64x64_8_neon: 4875.7

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c | 6 +-
 libavcodec/aarch64/hevcdsp_sao_neon.S | 9 ++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index b93cec9e44..2002530266 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -77,7 +77,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->idct_dc[1]  = ff_hevc_idct_8x8_dc_8_neon;
 c->idct_dc[2]  = ff_hevc_idct_16x16_dc_8_neon;
 c->idct_dc[3]  = ff_hevc_idct_32x32_dc_8_neon;
-c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_band_filter[0]  =
+c->sao_band_filter[1]  =
+c->sao_band_filter[2]  =
+c->sao_band_filter[3]  =
+c->sao_band_filter[4]  = ff_hevc_sao_band_filter_8x8_8_neon;
 c->sao_edge_filter[0]  = ff_hevc_sao_edge_filter_8x8_8_neon;
 c->sao_edge_filter[1]  =
 c->sao_edge_filter[2]  =
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index c4b931aab7..263747149f 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -35,6 +35,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 stpxzr, xzr, [sp, #32]
 stpxzr, xzr, [sp, #48]
 mov w8,  #4
+sxtwx6,  w6
 0:
 ldrsh   x9, [x4,  x8, lsl #1] // x9 = sao_offset_val[k+1]
 subsw8,  w8,  #1
@@ -44,8 +45,10 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 bne 0b
 ld1{v16.16b-v19.16b}, [sp], #64
 movi   v20.8h,   #1
+sub x2,  x2, x6 // stride_dst - width
+sub x3,  x3, x6 // stride_src - width
 1:  // beginning of line
-mov w8,  w6
+mov x8,  x6
 2:
 // Simple layout for accessing 16bit values
 // with 8bit LUT.
@@ -56,7 +59,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 // +--->
 //i-0 i-1 i-2 i-3
 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-ld1{v2.8b}, [x1]
+ld1{v2.8b}, [x1], #8
 // load src[x]
 uxtlv0.8h,  v2.8b
 // >> shift
@@ -74,7 +77,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 // clip + narrow
 sqxtun  v4.8b,  v1.8h
 // store
-st1{v4.8b}, [x0]
+st1{v4.8b}, [x0], #8
 // done 8 pixels
 subsw8, w8,  #8
 bne 2b
-- 
2.30.1 (Apple Git-130)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 4/4] lavc/aarch64: clean-up sao band 8x8 function formatting

2021-10-07 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_sao_neon.S | 103 +++---
 1 file changed, 44 insertions(+), 59 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index 263747149f..c2519da7f5 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -3,7 +3,7 @@
  *
  * AArch64 NEON optimised SAO functions for HEVC decoding
  *
- * Copyright (c) 2020 Josh Dekker 
+ * Copyright (c) 2020-2021  J. Dekker 
  *
  * This file is part of FFmpeg.
  *
@@ -29,64 +29,49 @@
 //  int16_t *sao_offset_val, int sao_left_class,
 //  int width, int height)
 function ff_hevc_sao_band_filter_8x8_8_neon, export=1
-sub sp,  sp, #64
-stpxzr, xzr, [sp]
-stpxzr, xzr, [sp, #16]
-stpxzr, xzr, [sp, #32]
-stpxzr, xzr, [sp, #48]
-mov w8,  #4
-sxtwx6,  w6
-0:
-ldrsh   x9, [x4,  x8, lsl #1] // x9 = sao_offset_val[k+1]
-subsw8,  w8,  #1
-addw10,  w8,  w5 // x10 = k + sao_left_class
-andw10, w10, #0x1F
-strhw9, [sp, x10, lsl #1]
-bne 0b
-ld1{v16.16b-v19.16b}, [sp], #64
-movi   v20.8h,   #1
-sub x2,  x2, x6 // stride_dst - width
-sub x3,  x3, x6 // stride_src - width
-1:  // beginning of line
-mov x8,  x6
-2:
-// Simple layout for accessing 16bit values
-// with 8bit LUT.
-//
-//   00  01  02  03  04  05  06  07
-// +--->
-// |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|
-// +--->
-//i-0 i-1 i-2 i-3
-// dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-ld1{v2.8b}, [x1], #8
-// load src[x]
-uxtlv0.8h,  v2.8b
-// >> shift
-ushrv2.8h,  v0.8h, #3 // BIT_DEPTH - 3
-// x2 (access lower short)
-shl v1.8h,  v2.8h, #1 // low (x2, accessing short)
-// +1 access upper short
-add v3.8h,  v1.8h, v20.8h
-// shift insert index to upper byte
-sli v1.8h,  v3.8h, #8
-// table
-tbxv2.16b, {v16.16b-v19.16b}, v1.16b
-// src[x] + table
-add v1.8h,  v0.8h, v2.8h
-// clip + narrow
-sqxtun  v4.8b,  v1.8h
-// store
-st1{v4.8b}, [x0], #8
-// done 8 pixels
-subsw8, w8,  #8
-bne 2b
-// finished line
-subsw7, w7,  #1
-add x0, x0,  x2 // dst += stride_dst
-add x1, x1,  x3 // src += stride_src
-bne 1b
-ret
+   sub sp,  sp, #64
+   stpxzr, xzr, [sp]
+   stpxzr, xzr, [sp, #16]
+   stpxzr, xzr, [sp, #32]
+   stpxzr, xzr, [sp, #48]
+   mov w8,  #4
+   sxtwx6,  w6
+0: ldrsh   x9, [x4,  x8, lsl #1]  // sao_offset_val[k+1]
+   subsw8,  w8,  #1
+   addw10,  w8,  w5   // k + sao_left_class
+   andw10, w10, #0x1F
+   strhw9, [sp, x10, lsl #1]
+   bne 0b
+   ld1{v16.16b-v19.16b}, [sp], #64
+   movi   v20.8h,   #1
+   sub x2,  x2, x6// stride_dst - width
+   sub x3,  x3, x6// stride_src - width
+1: mov x8,  x6// beginning of line
+2: // Simple layout for accessing 16bit values
+   // with 8bit LUT.
+   //
+   //   00  01  02  03  04  05  06  07
+   // +--->
+   // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|
+   // +--->
+   //i-0 i-1 i-2 i-3
+   ld1{v2.8b}, [x1], #8   // dst[x] = 
av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+   uxtlv0.8h,  v2.8b  // load src[x]
+   ushrv2.8h,  v0.8h, #3  // >> BIT_DEPTH - 3
+   shl v1.8h,  v2.8h, #1  // low (x2, accessing short)
+   add v3.8h,  v1.8h, v20.8h  // +1 access upper short
+   sli v1.8h,  v3.8h, #8  // shift insert index to 
upper byte
+   tbxv2.16b, {v16.16b-v19.16b}, v1.16b // table
+   add v1.8h,  v0.8h, v2.8h   // src[x] + table
+   sqxtun  v4.8b,  v1.8h  // clip + narrow

[FFmpeg-devel] ARM Hardware Request

2021-10-07 Thread J. Dekker


Hi,

I'm writing ARM64 ASM and testing on Apple M1 and Graviton2 N1 devices. 
This, however, is not a good spread of micro-architectures among 
commonly used ARM64 devices. Additional micro-architectures which I 
would like to test are 'High Efficiency' A53 and A55; 'high performance' 
A72, A73, and A77, and 'Prime' X1.


I've identified the following devices along with the micro-architectures 
they use and a price estimate:


- NanoPi M4B, Rockchip RK3399 (A53/A72) - 99 EUR
  & Extras (~23 EUR)
- ODROID-N2+, Amlogic S922x (A53/A73) - 71 EUR
- OnePlus 9, Snapdragon 888 (A55/A77/X1) - 799 EUR

As a note, the cheapest Snapdragon 888 devices were phones rather than 
development boards (with the 'official' development board being more 
than double the example given here). I don't think the OnePlus is the 
cheapest, but seems like it would be the most reasonable to test 
(unlocked boot-loader, etc).


I am preemptively requesting reimbursement for these devices, 
suggestions welcome.


--
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] ARM Hardware Request

2021-10-08 Thread J. Dekker

On 8 Oct 2021, at 10:25, Thilo Borgmann wrote:
>> Hi,
>>
>> I'm writing ARM64 ASM and testing on Apple M1 and Graviton2 N1 devices. 
>> This, however, is not a good spread of micro-architectures among commonly 
>> used ARM64 devices. Additional micro-architectures which I would like to 
>> test are 'High Efficiency' A53 and A55; 'high performance' A72, A73, and 
>> A77, and 'Prime' X1.
>
> I’m in favor of buying some ARM hardware for that as well as for FATE maybe.
>

Could buy 2x of whatever we decide, alternatively I'd be able to run FATE on 
them as well.

> [...]

>> - OnePlus 9, Snapdragon 888 (A55/A77/X1) - 799 EUR
>
> for this well it’s not cheap, maybe someone has an alternative idea or even 
> some hardware to donate that covers some of the cores?

A official devkit is ~1600 EUR, obviously the phone form factor is not ideal 
for automated testing but it is significantly cheaper. But yes, if anyone can 
find a Snapdragon 888 in a devkit form (i.e. something I can plug into a 
network easily) then that would be preferred. Or if someone is able to find one 
or multiple devices which cover the A55/A77/X1 micro-architectures then that 
works as well.

> [...]

-- 
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] Revert "arm: hevc_qpel: Fix the assembly to work with non-multiple of 8 widths"

2021-10-16 Thread J. Dekker

This reverts commit 2589060b92eeeb944c6e2b50e38412c0c5fabcf4.

Signed-off-by: J. Dekker 
---
 libavcodec/arm/hevcdsp_qpel_neon.S | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S 
b/libavcodec/arm/hevcdsp_qpel_neon.S
index f71bec05ed..caa6efa766 100644
--- a/libavcodec/arm/hevcdsp_qpel_neon.S
+++ b/libavcodec/arm/hevcdsp_qpel_neon.S
@@ -237,7 +237,7 @@
 vld1.8{d23}, [r2], r3
 bne 8b
 subs  r5, #8
-ble   99f
+beq   99f
 mov r4, r12
 add r6, #16
 mov r0, r6
@@ -280,7 +280,7 @@
 vld1.8{d23}, [r2], r3
 bne 8b
 subs  r5, #8
-ble   99f
+beq   99f
 mov r4, r12
 add r6, #8
 mov r0, r6
@@ -310,7 +310,7 @@
 vld1.8{d23}, [r2], r3
 bne 8b
 subs  r5, #8
-ble   99f
+beq   99f
 mov r4, r12
 add r6, #8
 mov r0, r6
@@ -377,7 +377,7 @@ endfunc
 vst1.16   {q7}, [r0], r1
 bne   8b
 subs  r5, #8
-ble   99f
+beq  99f
 mov   r4, r12
 add   r6, #16
 mov   r0, r6
@@ -417,7 +417,7 @@ endfunc
 vst1.8d0, [r0], r1
 bne   8b
 subs  r5, #8
-ble   99f
+beq  99f
 mov   r4, r12
 add   r6, #8
 mov   r0, r6
@@ -446,7 +446,7 @@ endfunc
 vst1.8 d0, [r0], r1
 bne   8b
 subs  r5, #8
-ble   99f
+beq  99f
 mov   r4, r12
 add   r6, #8
 add   r10, #16
@@ -533,7 +533,7 @@ endfunc
 \filterh q7
 bne 8b
 subs  r5, #8
-ble 99f
+beq 99f
 mov r4, r12
 add r6, #16
 mov r0, r6
@@ -594,7 +594,7 @@ endfunc
 \filterh q7
 bne 8b
 subs  r5, #8
-ble 99f
+beq 99f
 mov r4, r12
 add r6, #8
 mov r0, r6
@@ -641,7 +641,7 @@ endfunc
 \filterh q7
 bne 8b
 subs  r5, #8
-ble 99f
+beq 99f
 mov r4, r12
 add r6, #8
 mov r0, r6
-- 
2.30.1 (Apple Git-130)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] lavc/arm: dont assign hevc_qpel non-multiple of 8 width stubs

2021-10-16 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 libavcodec/arm/hevcdsp_init_neon.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libavcodec/arm/hevcdsp_init_neon.c 
b/libavcodec/arm/hevcdsp_init_neon.c
index 201a088dac..112edb5edd 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -270,7 +270,8 @@ av_cold void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const 
int bit_depth)
 put_hevc_qpel_uw_neon[3][1]  = ff_hevc_put_qpel_uw_h1v3_neon_8;
 put_hevc_qpel_uw_neon[3][2]  = ff_hevc_put_qpel_uw_h2v3_neon_8;
 put_hevc_qpel_uw_neon[3][3]  = ff_hevc_put_qpel_uw_h3v3_neon_8;
-for (x = 0; x < 10; x++) {
+for (x = 3; x < 10; x++) {
+if (x == 4) continue;
 c->put_hevc_qpel[x][1][0] = ff_hevc_put_qpel_neon_wrapper;
 c->put_hevc_qpel[x][0][1] = ff_hevc_put_qpel_neon_wrapper;
 c->put_hevc_qpel[x][1][1] = ff_hevc_put_qpel_neon_wrapper;
-- 
2.30.1 (Apple Git-130)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: clean-up sao band 8x8 function formatting

2021-10-26 Thread J. Dekker

On 19 Oct 2021, at 10:40, Martin Storsjö wrote:

> On Thu, 7 Oct 2021, J. Dekker wrote:
>
>> Signed-off-by: J. Dekker 
>> ---
>> libavcodec/aarch64/hevcdsp_sao_neon.S | 103 +++---
>> 1 file changed, 44 insertions(+), 59 deletions(-)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
>> b/libavcodec/aarch64/hevcdsp_sao_neon.S
>> index 263747149f..c2519da7f5 100644
>> --- a/libavcodec/aarch64/hevcdsp_sao_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
>> @@ -3,7 +3,7 @@
>>  *
>>  * AArch64 NEON optimised SAO functions for HEVC decoding
>>  *
>> - * Copyright (c) 2020 Josh Dekker 
>> + * Copyright (c) 2020-2021  J. Dekker 
>>  *
>>  * This file is part of FFmpeg.
>>  *
>> @@ -29,64 +29,49 @@
>> //  int16_t *sao_offset_val, int sao_left_class,
>> //  int width, int height)
>> function ff_hevc_sao_band_filter_8x8_8_neon, export=1
>> -sub sp,  sp, #64
>> -stpxzr, xzr, [sp]
>
> This one had the right indentation to start with, don't reindent it according 
> to the new incorrectly indented code you're adding.

Yep. This was a mistake, some of (my) previously pushed code is actually 
incorrect here as well. Instructions should be indented to 9 Columns with the 
first argument being at 25 Columns.

You mentioned in the past about shifting the first argument left by 1 character 
when it began with a curly brace but I don't see this used in any other files 
(except the one added by me). Should this still be done?

Also do you want a patch to reformat previous code to fit this 9,25 cols 
alignment?

> Also if you're going to reformat this, could you align the left edge of the 
> operand columns instead of aligning the commas, i.e. making it match the rest 
> of the asm we have? I.e. like this:
>
> sp,  sp,  #64
> xzr, xzr, [sp]

Sure, I thought comma aligning here was strange.

-- 
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 1/6] lavc/arm: dont assign hevc_qpel functions for non-multiple of 8 widths

2021-11-16 Thread J. Dekker

The assembly is written assuming that the width is a multiple of 8.

However the real issue is the functions were errorneously assigned to
the 2, 4, 6 & 12 widths. This behaviour never broke the decoder as
samples which trigger the functions for these widths have not been found
in the wild. This relies on the mappings in ff_hevc_pel_weight[].

Signed-off-by: J. Dekker 
---
 libavcodec/arm/hevcdsp_init_neon.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

 Updated commit message.

diff --git a/libavcodec/arm/hevcdsp_init_neon.c 
b/libavcodec/arm/hevcdsp_init_neon.c
index 201a088dac..112edb5edd 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -270,7 +270,8 @@ av_cold void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const 
int bit_depth)
 put_hevc_qpel_uw_neon[3][1]  = ff_hevc_put_qpel_uw_h1v3_neon_8;
 put_hevc_qpel_uw_neon[3][2]  = ff_hevc_put_qpel_uw_h2v3_neon_8;
 put_hevc_qpel_uw_neon[3][3]  = ff_hevc_put_qpel_uw_h3v3_neon_8;
-for (x = 0; x < 10; x++) {
+for (x = 3; x < 10; x++) {
+if (x == 4) continue;
 c->put_hevc_qpel[x][1][0] = ff_hevc_put_qpel_neon_wrapper;
 c->put_hevc_qpel[x][0][1] = ff_hevc_put_qpel_neon_wrapper;
 c->put_hevc_qpel[x][1][1] = ff_hevc_put_qpel_neon_wrapper;
-- 
2.30.1 (Apple Git-130)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 2/6] Revert "arm: hevc_qpel: Fix the assembly to work with non-multiple of 8 widths"

2021-11-16 Thread J. Dekker

This reverts commit 2589060b92eeeb944c6e2b50e38412c0c5fabcf4 which was
originally to fix the FATE test. The real cause of the test breakage was
fixed in 8dc8f04036eb27c8ad419839d4ed3bc67c44fe7a.

Signed-off-by: J. Dekker 
---
 libavcodec/arm/hevcdsp_qpel_neon.S | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

 Updated commit message.

diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S 
b/libavcodec/arm/hevcdsp_qpel_neon.S
index f71bec05ed..caa6efa766 100644
--- a/libavcodec/arm/hevcdsp_qpel_neon.S
+++ b/libavcodec/arm/hevcdsp_qpel_neon.S
@@ -237,7 +237,7 @@
 vld1.8{d23}, [r2], r3
 bne 8b
 subs  r5, #8
-ble   99f
+beq   99f
 mov r4, r12
 add r6, #16
 mov r0, r6
@@ -280,7 +280,7 @@
 vld1.8{d23}, [r2], r3
 bne 8b
 subs  r5, #8
-ble   99f
+beq   99f
 mov r4, r12
 add r6, #8
 mov r0, r6
@@ -310,7 +310,7 @@
 vld1.8{d23}, [r2], r3
 bne 8b
 subs  r5, #8
-ble   99f
+beq   99f
 mov r4, r12
 add r6, #8
 mov r0, r6
@@ -377,7 +377,7 @@ endfunc
 vst1.16   {q7}, [r0], r1
 bne   8b
 subs  r5, #8
-ble   99f
+beq  99f
 mov   r4, r12
 add   r6, #16
 mov   r0, r6
@@ -417,7 +417,7 @@ endfunc
 vst1.8d0, [r0], r1
 bne   8b
 subs  r5, #8
-ble   99f
+beq  99f
 mov   r4, r12
 add   r6, #8
 mov   r0, r6
@@ -446,7 +446,7 @@ endfunc
 vst1.8 d0, [r0], r1
 bne   8b
 subs  r5, #8
-ble   99f
+beq  99f
 mov   r4, r12
 add   r6, #8
 add   r10, #16
@@ -533,7 +533,7 @@ endfunc
 \filterh q7
 bne 8b
 subs  r5, #8
-ble 99f
+beq 99f
 mov r4, r12
 add r6, #16
 mov r0, r6
@@ -594,7 +594,7 @@ endfunc
 \filterh q7
 bne 8b
 subs  r5, #8
-ble 99f
+beq 99f
 mov r4, r12
 add r6, #8
 mov r0, r6
@@ -641,7 +641,7 @@ endfunc
 \filterh q7
 bne 8b
 subs  r5, #8
-ble 99f
+beq 99f
 mov r4, r12
 add r6, #8
 mov r0, r6
-- 
2.30.1 (Apple Git-130)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 3/6] lavc/aarch64: add hevc sao edge 16x16

2021-11-16 Thread J. Dekker

--bench on AWS Graviton:

hevc_sao_edge_16x16_8_c: 1857.0
hevc_sao_edge_16x16_8_neon: 211.0
hevc_sao_edge_32x32_8_c: 7802.2
hevc_sao_edge_32x32_8_neon: 808.2
hevc_sao_edge_48x48_8_c: 16764.2
hevc_sao_edge_48x48_8_neon: 1796.5
hevc_sao_edge_64x64_8_c: 32647.5
hevc_sao_edge_64x64_8_neon: 3118.5

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  8 ++-
 libavcodec/aarch64/hevcdsp_sao_neon.S | 65 +++
 2 files changed, 71 insertions(+), 2 deletions(-)

 Used inline shift for ldr & escheduled uxtl/uxtl2 before zip/zip2.

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index c785e46f79..747ff0412d 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -57,8 +57,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   ptrdiff_t stride_dst, ptrdiff_t stride_src,
   int16_t *sao_offset_val, int sao_left_class,
   int width, int height);
-
-
+void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, 
ptrdiff_t stride_dst,
+  int16_t *sao_offset_val, int eo, int 
width, int height);
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -76,6 +76,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->idct_dc[2]  = ff_hevc_idct_16x16_dc_8_neon;
 c->idct_dc[3]  = ff_hevc_idct_32x32_dc_8_neon;
 c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_edge_filter[1]  =
+c->sao_edge_filter[2]  =
+c->sao_edge_filter[3]  =
+c->sao_edge_filter[4]  = ff_hevc_sao_edge_filter_16x16_8_neon;
 }
 if (bit_depth == 10) {
 c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index f9fed8345b..f1b9ced93f 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -85,3 +85,68 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 bne 1b
 ret
 endfunc
+
+// ASSUMES STRIDE_SRC = 192
+.Lsao_edge_pos:
+.word 1 // horizontal
+.word 192 // vertical
+.word 192 + 1 // 45 degree
+.word 192 - 1 // 135 degree
+
+// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff 
stride_dst,
+//  int16 *sao_offset_val, int eo, int 
width, int height)
+function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
+   adr x7, .Lsao_edge_pos
+   ld1 {v3.8h}, [x3]  // load sao_offset_val
+   sxtwx5, w5
+   ldr w4, [x7, w4, uxtw #2]  // stride_src
+   mov v3.h[7], v3.h[0]   // reorder to [1,2,0,3,4]
+   mov v3.h[0], v3.h[1]
+   mov v3.h[1], v3.h[2]
+   mov v3.h[2], v3.h[7]
+   // split 16bit values into two tables
+   uzp2v1.16b, v3.16b, v3.16b // sao_offset_val -> upper
+   uzp1v0.16b, v3.16b, v3.16b // sao_offset_val -> lower
+   moviv2.16b, #2
+   mov x15, #192
+   // strides between end of line and next src/dst
+   sub x15, x15, x5   // stride_src - width
+   sub x16, x2, x5// stride_dst - width
+   mov x11, x1// copy base src
+1: // new line
+   mov x14, x5// copy width
+   sub x12, x11, x4   // src_a (prev) = src - 
sao_edge_pos
+   add x13, x11, x4   // src_b (next) = src + 
sao_edge_pos
+2: // process 16 bytes
+   ld1 {v3.16b}, [x11], #16   // load src
+   ld1 {v4.16b}, [x12], #16   // load src_a (prev)
+   ld1 {v5.16b}, [x13], #16   // load src_b (next)
+   cmhiv16.16b, v4.16b, v3.16b// (prev > cur)
+   cmhiv17.16b, v3.16b, v4.16b// (cur > prev)
+   cmhiv18.16b, v5.16b, v3.16b// (next > cur)
+   cmhiv19.16b, v3.16b, v5.16b// (cur > next)
+   sub v20.16b, v16.16b, v17.16b  // diff0 = CMP(cur, prev) = 
(cur > prev) - (cur < prev)
+   sub v21.16b, v18.16b, v19.16b  // diff1 = CMP(cur, next) = 
(cur > next) - (cur < next)
+   add v20.16b, v20.16b, v21.16b  // diff = diff0 + diff1
+   add v20.16b, v20.16b, v2.16b   // offset_val = diff + 2
+   tbl v16.16b, {v0.16b}, v20.16b
+   tbl v17.16

[FFmpeg-devel] [PATCH v2 4/6] lavc/aarch64: add hevc sao edge 8x8

2021-11-16 Thread J. Dekker

--bench on AWS Graviton:

hevc_sao_edge_8x8_8_c: 516.0
hevc_sao_edge_8x8_8_neon: 81.0

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  3 ++
 libavcodec/aarch64/hevcdsp_sao_neon.S | 51 +++
 2 files changed, 54 insertions(+)

 Used inline shift for ldr & escheduled uxtl/uxtl2 before zip/zip2.

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 747ff0412d..b93cec9e44 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -59,6 +59,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   int width, int height);
 void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, 
ptrdiff_t stride_dst,
   int16_t *sao_offset_val, int eo, int 
width, int height);
+void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t 
stride_dst,
+  int16_t *sao_offset_val, int eo, int 
width, int height);
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -76,6 +78,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->idct_dc[2]  = ff_hevc_idct_16x16_dc_8_neon;
 c->idct_dc[3]  = ff_hevc_idct_32x32_dc_8_neon;
 c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_edge_filter[0]  = ff_hevc_sao_edge_filter_8x8_8_neon;
 c->sao_edge_filter[1]  =
 c->sao_edge_filter[2]  =
 c->sao_edge_filter[3]  =
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index f1b9ced93f..e844cc8980 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -150,3 +150,54 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
// no lines to filter
ret
 endfunc
+
+// ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst,
+//int16 *sao_offset_val, int eo, int 
width, int height)
+function ff_hevc_sao_edge_filter_8x8_8_neon, export=1
+   adr x7, .Lsao_edge_pos
+   ldr w4, [x7, w4, uxtw #2]
+   ld1 {v3.8h}, [x3]
+   mov v3.h[7], v3.h[0]
+   mov v3.h[0], v3.h[1]
+   mov v3.h[1], v3.h[2]
+   mov v3.h[2], v3.h[7]
+   uzp2v1.16b, v3.16b, v3.16b
+   uzp1v0.16b, v3.16b, v3.16b
+   moviv2.16b, #2
+   add x16, x0, x2
+   lsl x2, x2, #1
+   mov x15, #192
+   mov  x8, x1
+   sub  x9, x1, x4
+   add x10, x1, x4
+   mov x17, #4
+1: ld1 {v3.d}[0], [ x8], x15
+   ld1 {v4.d}[0], [ x9], x15
+   ld1 {v5.d}[0], [x10], x15
+   ld1 {v3.d}[1], [ x8], x15
+   ld1 {v4.d}[1], [ x9], x15
+   ld1 {v5.d}[1], [x10], x15
+   cmhiv16.16b, v4.16b, v3.16b
+   cmhiv17.16b, v3.16b, v4.16b
+   cmhiv18.16b, v5.16b, v3.16b
+   cmhiv19.16b, v3.16b, v5.16b
+   sub v20.16b, v16.16b, v17.16b
+   sub v21.16b, v18.16b, v19.16b
+   add v20.16b, v20.16b, v21.16b
+   add v20.16b, v20.16b, v2.16b
+   tbl v16.16b, {v0.16b}, v20.16b
+   tbl v17.16b, {v1.16b}, v20.16b
+   uxtlv20.8h, v3.8b
+   uxtl2   v21.8h, v3.16b
+   zip1v18.16b, v16.16b, v17.16b
+   zip2v19.16b, v16.16b, v17.16b
+   sqadd   v20.8h, v18.8h, v20.8h
+   sqadd   v21.8h, v19.8h, v21.8h
+   sqxtun  v6.8b, v20.8h
+   sqxtun  v7.8b, v21.8h
+   st1 {v6.8b}, [ x0], x2
+   st1 {v7.8b}, [x16], x2
+   subsx17, x17, #1
+   b.ne1b
+   ret
+endfunc
-- 
2.30.1 (Apple Git-130)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 5/6] lavc/aarch64: add hevc sao band 8x8 tiling

2021-11-16 Thread J. Dekker

--bench on AWS Graviton:

hevc_sao_band_8x8_8_c: 317.5
hevc_sao_band_8x8_8_neon: 97.5
hevc_sao_band_16x16_8_c: 1115.0
hevc_sao_band_16x16_8_neon: 322.7
hevc_sao_band_32x32_8_c: 4599.2
hevc_sao_band_32x32_8_neon: 1246.2
hevc_sao_band_48x48_8_c: 10021.7
hevc_sao_band_48x48_8_neon: 2740.5
hevc_sao_band_64x64_8_c: 17635.0
hevc_sao_band_64x64_8_neon: 4875.7

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c | 6 +-
 libavcodec/aarch64/hevcdsp_sao_neon.S | 9 ++---
 2 files changed, 11 insertions(+), 4 deletions(-)

 No change since previous patch which was ACK'd, just want to push this
 set together.

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index b93cec9e44..2002530266 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -77,7 +77,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->idct_dc[1]  = ff_hevc_idct_8x8_dc_8_neon;
 c->idct_dc[2]  = ff_hevc_idct_16x16_dc_8_neon;
 c->idct_dc[3]  = ff_hevc_idct_32x32_dc_8_neon;
-c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_band_filter[0]  =
+c->sao_band_filter[1]  =
+c->sao_band_filter[2]  =
+c->sao_band_filter[3]  =
+c->sao_band_filter[4]  = ff_hevc_sao_band_filter_8x8_8_neon;
 c->sao_edge_filter[0]  = ff_hevc_sao_edge_filter_8x8_8_neon;
 c->sao_edge_filter[1]  =
 c->sao_edge_filter[2]  =
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index e844cc8980..82b234aa47 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -35,6 +35,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 stpxzr, xzr, [sp, #32]
 stpxzr, xzr, [sp, #48]
 mov w8,  #4
+sxtwx6,  w6
 0:
 ldrsh   x9, [x4,  x8, lsl #1] // x9 = sao_offset_val[k+1]
 subsw8,  w8,  #1
@@ -44,8 +45,10 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 bne 0b
 ld1{v16.16b-v19.16b}, [sp], #64
 movi   v20.8h,   #1
+sub x2,  x2, x6 // stride_dst - width
+sub x3,  x3, x6 // stride_src - width
 1:  // beginning of line
-mov w8,  w6
+mov x8,  x6
 2:
 // Simple layout for accessing 16bit values
 // with 8bit LUT.
@@ -56,7 +59,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 // +--->
 //i-0 i-1 i-2 i-3
 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-ld1{v2.8b}, [x1]
+ld1{v2.8b}, [x1], #8
 // load src[x]
 uxtlv0.8h,  v2.8b
 // >> shift
@@ -74,7 +77,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 // clip + narrow
 sqxtun  v4.8b,  v1.8h
 // store
-st1{v4.8b}, [x0]
+st1{v4.8b}, [x0], #8
 // done 8 pixels
 subsw8, w8,  #8
 bne 2b
-- 
2.30.1 (Apple Git-130)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 6/6] lavc/aarch64: clean-up sao band 8x8 function formatting

2021-11-16 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_sao_neon.S | 195 --
 1 file changed, 90 insertions(+), 105 deletions(-)

 Now matches the 9,25 indentation like other ASM.

diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index 82b234aa47..3ca34705db 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -3,7 +3,7 @@
  *
  * AArch64 NEON optimised SAO functions for HEVC decoding
  *
- * Copyright (c) 2020 Josh Dekker 
+ * Copyright (c) 2020-2021  J. Dekker 
  *
  * This file is part of FFmpeg.
  *
@@ -29,64 +29,49 @@
 //  int16_t *sao_offset_val, int sao_left_class,
 //  int width, int height)
 function ff_hevc_sao_band_filter_8x8_8_neon, export=1
-sub sp,  sp, #64
-stpxzr, xzr, [sp]
-stpxzr, xzr, [sp, #16]
-stpxzr, xzr, [sp, #32]
-stpxzr, xzr, [sp, #48]
-mov w8,  #4
-sxtwx6,  w6
-0:
-ldrsh   x9, [x4,  x8, lsl #1] // x9 = sao_offset_val[k+1]
-subsw8,  w8,  #1
-addw10,  w8,  w5 // x10 = k + sao_left_class
-andw10, w10, #0x1F
-strhw9, [sp, x10, lsl #1]
-bne 0b
-ld1{v16.16b-v19.16b}, [sp], #64
-movi   v20.8h,   #1
-sub x2,  x2, x6 // stride_dst - width
-sub x3,  x3, x6 // stride_src - width
-1:  // beginning of line
-mov x8,  x6
-2:
-// Simple layout for accessing 16bit values
-// with 8bit LUT.
-//
-//   00  01  02  03  04  05  06  07
-// +--->
-// |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|
-// +--->
-//i-0 i-1 i-2 i-3
-// dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-ld1{v2.8b}, [x1], #8
-// load src[x]
-uxtlv0.8h,  v2.8b
-// >> shift
-ushrv2.8h,  v0.8h, #3 // BIT_DEPTH - 3
-// x2 (access lower short)
-shl v1.8h,  v2.8h, #1 // low (x2, accessing short)
-// +1 access upper short
-add v3.8h,  v1.8h, v20.8h
-// shift insert index to upper byte
-sli v1.8h,  v3.8h, #8
-// table
-tbxv2.16b, {v16.16b-v19.16b}, v1.16b
-// src[x] + table
-add v1.8h,  v0.8h, v2.8h
-// clip + narrow
-sqxtun  v4.8b,  v1.8h
-// store
-st1{v4.8b}, [x0], #8
-// done 8 pixels
-subsw8, w8,  #8
-bne 2b
-// finished line
-subsw7, w7,  #1
-add x0, x0,  x2 // dst += stride_dst
-add x1, x1,  x3 // src += stride_src
-bne 1b
-ret
+   sub sp,  sp, #64
+   stpxzr, xzr, [sp]
+   stpxzr, xzr, [sp, #16]
+   stpxzr, xzr, [sp, #32]
+   stpxzr, xzr, [sp, #48]
+   mov w8,  #4
+   sxtwx6,  w6
+0: ldrsh   x9, [x4,  x8, lsl #1]  // sao_offset_val[k+1]
+   subsw8,  w8,  #1
+   addw10,  w8,  w5   // k + sao_left_class
+   andw10, w10, #0x1F
+   strhw9, [sp, x10, lsl #1]
+   bne 0b
+   ld1{v16.16b-v19.16b}, [sp], #64
+   movi   v20.8h,   #1
+   sub x2,  x2, x6// stride_dst - width
+   sub x3,  x3, x6// stride_src - width
+1: mov x8,  x6// beginning of line
+2: // Simple layout for accessing 16bit values
+   // with 8bit LUT.
+   //
+   //   00  01  02  03  04  05  06  07
+   // +--->
+   // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|
+   // +--->
+   //i-0 i-1 i-2 i-3
+   ld1{v2.8b}, [x1], #8   // dst[x] = 
av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+   uxtlv0.8h,  v2.8b  // load src[x]
+   ushrv2.8h,  v0.8h, #3  // >> BIT_DEPTH - 3
+   shl v1.8h,  v2.8h, #1  // low (x2, accessing short)
+   add v3.8h,  v1.8h, v20.8h  // +1 access upper short
+   sli v1.8h,  v3.8h, #8  // shift insert index to 
upper byte
+   tbxv2.16b, {v16.16b-v19.16b}, v1.16b // table
+   add v1.8h,  v0.8h, v2.8h   // src[x] + table
+

[FFmpeg-devel] [PATCH v3 1/6] lavc/arm: dont assign hevc_qpel functions for non-multiple of 8 widths

2022-01-03 Thread J. Dekker

The assembly is written assuming that the width is a multiple of 8.

However the real issue is the functions were errorneously assigned to
the 2, 4, 6 & 12 widths. This behaviour never broke the decoder as
samples which trigger the functions for these widths have not been found
in the wild. This relies on the mappings in ff_hevc_pel_weight[].

Signed-off-by: J. Dekker 
---
 libavcodec/arm/hevcdsp_init_neon.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

 This set has already been reviewed by Martin, sending to list for
 transparency.

diff --git a/libavcodec/arm/hevcdsp_init_neon.c 
b/libavcodec/arm/hevcdsp_init_neon.c
index 201a088dac..112edb5edd 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -270,7 +270,8 @@ av_cold void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const 
int bit_depth)
 put_hevc_qpel_uw_neon[3][1]  = ff_hevc_put_qpel_uw_h1v3_neon_8;
 put_hevc_qpel_uw_neon[3][2]  = ff_hevc_put_qpel_uw_h2v3_neon_8;
 put_hevc_qpel_uw_neon[3][3]  = ff_hevc_put_qpel_uw_h3v3_neon_8;
-for (x = 0; x < 10; x++) {
+for (x = 3; x < 10; x++) {
+if (x == 4) continue;
 c->put_hevc_qpel[x][1][0] = ff_hevc_put_qpel_neon_wrapper;
 c->put_hevc_qpel[x][0][1] = ff_hevc_put_qpel_neon_wrapper;
 c->put_hevc_qpel[x][1][1] = ff_hevc_put_qpel_neon_wrapper;
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v3 2/6] Revert "arm: hevc_qpel: Fix the assembly to work with non-multiple of 8 widths"

2022-01-03 Thread J. Dekker

This reverts commit 2589060b92eeeb944c6e2b50e38412c0c5fabcf4 which was
originally to fix the FATE test. The real cause of the test breakage was
fixed in 8dc8f04036eb27c8ad419839d4ed3bc67c44fe7a.

Signed-off-by: J. Dekker 
---
 libavcodec/arm/hevcdsp_qpel_neon.S | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S 
b/libavcodec/arm/hevcdsp_qpel_neon.S
index f71bec05ed..caa6efa766 100644
--- a/libavcodec/arm/hevcdsp_qpel_neon.S
+++ b/libavcodec/arm/hevcdsp_qpel_neon.S
@@ -237,7 +237,7 @@
 vld1.8{d23}, [r2], r3
 bne 8b
 subs  r5, #8
-ble   99f
+beq   99f
 mov r4, r12
 add r6, #16
 mov r0, r6
@@ -280,7 +280,7 @@
 vld1.8{d23}, [r2], r3
 bne 8b
 subs  r5, #8
-ble   99f
+beq   99f
 mov r4, r12
 add r6, #8
 mov r0, r6
@@ -310,7 +310,7 @@
 vld1.8{d23}, [r2], r3
 bne 8b
 subs  r5, #8
-ble   99f
+beq   99f
 mov r4, r12
 add r6, #8
 mov r0, r6
@@ -377,7 +377,7 @@ endfunc
 vst1.16   {q7}, [r0], r1
 bne   8b
 subs  r5, #8
-ble   99f
+beq  99f
 mov   r4, r12
 add   r6, #16
 mov   r0, r6
@@ -417,7 +417,7 @@ endfunc
 vst1.8d0, [r0], r1
 bne   8b
 subs  r5, #8
-ble   99f
+beq  99f
 mov   r4, r12
 add   r6, #8
 mov   r0, r6
@@ -446,7 +446,7 @@ endfunc
 vst1.8 d0, [r0], r1
 bne   8b
 subs  r5, #8
-ble   99f
+beq  99f
 mov   r4, r12
 add   r6, #8
 add   r10, #16
@@ -533,7 +533,7 @@ endfunc
 \filterh q7
 bne 8b
 subs  r5, #8
-ble 99f
+beq 99f
 mov r4, r12
 add r6, #16
 mov r0, r6
@@ -594,7 +594,7 @@ endfunc
 \filterh q7
 bne 8b
 subs  r5, #8
-ble 99f
+beq 99f
 mov r4, r12
 add r6, #8
 mov r0, r6
@@ -641,7 +641,7 @@ endfunc
 \filterh q7
 bne 8b
 subs  r5, #8
-ble 99f
+beq 99f
 mov r4, r12
 add r6, #8
 mov r0, r6
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v3 3/6] lavc/aarch64: add hevc sao edge 16x16

2022-01-03 Thread J. Dekker

bench on AWS Graviton:

hevc_sao_edge_16x16_8_c: 1857.0
hevc_sao_edge_16x16_8_neon: 211.0
hevc_sao_edge_32x32_8_c: 7802.2
hevc_sao_edge_32x32_8_neon: 808.2
hevc_sao_edge_48x48_8_c: 16764.2
hevc_sao_edge_48x48_8_neon: 1796.5
hevc_sao_edge_64x64_8_c: 32647.5
hevc_sao_edge_64x64_8_neon: 3118.5

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  8 ++-
 libavcodec/aarch64/hevcdsp_sao_neon.S | 65 +++
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index c785e46f79..747ff0412d 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -57,8 +57,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   ptrdiff_t stride_dst, ptrdiff_t stride_src,
   int16_t *sao_offset_val, int sao_left_class,
   int width, int height);
-
-
+void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, 
ptrdiff_t stride_dst,
+  int16_t *sao_offset_val, int eo, int 
width, int height);
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -76,6 +76,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->idct_dc[2]  = ff_hevc_idct_16x16_dc_8_neon;
 c->idct_dc[3]  = ff_hevc_idct_32x32_dc_8_neon;
 c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_edge_filter[1]  =
+c->sao_edge_filter[2]  =
+c->sao_edge_filter[3]  =
+c->sao_edge_filter[4]  = ff_hevc_sao_edge_filter_16x16_8_neon;
 }
 if (bit_depth == 10) {
 c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index f9fed8345b..4b895959d8 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -85,3 +85,68 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 bne 1b
 ret
 endfunc
+
+// ASSUMES STRIDE_SRC = 192
+.Lsao_edge_pos:
+.word 1 // horizontal
+.word 192 // vertical
+.word 192 + 1 // 45 degree
+.word 192 - 1 // 135 degree
+
+// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff 
stride_dst,
+//  int16 *sao_offset_val, int eo, int 
width, int height)
+function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
+adr x7, .Lsao_edge_pos
+ld1 {v3.8h}, [x3]  // load sao_offset_val
+sxtwx5, w5
+ldr w4, [x7, w4, uxtw #2]  // stride_src
+mov v3.h[7], v3.h[0]   // reorder to [1,2,0,3,4]
+mov v3.h[0], v3.h[1]
+mov v3.h[1], v3.h[2]
+mov v3.h[2], v3.h[7]
+// split 16bit values into two tables
+uzp2v1.16b, v3.16b, v3.16b // sao_offset_val -> upper
+uzp1v0.16b, v3.16b, v3.16b // sao_offset_val -> lower
+moviv2.16b, #2
+mov x15, #192
+// strides between end of line and next src/dst
+sub x15, x15, x5   // stride_src - width
+sub x16, x2, x5// stride_dst - width
+mov x11, x1// copy base src
+1:  // new line
+mov x14, x5// copy width
+sub x12, x11, x4   // src_a (prev) = src - 
sao_edge_pos
+add x13, x11, x4   // src_b (next) = src + 
sao_edge_pos
+2:  // process 16 bytes
+ld1 {v3.16b}, [x11], #16   // load src
+ld1 {v4.16b}, [x12], #16   // load src_a (prev)
+ld1 {v5.16b}, [x13], #16   // load src_b (next)
+cmhiv16.16b, v4.16b, v3.16b// (prev > cur)
+cmhiv17.16b, v3.16b, v4.16b// (cur > prev)
+cmhiv18.16b, v5.16b, v3.16b// (next > cur)
+cmhiv19.16b, v3.16b, v5.16b// (cur > next)
+sub v20.16b, v16.16b, v17.16b  // diff0 = CMP(cur, prev) = 
(cur > prev) - (cur < prev)
+sub v21.16b, v18.16b, v19.16b  // diff1 = CMP(cur, next) = 
(cur > next) - (cur < next)
+add v20.16b, v20.16b, v21.16b  // diff = diff0 + diff1
+add v20.16b, v20.16b, v2.16b   // offset_val = diff + 2
+tbl v16.16b, {v0.16b}, v20.16b
+tbl v17.16b, {v1.16b}, v20.16b
+uxtl

[FFmpeg-devel] [PATCH v3 4/6] lavc/aarch64: add hevc sao edge 8x8

2022-01-03 Thread J. Dekker

bench on AWS Graviton:

hevc_sao_edge_8x8_8_c: 516.0
hevc_sao_edge_8x8_8_neon: 81.0

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  3 ++
 libavcodec/aarch64/hevcdsp_sao_neon.S | 51 +++
 2 files changed, 54 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 747ff0412d..b93cec9e44 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -59,6 +59,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   int width, int height);
 void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, 
ptrdiff_t stride_dst,
   int16_t *sao_offset_val, int eo, int 
width, int height);
+void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t 
stride_dst,
+  int16_t *sao_offset_val, int eo, int 
width, int height);
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -76,6 +78,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->idct_dc[2]  = ff_hevc_idct_16x16_dc_8_neon;
 c->idct_dc[3]  = ff_hevc_idct_32x32_dc_8_neon;
 c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_edge_filter[0]  = ff_hevc_sao_edge_filter_8x8_8_neon;
 c->sao_edge_filter[1]  =
 c->sao_edge_filter[2]  =
 c->sao_edge_filter[3]  =
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index 4b895959d8..167b9676d8 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -150,3 +150,54 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
 // no lines to filter
 ret
 endfunc
+
+// ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst,
+//int16 *sao_offset_val, int eo, int 
width, int height)
+function ff_hevc_sao_edge_filter_8x8_8_neon, export=1
+adr x7, .Lsao_edge_pos
+ldr w4, [x7, w4, uxtw #2]
+ld1 {v3.8h}, [x3]
+mov v3.h[7], v3.h[0]
+mov v3.h[0], v3.h[1]
+mov v3.h[1], v3.h[2]
+mov v3.h[2], v3.h[7]
+uzp2v1.16b, v3.16b, v3.16b
+uzp1v0.16b, v3.16b, v3.16b
+moviv2.16b, #2
+add x16, x0, x2
+lsl x2,  x2, #1
+mov x15, #192
+mov x8,  x1
+sub x9,  x1, x4
+add x10, x1, x4
+mov x17, #4
+1:  ld1 {v3.d}[0], [ x8], x15
+ld1 {v4.d}[0], [ x9], x15
+ld1 {v5.d}[0], [x10], x15
+ld1 {v3.d}[1], [ x8], x15
+ld1 {v4.d}[1], [ x9], x15
+ld1 {v5.d}[1], [x10], x15
+cmhiv16.16b, v4.16b, v3.16b
+cmhiv17.16b, v3.16b, v4.16b
+cmhiv18.16b, v5.16b, v3.16b
+cmhiv19.16b, v3.16b, v5.16b
+sub v20.16b, v16.16b, v17.16b
+sub v21.16b, v18.16b, v19.16b
+add v20.16b, v20.16b, v21.16b
+add v20.16b, v20.16b, v2.16b
+tbl v16.16b, {v0.16b}, v20.16b
+tbl v17.16b, {v1.16b}, v20.16b
+uxtlv20.8h, v3.8b
+uxtl2   v21.8h, v3.16b
+zip1v18.16b, v16.16b, v17.16b
+zip2v19.16b, v16.16b, v17.16b
+sqadd   v20.8h, v18.8h, v20.8h
+sqadd   v21.8h, v19.8h, v21.8h
+sqxtun  v6.8b, v20.8h
+sqxtun  v7.8b, v21.8h
+st1 {v6.8b}, [ x0], x2
+st1 {v7.8b}, [x16], x2
+subsx17, x17, #1
+b.ne1b
+ret
+endfunc
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v3 5/6] lavc/aarch64: clean-up sao band 8x8 function formatting

2022-01-03 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_sao_neon.S | 65 +++
 1 file changed, 25 insertions(+), 40 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index 167b9676d8..73b0b3b056 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -30,24 +30,21 @@
 //  int width, int height)
 function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 sub sp,  sp, #64
-stpxzr, xzr, [sp]
-stpxzr, xzr, [sp, #16]
-stpxzr, xzr, [sp, #32]
-stpxzr, xzr, [sp, #48]
+stp xzr, xzr, [sp]
+stp xzr, xzr, [sp, #16]
+stp xzr, xzr, [sp, #32]
+stp xzr, xzr, [sp, #48]
 mov w8,  #4
-0:
-ldrsh   x9, [x4,  x8, lsl #1] // x9 = sao_offset_val[k+1]
+0:  ldrsh   x9, [x4,  x8, lsl #1]  // sao_offset_val[k+1]
 subsw8,  w8,  #1
-addw10,  w8,  w5 // x10 = k + sao_left_class
-andw10, w10, #0x1F
+add w10, w8,  w5   // k + sao_left_class
+and w10, w10, #0x1F
 strhw9, [sp, x10, lsl #1]
 bne 0b
-ld1{v16.16b-v19.16b}, [sp], #64
-movi   v20.8h,   #1
-1:  // beginning of line
-mov w8,  w6
-2:
-// Simple layout for accessing 16bit values
+ld1 {v16.16b-v19.16b}, [sp], #64
+moviv20.8h,   #1
+1:  mov w8,  w6// beginning of line
+2:  // Simple layout for accessing 16bit values
 // with 8bit LUT.
 //
 //   00  01  02  03  04  05  06  07
@@ -55,33 +52,21 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|
 // +--->
 //i-0 i-1 i-2 i-3
-// dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-ld1{v2.8b}, [x1]
-// load src[x]
-uxtlv0.8h,  v2.8b
-// >> shift
-ushrv2.8h,  v0.8h, #3 // BIT_DEPTH - 3
-// x2 (access lower short)
-shl v1.8h,  v2.8h, #1 // low (x2, accessing short)
-// +1 access upper short
-add v3.8h,  v1.8h, v20.8h
-// shift insert index to upper byte
-sli v1.8h,  v3.8h, #8
-// table
-tbxv2.16b, {v16.16b-v19.16b}, v1.16b
-// src[x] + table
-add v1.8h,  v0.8h, v2.8h
-// clip + narrow
-sqxtun  v4.8b,  v1.8h
-// store
-st1{v4.8b}, [x0]
-// done 8 pixels
-subsw8, w8,  #8
+ld1 {v2.8b}, [x1]  // dst[x] = 
av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+uxtlv0.8h,  v2.8b  // load src[x]
+ushrv2.8h,  v0.8h, #3  // >> BIT_DEPTH - 3
+shl v1.8h,  v2.8h, #1  // low (x2, accessing short)
+add v3.8h,  v1.8h, v20.8h  // +1 access upper short
+sli v1.8h,  v3.8h, #8  // shift insert index to 
upper byte
+tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table
+add v1.8h,  v0.8h, v2.8h   // src[x] + table
+sqxtun  v4.8b,  v1.8h  // clip + narrow
+st1 {v4.8b}, [x0]  // store
+subsw8, w8,  #8// done 8 pixels
 bne 2b
-// finished line
-subsw7, w7,  #1
-add x0, x0,  x2 // dst += stride_dst
-add x1, x1,  x3 // src += stride_src
+subsw7, w7,  #1// finished line, prep. new
+add x0, x0,  x2// dst += stride_dst
+add x1, x1,  x3// src += stride_src
 bne 1b
 ret
 endfunc
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v3 6/6] lavc/aarch64: add hevc sao band 8x8 tiling

2022-01-03 Thread J. Dekker

--bench on AWS Graviton:

hevc_sao_band_8x8_8_c: 317.5
hevc_sao_band_8x8_8_neon: 97.5
hevc_sao_band_16x16_8_c: 1115.0
hevc_sao_band_16x16_8_neon: 322.7
hevc_sao_band_32x32_8_c: 4599.2
hevc_sao_band_32x32_8_neon: 1246.2
hevc_sao_band_48x48_8_c: 10021.7
hevc_sao_band_48x48_8_neon: 2740.5
hevc_sao_band_64x64_8_c: 17635.0
hevc_sao_band_64x64_8_neon: 4875.7

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  6 +-
 libavcodec/aarch64/hevcdsp_sao_neon.S | 11 +++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index b93cec9e44..2002530266 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -77,7 +77,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->idct_dc[1]  = ff_hevc_idct_8x8_dc_8_neon;
 c->idct_dc[2]  = ff_hevc_idct_16x16_dc_8_neon;
 c->idct_dc[3]  = ff_hevc_idct_32x32_dc_8_neon;
-c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_band_filter[0]  =
+c->sao_band_filter[1]  =
+c->sao_band_filter[2]  =
+c->sao_band_filter[3]  =
+c->sao_band_filter[4]  = ff_hevc_sao_band_filter_8x8_8_neon;
 c->sao_edge_filter[0]  = ff_hevc_sao_edge_filter_8x8_8_neon;
 c->sao_edge_filter[1]  =
 c->sao_edge_filter[2]  =
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index 73b0b3b056..d524323fe8 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -3,7 +3,7 @@
  *
  * AArch64 NEON optimised SAO functions for HEVC decoding
  *
- * Copyright (c) 2020 Josh Dekker 
+ * Copyright (c) 2020-2021  J. Dekker 
  *
  * This file is part of FFmpeg.
  *
@@ -35,6 +35,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 stp xzr, xzr, [sp, #32]
 stp xzr, xzr, [sp, #48]
 mov w8,  #4
+sxtwx6,  w6
 0:  ldrsh   x9, [x4,  x8, lsl #1]  // sao_offset_val[k+1]
 subsw8,  w8,  #1
 add w10, w8,  w5   // k + sao_left_class
@@ -43,7 +44,9 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 bne 0b
 ld1 {v16.16b-v19.16b}, [sp], #64
 moviv20.8h,   #1
-1:  mov w8,  w6// beginning of line
+sub x2,  x2, x6// stride_dst - width
+sub x3,  x3, x6// stride_src - width
+1:  mov x8,  x6// beginning of line
 2:  // Simple layout for accessing 16bit values
 // with 8bit LUT.
 //
@@ -52,7 +55,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|
 // +--->
 //i-0 i-1 i-2 i-3
-ld1 {v2.8b}, [x1]  // dst[x] = 
av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+ld1 {v2.8b}, [x1], #8  // dst[x] = 
av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
 uxtlv0.8h,  v2.8b  // load src[x]
 ushrv2.8h,  v0.8h, #3  // >> BIT_DEPTH - 3
 shl v1.8h,  v2.8h, #1  // low (x2, accessing short)
@@ -61,7 +64,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table
 add v1.8h,  v0.8h, v2.8h   // src[x] + table
 sqxtun  v4.8b,  v1.8h  // clip + narrow
-st1 {v4.8b}, [x0]  // store
+st1 {v4.8b}, [x0], #8  // store
 subsw8, w8,  #8// done 8 pixels
 bne 2b
 subsw7, w7,  #1// finished line, prep. new
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3 6/6] lavc/aarch64: add hevc sao band 8x8 tiling

2022-01-04 Thread J. Dekker

On 4 Jan 2022, at 10:41, Martin Storsjö wrote:

> On Tue, 4 Jan 2022, J. Dekker wrote:
>
> [...]
>
> LGTM, please push.
>
> // Martin
>

Thanks, pushed.

-- 
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] lavc/aarch64: add hevc qpel assembly

2022-01-20 Thread J. Dekker

Based on patch by: Rafal Dabrowa 
---
 libavcodec/aarch64/Makefile   |1 +
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   69 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S| 2799 +
 3 files changed, 2869 insertions(+)

 Some changes since last time it was submitted, namely: split, macro'd
 and some scheduling and other improvements.

create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..8592692479 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -63,4 +63,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
aarch64/vp9itxfm_16bpp_neon.o   \
aarch64/vp9mc_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o 
\
aarch64/hevcdsp_init_aarch64.o  
\
+   aarch64/hevcdsp_qpel_neon.o 
\
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..69f0d9bc6f 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,63 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   int16_t *sao_offset_val, int sao_left_class,
   int width, int height);
 
+#define NEON8_FNPROTO(fn, args) \
+void ff_hevc_put_hevc_##fn##4_8_neon args; \
+void ff_hevc_put_hevc_##fn##6_8_neon args; \
+void ff_hevc_put_hevc_##fn##8_8_neon args; \
+void ff_hevc_put_hevc_##fn##12_8_neon args; \
+void ff_hevc_put_hevc_##fn##16_8_neon args; \
+void ff_hevc_put_hevc_##fn##24_8_neon args; \
+void ff_hevc_put_hevc_##fn##32_8_neon args; \
+void ff_hevc_put_hevc_##fn##48_8_neon args; \
+void ff_hevc_put_hevc_##fn##64_8_neon args; \
 
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_h, (uint8_t *dst,  ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_hv, (uint8_t *dst,  ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+int height, intptr_t mx, intptr_t my, int width));
+
+#define NEON8_FNASSIGN(member, v, h, fn) \
+member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon;  \
+member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon;  \
+member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon;  \
+member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon; \
+member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon; \
+member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon; \
+member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon; \
+member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon; \
+member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon;
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -80,6 +136,19 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 // for the current size, but if enabled for bigger sizes, the cases
 // of non-multiple of 8 seem to arise.
 //c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+
+NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels);
+NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
+NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
+NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
+NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
+NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
+NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
+NEON8_FNASSIGN(c

[FFmpeg-devel] [PATCH v2 1/2] lavc/aarch64: add hevc qpel assembly

2022-02-03 Thread J. Dekker

Thanks: Rafal Dabrowa 
---
 libavcodec/aarch64/Makefile   |1 +
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   67 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S| 2799 +
 3 files changed, 2867 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S

 Had trouble testing on a Linux machine as well, but have a workflow
 setup for that now so should be easier in the future. Passes FATE on
 both macOS and Linux.

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..8592692479 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -63,4 +63,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
aarch64/vp9itxfm_16bpp_neon.o   \
aarch64/vp9mc_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o 
\
aarch64/hevcdsp_init_aarch64.o  
\
+   aarch64/hevcdsp_qpel_neon.o 
\
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..3e5d85247e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,63 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   int16_t *sao_offset_val, int sao_left_class,
   int width, int height);
 
+#define NEON8_FNPROTO(fn, args) \
+void ff_hevc_put_hevc_##fn##4_8_neon args; \
+void ff_hevc_put_hevc_##fn##6_8_neon args; \
+void ff_hevc_put_hevc_##fn##8_8_neon args; \
+void ff_hevc_put_hevc_##fn##12_8_neon args; \
+void ff_hevc_put_hevc_##fn##16_8_neon args; \
+void ff_hevc_put_hevc_##fn##24_8_neon args; \
+void ff_hevc_put_hevc_##fn##32_8_neon args; \
+void ff_hevc_put_hevc_##fn##48_8_neon args; \
+void ff_hevc_put_hevc_##fn##64_8_neon args; \
 
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_h, (uint8_t *dst,  ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_hv, (uint8_t *dst,  ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+int height, intptr_t mx, intptr_t my, int width));
+
+#define NEON8_FNASSIGN(member, v, h, fn) \
+member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon;  \
+member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon;  \
+member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon;  \
+member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon; \
+member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon; \
+member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon; \
+member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon; \
+member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon; \
+member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon;
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -80,6 +136,17 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 // for the current size, but if enabled for bigger sizes, the cases
 // of non-multiple of 8 seem to arise.
 //c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+
+NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
+NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
+NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
+NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
+NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
+NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
+NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 1,

[FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc epel assembly

2022-02-03 Thread J. Dekker

Thanks: Rafal Dabrowa 
---
 libavcodec/aarch64/Makefile   |3 +-
 libavcodec/aarch64/hevcdsp_epel_neon.S| 2501 +
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   52 +
 3 files changed, 2555 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 8592692479..ebedc03bfa 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -61,7 +61,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
aarch64/vp9itxfm_16bpp_neon.o   \
aarch64/vp9lpf_neon.o   
\
aarch64/vp9mc_16bpp_neon.o  
\
aarch64/vp9mc_neon.o
-NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o 
\
+NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_epel_neon.o 
\
+   aarch64/hevcdsp_idct_neon.o 
\
aarch64/hevcdsp_init_aarch64.o  
\
aarch64/hevcdsp_qpel_neon.o 
\
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
new file mode 100644
index 00..bbf93c3d6a
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -0,0 +1,2501 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2)
+1:  ld1{v0.s}[0], [x1], x2
+ushll   v4.8h, v0.8b, #6
+subsw3, w3, #1
+st1{v4.d}[0], [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2 - 8)
+1:  ld1{v0.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+st1{v4.d}[0], [x0], #8
+subsw3, w3, #1
+st1{v4.s}[2], [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2)
+1:  ld1{v0.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+subsw3, w3, #1
+st1{v4.8h}, [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2 - 16)
+1:  ld1{v0.8b, v1.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+st1{v4.8h}, [x0], #16
+ushll   v5.8h, v1.8b, #6
+subsw3, w3, #1
+st1{v5.d}[0], [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2)
+1:  ld1{v0.8b, v1.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+ushll   v5.8h, v1.8b, #6
+subsw3, w3, #1
+st1{v4.8h, v5.8h}, [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2)
+1:  ld1{v0.8b-v2.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+ushll   v5.8h, v1.8b, #6
+ushll   v6.8h, v2.8b, #6
+subsw3, w3, #1
+st1{v4.8h-v6.8h}, [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2)
+1:  ld1{v0.8b-v3.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+ushll   v5.8h, v1.8b, #6
+ushll   v6.8h, v2.8b, #6
+ushll   v7.8h, v3.8b, #6
+subsw3, w3, #1
+st

[FFmpeg-devel] [PATCH] tools: add general_assembly.pl

2022-02-15 Thread J. Dekker

This script generates the current general assembly voters according to
the criteria of '20 code commits in the last 36 months'.

Signed-off-by: J. Dekker 
---

 This was rejected last time but I would really like to get this in the
 tools or at least publicly recorded on mailing list since the script
 was updated.

 tools/general_assembly.pl | 45 +++
 1 file changed, 45 insertions(+)
 create mode 100644 tools/general_assembly.pl

diff --git a/tools/general_assembly.pl b/tools/general_assembly.pl
new file mode 100644
index 00..c3aea14d79
--- /dev/null
+++ b/tools/general_assembly.pl
@@ -0,0 +1,45 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+
+use POSIX qw(strftime);
+use Encode qw(decode);
+use Data::Dumper;
+
+sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+my @shortlog = split /\n/, decode('UTF-8', `git log --pretty=format:"%aN 
<%aE>" --since="last 36 months" | sort | uniq -c | sort -r`, Encode::FB_CROAK);
+my %assembly = ();
+
+foreach my $line (@shortlog) {
+my ($count, $name, $email) = $line =~ m/^ *(\d+) *(.*?) <(.*?)>/;
+
+if ($count < 20) {
+next;
+}
+
+$name = trim $name;
+
+# assume people with 50 commits have at least 20 source commits
+if ($count < 50) {
+my $true = 0;
+my @commits = split /(^|\n)commit [a-z0-9]{40}(\n|$)/, decode('UTF-8', 
`git log --name-only --use-mailmap --author="$email" --since="last 36 months"`, 
Encode::FB_CROAK);
+foreach my $commit (@commits) {
+if ($commit =~ /\n[\w\/]+\.(c|h|S|asm)/) {
+$true++;
+}
+}
+
+if ($true < 20) {
+next;
+}
+}
+
+$assembly{$name} = $email;
+}
+
+printf("# %s %s", strftime("%Y-%m-%d", localtime), decode('UTF-8', `git 
rev-parse HEAD`, Encode::FB_CROAK));
+foreach my $email (sort values %assembly) {
+printf("%s\n", $email);
+}
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] tools: add general_assembly.pl

2022-02-15 Thread J, Dekker


On 15/02/2022 17:46, Jean-Baptiste Kempf wrote:

Hello,

On Tue, 15 Feb 2022, at 12:50, J. Dekker wrote:

  This was rejected last time but I would really like to get this in the
  tools or at least publicly recorded on mailing list since the script
  was updated.


Why was this rejected?


It was blocked by Nicolas as the procedure to determine the general 
assembly was not fully decided.



+foreach my $commit (@commits) {
+if ($commit =~ /\n[\w\/]+\.(c|h|S|asm)/) {
+$true++;
+}


Why do we filter on those file types? .md, .pl and other things for docs are 
active in the community.


It was just discussed at the in person meeting, our documentation states 
otherwise and I think we should stick to the documentation here. It 
doesn't make that much sense to filter out people who have majority 
documentation commits either--20 documentation commits is still a 
significant enough contribution.


See updated patch.

--
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 1/2] tools: add general_assembly.pl

2022-02-15 Thread J. Dekker

This script generates the current general assembly voters according to
the criteria of '20 commits in the last 36 months'.

Signed-off-by: J. Dekker 
---
 doc/dev_community/community.md |  3 +++
 tools/general_assembly.pl  | 40 ++
 2 files changed, 43 insertions(+)
 create mode 100644 tools/general_assembly.pl

diff --git a/doc/dev_community/community.md b/doc/dev_community/community.md
index 21e08e20e3..516ca5c05e 100644
--- a/doc/dev_community/community.md
+++ b/doc/dev_community/community.md
@@ -25,6 +25,9 @@ proposal by a member of the General Assembly.
 They are part of the GA for two years, after which they need a confirmation by
 the GA.
 
+A script to generate the current members of the general assembly (minus members
+voted in) can be found in `tools/general_assembly.pl`.
+
 ## Voting
 
 Voting is done using a ranked voting system, currently running on 
https://vote.ffmpeg.org/ .
diff --git a/tools/general_assembly.pl b/tools/general_assembly.pl
new file mode 100644
index 00..898a6262ef
--- /dev/null
+++ b/tools/general_assembly.pl
@@ -0,0 +1,40 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+
+use POSIX qw(strftime);
+use Encode qw(decode);
+use Data::Dumper;
+
+sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+my @shortlog = split /\n/, decode('UTF-8', `git log --pretty=format:"%aN 
<%aE>" --since="last 36 months" | sort | uniq -c | sort -r`, Encode::FB_CROAK);
+my %assembly = ();
+
+foreach my $line (@shortlog) {
+my ($count, $name, $email) = $line =~ m/^ *(\d+) *(.*?) <(.*?)>/;
+if ($count < 20) {
+next;
+}
+
+$name = trim $name;
+if ($count < 50) {
+my $true = 0;
+my @commits = split /(^|\n)commit [a-z0-9]{40}(\n|$)/, decode('UTF-8', 
`git log --name-only --use-mailmap --author="$email" --since="last 36 months"`, 
Encode::FB_CROAK);
+foreach my $commit (@commits) {
+$true++; # if ($commit =~ /\n[\w\/]+\.(c|h|S|asm|texi)/);
+}
+
+if ($true < 20) {
+next;
+}
+}
+
+$assembly{$name} = $email;
+}
+
+printf("# %s %s", strftime("%Y-%m-%d", localtime), decode('UTF-8', `git 
rev-parse HEAD`, Encode::FB_CROAK));
+foreach my $email (sort values %assembly) {
+printf("%s\n", $email);
+}
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] mailmap: update entry

2022-02-15 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 .mailmap | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.mailmap b/.mailmap
index ba072f38c8..5544fc5b5c 100644
--- a/.mailmap
+++ b/.mailmap
@@ -8,7 +8,8 @@
  
  
  
- 
+ 
+J. Dekker  
  
  
  
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 1/2] tools: add general_assembly.pl

2022-03-01 Thread J. Dekker

On 24 Feb 2022, at 14:46, Nicolas George wrote:

> Thilo Borgman (12022-02-24):
>> Both LGTM.
>
> Does it?

An earlier reply would have been helpful here, i.e. before an ACK rather than 
after.

> The way I remember it, this condition was for the initial assembly, as a
> temporary measure. I might have missed some steps, but I do not remember
> we adopted this as a constant rule.
>
> I see several flaws with it:
>
> - Documentation patches were not counted, now they are.
>
> - Cosmetic patches are counted.
>
> - It does not take the size of the changes.
>
> Using an imperfect solution for bootstrap is one thing, using it
> permanently is another.
>

It is only supposed to match the documentation. Of course, nothing says it 
can't be changed later when the 'bootstrap' process is completed (and let's 
consider that the last time this was actually discussed was in Tokyo, not sure 
what good delaying it further helps).

You are welcome to send a patch to our documentation to clarify the rules if 
you think they are not correct. However, I personally think that any 
contribution to the project is useful and documentation & cosmetic improvements 
should not be penalized needlessly.

-- 
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/2] lavu/tests/opts: add tests for filepath options

2022-03-07 Thread J. Dekker

On 5 Mar 2022, at 20:16, Michael Niedermayer wrote:

> On Fri, Mar 04, 2022 at 04:03:07PM +0100, Niklas Haas wrote:
>> From: Niklas Haas 
>>
>> Using the venerable HEADER.txt as a small file to load.
>> ---
>>  libavutil/tests/opt.c| 38 +-
>>  tests/fate/libavutil.mak |  2 +-
>>  tests/ref/fate/opt   |  4 
>>  3 files changed, 42 insertions(+), 2 deletions(-)
>
> Please add tests which tries to load
> id_rsa
> ~/.ssh/id_rsa
> shadow
> /etc/shadow
> .bash_history
> ...
>
> The idea here is of course that such attempts fail

There is absolutely no way we can or should try to implement a path based 
blacklist. Untrusted inputs should be sanitised externally by whichever script 
is being used to call ffmpeg.

> Also document the security implications of this feature in
> doc/APIchanges / release notes if there is a security implication
>
> Adjusting the parameters of most components could previously
> not read arbitrary files so a application could previously
> pass a string from a untrusted user to it.
> If this changes it needs to be justfied and documented
> If it doesnt change and its still safe that should be documented.
> If it depends on whitelists and callbacks that should be actually implemented
> in ffmpeg and the relevant examples
>
> And i do like this feature, if it can be done without security issues

There aren't any extra security implications here, if a user is allowed to 
specify filter arguments themselves then they can already use the movie/amovie 
filter etc. This new option is just a way to unify the way in which filters 
which already (and will) require to load files can do so.

-- 
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/3] checkasm: collapse hevc pel tests

2021-08-05 Thread J. Dekker

Also add to `make fate-checkasm' target.

Signed-off-by: J. Dekker 
---
 tests/checkasm/checkasm.c | 11 +--
 tests/checkasm/checkasm.h | 11 +--
 tests/checkasm/hevc_pel.c | 34 --
 tests/fate/checkasm.mak   |  1 +
 4 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index be5c17cd2a..b1353f7cbe 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -116,16 +116,7 @@ static const struct {
 #if CONFIG_HEVC_DECODER
 { "hevc_add_res", checkasm_check_hevc_add_res },
 { "hevc_idct", checkasm_check_hevc_idct },
-{ "hevc_qpel", checkasm_check_hevc_qpel },
-{ "hevc_qpel_uni", checkasm_check_hevc_qpel_uni },
-{ "hevc_qpel_uni_w", checkasm_check_hevc_qpel_uni_w },
-{ "hevc_qpel_bi", checkasm_check_hevc_qpel_bi },
-{ "hevc_qpel_bi_w", checkasm_check_hevc_qpel_bi_w },
-{ "hevc_epel", checkasm_check_hevc_epel },
-{ "hevc_epel_uni", checkasm_check_hevc_epel_uni },
-{ "hevc_epel_uni_w", checkasm_check_hevc_epel_uni_w },
-{ "hevc_epel_bi", checkasm_check_hevc_epel_bi },
-{ "hevc_epel_bi_w", checkasm_check_hevc_epel_bi_w },
+{ "hevc_pel", checkasm_check_hevc_pel },
 { "hevc_sao", checkasm_check_hevc_sao },
 #endif
 #if CONFIG_HUFFYUV_DECODER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index b747ed1986..68b0697d3e 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -61,16 +61,7 @@ void checkasm_check_h264pred(void);
 void checkasm_check_h264qpel(void);
 void checkasm_check_hevc_add_res(void);
 void checkasm_check_hevc_idct(void);
-void checkasm_check_hevc_qpel(void);
-void checkasm_check_hevc_qpel_uni(void);
-void checkasm_check_hevc_qpel_uni_w(void);
-void checkasm_check_hevc_qpel_bi(void);
-void checkasm_check_hevc_qpel_bi_w(void);
-void checkasm_check_hevc_epel(void);
-void checkasm_check_hevc_epel_uni(void);
-void checkasm_check_hevc_epel_uni_w(void);
-void checkasm_check_hevc_epel_bi(void);
-void checkasm_check_hevc_epel_bi_w(void);
+void checkasm_check_hevc_pel(void);
 void checkasm_check_hevc_sao(void);
 void checkasm_check_huffyuvdsp(void);
 void checkasm_check_jpeg2000dsp(void);
diff --git a/tests/checkasm/hevc_pel.c b/tests/checkasm/hevc_pel.c
index 4d1545e467..ec24309081 100644
--- a/tests/checkasm/hevc_pel.c
+++ b/tests/checkasm/hevc_pel.c
@@ -65,7 +65,7 @@ static const int offsets[] = {0, 255, -1 };
 #define src0 (buf0 + 2 * 4 * MAX_PB_SIZE) /* hevc qpel functions read data 
from negative src pointer offsets */
 #define src1 (buf1 + 2 * 4 * MAX_PB_SIZE)
 
-void checkasm_check_hevc_qpel(void)
+static void checkasm_check_hevc_qpel(void)
 {
 LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
 LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
@@ -109,7 +109,7 @@ void checkasm_check_hevc_qpel(void)
 report("qpel");
 }
 
-void checkasm_check_hevc_qpel_uni(void)
+static void checkasm_check_hevc_qpel_uni(void)
 {
 LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
 LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
@@ -150,7 +150,7 @@ void checkasm_check_hevc_qpel_uni(void)
 report("qpel_uni");
 }
 
-void checkasm_check_hevc_qpel_uni_w(void)
+static void checkasm_check_hevc_qpel_uni_w(void)
 {
 LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
 LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
@@ -198,7 +198,7 @@ void checkasm_check_hevc_qpel_uni_w(void)
 report("qpel_uni_w");
 }
 
-void checkasm_check_hevc_qpel_bi(void)
+static void checkasm_check_hevc_qpel_bi(void)
 {
 LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
 LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
@@ -242,7 +242,7 @@ void checkasm_check_hevc_qpel_bi(void)
 report("qpel_bi");
 }
 
-void checkasm_check_hevc_qpel_bi_w(void)
+static void checkasm_check_hevc_qpel_bi_w(void)
 {
 LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
 LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
@@ -294,7 +294,7 @@ void checkasm_check_hevc_qpel_bi_w(void)
 report("qpel_bi_w");
 }
 
-void checkasm_check_hevc_epel(void)
+static void checkasm_check_hevc_epel(void)
 {
 LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
 LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
@@ -338,7 +338,7 @@ void checkasm_check_hevc_epel(void)
 report("epel");
 }
 
-void checkasm_check_hevc_epel_uni(void)
+static void checkasm_check_hevc_epel_uni(void)
 {
 LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
 LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
@@ -379,7 +379,7 @@ void checkasm_check_hevc_epel_uni(void)
 report("epel_uni");
 }
 
-void checkasm_check_hevc_epel_uni_w(void)
+static void checkasm_check_hevc_epel_uni_w(void)
 {
 LOCAL_ALIGNED_32(uint8_t,

[FFmpeg-devel] [PATCH 2/3] checkasm: add h264 chroma test

2021-08-05 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 tests/checkasm/Makefile |   1 +
 tests/checkasm/checkasm.c   |   3 +
 tests/checkasm/checkasm.h   |   1 +
 tests/checkasm/h264chroma.c | 109 
 tests/fate/checkasm.mak |   1 +
 5 files changed, 115 insertions(+)
 create mode 100644 tests/checkasm/h264chroma.c

What should I do for other codecs here, or just ignore non-h264?

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 4ef5fa87da..41222c3827 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -6,6 +6,7 @@ AVCODECOBJS-$(CONFIG_BSWAPDSP)  += bswapdsp.o
 AVCODECOBJS-$(CONFIG_FLACDSP)   += flacdsp.o
 AVCODECOBJS-$(CONFIG_FMTCONVERT)+= fmtconvert.o
 AVCODECOBJS-$(CONFIG_G722DSP)   += g722dsp.o
+AVCODECOBJS-$(CONFIG_H264CHROMA)+= h264chroma.o
 AVCODECOBJS-$(CONFIG_H264DSP)   += h264dsp.o
 AVCODECOBJS-$(CONFIG_H264PRED)  += h264pred.o
 AVCODECOBJS-$(CONFIG_H264QPEL)  += h264qpel.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index b1353f7cbe..154c4a5c01 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -104,6 +104,9 @@ static const struct {
 #if CONFIG_G722DSP
 { "g722dsp", checkasm_check_g722dsp },
 #endif
+#if CONFIG_H264CHROMA
+{ "h264chroma", checkasm_check_h264chroma },
+#endif
 #if CONFIG_H264DSP
 { "h264dsp", checkasm_check_h264dsp },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 68b0697d3e..ac2f22af05 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -56,6 +56,7 @@ void checkasm_check_flacdsp(void);
 void checkasm_check_float_dsp(void);
 void checkasm_check_fmtconvert(void);
 void checkasm_check_g722dsp(void);
+void checkasm_check_h264chroma(void);
 void checkasm_check_h264dsp(void);
 void checkasm_check_h264pred(void);
 void checkasm_check_h264qpel(void);
diff --git a/tests/checkasm/h264chroma.c b/tests/checkasm/h264chroma.c
new file mode 100644
index 00..0bd27d
--- /dev/null
+++ b/tests/checkasm/h264chroma.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2015 Henrik Gramner
+ * Copyright (c) 2021 J. Dekker
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264chroma.h"
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+// static const int codec_ids[4] = { AV_CODEC_ID_H264, AV_CODEC_ID_VP8, 
AV_CODEC_ID_RV40, AV_CODEC_ID_SVQ3 };
+static const uint32_t pixel_mask[3] = { 0x, 0x01ff01ff, 0x03ff03ff };
+
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
+#define BUF_SIZE (3 * 16 * 17)
+
+#define randomize_buffers()\
+do {   \
+uint32_t mask = pixel_mask[bit_depth - 8]; \
+int i; \
+for (i = 0; i < BUF_SIZE; i += 4) {\
+uint32_t r = rnd() & mask; \
+AV_WN32A(buf0 + i, r); \
+AV_WN32A(buf1 + i, r); \
+}  \
+} while (0)
+
+#define src0 (buf0 + 4 * 16) /* Offset to allow room for top and left */
+#define src1 (buf1 + 4 * 16)
+
+static void check_avg()
+{
+int i, bit_depth, x, y;
+LOCAL_ALIGNED_16(uint8_t, buf0, [BUF_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, buf1, [BUF_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, dst0, [BUF_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, dst1, [BUF_SIZE]);
+H264ChromaContext h;
+
+declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, 
ptrdiff_t stride, int h, int x, int y);
+for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
+ff_h264chroma_init(&h, bit_depth);
+for (i = 0; i < 4; i++) {
+if (check_func(h.avg_h264_chroma_pixels_tab[i], 
"avg_chroma_mc%d_%d", 1 << (3 - i), bit_depth)) {
+randomize_buffers();
+x = rnd() & 0x7; y = rnd() & 0x7;

[FFmpeg-devel] [PATCH 3/3] checkasm: add hevc_deblock tests

2021-08-05 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 0001-checkasm-add-hevc_deblock-tests.patch | 186 +
 tests/checkasm/Makefile|   2 +-
 tests/checkasm/checkasm.c  |   1 +
 tests/checkasm/checkasm.h  |   1 +
 tests/checkasm/hevc_deblock.c  | 126 ++
 tests/fate/checkasm.mak|   1 +
 6 files changed, 316 insertions(+), 1 deletion(-)
 create mode 100644 0001-checkasm-add-hevc_deblock-tests.patch
 create mode 100644 tests/checkasm/hevc_deblock.c

diff --git a/0001-checkasm-add-hevc_deblock-tests.patch 
b/0001-checkasm-add-hevc_deblock-tests.patch
new file mode 100644
index 00..29441e53f6
--- /dev/null
+++ b/0001-checkasm-add-hevc_deblock-tests.patch
@@ -0,0 +1,186 @@
+From a8b2d2259cb8ca5e30b2efae8aa8813eaa768615 Mon Sep 17 00:00:00 2001
+From: "J. Dekker" 
+Date: Mon, 28 Jun 2021 04:46:49 +0200
+Subject: [PATCH] checkasm: add hevc_deblock tests
+
+Signed-off-by: J. Dekker 
+---
+ tests/checkasm/Makefile   |   2 +-
+ tests/checkasm/checkasm.c |   1 +
+ tests/checkasm/checkasm.h |   1 +
+ tests/checkasm/hevc_deblock.c | 126 ++
+ 4 files changed, 129 insertions(+), 1 deletion(-)
+ create mode 100644 tests/checkasm/hevc_deblock.c
+
+diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
+index 4ef5fa87da..481a96e36e 100644
+--- a/tests/checkasm/Makefile
 b/tests/checkasm/Makefile
+@@ -24,7 +24,7 @@ AVCODECOBJS-$(CONFIG_HUFFYUV_DECODER)   += huffyuvdsp.o
+ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
+ AVCODECOBJS-$(CONFIG_OPUS_DECODER)  += opusdsp.o
+ AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
+-AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_idct.o 
hevc_sao.o hevc_pel.o
++AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_deblock.o 
hevc_idct.o hevc_sao.o hevc_pel.o
+ AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
+ AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
+ AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
+diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
+index eb9e2cd677..1fe67007cc 100644
+--- a/tests/checkasm/checkasm.c
 b/tests/checkasm/checkasm.c
+@@ -115,6 +115,7 @@ static const struct {
+ #endif
+ #if CONFIG_HEVC_DECODER
+ { "hevc_add_res", checkasm_check_hevc_add_res },
++{ "hevc_deblock", checkasm_check_hevc_deblock },
+ { "hevc_idct", checkasm_check_hevc_idct },
+ { "hevc_qpel", checkasm_check_hevc_qpel },
+ { "hevc_qpel_uni", checkasm_check_hevc_qpel_uni },
+diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
+index b747ed1986..8ecb4f0cf0 100644
+--- a/tests/checkasm/checkasm.h
 b/tests/checkasm/checkasm.h
+@@ -60,6 +60,7 @@ void checkasm_check_h264dsp(void);
+ void checkasm_check_h264pred(void);
+ void checkasm_check_h264qpel(void);
+ void checkasm_check_hevc_add_res(void);
++void checkasm_check_hevc_deblock(void);
+ void checkasm_check_hevc_idct(void);
+ void checkasm_check_hevc_qpel(void);
+ void checkasm_check_hevc_qpel_uni(void);
+diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c
+new file mode 100644
+index 00..98f612921b
+--- /dev/null
 b/tests/checkasm/hevc_deblock.c
+@@ -0,0 +1,126 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include 
++
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#include "libavcodec/avcodec.h"
++#include "libavcodec/hevcdsp.h"
++
++#include "checkasm.h"
++
++static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff };
++
++#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
++#define BUF_STRIDE (8 * 2)
++#define BUF_LINES (8)
++#define BUF_OFFSET (BUF_STRIDE * BUF_LINES)
++#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
++
++#define randomize_buffers(buf0, buf1, size) \
++do {\
++uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
++int k;  \
++for (k = 0; k < size; k += 4)

[FFmpeg-devel] [PATCH v2 2/3] checkasm: add h264 chroma test

2021-08-05 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 tests/checkasm/Makefile |   1 +
 tests/checkasm/checkasm.c   |   3 +
 tests/checkasm/checkasm.h   |   1 +
 tests/checkasm/h264chroma.c | 109 
 tests/fate/checkasm.mak |   1 +
 5 files changed, 115 insertions(+)
 create mode 100644 tests/checkasm/h264chroma.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 4ef5fa87da..41222c3827 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -6,6 +6,7 @@ AVCODECOBJS-$(CONFIG_BSWAPDSP)  += bswapdsp.o
 AVCODECOBJS-$(CONFIG_FLACDSP)   += flacdsp.o
 AVCODECOBJS-$(CONFIG_FMTCONVERT)+= fmtconvert.o
 AVCODECOBJS-$(CONFIG_G722DSP)   += g722dsp.o
+AVCODECOBJS-$(CONFIG_H264CHROMA)+= h264chroma.o
 AVCODECOBJS-$(CONFIG_H264DSP)   += h264dsp.o
 AVCODECOBJS-$(CONFIG_H264PRED)  += h264pred.o
 AVCODECOBJS-$(CONFIG_H264QPEL)  += h264qpel.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index b1353f7cbe..154c4a5c01 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -104,6 +104,9 @@ static const struct {
 #if CONFIG_G722DSP
 { "g722dsp", checkasm_check_g722dsp },
 #endif
+#if CONFIG_H264CHROMA
+{ "h264chroma", checkasm_check_h264chroma },
+#endif
 #if CONFIG_H264DSP
 { "h264dsp", checkasm_check_h264dsp },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 68b0697d3e..ac2f22af05 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -56,6 +56,7 @@ void checkasm_check_flacdsp(void);
 void checkasm_check_float_dsp(void);
 void checkasm_check_fmtconvert(void);
 void checkasm_check_g722dsp(void);
+void checkasm_check_h264chroma(void);
 void checkasm_check_h264dsp(void);
 void checkasm_check_h264pred(void);
 void checkasm_check_h264qpel(void);
diff --git a/tests/checkasm/h264chroma.c b/tests/checkasm/h264chroma.c
new file mode 100644
index 00..1f9773e345
--- /dev/null
+++ b/tests/checkasm/h264chroma.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2015 Henrik Gramner
+ * Copyright (c) 2021 J. Dekker
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264chroma.h"
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+// static const int codec_ids[4] = { AV_CODEC_ID_H264, AV_CODEC_ID_VP8, 
AV_CODEC_ID_RV40, AV_CODEC_ID_SVQ3 };
+static const uint32_t pixel_mask[3] = { 0x, 0x01ff01ff, 0x03ff03ff };
+
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
+#define BUF_SIZE (3 * 16 * 17)
+
+#define randomize_buffers()\
+do {   \
+uint32_t mask = pixel_mask[bit_depth - 8]; \
+int i; \
+for (i = 0; i < BUF_SIZE; i += 4) {\
+uint32_t r = rnd() & mask; \
+AV_WN32A(buf0 + i, r); \
+AV_WN32A(buf1 + i, r); \
+}  \
+} while (0)
+
+#define src0 (buf0 + 4 * 16) /* Offset to allow room for top and left */
+#define src1 (buf1 + 4 * 16)
+
+static void check_avg(void)
+{
+int i, bit_depth, x, y;
+LOCAL_ALIGNED_16(uint8_t, buf0, [BUF_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, buf1, [BUF_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, dst0, [BUF_SIZE]);
+LOCAL_ALIGNED_16(uint8_t, dst1, [BUF_SIZE]);
+H264ChromaContext h;
+
+declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, 
ptrdiff_t stride, int h, int x, int y);
+for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
+ff_h264chroma_init(&h, bit_depth);
+for (i = 0; i < 4; i++) {
+if (check_func(h.avg_h264_chroma_pixels_tab[i], 
"avg_chroma_mc%d_%d", 1 << (3 - i), bit_depth)) {
+randomize_buffers();
+x = rnd() & 0x7; y = rnd() & 0x7;
+call_ref(dst0, src0, 8, 4, x, y);
+

[FFmpeg-devel] [PATCH v2 3/3] checkasm: add hevc_deblock tests

2021-08-05 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 tests/checkasm/Makefile   |   2 +-
 tests/checkasm/checkasm.c |   1 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/hevc_deblock.c | 126 ++
 tests/fate/checkasm.mak   |   1 +
 5 files changed, 130 insertions(+), 1 deletion(-)
 create mode 100644 tests/checkasm/hevc_deblock.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 41222c3827..862142d8e6 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -25,7 +25,7 @@ AVCODECOBJS-$(CONFIG_HUFFYUV_DECODER)   += huffyuvdsp.o
 AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
 AVCODECOBJS-$(CONFIG_OPUS_DECODER)  += opusdsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
-AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_idct.o 
hevc_sao.o hevc_pel.o
+AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_deblock.o 
hevc_idct.o hevc_sao.o hevc_pel.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
 AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 154c4a5c01..a1e8c4d92e 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -118,6 +118,7 @@ static const struct {
 #endif
 #if CONFIG_HEVC_DECODER
 { "hevc_add_res", checkasm_check_hevc_add_res },
+{ "hevc_deblock", checkasm_check_hevc_deblock },
 { "hevc_idct", checkasm_check_hevc_idct },
 { "hevc_pel", checkasm_check_hevc_pel },
 { "hevc_sao", checkasm_check_hevc_sao },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index ac2f22af05..386ecbf69a 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -61,6 +61,7 @@ void checkasm_check_h264dsp(void);
 void checkasm_check_h264pred(void);
 void checkasm_check_h264qpel(void);
 void checkasm_check_hevc_add_res(void);
+void checkasm_check_hevc_deblock(void);
 void checkasm_check_hevc_idct(void);
 void checkasm_check_hevc_pel(void);
 void checkasm_check_hevc_sao(void);
diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c
new file mode 100644
index 00..98f612921b
--- /dev/null
+++ b/tests/checkasm/hevc_deblock.c
@@ -0,0 +1,126 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/hevcdsp.h"
+
+#include "checkasm.h"
+
+static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff };
+
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
+#define BUF_STRIDE (8 * 2)
+#define BUF_LINES (8)
+#define BUF_OFFSET (BUF_STRIDE * BUF_LINES)
+#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
+
+#define randomize_buffers(buf0, buf1, size) \
+do {\
+uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
+int k;  \
+for (k = 0; k < size; k += 4) { \
+uint32_t r = rnd() & mask;  \
+AV_WN32A(buf0 + k, r);  \
+AV_WN32A(buf1 + k, r);  \
+}   \
+} while (0)
+
+
+static void check_deblock_luma(HEVCDSPContext h, int bit_depth)
+{
+int32_t tc[2] = { 1, 1 };
+uint8_t no_p[2] = { 0, 0 };
+uint8_t no_q[2] = { 0, 0 };
+int beta = rnd() & (0x40 - 1);
+LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
+
+declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, ptrdiff_t stride, 
int beta, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
+
+randomize_buffers(buf0, buf1, BUF_SIZE);
+if (check_func(h.hevc_h_loop_filter_luma, "hevc_h_loop_filter_luma_%d", 
bit_depth)) {
+call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, beta, tc, no_p, no_q);
+call_new(buf1 + BUF_OFFSET, BUF_STRIDE, beta, tc, no_p, no_q);
+

[FFmpeg-devel] [PATCH 1/3] lavc/aarch64: fix hevc sao band filter

2022-04-28 Thread J. Dekker

The SAO band filter can be called with non-multiples of 8, we round up
to the nearest multiple of 8 to account for this.

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c | 10 +-
 libavcodec/aarch64/hevcdsp_sao_neon.S |  8 ++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..c8963e6104 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -75,11 +75,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->idct_dc[1]  = ff_hevc_idct_8x8_dc_8_neon;
 c->idct_dc[2]  = ff_hevc_idct_16x16_dc_8_neon;
 c->idct_dc[3]  = ff_hevc_idct_32x32_dc_8_neon;
-// This function is disabled, as it doesn't handle widths that aren't
-// an even multiple of 8 correctly. fate-hevc doesn't exercise that
-// for the current size, but if enabled for bigger sizes, the cases
-// of non-multiple of 8 seem to arise.
-//c->sao_band_filter[0]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_band_filter[0]  =
+c->sao_band_filter[1]  =
+c->sao_band_filter[2]  =
+c->sao_band_filter[3]  =
+c->sao_band_filter[4]  = ff_hevc_sao_band_filter_8x8_8_neon;
 }
 if (bit_depth == 10) {
 c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index d523bf584d..e07e0cea2d 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -41,7 +41,11 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 and w10, w10, #0x1F
 strhw9, [sp, x10, lsl #1]
 bne 0b
+add w6,  w6,  #7
+bic w6,  w6,  #7
 ld1 {v16.16b-v19.16b}, [sp], #64
+sub x2,  x2,  x6
+sub x3,  x3,  x6
 moviv20.8h,   #1
 1:  mov w8,  w6// beginning of line
 2:  // Simple layout for accessing 16bit values
@@ -52,7 +56,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|
 // +--->
 //i-0 i-1 i-2 i-3
-ld1 {v2.8b}, [x1]  // dst[x] = 
av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+ld1 {v2.8b}, [x1], #8  // dst[x] = 
av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
 uxtlv0.8h,  v2.8b  // load src[x]
 ushrv2.8h,  v0.8h, #3  // >> BIT_DEPTH - 3
 shl v1.8h,  v2.8h, #1  // low (x2, accessing short)
@@ -61,7 +65,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table
 add v1.8h,  v0.8h, v2.8h   // src[x] + table
 sqxtun  v4.8b,  v1.8h  // clip + narrow
-st1 {v4.8b}, [x0]  // store
+st1 {v4.8b}, [x0], #8  // store
 subsw8, w8,  #8// done 8 pixels
 bne 2b
 subsw7, w7,  #1// finished line, prep. new
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/3] lavc/aarch64: add hevc sao edge 16x16

2022-04-28 Thread J. Dekker

bench on AWS Graviton:

hevc_sao_edge_16x16_8_c: 1857.0
hevc_sao_edge_16x16_8_neon: 211.0
hevc_sao_edge_32x32_8_c: 7802.2
hevc_sao_edge_32x32_8_neon: 808.2
hevc_sao_edge_48x48_8_c: 16764.2
hevc_sao_edge_48x48_8_neon: 1796.5
hevc_sao_edge_64x64_8_c: 32647.5
hevc_sao_edge_64x64_8_neon: 3118.5

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  8 ++-
 libavcodec/aarch64/hevcdsp_sao_neon.S | 66 +++
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index c8963e6104..df521bb083 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -57,8 +57,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   ptrdiff_t stride_dst, ptrdiff_t stride_src,
   int16_t *sao_offset_val, int sao_left_class,
   int width, int height);
-
-
+void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, 
ptrdiff_t stride_dst,
+  int16_t *sao_offset_val, int eo, int 
width, int height);
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -80,6 +80,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->sao_band_filter[2]  =
 c->sao_band_filter[3]  =
 c->sao_band_filter[4]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_edge_filter[1]  =
+c->sao_edge_filter[2]  =
+c->sao_edge_filter[3]  =
+c->sao_edge_filter[4]  = ff_hevc_sao_edge_filter_16x16_8_neon;
 }
 if (bit_depth == 10) {
 c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index e07e0cea2d..0315c479df 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -74,3 +74,69 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 bne 1b
 ret
 endfunc
+
+// ASSUMES STRIDE_SRC = 192
+.Lsao_edge_pos:
+.word 1 // horizontal
+.word 192 // vertical
+.word 192 + 1 // 45 degree
+.word 192 - 1 // 135 degree
+
+// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff 
stride_dst,
+//  int16 *sao_offset_val, int eo, int 
width, int height)
+function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
+adr x7, .Lsao_edge_pos
+ld1 {v3.8h}, [x3]  // load sao_offset_val
+add w5,  w5,  #0xF
+bic w5,  w5,  #0xF
+ldr w4, [x7, w4, uxtw #2]  // stride_src
+mov v3.h[7], v3.h[0]   // reorder to [1,2,0,3,4]
+mov v3.h[0], v3.h[1]
+mov v3.h[1], v3.h[2]
+mov v3.h[2], v3.h[7]
+// split 16bit values into two tables
+uzp2v1.16b, v3.16b, v3.16b // sao_offset_val -> upper
+uzp1v0.16b, v3.16b, v3.16b // sao_offset_val -> lower
+moviv2.16b, #2
+mov x15, #192
+// strides between end of line and next src/dst
+sub x15, x15, x5   // stride_src - width
+sub x16, x2, x5// stride_dst - width
+mov x11, x1// copy base src
+1:  // new line
+mov x14, x5// copy width
+sub x12, x11, x4   // src_a (prev) = src - 
sao_edge_pos
+add x13, x11, x4   // src_b (next) = src + 
sao_edge_pos
+2:  // process 16 bytes
+ld1 {v3.16b}, [x11], #16   // load src
+ld1 {v4.16b}, [x12], #16   // load src_a (prev)
+ld1 {v5.16b}, [x13], #16   // load src_b (next)
+cmhiv16.16b, v4.16b, v3.16b// (prev > cur)
+cmhiv17.16b, v3.16b, v4.16b// (cur > prev)
+cmhiv18.16b, v5.16b, v3.16b// (next > cur)
+cmhiv19.16b, v3.16b, v5.16b// (cur > next)
+sub v20.16b, v16.16b, v17.16b  // diff0 = CMP(cur, prev) = 
(cur > prev) - (cur < prev)
+sub v21.16b, v18.16b, v19.16b  // diff1 = CMP(cur, next) = 
(cur > next) - (cur < next)
+add v20.16b, v20.16b, v21.16b  // diff = diff0 + diff1
+add v20.16b, v20.16b, v2.16b   // offset_val = diff + 2
+tbl v16.16b, {v0.16b}, v20.16b
+tbl v17.16b, {v1.16b}, v20.16b
+uxtlv20.8h, v3.8b

[FFmpeg-devel] [PATCH 3/3] lavc/aarch64: add hevc sao edge 8x8

2022-04-28 Thread J. Dekker

bench on AWS Graviton:

hevc_sao_edge_8x8_8_c: 516.0
hevc_sao_edge_8x8_8_neon: 81.0

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  3 ++
 libavcodec/aarch64/hevcdsp_sao_neon.S | 51 +++
 2 files changed, 54 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index df521bb083..2002530266 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -59,6 +59,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   int width, int height);
 void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, 
ptrdiff_t stride_dst,
   int16_t *sao_offset_val, int eo, int 
width, int height);
+void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t 
stride_dst,
+  int16_t *sao_offset_val, int eo, int 
width, int height);
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -80,6 +82,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 c->sao_band_filter[2]  =
 c->sao_band_filter[3]  =
 c->sao_band_filter[4]  = ff_hevc_sao_band_filter_8x8_8_neon;
+c->sao_edge_filter[0]  = ff_hevc_sao_edge_filter_8x8_8_neon;
 c->sao_edge_filter[1]  =
 c->sao_edge_filter[2]  =
 c->sao_edge_filter[3]  =
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index 0315c479df..efd8112af4 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -140,3 +140,54 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
 // no lines to filter
 ret
 endfunc
+
+// ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst,
+//int16 *sao_offset_val, int eo, int 
width, int height)
+function ff_hevc_sao_edge_filter_8x8_8_neon, export=1
+adr x7, .Lsao_edge_pos
+ldr w4, [x7, w4, uxtw #2]
+ld1 {v3.8h}, [x3]
+mov v3.h[7], v3.h[0]
+mov v3.h[0], v3.h[1]
+mov v3.h[1], v3.h[2]
+mov v3.h[2], v3.h[7]
+uzp2v1.16b, v3.16b, v3.16b
+uzp1v0.16b, v3.16b, v3.16b
+moviv2.16b, #2
+add x16, x0, x2
+lsl x2,  x2, #1
+mov x15, #192
+mov x8,  x1
+sub x9,  x1, x4
+add x10, x1, x4
+lsr w17, w6, #1
+1:  ld1 {v3.d}[0], [ x8], x15
+ld1 {v4.d}[0], [ x9], x15
+ld1 {v5.d}[0], [x10], x15
+ld1 {v3.d}[1], [ x8], x15
+ld1 {v4.d}[1], [ x9], x15
+ld1 {v5.d}[1], [x10], x15
+cmhiv16.16b, v4.16b, v3.16b
+cmhiv17.16b, v3.16b, v4.16b
+cmhiv18.16b, v5.16b, v3.16b
+cmhiv19.16b, v3.16b, v5.16b
+sub v20.16b, v16.16b, v17.16b
+sub v21.16b, v18.16b, v19.16b
+add v20.16b, v20.16b, v21.16b
+add v20.16b, v20.16b, v2.16b
+tbl v16.16b, {v0.16b}, v20.16b
+tbl v17.16b, {v1.16b}, v20.16b
+uxtlv20.8h, v3.8b
+uxtl2   v21.8h, v3.16b
+zip1v18.16b, v16.16b, v17.16b
+zip2v19.16b, v16.16b, v17.16b
+sqadd   v20.8h, v18.8h, v20.8h
+sqadd   v21.8h, v19.8h, v21.8h
+sqxtun  v6.8b, v20.8h
+sqxtun  v7.8b, v21.8h
+st1 {v6.8b}, [ x0], x2
+st1 {v7.8b}, [x16], x2
+subsx17, x17, #1
+b.ne1b
+ret
+endfunc
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: fix hevc sao band filter

2022-04-28 Thread J. Dekker

On 28 Apr 2022, at 15:42, J. Dekker wrote:

> The SAO band filter can be called with non-multiples of 8, we round up
> to the nearest multiple of 8 to account for this.

Martin mentioned he wanted checkasm to check these extra cases. Still working 
on the best way to do this, will post soon.

-- 
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: add hevc sao edge 8x8

2022-05-17 Thread J. Dekker

On 28 Apr 2022, at 21:50, Martin Storsjö wrote:

> [...]
> Compared with the previously applied (and reverted) patch, here, you 
> previously had "mov x17, #4". I guess that'd mean the function only ever 
> produced 8 output rows, while it now uses the real height parameter? Was this 
> change a no-op (height is always 8?) or was this another hidden bug in the 
> previous implementation?
>

Yes, this was another bug in a previous implementation which I've fixed in both 
of the newer versions.

>> [...]
>> +sqxtun  v6.8b, v20.8h
>> +sqxtun  v7.8b, v21.8h
>> +st1 {v6.8b}, [ x0], x2
>> +st1 {v7.8b}, [x16], x2
>> +subsx17, x17, #1
>
> This could be "subs w6, w6, #2" and you wouldn't need the lsr instruction at 
> all. And you could place the subs before the two st1 instructions to reduce 
> latency between them a little. (The same thing goes for moving subs further 
> away from the branch that uses its outcome in the previous patch too.) But as 
> this is just a reapply of a previously committed and reverted patch, I guess 
> it's fine this way too...

Will do before apply if you're fine with it, not too complex change.

> The patchset otherwise looks good to me, modulo the question about the 
> difference to the previous patchset above.

--
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] checkasm: improve hevc_sao test

2022-05-17 Thread J. Dekker

The HEVC decoder can call these functions with smaller widths than the
functions themselves are designed to operate on so we should only check
the relevant output

Signed-off-by: J. Dekker 
---
 tests/checkasm/hevc_sao.c | 51 ---
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c
index 6b750758e2..72cdb87dd1 100644
--- a/tests/checkasm/hevc_sao.c
+++ b/tests/checkasm/hevc_sao.c
@@ -78,20 +78,26 @@ static void check_sao_band(HEVCDSPContext h, int bit_depth)
 
 for (i = 0; i <= 4; i++) {
 int block_size = sao_size[i];
+int prev_size = i > 0 ? sao_size[i - 1] : 0;
 ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL;
 declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, 
ptrdiff_t dst_stride, ptrdiff_t src_stride,
   int16_t *sao_offset_val, int sao_left_class, int 
width, int height);
 
-randomize_buffers(src0, src1, BUF_SIZE);
-randomize_buffers2(offset_val, OFFSET_LENGTH);
-memset(dst0, 0, BUF_SIZE);
-memset(dst1, 0, BUF_SIZE);
-
-if (check_func(h.sao_band_filter[i], "hevc_sao_band_%dx%d_%d", 
block_size, block_size, bit_depth)) {
-call_ref(dst0, src0, stride, stride, offset_val, left_class, 
block_size, block_size);
-call_new(dst1, src1, stride, stride, offset_val, left_class, 
block_size, block_size);
-if (memcmp(dst0, dst1, BUF_SIZE))
-fail();
+if (check_func(h.sao_band_filter[i], "hevc_sao_band_%d_%d", 
block_size, bit_depth)) {
+
+for (int w = prev_size + 4; w <= block_size; w += 4) {
+randomize_buffers(src0, src1, BUF_SIZE);
+randomize_buffers2(offset_val, OFFSET_LENGTH);
+memset(dst0, 0, BUF_SIZE);
+memset(dst1, 0, BUF_SIZE);
+
+call_ref(dst0, src0, stride, stride, offset_val, left_class, 
w, block_size);
+call_new(dst1, src1, stride, stride, offset_val, left_class, 
w, block_size);
+for (int j = 0; j < block_size; j++) {
+if (memcmp(dst0 + j*MAX_PB_SIZE*2, dst1 + j*MAX_PB_SIZE*2, 
w))
+fail();
+}
+}
 bench_new(dst1, src1, stride, stride, offset_val, left_class, 
block_size, block_size);
 }
 }
@@ -109,21 +115,26 @@ static void check_sao_edge(HEVCDSPContext h, int 
bit_depth)
 
 for (i = 0; i <= 4; i++) {
 int block_size = sao_size[i];
+int prev_size = i > 0 ? sao_size[i - 1] : 0;
 ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL;
 int offset = (AV_INPUT_BUFFER_PADDING_SIZE + 
PIXEL_STRIDE)*SIZEOF_PIXEL;
 declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, 
ptrdiff_t stride_dst,
   int16_t *sao_offset_val, int eo, int width, int 
height);
 
-randomize_buffers(src0, src1, BUF_SIZE);
-randomize_buffers2(offset_val, OFFSET_LENGTH);
-memset(dst0, 0, BUF_SIZE);
-memset(dst1, 0, BUF_SIZE);
-
-if (check_func(h.sao_edge_filter[i], "hevc_sao_edge_%dx%d_%d", 
block_size, block_size, bit_depth)) {
-call_ref(dst0, src0 + offset, stride, offset_val, eo, block_size, 
block_size);
-call_new(dst1, src1 + offset, stride, offset_val, eo, block_size, 
block_size);
-if (memcmp(dst0, dst1, BUF_SIZE))
-fail();
+for (int w = prev_size + 4; w <= block_size; w += 4) {
+randomize_buffers(src0, src1, BUF_SIZE);
+randomize_buffers2(offset_val, OFFSET_LENGTH);
+memset(dst0, 0, BUF_SIZE);
+memset(dst1, 0, BUF_SIZE);
+
+if (check_func(h.sao_edge_filter[i], "hevc_sao_edge_%d_%d", 
block_size, bit_depth)) {
+call_ref(dst0, src0 + offset, stride, offset_val, eo, w, 
block_size);
+call_new(dst1, src1 + offset, stride, offset_val, eo, w, 
block_size);
+for (int j = 0; j < block_size; j++) {
+if (memcmp(dst0 + j*MAX_PB_SIZE*2, dst1 + j*MAX_PB_SIZE*2, 
w))
+fail();
+}
+}
 bench_new(dst1, src1 + offset, stride, offset_val, eo, block_size, 
block_size);
 }
 }
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] lavc/aarch64: add hevc horizontal qpel/uni/bi

2022-05-24 Thread J. Dekker

checkasm --benchmark on Ampere Altra (Neoverse N1):

put_hevc_qpel_bi_h4_8_c: 173.7
put_hevc_qpel_bi_h4_8_neon: 77.0
put_hevc_qpel_bi_h6_8_c: 385.7
put_hevc_qpel_bi_h6_8_neon: 125.7
put_hevc_qpel_bi_h8_8_c: 680.7
put_hevc_qpel_bi_h8_8_neon: 137.5
put_hevc_qpel_bi_h12_8_c: 1480.0
put_hevc_qpel_bi_h12_8_neon: 438.5
put_hevc_qpel_bi_h16_8_c: 2663.2
put_hevc_qpel_bi_h16_8_neon: 561.5
put_hevc_qpel_bi_h24_8_c: 6039.0
put_hevc_qpel_bi_h24_8_neon: 1717.5
put_hevc_qpel_bi_h32_8_c: 11104.2
put_hevc_qpel_bi_h32_8_neon: .0
put_hevc_qpel_bi_h48_8_c: 25175.2
put_hevc_qpel_bi_h48_8_neon: 4983.7
put_hevc_qpel_bi_h64_8_c: 42806.5
put_hevc_qpel_bi_h64_8_neon: 8848.5
put_hevc_qpel_h4_8_c: 149.7
put_hevc_qpel_h4_8_neon: 68.2
put_hevc_qpel_h6_8_c: 318.5
put_hevc_qpel_h6_8_neon: 105.2
put_hevc_qpel_h8_8_c: 577.0
put_hevc_qpel_h8_8_neon: 133.2
put_hevc_qpel_h12_8_c: 1276.0
put_hevc_qpel_h12_8_neon: 394.5
put_hevc_qpel_h16_8_c: 2278.2
put_hevc_qpel_h16_8_neon: 517.5
put_hevc_qpel_h24_8_c: 5081.7
put_hevc_qpel_h24_8_neon: 1546.5
put_hevc_qpel_h32_8_c: 9081.0
put_hevc_qpel_h32_8_neon: 2054.0
put_hevc_qpel_h48_8_c: 20280.7
put_hevc_qpel_h48_8_neon: 4615.5
put_hevc_qpel_h64_8_c: 36042.0
put_hevc_qpel_h64_8_neon: 8197.5
put_hevc_qpel_uni_h4_8_c: 165.5
put_hevc_qpel_uni_h4_8_neon: 73.5
put_hevc_qpel_uni_h6_8_c: 366.5
put_hevc_qpel_uni_h6_8_neon: 118.5
put_hevc_qpel_uni_h8_8_c: 661.7
put_hevc_qpel_uni_h8_8_neon: 138.2
put_hevc_qpel_uni_h12_8_c: 1440.5
put_hevc_qpel_uni_h12_8_neon: 399.5
put_hevc_qpel_uni_h16_8_c: 2489.0
put_hevc_qpel_uni_h16_8_neon: 532.2
put_hevc_qpel_uni_h24_8_c: 5896.5
put_hevc_qpel_uni_h24_8_neon: 1558.5
put_hevc_qpel_uni_h32_8_c: 10675.5
put_hevc_qpel_uni_h32_8_neon: 2092.2
put_hevc_qpel_uni_h48_8_c: 24103.0
put_hevc_qpel_uni_h48_8_neon: 4680.2
put_hevc_qpel_uni_h64_8_c: 42789.2
put_hevc_qpel_uni_h64_8_neon: 8330.0

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/Makefile   |   1 +
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  43 +-
 libavcodec/aarch64/hevcdsp_qpel_neon.S| 520 ++
 3 files changed, 563 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index c8935f205e..2f95649c66 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -65,4 +65,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
aarch64/vp9itxfm_16bpp_neon.o   \
aarch64/vp9mc_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o 
\
aarch64/hevcdsp_init_aarch64.o  
\
+   aarch64/hevcdsp_qpel_neon.o 
\
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..ca2cb7cf97 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,21 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
uint8_t *_src,
   int16_t *sao_offset_val, int sao_left_class,
   int width, int height);
 
-
+void ff_hevc_put_hevc_qpel_h4_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t 
_srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h6_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t 
_srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h8_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t 
_srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t 
_srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t 
_srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, 
uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int 
width);
+void ff_hevc_put_hevc_qpel_uni_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, 
uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int 
width);
+void ff_hevc_put_hevc_qpel_uni_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, 
uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int 
width);
+void ff_hevc_put_hevc_qpel_uni_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, 
uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int 
width);
+void ff_hevc_put_hevc_qpel_uni_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, 
uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int 
width);
+void ff_hevc_put_hevc_qpel_bi_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, 
uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, 
intptr_t my

Re: [FFmpeg-devel] [PATCH] checkasm: improve hevc_sao test

2022-05-25 Thread J. Dekker



On 24 May 2022, at 22:27, Martin Storsjö wrote:

> On Tue, 17 May 2022, J. Dekker wrote:
>
>> The HEVC decoder can call these functions with smaller widths than the
>> functions themselves are designed to operate on so we should only check
>> the relevant output
>>
>> Signed-off-by: J. Dekker 
>> ---
>> tests/checkasm/hevc_sao.c | 51 ---
>> 1 file changed, 31 insertions(+), 20 deletions(-)
>>
>> diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c
>> index 6b750758e2..72cdb87dd1 100644
>> --- a/tests/checkasm/hevc_sao.c
>> +++ b/tests/checkasm/hevc_sao.c
>> @@ -78,20 +78,26 @@ static void check_sao_band(HEVCDSPContext h, int 
>> bit_depth)
>>
>> for (i = 0; i <= 4; i++) {
>> int block_size = sao_size[i];
>> +int prev_size = i > 0 ? sao_size[i - 1] : 0;
>> ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL;
>> declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, 
>> ptrdiff_t dst_stride, ptrdiff_t src_stride,
>>   int16_t *sao_offset_val, int sao_left_class, int 
>> width, int height);
>>
>> -randomize_buffers(src0, src1, BUF_SIZE);
>> -randomize_buffers2(offset_val, OFFSET_LENGTH);
>> -memset(dst0, 0, BUF_SIZE);
>> -memset(dst1, 0, BUF_SIZE);
>> -
>> -if (check_func(h.sao_band_filter[i], "hevc_sao_band_%dx%d_%d", 
>> block_size, block_size, bit_depth)) {
>> -call_ref(dst0, src0, stride, stride, offset_val, left_class, 
>> block_size, block_size);
>> -call_new(dst1, src1, stride, stride, offset_val, left_class, 
>> block_size, block_size);
>> -if (memcmp(dst0, dst1, BUF_SIZE))
>> -fail();
>> +if (check_func(h.sao_band_filter[i], "hevc_sao_band_%d_%d", 
>> block_size, bit_depth)) {
>> +
>> +for (int w = prev_size + 4; w <= block_size; w += 4) {
>> +randomize_buffers(src0, src1, BUF_SIZE);
>> +randomize_buffers2(offset_val, OFFSET_LENGTH);
>> +memset(dst0, 0, BUF_SIZE);
>> +memset(dst1, 0, BUF_SIZE);
>> +
>> +call_ref(dst0, src0, stride, stride, offset_val, 
>> left_class, w, block_size);
>> +call_new(dst1, src1, stride, stride, offset_val, 
>> left_class, w, block_size);
>> +for (int j = 0; j < block_size; j++) {
>> +if (memcmp(dst0 + j*MAX_PB_SIZE*2, dst1 + 
>> j*MAX_PB_SIZE*2, w))
>
> I'm not quite sure about the MAX_PB_SIZE*2 part here - shouldn't that be just 
> the 'stride' variable instead? And for the compared length ('w'), shouldn't 
> that be multiplied by SIZEOF_PIXEL?
>
> Other than that, this looks good to me!

Pushed with this fix. Rest of the set as-is. I have an extra SAO patch but 
don't want to delay this set further.

Thanks,

-- 
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] lavc/aarch64: hevc_sao reschedule slightly

2022-05-25 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_sao_neon.S | 30 +++
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index efd8112af4..39056d76ee 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -3,7 +3,7 @@
  *
  * AArch64 NEON optimised SAO functions for HEVC decoding
  *
- * Copyright (c) 2020 Josh Dekker 
+ * Copyright (c) 2022 J. Dekker 
  *
  * This file is part of FFmpeg.
  *
@@ -24,6 +24,10 @@
 
 #include "libavutil/aarch64/asm.S"
 
+#define MAX_PB_SIZE 64
+#define AV_INPUT_BUFFER_PADDING_SIZE 64
+#define SAO_STRIDE (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE)
+
 // void sao_band_filter(uint8_t *_dst, uint8_t *_src,
 //  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 //  int16_t *sao_offset_val, int sao_left_class,
@@ -56,6 +60,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|
 // +--->
 //i-0 i-1 i-2 i-3
+subsw8, w8,  #8
 ld1 {v2.8b}, [x1], #8  // dst[x] = 
av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
 uxtlv0.8h,  v2.8b  // load src[x]
 ushrv2.8h,  v0.8h, #3  // >> BIT_DEPTH - 3
@@ -66,7 +71,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 add v1.8h,  v0.8h, v2.8h   // src[x] + table
 sqxtun  v4.8b,  v1.8h  // clip + narrow
 st1 {v4.8b}, [x0], #8  // store
-subsw8, w8,  #8// done 8 pixels
+// done 8 pixels
 bne 2b
 subsw7, w7,  #1// finished line, prep. new
 add x0, x0,  x2// dst += stride_dst
@@ -75,12 +80,11 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 ret
 endfunc
 
-// ASSUMES STRIDE_SRC = 192
 .Lsao_edge_pos:
 .word 1 // horizontal
-.word 192 // vertical
-.word 192 + 1 // 45 degree
-.word 192 - 1 // 135 degree
+.word SAO_STRIDE // vertical
+.word SAO_STRIDE + 1 // 45 degree
+.word SAO_STRIDE - 1 // 135 degree
 
 // ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff 
stride_dst,
 //  int16 *sao_offset_val, int eo, int 
width, int height)
@@ -98,7 +102,7 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
 uzp2v1.16b, v3.16b, v3.16b // sao_offset_val -> upper
 uzp1v0.16b, v3.16b, v3.16b // sao_offset_val -> lower
 moviv2.16b, #2
-mov x15, #192
+mov x15, #SAO_STRIDE
 // strides between end of line and next src/dst
 sub x15, x15, x5   // stride_src - width
 sub x16, x2, x5// stride_dst - width
@@ -108,6 +112,7 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
 sub x12, x11, x4   // src_a (prev) = src - 
sao_edge_pos
 add x13, x11, x4   // src_b (next) = src + 
sao_edge_pos
 2:  // process 16 bytes
+subsx14, x14, #16
 ld1 {v3.16b}, [x11], #16   // load src
 ld1 {v4.16b}, [x12], #16   // load src_a (prev)
 ld1 {v5.16b}, [x13], #16   // load src_b (next)
@@ -130,12 +135,12 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
 sqxtun  v3.8b, v20.8h
 sqxtun2 v3.16b, v21.8h
 st1 {v3.16b}, [x0], #16
-subsx14, x14, #16  // filtered 16 bytes
+// filtered 16 bytes
 b.ne2b // do we have width to 
filter?
 // no width to filter, setup next line
+subsw6, w6, #1 // filtered line
 add x11, x11, x15  // stride src to next line
 add x0, x0, x16// stride dst to next line
-subsw6, w6, #1 // filtered line
 b.ne1b // do we have lines to 
process?
 // no lines to filter
 ret
@@ -156,12 +161,12 @@ function ff_hevc_sao_edge_filter_8x8_8_neon, export=1
 moviv2.16b, #2
 add x16, x0, x2
 lsl x2,  x2, #1
-mov x15, #192
+mov x15, #SAO_STRIDE
 mov x8,  x1
 sub x9,  x1, x4
 add x10, x1, x4
-lsr w17, w6, #1
-1:  ld1 {v3.d}[0], [ x8], x15
+1:  subs

Re: [FFmpeg-devel] [PATCH] lavc/aarch64: hevc_sao reschedule slightly

2022-05-25 Thread J. Dekker



On 25 May 2022, at 12:23, Martin Storsjö wrote:

> On Wed, 25 May 2022, J. Dekker wrote:
>
>> Signed-off-by: J. Dekker 
>> ---
>> libavcodec/aarch64/hevcdsp_sao_neon.S | 30 +++
>> 1 file changed, 17 insertions(+), 13 deletions(-)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
>> b/libavcodec/aarch64/hevcdsp_sao_neon.S
>> index efd8112af4..39056d76ee 100644
>> --- a/libavcodec/aarch64/hevcdsp_sao_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
>> @@ -24,6 +24,10 @@
>>
>> #include "libavutil/aarch64/asm.S"
>>
>> +#define MAX_PB_SIZE 64
>> +#define AV_INPUT_BUFFER_PADDING_SIZE 64
>> +#define SAO_STRIDE (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE)
>> +
>> // void sao_band_filter(uint8_t *_dst, uint8_t *_src,
>> //  ptrdiff_t stride_dst, ptrdiff_t stride_src,
>> //  int16_t *sao_offset_val, int sao_left_class,
>> @@ -56,6 +60,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
>> // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|
>> // +--->
>> //i-0 i-1 i-2 i-3
>> +subsw8, w8,  #8
>> ld1 {v2.8b}, [x1], #8  // dst[x] = 
>> av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
>
> For cases like this, it's usually better to place the subs after the ld1, 
> because the ld1 will take a couple cycles before the result is available 
> (which the next instruction needs).
>
>> uxtlv0.8h,  v2.8b  // load src[x]
>> ushrv2.8h,  v0.8h, #3  // >> BIT_DEPTH - 3
>> @@ -66,7 +71,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
>> add v1.8h,  v0.8h, v2.8h   // src[x] + table
>> sqxtun  v4.8b,  v1.8h  // clip + narrow
>> st1 {v4.8b}, [x0], #8  // store
>> -subsw8, w8,  #8// done 8 pixels
>> +// done 8 pixels
>> bne 2b
>> subsw7, w7,  #1// finished line, prep. 
>> new
>> add x0, x0,  x2// dst += stride_dst
>> @@ -75,12 +80,11 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
>> ret
>> endfunc
>>
>> -// ASSUMES STRIDE_SRC = 192
>> .Lsao_edge_pos:
>> .word 1 // horizontal
>> -.word 192 // vertical
>> -.word 192 + 1 // 45 degree
>> -.word 192 - 1 // 135 degree
>> +.word SAO_STRIDE // vertical
>> +.word SAO_STRIDE + 1 // 45 degree
>> +.word SAO_STRIDE - 1 // 135 degree
>>
>> // ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff 
>> stride_dst,
>> //  int16 *sao_offset_val, int eo, int 
>> width, int height)
>> @@ -98,7 +102,7 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
>> uzp2v1.16b, v3.16b, v3.16b // sao_offset_val -> upper
>> uzp1v0.16b, v3.16b, v3.16b // sao_offset_val -> lower
>> moviv2.16b, #2
>> -mov x15, #192
>> +mov x15, #SAO_STRIDE
>> // strides between end of line and next src/dst
>> sub x15, x15, x5   // stride_src - width
>> sub x16, x2, x5// stride_dst - width
>> @@ -108,6 +112,7 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
>> sub x12, x11, x4   // src_a (prev) = src - 
>> sao_edge_pos
>> add x13, x11, x4   // src_b (next) = src + 
>> sao_edge_pos
>> 2:  // process 16 bytes
>> +subsx14, x14, #16
>> ld1 {v3.16b}, [x11], #16   // load src
>> ld1 {v4.16b}, [x12], #16   // load src_a (prev)
>> ld1 {v5.16b}, [x13], #16   // load src_b (next)
>
> Same thing here, it's better to do the subs after firing off all loads.
>
>> @@ -130,12 +135,12 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
>> sqxtun  v3.8b, v20.8h
>> sqxtun2 v3.16b, v21.8h
>> st1 {v3.16b}, [x0], #16
>> -subsx14, x14, #16  // filtered 16 bytes
>> +// filtered 16 bytes
>> b.ne2b // do we have width to 
>> filter?
>>

Re: [FFmpeg-devel] [PATCH v3 2/2] swscale/aarch64: add hscale specializations

2022-05-26 Thread J. Dekker

ions of 
> src[...] * filter[...]
> +addvs0, v0.4S   // add up products 
> of src and filter values
> +sqshrn  h0, s0, #7  // shift and clip 
> the 2x16-bit final value
> +st1 {v0.H}[0], [x1], #2 // dst[i] = ...
> +sub w2, w2, #1  // dstW--
> +cbnzw2, 2b
> +
> +add sp, sp, #32 // clean up stack
> +ret
> +endfunc
> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
> index 09d0a7130e..ab28be4da6 100644
> --- a/libswscale/aarch64/swscale.c
> +++ b/libswscale/aarch64/swscale.c
> @@ -22,25 +22,46 @@
>  #include "libswscale/swscale_internal.h"
>  #include "libavutil/aarch64/cpu.h"
>
> -void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW,
> -const uint8_t *src, const int16_t *filter,
> -const int32_t *filterPos, int filterSize);
> +#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
> +void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
> +SwsContext *c, int16_t 
> *data, \
> +int dstW, const uint8_t 
> *src, \
> +const int16_t *filter, \
> +const int32_t *filterPos, 
> int filterSize)
> +#define SCALE_FUNCS(filter_n, opt) \
> +SCALE_FUNC(filter_n,  8, 15, opt);
> +#define ALL_SCALE_FUNCS(opt) \
> +SCALE_FUNCS(4, opt); \
> +SCALE_FUNCS(X8, opt)
> +
> +ALL_SCALE_FUNCS(neon);
>
>  void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
>const int16_t **src, uint8_t *dest, int dstW,
>const uint8_t *dither, int offset);
>
> +#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do {  \
> +if (c->srcBpc == 8 && c->dstBpc <= 14) {\
> +  hscalefn =\
> +ff_hscale8to15_ ## filtersize ## _ ## opt;  \
> +}   \
> +} while (0)
> +
> +#define ASSIGN_SCALE_FUNC(hscalefn, filtersize, opt)\
> +  switch (filtersize) { \
> +  case 4:  ASSIGN_SCALE_FUNC2(hscalefn, 4, opt); break; \
> +  default: if (filtersize % 8 == 0) \
> +   ASSIGN_SCALE_FUNC2(hscalefn, X8, opt);   \
> +   break;   \
> +  }
> +
>  av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>  {
>  int cpu_flags = av_get_cpu_flags();
>
>  if (have_neon(cpu_flags)) {
> -if (c->srcBpc == 8 && c->dstBpc <= 14 &&
> -(c->hLumFilterSize % 8) == 0 &&
> -(c->hChrFilterSize % 8) == 0)
> -{
> -c->hyScale = c->hcScale = ff_hscale_8_to_15_neon;
> -}
> +ASSIGN_SCALE_FUNC(c->hyScale, c->hLumFilterSize, neon);
> +ASSIGN_SCALE_FUNC(c->hcScale, c->hChrFilterSize, neon);
>  if (c->dstBpc == 8) {
>  c->yuv2planeX = ff_yuv2planeX_8_neon;
>  }
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index ffa130524a..105781c4f4 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -1820,7 +1820,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
> *srcFilter,
>  {
>  const int filterAlign = X86_MMX(cpu_flags) ? 4 :
>  PPC_ALTIVEC(cpu_flags) ? 8 :
> -have_neon(cpu_flags)   ? 8 : 1;
> +have_neon(cpu_flags)   ? 4 : 1;
>
>  if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,
> &c->hLumFilterSize, c->lumXInc,
> -- 
> 2.32.0

-- 
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] lavc/aarch64: add hevc chroma loop filter 8-12bit

2022-06-23 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/Makefile   |   3 +-
 libavcodec/aarch64/hevcdsp_deblock_neon.S | 168 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  14 ++
 3 files changed, 184 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S

 Passes FATE, I never completed the checkasm for loop_filter, working on
 that currently alongside the luma loop filter. This asm can also go
 into hevcdsp_sao_neon.S if you would prefer not creating an extra file
 for it.

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index c8935f205e..66bd8b596c 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -63,6 +63,7 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
aarch64/vp9itxfm_16bpp_neon.o   \
aarch64/vp9lpf_neon.o   
\
aarch64/vp9mc_16bpp_neon.o  
\
aarch64/vp9mc_neon.o
-NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o 
\
+NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_deblock_neon.o  
\
+   aarch64/hevcdsp_idct_neon.o 
\
aarch64/hevcdsp_init_aarch64.o  
\
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S 
b/libavcodec/aarch64/hevcdsp_deblock_neon.S
new file mode 100644
index 00..d21ad0a54f
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
@@ -0,0 +1,168 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * Copyright (c) 2014 Seppo Tomperi 
+ * Copyright (c) 2022 J. Dekker 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro hevc_loop_filter_chroma_start bitdepth
+ldr w14, [x2]
+ldr w15, [x2, #4]
+.if \bitdepth > 8
+lsl w14, w14, #(\bitdepth - 8)
+lsl w15, w15, #(\bitdepth - 8)
+.endif
+addsw2, w14, w15
+b.eq1f
+dup v16.4h, w14
+dup v17.4h, w15
+trn1v16.2d, v16.2d, v17.2d
+.if \bitdepth == 10
+mvniv19.8h, #0xFC, lsl #8 // movi #0x03FF
+.endif
+.if \bitdepth == 12
+mvniv19.8h, #0xF0, lsl #8 // movi #0x0FFF
+.endif
+.if \bitdepth > 8
+moviv18.8h, #0
+.endif
+neg v17.8h, v16.8h
+.endm
+
+.macro hevc_loop_filter_chroma_body bitdepth
+.if \bitdepth <= 8
+uxtlv0.8h, v0.8b // p1
+uxtlv1.8h, v1.8b // p0
+uxtlv2.8h, v2.8b // q0
+uxtlv3.8h, v3.8b // q1
+.endif
+sub v5.8h, v2.8h, v1.8h // q0 - p0
+sub v6.8h, v0.8h, v3.8h // p1 - q1
+shl v5.8h, v5.8h, #2
+add v5.8h, v6.8h, v5.8h
+srshr   v5.8h, v5.8h, #3
+sminv5.8h, v5.8h, v16.8h
+smaxv5.8h, v5.8h, v17.8h
+sqadd   v1.8h, v1.8h, v5.8h // p0 + delta
+sqsub   v2.8h, v2.8h, v5.8h // q0 - delta
+.if \bitdepth <= 8
+sqxtun  v1.8b, v1.8h
+sqxtun  v2.8b, v2.8h
+.else
+sminv1.8h, v1.8h, v19.8h
+sminv2.8h, v2.8h, v19.8h
+smaxv1.8h, v1.8h, v18.8h
+smaxv2.8h, v2.8h, v18.8h
+.endif
+.endm
+
+.macro hevc_h_loop_filter_chroma bitdepth
+function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1
+hevc_loop_filter_chroma_start \bitdepth
+sub x0, x0, x1, lsl #1
+.if \bitdepth > 8
+ld1 {v0.8h}, [x0], x1
+ld1 {v1.8h}, [x0], x1
+ld1 {v2.8h}, [x0], x1
+ld1 {v3.8h}, [x0]
+.else
+ld1 {v0.8b}, [x0], x1
+ld1 {v1.8b}, [x0], x1
+ld1 {v2.8b}, [x0], x1
+ld1 {v3.8h}, [x0]
+.endif
+

Re: [FFmpeg-devel] [PATCH] lavc/aarch64: add hevc chroma loop filter 8-12bit

2022-06-23 Thread J. Dekker




On 23 Jun 2022, at 12:52, J. Dekker wrote:

> Signed-off-by: J. Dekker 
> ---
>  libavcodec/aarch64/Makefile   |   3 +-
>  libavcodec/aarch64/hevcdsp_deblock_neon.S | 168 ++
>  libavcodec/aarch64/hevcdsp_init_aarch64.c |  14 ++
>  3 files changed, 184 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S
>
>  Passes FATE, I never completed the checkasm for loop_filter, working on
>  that currently alongside the luma loop filter. This asm can also go
>  into hevcdsp_sao_neon.S if you would prefer not creating an extra file
>  for it.
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index c8935f205e..66bd8b596c 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -63,6 +63,7 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
> aarch64/vp9itxfm_16bpp_neon.o   \
> aarch64/vp9lpf_neon.o 
>   \
> aarch64/vp9mc_16bpp_neon.o
>   \
> aarch64/vp9mc_neon.o
> -NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o   
>   \
> +NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_deblock_neon.o
>   \
> +   aarch64/hevcdsp_idct_neon.o   
>   \
> aarch64/hevcdsp_init_aarch64.o
>   \
> aarch64/hevcdsp_sao_neon.o
> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S 
> b/libavcodec/aarch64/hevcdsp_deblock_neon.S
> new file mode 100644
> index 00..d21ad0a54f
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
> @@ -0,0 +1,168 @@
> +/* -*-arm64-*-
> + * vim: syntax=arm64asm
> + *
> + * Copyright (c) 2014 Seppo Tomperi 
> + * Copyright (c) 2022 J. Dekker 
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> + */
> +
> +
> +#include "libavutil/aarch64/asm.S"
> +#include "neon.S"
> +
> +.macro hevc_loop_filter_chroma_start bitdepth
> +ldr w14, [x2]
> +ldr w15, [x2, #4]
> +.if \bitdepth > 8
> +lsl w14, w14, #(\bitdepth - 8)
> +lsl w15, w15, #(\bitdepth - 8)
> +.endif
> +addsw2, w14, w15
> +b.eq1f
> +dup v16.4h, w14
> +dup v17.4h, w15
> +trn1v16.2d, v16.2d, v17.2d
> +.if \bitdepth == 10
> +mvniv19.8h, #0xFC, lsl #8 // movi #0x03FF
> +.endif
> +.if \bitdepth == 12
> +mvniv19.8h, #0xF0, lsl #8 // movi #0x0FFF
> +.endif
> +.if \bitdepth > 8
> +moviv18.8h, #0
> +.endif
> +neg v17.8h, v16.8h
> +.endm
> +
> +.macro hevc_loop_filter_chroma_body bitdepth
> +.if \bitdepth <= 8
> +uxtlv0.8h, v0.8b // p1
> +uxtlv1.8h, v1.8b // p0
> +uxtlv2.8h, v2.8b // q0
> +uxtlv3.8h, v3.8b // q1
> +.endif
> +sub v5.8h, v2.8h, v1.8h // q0 - p0
> +sub v6.8h, v0.8h, v3.8h // p1 - q1
> +shl v5.8h, v5.8h, #2
> +add v5.8h, v6.8h, v5.8h
> +srshr   v5.8h, v5.8h, #3
> +sminv5.8h, v5.8h, v16.8h
> +smaxv5.8h, v5.8h, v17.8h
> +sqadd   v1.8h, v1.8h, v5.8h // p0 + delta
> +sqsub   v2.8h, v2.8h, v5.8h // q0 - delta
> +.if \bitdepth <= 8
> +sqxtun  v1.8b, v1.8h
> +sqxtun  v2.8b, v2.8h
> +.else
> +sminv1.8h, v1.8h, v19.8h
> +sminv2.8h, v2.8h, v19.8h
> +smaxv1.8h, v1.8h, v18.8h
> +s

[FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct

2022-06-23 Thread J. Dekker

old:
hevc_idct_16x16_8_c: 5366.2
hevc_idct_16x16_8_neon: 1493.2

new:
hevc_idct_16x16_8_c: 5363.2
hevc_idct_16x16_8_neon: 943.5

Co-developed-by: Rafal Dabrowa 
Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_idct_neon.S| 666 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   3 +-
 2 files changed, 668 insertions(+), 1 deletion(-)

 This idct is significantly faster than the one we currently have, I
 suspect its for a couple reasons: 1) it's only written for 8bit 2) it's
 unrolled signficantly more. It comes at a hefty cost of roughly 2.25x
 the object size. I'm wondering if this idct is salvagable, or the one
 we have should just be improved instead.

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 0869431294..784bae33b3 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -618,3 +618,669 @@ idct_dc 16, 10
 
 idct_dc 32, 8
 idct_dc 32, 10
+
+// WIP
+
+.Lo0_coeff: .hword  83, 36, 0, 0, 0, 0, 0, 0
+.Lo8transform0: .hword  89,  75,  50,  18   // 
transform[4,12,20,28][0]
+.Lo8transform1: .hword  75, -18, -89, -50
+.Lo8transform2: .hword  50, -89,  18,  75
+.Lo8transform3: .hword  18, -50,  75, -89
+
+.LimitMask:
+.hword  0x, 0x, 0x, 0x, 0x, 0, 0, 0
+.hword  0x,  0,  0,  0,  0, 0, 0, 0
+
+.Leo_coeff:
+.hword  64,  64,  64,  64,  83,  36, -36, -83
+.hword  64, -64, -64,  64,  36, -83,  83, -36
+.hword  89,  75,  50,  18,  75, -18, -89, -50   // 
transform[4,12][0-3]
+.hword  50, -89,  18,  75,  18, -50,  75, -89   // 
transform[20,28][0-3]
+.Lo16transform0: .hword 90,  87,  80,  70,  57,  43,  25,   9   // 
transform[2][0-7], also transform[2,6,10..][0]
+.Lo16transform1: .hword 87,  57,   9, -43, -80, -90, -70, -25   // 
transform[6][0-7]
+.Lo16transform2: .hword 80,   9, -70, -87, -25,  57,  90,  43   // 
transform[10][0-7]
+.Lo16transform3: .hword 70, -43, -87,   9,  90,  25, -80, -57   // 
transform[14][0-7]
+.Lo16transform4: .hword 57, -80, -25,  90,  -9, -87,  43,  70   // 
transform[18][0-7]
+.Lo16transform5: .hword 43, -90,  57,  25, -87,  70,   9, -80   // 
transform[22][0-7]
+.Lo16transform6: .hword 25, -70,  90, -80,  43,   9, -57,  87   // 
transform[26][0-7]
+.Lo16transform7: .hword  9, -25,  43, -57,  70, -80,  87, -90   // 
transform[30][0-7]
+
+// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
+function ff_hevc_idct_16x16_8_neon_new, export=1
+sub sp, sp, 64
+st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+sub sp, sp, 32
+st1 {v14.16b, v15.16b}, [sp]
+mov x3, 0
+mov x2, x0
+1:  mov x4, x2
+mov x5, 32
+ld1 {v16.8h}, [x4], x5
+ld1 {v17.8h}, [x4], x5
+ld1 {v18.8h}, [x4], x5
+ld1 {v19.8h}, [x4], x5
+ld1 {v20.8h}, [x4], x5
+ld1 {v21.8h}, [x4], x5
+ld1 {v22.8h}, [x4], x5
+ld1 {v23.8h}, [x4], x5
+ld1 {v24.8h}, [x4], x5
+ld1 {v25.8h}, [x4], x5
+ld1 {v26.8h}, [x4], x5
+ld1 {v27.8h}, [x4], x5
+ld1 {v28.8h}, [x4], x5
+ld1 {v29.8h}, [x4], x5
+ld1 {v30.8h}, [x4], x5
+ld1 {v31.8h}, [x4], x5
+cmp x1, 12
+b.hs5f
+// limit2 below 16
+bic x4, x1, 1
+adr x5, .LimitMask
+cbnzx3, 3f
+// columns 0 .. 7 - cleanup of indexes 5 .. 7
+ld1 {v0.8h}, [x5]
+adr x5, 2f
+add x5, x5, x4, lsl 2
+add x5, x5, x4, lsl 1
+br  x5
+2:  and v17.16b, v17.16b, v0.16b// col_limit 0..1 -> 
limit2 == 4..5
+and v19.16b, v19.16b, v0.16b
+b   5f
+and v19.16b, v19.16b, v0.16b// col_limit 2..3 -> 
limit2 == 6..7
+and v21.16b, v21.16b, v0.16b
+b   5f
+and v21.16b, v21.16b, v0.16b// col_limit 4..5 -> 
limit2 == 8..9
+and v23.16b, v23.16b, v0.16b
+b   5f
+and v23.16b, v23.16b, v0.16b// col_limit 6..7 -> 
limit2 == 10..11
+and v25.16b, v25.16b, v0.16b
+b   5f
+and v25.16b, v25.16b, v0.16b// col_limit 8..9 -> 
limit2 == 12..13
+and v27.16b, v27.16b, v0.16b
+b   5f
+and v27.16b, v27.16b, v0.16b// col_limit 10..11 -&g

[FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add 8-bit hevc 32x32 idct

2022-06-23 Thread J. Dekker

hevc_idct_32x32_8_c: 40128.5
hevc_idct_32x32_8_neon: 7102.0

Co-developed-by: Rafal Dabrowa 
Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_idct_neon.S| 1265 +
 libavcodec/aarch64/hevcdsp_init_aarch64.c |2 +
 2 files changed, 1267 insertions(+)

 Written by the same author as the other 16x16 idct. Again the same
 concern with it.

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 784bae33b3..3b6e95153f 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -644,6 +644,40 @@ idct_dc 32, 10
 .Lo16transform5: .hword 43, -90,  57,  25, -87,  70,   9, -80   // 
transform[22][0-7]
 .Lo16transform6: .hword 25, -70,  90, -80,  43,   9, -57,  87   // 
transform[26][0-7]
 .Lo16transform7: .hword  9, -25,  43, -57,  70, -80,  87, -90   // 
transform[30][0-7]
+.Lo32transform:
+.hword  90,  90,  88,  85,  82,  78,  73,  67   // 
transform[1,3,5,7..15][1]
+.hword  61,  54,  46,  38,  31,  22,  13,   4   // 
transform[17,19,21..31][1]
+.hword  90,  82,  67,  46,  22,  -4, -31, -54   // 
transform[1,3,5,7..15][3]
+.hword -73, -85, -90, -88, -78, -61, -38, -13   // 
transform[17,19,21..31][3]
+.hword  88,  67,  31, -13, -54, -82, -90, -78   // ..
+.hword -46, -4,   38,  73,  90,  85,  61,  22
+.hword  85,  46, -13, -67, -90, -73, -22,  38
+.hword  82,  88,  54,  -4, -61, -90, -78, -31
+.Lo32transform9_31:
+.hword  82,  22, -54, -90, -61,  13,  78,  85
+.hword  31, -46, -90, -67,   4,  73,  88,  38
+.hword  78,  -4, -82, -73,  13,  85,  67, -22
+.hword -88, -61,  31,  90,  54, -38, -90, -46
+.hword  73, -31, -90, -22,  78,  67, -38, -90
+.hword -13,  82,  61, -46, -88,  -4,  85,  54
+.hword  67, -54, -78,  38,  85, -22, -90,   4
+.hword  90,  13, -88, -31,  82,  46, -73, -61
+.hword  61, -73, -46,  82,  31, -88, -13,  90
+.hword  -4, -90,  22,  85, -38, -78,  54,  67
+.hword  54, -85,  -4,  88, -46, -61,  82,  13
+.hword -90,  38,  67, -78, -22,  90, -31, -73
+.hword  46, -90,  38,  54, -90,  31,  61, -88
+.hword  22,  67, -85,  13,  73, -82,   4,  78
+.hword  38, -88,  73,  -4, -67,  90, -46, -31
+.hword  85, -78,  13,  61, -90,  54,  22, -82
+.hword  31, -78,  90, -61,   4,  54, -88,  82
+.hword -38, -22,  73, -90,  67, -13, -46,  85
+.hword  22, -61,  85, -90,  73, -38,  -4,  46
+.hword -78,  90, -82,  54, -13, -31,  67, -88
+.hword  13, -38,  61, -78,  88, -90,  85, -73
+.hword  54, -31,   4,  22, -46,  67, -82,  90
+.hword   4, -13,  22, -31,  38, -46,  54, -61   // 
transform[1,3,5,7..15][31]
+.hword  67, -73,  78, -82,  85, -88,  90, -90   // 
transform[17,19,21..31][31]
 
 // void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
 function ff_hevc_idct_16x16_8_neon_new, export=1
@@ -1284,3 +1318,1234 @@ function ff_hevc_idct_16x16_8_neon_new, export=1
 ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64
 ret
 endfunc
+
+function ff_hevc_idct_32x32_8_neon, export=1
+sub sp, sp, 64
+st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+sub sp, sp, 64
+st1 {v12.16b, v13.16b, v14.16b, v15.16b}, [sp]
+sub sp, sp, 16 * 32 * 4 // room for o_32: 16 * 32 
values
+mov x3, 0   // loop counter
+mov x2, x0
+mov x7, 83
+add x7, x7, 36 * 65536  // o0, o1 coeff. factors
+1:  mov x9, 128
+// loading odd lines
+add x4, x2, 64  // odd lines
+ld1 {v16.8h}, [x4], x9// line 1
+ld1 {v17.8h}, [x4], x9// line 3
+ld1 {v18.8h}, [x4], x9// line 5
+ld1 {v19.8h}, [x4], x9// line 7
+ld1 {v20.8h}, [x4], x9// line 9
+ld1 {v21.8h}, [x4], x9// line 11
+ld1 {v22.8h}, [x4], x9// line 13
+ld1 {v23.8h}, [x4], x9// line 15
+ld1 {v24.8h}, [x4], x9// line 17
+ld1 {v25.8h}, [x4], x9// line 19
+ld1 {v26.8h}, [x4], x9// line 21
+ld1 {v27.8h}, [x4], x9// line 23
+ld1 {v28.8h}, [x4], x9// line 25
+ld1 {v29.8h}, [x4], x9// line 27
+ld1 {v30.8h}, [x4], x9// line 29
+ld1 {v31.8h

[FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test

2022-06-23 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 tests/checkasm/hevc_add_res.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/checkasm/hevc_add_res.c b/tests/checkasm/hevc_add_res.c
index 0c896adaca..f17d121939 100644
--- a/tests/checkasm/hevc_add_res.c
+++ b/tests/checkasm/hevc_add_res.c
@@ -36,14 +36,14 @@
 }   \
 } while (0)
 
-#define randomize_buffers2(buf, size) \
+#define randomize_buffers2(buf, size, mask)   \
 do {  \
 int j;\
 for (j = 0; j < size; j++)\
-AV_WN16A(buf + j * 2, rnd() & 0x3FF); \
+AV_WN16A(buf + j * 2, rnd() & mask); \
 } while (0)
 
-static void compare_add_res(int size, ptrdiff_t stride, int overflow_test)
+static void compare_add_res(int size, ptrdiff_t stride, int overflow_test, int 
mask)
 {
 LOCAL_ALIGNED_32(int16_t, res0, [32 * 32]);
 LOCAL_ALIGNED_32(int16_t, res1, [32 * 32]);
@@ -53,7 +53,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int 
overflow_test)
 declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *res, 
ptrdiff_t stride);
 
 randomize_buffers(res0, size);
-randomize_buffers2(dst0, size);
+randomize_buffers2(dst0, size, mask);
 if (overflow_test)
 res0[0] = 0x8000;
 memcpy(res1, res0, sizeof(*res0) * size);
@@ -69,6 +69,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int 
overflow_test)
 static void check_add_res(HEVCDSPContext h, int bit_depth)
 {
 int i;
+int mask = bit_depth == 8 ? 0x : bit_depth == 10 ? 0x03FF : 0x07FF;
 
 for (i = 2; i <= 5; i++) {
 int block_size = 1 << i;
@@ -76,9 +77,9 @@ static void check_add_res(HEVCDSPContext h, int bit_depth)
 ptrdiff_t stride = block_size << (bit_depth > 8);
 
 if (check_func(h.add_residual[i - 2], "hevc_add_res_%dx%d_%d", 
block_size, block_size, bit_depth)) {
-compare_add_res(size, stride, 0);
+compare_add_res(size, stride, 0, mask);
 // overflow test for res = -32768
-compare_add_res(size, stride, 1);
+compare_add_res(size, stride, 1, mask);
 }
 }
 }
@@ -87,7 +88,7 @@ void checkasm_check_hevc_add_res(void)
 {
 int bit_depth;
 
-for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
+for (bit_depth = 8; bit_depth <= 12; bit_depth++) {
 HEVCDSPContext h;
 
 ff_hevc_dsp_init(&h, bit_depth);
-- 
2.32.0 (Apple Git-132)

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs

2022-06-23 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_idct_neon.S | 216 -
 1 file changed, 108 insertions(+), 108 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 0869431294..484eea8437 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -27,21 +27,21 @@
 #include "libavutil/aarch64/asm.S"
 
 const trans, align=4
-.short 64, 83, 64, 36
-.short 89, 75, 50, 18
-.short 90, 87, 80, 70
-.short 57, 43, 25, 9
-.short 90, 90, 88, 85
-.short 82, 78, 73, 67
-.short 61, 54, 46, 38
-.short 31, 22, 13, 4
+.short  64, 83, 64, 36
+.short  89, 75, 50, 18
+.short  90, 87, 80, 70
+.short  57, 43, 25, 9
+.short  90, 90, 88, 85
+.short  82, 78, 73, 67
+.short  61, 54, 46, 38
+.short  31, 22, 13, 4
 endconst
 
 .macro clip10 in1, in2, c1, c2
-smax\in1, \in1, \c1
-smax\in2, \in2, \c1
-smin\in1, \in1, \c2
-smin\in2, \in2, \c2
+smax\in1, \in1, \c1
+smax\in2, \in2, \c1
+smin\in1, \in1, \c2
+smin\in2, \in2, \c2
 .endm
 
 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -50,13 +50,13 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
 ld1 {v2.s}[1], [x0], x2
 ld1 {v2.s}[2], [x0], x2
 ld1 {v2.s}[3], [x0], x2
-sub  x0,  x0,  x2, lsl #2
-uxtl v6.8h,  v2.8b
-uxtl2v7.8h,  v2.16b
-sqaddv0.8h,  v0.8h, v6.8h
-sqaddv1.8h,  v1.8h, v7.8h
-sqxtun   v0.8b,  v0.8h
-sqxtun2  v0.16b, v1.8h
+sub x0, x0, x2, lsl #2
+uxtlv6.8h,  v2.8b
+uxtl2   v7.8h,  v2.16b
+sqadd   v0.8h,  v0.8h, v6.8h
+sqadd   v1.8h,  v1.8h, v7.8h
+sqxtun  v0.8b,  v0.8h
+sqxtun2 v0.16b, v1.8h
 st1 {v0.s}[0], [x0], x2
 st1 {v0.s}[1], [x0], x2
 st1 {v0.s}[2], [x0], x2
@@ -70,63 +70,63 @@ function ff_hevc_add_residual_4x4_10_neon, export=1
 ld1 {v2.d}[0], [x12], x2
 ld1 {v2.d}[1], [x12], x2
 ld1 {v3.d}[0], [x12], x2
-sqaddv0.8h, v0.8h, v2.8h
+sqadd   v0.8h, v0.8h, v2.8h
 ld1 {v3.d}[1], [x12], x2
-movi v4.8h, #0
-sqaddv1.8h, v1.8h, v3.8h
-mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
-clip10   v0.8h, v1.8h, v4.8h, v5.8h
-st1 {v0.d}[0],  [x0], x2
-st1 {v0.d}[1],  [x0], x2
-st1 {v1.d}[0],  [x0], x2
-st1 {v1.d}[1],  [x0], x2
+moviv4.8h, #0
+sqadd   v1.8h, v1.8h, v3.8h
+mvniv5.8h, #0xFC, lsl #8 // movi #0x3FF
+clip10  v0.8h, v1.8h, v4.8h, v5.8h
+st1 {v0.d}[0], [x0],  x2
+st1 {v0.d}[1], [x0],  x2
+st1 {v1.d}[0], [x0],  x2
+st1 {v1.d}[1], [x0],  x2
 ret
 endfunc
 
 function ff_hevc_add_residual_8x8_8_neon, export=1
-add x12,  x0, x2
-add  x2,  x2, x2
-mov  x3,  #8
-1:  subs x3,  x3, #2
-ld1 {v2.d}[0], [x0]
-ld1 {v2.d}[1],[x12]
-uxtl v3.8h,  v2.8b
+add x12, x0, x2
+add x2, x2, x2
+mov x3, #8
+1:  subsx3, x3, #2
+ld1 {v2.d}[0], [x0]
+ld1 {v2.d}[1], [x12]
+uxtlv3.8h,  v2.8b
 ld1 {v0.8h-v1.8h}, [x1], #32
-uxtl2v2.8h,  v2.16b
-sqaddv0.8h,  v0.8h,   v3.8h
-sqaddv1.8h,  v1.8h,   v2.8h
-sqxtun   v0.8b,  v0.8h
-sqxtun2  v0.16b, v1.8h
-st1 {v0.d}[0], [x0], x2
-st1 {v0.d}[1],[x12], x2
-bne  1b
+uxtl2   v2.8h,  v2.16b
+sqadd   v0.8h,  v0.8h, v3.8h
+sqadd   v1.8h,  v1.8h, v2.8h
+sqxtun  v0.8b,  v0.8h
+sqxtun2 v0.16b, v1.8h
+st1 {v0.d}[0], [x0],  x2
+st1 {v0.d}[1], [x12], x2
+bne 1b
 ret
 endfunc
 
 function ff_hevc_add_residual_8x8_10_neon, export=1
-add x12,  x0, x2
-add  x2,  x2, x2
-

[FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants

2022-06-23 Thread J. Dekker

hevc_add_res_4x4_12_c: 46.0
hevc_add_res_4x4_12_neon: 18.7
hevc_add_res_8x8_12_c: 194.7
hevc_add_res_8x8_12_neon: 25.2
hevc_add_res_16x16_12_c: 716.0
hevc_add_res_16x16_12_neon: 69.7
hevc_add_res_32x32_12_c: 3820.7
hevc_add_res_32x32_12_neon: 261.0

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_idct_neon.S| 148 --
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
 2 files changed, 97 insertions(+), 85 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 484eea8437..413e225218 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -37,11 +37,11 @@ const trans, align=4
 .short  31, 22, 13, 4
 endconst
 
-.macro clip10 in1, in2, c1, c2
-smax\in1, \in1, \c1
-smax\in2, \in2, \c1
-smin\in1, \in1, \c2
-smin\in2, \in2, \c2
+.macro clip2 in1, in2, min, max
+smax\in1, \in1, \min
+smax\in2, \in2, \min
+smin\in1, \in1, \max
+smin\in2, \in2, \max
 .endm
 
 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -64,25 +64,6 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_4x4_10_neon, export=1
-mov x12,  x0
-ld1 {v0.8h-v1.8h}, [x1]
-ld1 {v2.d}[0], [x12], x2
-ld1 {v2.d}[1], [x12], x2
-ld1 {v3.d}[0], [x12], x2
-sqadd   v0.8h, v0.8h, v2.8h
-ld1 {v3.d}[1], [x12], x2
-moviv4.8h, #0
-sqadd   v1.8h, v1.8h, v3.8h
-mvniv5.8h, #0xFC, lsl #8 // movi #0x3FF
-clip10  v0.8h, v1.8h, v4.8h, v5.8h
-st1 {v0.d}[0], [x0],  x2
-st1 {v0.d}[1], [x0],  x2
-st1 {v1.d}[0], [x0],  x2
-st1 {v1.d}[1], [x0],  x2
-ret
-endfunc
-
 function ff_hevc_add_residual_8x8_8_neon, export=1
 add x12, x0, x2
 add x2, x2, x2
@@ -103,25 +84,6 @@ function ff_hevc_add_residual_8x8_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_8x8_10_neon, export=1
-add x12, x0, x2
-add x2,  x2, x2
-mov x3,  #8
-moviv4.8h, #0
-mvniv5.8h, #0xFC, lsl #8 // movi #0x3FF
-1:  subsx3,  x3, #2
-ld1 {v0.8h-v1.8h}, [x1], #32
-ld1 {v2.8h}, [x0]
-sqadd   v0.8h, v0.8h, v2.8h
-ld1 {v3.8h}, [x12]
-sqadd   v1.8h, v1.8h, v3.8h
-clip10  v0.8h, v1.8h, v4.8h, v5.8h
-st1 {v0.8h}, [x0],  x2
-st1 {v1.8h}, [x12], x2
-bne 1b
-ret
-endfunc
-
 function ff_hevc_add_residual_16x16_8_neon, export=1
 mov x3,  #16
 add x12, x0, x2
@@ -148,28 +110,6 @@ function ff_hevc_add_residual_16x16_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_16x16_10_neon, export=1
-mov x3,  #16
-moviv20.8h, #0
-mvniv21.8h, #0xFC, lsl #8 // movi #0x3FF
-add x12,  x0, x2
-add x2,  x2, x2
-1:  subsx3,  x3, #2
-ld1 {v16.8h-v17.8h}, [x0]
-ld1 {v0.8h-v3.8h},   [x1], #64
-sqadd   v0.8h, v0.8h, v16.8h
-ld1 {v18.8h-v19.8h}, [x12]
-sqadd   v1.8h, v1.8h, v17.8h
-sqadd   v2.8h, v2.8h, v18.8h
-sqadd   v3.8h, v3.8h, v19.8h
-clip10  v0.8h, v1.8h, v20.8h, v21.8h
-clip10  v2.8h, v3.8h, v20.8h, v21.8h
-st1 {v0.8h-v1.8h}, [x0],  x2
-st1 {v2.8h-v3.8h}, [x12], x2
-bne 1b
-ret
-endfunc
-
 function ff_hevc_add_residual_32x32_8_neon, export=1
 add x12,  x0, x2
 add x2,  x2, x2
@@ -209,10 +149,76 @@ function ff_hevc_add_residual_32x32_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_32x32_10_neon, export=1
+.macro add_res bitdepth
+.if \bitdepth == 10
+.set mask, 0xFC
+.else
+.set mask, 0xF0
+.endif
+function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1
+mov x12,  x0
+ld1 {v0.8h-v1.8h}, [x1]
+ld1 {v2.d}[0], [x12], x2
+ld1 {v2.d}[1], [x12], x2
+ld1 {v3.d}[0], [x12], x2
+sqadd   v0.8h, v0.8h, v2.8h
+ld1 {v3.d}[1], [x12], x2
+moviv4.8h, #0
+sqadd   v1.8h, v1.8h, v3.8h
+mvniv5.8h, mask, lsl #8

Re: [FFmpeg-devel] [PATCH] lavu: always provide symbols from hwcontext_vulkan.h

2022-07-05 Thread J. Dekker

On 5 Jul 2022, at 2:11, Niklas Haas wrote:

> From: Niklas Haas 
>
> This header is unconditionally installed, even though the utility
> functions defined by it may be missing from the built library.
>
> A precedent set by e.g. libavcodec/qsv.h (and others) is to always
> provide these functions by compiling stub functions in the absence of
> CONFIG_*. Make hwcontext_vulkan.h match this convention.
>
> Fixes downstream issues, e.g.
> https://github.com/haasn/libplacebo/issues/120
>
> Signed-off-by: Niklas Haas 
> ---
>  libavutil/Makefile   |  2 +-
>  libavutil/hwcontext_vulkan.c | 26 --
>  2 files changed, 25 insertions(+), 3 deletions(-)
>
> [...]

Public API symbols (av_*) shouldn't completely disappear based on configure 
options.

LGTM.

-- 
J. Dekker
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2] lavc/aarch64: hevc_add_res add 12bit variants

2022-08-15 Thread J. Dekker

hevc_add_res_4x4_12_c: 46.0
hevc_add_res_4x4_12_neon: 18.7
hevc_add_res_8x8_12_c: 194.7
hevc_add_res_8x8_12_neon: 25.2
hevc_add_res_16x16_12_c: 716.0
hevc_add_res_16x16_12_neon: 69.7
hevc_add_res_32x32_12_c: 3820.7
hevc_add_res_32x32_12_neon: 261.0

Signed-off-by: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_idct_neon.S| 156 --
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
 2 files changed, 105 insertions(+), 85 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 484eea8437..5fb5990f3d 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -37,11 +37,11 @@ const trans, align=4
 .short  31, 22, 13, 4
 endconst
 
-.macro clip10 in1, in2, c1, c2
-smax\in1, \in1, \c1
-smax\in2, \in2, \c1
-smin\in1, \in1, \c2
-smin\in2, \in2, \c2
+.macro clip2 in1, in2, min, max
+smax\in1, \in1, \min
+smax\in2, \in2, \min
+smin\in1, \in1, \max
+smin\in2, \in2, \max
 .endm
 
 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -64,25 +64,6 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_4x4_10_neon, export=1
-mov x12,  x0
-ld1 {v0.8h-v1.8h}, [x1]
-ld1 {v2.d}[0], [x12], x2
-ld1 {v2.d}[1], [x12], x2
-ld1 {v3.d}[0], [x12], x2
-sqadd   v0.8h, v0.8h, v2.8h
-ld1 {v3.d}[1], [x12], x2
-moviv4.8h, #0
-sqadd   v1.8h, v1.8h, v3.8h
-mvniv5.8h, #0xFC, lsl #8 // movi #0x3FF
-clip10  v0.8h, v1.8h, v4.8h, v5.8h
-st1 {v0.d}[0], [x0],  x2
-st1 {v0.d}[1], [x0],  x2
-st1 {v1.d}[0], [x0],  x2
-st1 {v1.d}[1], [x0],  x2
-ret
-endfunc
-
 function ff_hevc_add_residual_8x8_8_neon, export=1
 add x12, x0, x2
 add x2, x2, x2
@@ -103,25 +84,6 @@ function ff_hevc_add_residual_8x8_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_8x8_10_neon, export=1
-add x12, x0, x2
-add x2,  x2, x2
-mov x3,  #8
-moviv4.8h, #0
-mvniv5.8h, #0xFC, lsl #8 // movi #0x3FF
-1:  subsx3,  x3, #2
-ld1 {v0.8h-v1.8h}, [x1], #32
-ld1 {v2.8h}, [x0]
-sqadd   v0.8h, v0.8h, v2.8h
-ld1 {v3.8h}, [x12]
-sqadd   v1.8h, v1.8h, v3.8h
-clip10  v0.8h, v1.8h, v4.8h, v5.8h
-st1 {v0.8h}, [x0],  x2
-st1 {v1.8h}, [x12], x2
-bne 1b
-ret
-endfunc
-
 function ff_hevc_add_residual_16x16_8_neon, export=1
 mov x3,  #16
 add x12, x0, x2
@@ -148,28 +110,6 @@ function ff_hevc_add_residual_16x16_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_16x16_10_neon, export=1
-mov x3,  #16
-moviv20.8h, #0
-mvniv21.8h, #0xFC, lsl #8 // movi #0x3FF
-add x12,  x0, x2
-add x2,  x2, x2
-1:  subsx3,  x3, #2
-ld1 {v16.8h-v17.8h}, [x0]
-ld1 {v0.8h-v3.8h},   [x1], #64
-sqadd   v0.8h, v0.8h, v16.8h
-ld1 {v18.8h-v19.8h}, [x12]
-sqadd   v1.8h, v1.8h, v17.8h
-sqadd   v2.8h, v2.8h, v18.8h
-sqadd   v3.8h, v3.8h, v19.8h
-clip10  v0.8h, v1.8h, v20.8h, v21.8h
-clip10  v2.8h, v3.8h, v20.8h, v21.8h
-st1 {v0.8h-v1.8h}, [x0],  x2
-st1 {v2.8h-v3.8h}, [x12], x2
-bne 1b
-ret
-endfunc
-
 function ff_hevc_add_residual_32x32_8_neon, export=1
 add x12,  x0, x2
 add x2,  x2, x2
@@ -209,10 +149,88 @@ function ff_hevc_add_residual_32x32_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_32x32_10_neon, export=1
+.macro add_res bitdepth
+function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1
+mvniv21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+b   X(ff_hevc_add_residual_4x4_16_neon)
+endfunc
+function ff_hevc_add_residual_8x8_\bitdepth\()_neon, export=1
+mvniv21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+b   X(ff_hevc_add_residual_8x8_16_neon)
+endfunc
+function ff_hevc_add_residual_16x16_\bitdepth\()_neon, export=1
+mvniv21.8h, #((0xFF << (\bitd

[FFmpeg-devel] [PATCH v3] lavc/aarch64: hevc_add_res add 12bit variants

2022-08-16 Thread J. Dekker

hevc_add_res_4x4_12_c: 46.0
hevc_add_res_4x4_12_neon: 18.7
hevc_add_res_8x8_12_c: 194.7
hevc_add_res_8x8_12_neon: 25.2
hevc_add_res_16x16_12_c: 716.0
hevc_add_res_16x16_12_neon: 69.7
hevc_add_res_32x32_12_c: 3820.7
hevc_add_res_32x32_12_neon: 261.0

Signed-off-by: J. Dekker 
---

 libavcodec/aarch64/hevcdsp_idct_neon.S| 156 --
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
 2 files changed, 105 insertions(+), 85 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 484eea8437..97c51e06e3 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -37,11 +37,11 @@ const trans, align=4
 .short  31, 22, 13, 4
 endconst
 
-.macro clip10 in1, in2, c1, c2
-smax\in1, \in1, \c1
-smax\in2, \in2, \c1
-smin\in1, \in1, \c2
-smin\in2, \in2, \c2
+.macro clip2 in1, in2, min, max
+smax\in1, \in1, \min
+smax\in2, \in2, \min
+smin\in1, \in1, \max
+smin\in2, \in2, \max
 .endm
 
 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -64,25 +64,6 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_4x4_10_neon, export=1
-mov x12,  x0
-ld1 {v0.8h-v1.8h}, [x1]
-ld1 {v2.d}[0], [x12], x2
-ld1 {v2.d}[1], [x12], x2
-ld1 {v3.d}[0], [x12], x2
-sqadd   v0.8h, v0.8h, v2.8h
-ld1 {v3.d}[1], [x12], x2
-moviv4.8h, #0
-sqadd   v1.8h, v1.8h, v3.8h
-mvniv5.8h, #0xFC, lsl #8 // movi #0x3FF
-clip10  v0.8h, v1.8h, v4.8h, v5.8h
-st1 {v0.d}[0], [x0],  x2
-st1 {v0.d}[1], [x0],  x2
-st1 {v1.d}[0], [x0],  x2
-st1 {v1.d}[1], [x0],  x2
-ret
-endfunc
-
 function ff_hevc_add_residual_8x8_8_neon, export=1
 add x12, x0, x2
 add x2, x2, x2
@@ -103,25 +84,6 @@ function ff_hevc_add_residual_8x8_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_8x8_10_neon, export=1
-add x12, x0, x2
-add x2,  x2, x2
-mov x3,  #8
-moviv4.8h, #0
-mvniv5.8h, #0xFC, lsl #8 // movi #0x3FF
-1:  subsx3,  x3, #2
-ld1 {v0.8h-v1.8h}, [x1], #32
-ld1 {v2.8h}, [x0]
-sqadd   v0.8h, v0.8h, v2.8h
-ld1 {v3.8h}, [x12]
-sqadd   v1.8h, v1.8h, v3.8h
-clip10  v0.8h, v1.8h, v4.8h, v5.8h
-st1 {v0.8h}, [x0],  x2
-st1 {v1.8h}, [x12], x2
-bne 1b
-ret
-endfunc
-
 function ff_hevc_add_residual_16x16_8_neon, export=1
 mov x3,  #16
 add x12, x0, x2
@@ -148,28 +110,6 @@ function ff_hevc_add_residual_16x16_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_16x16_10_neon, export=1
-mov x3,  #16
-moviv20.8h, #0
-mvniv21.8h, #0xFC, lsl #8 // movi #0x3FF
-add x12,  x0, x2
-add x2,  x2, x2
-1:  subsx3,  x3, #2
-ld1 {v16.8h-v17.8h}, [x0]
-ld1 {v0.8h-v3.8h},   [x1], #64
-sqadd   v0.8h, v0.8h, v16.8h
-ld1 {v18.8h-v19.8h}, [x12]
-sqadd   v1.8h, v1.8h, v17.8h
-sqadd   v2.8h, v2.8h, v18.8h
-sqadd   v3.8h, v3.8h, v19.8h
-clip10  v0.8h, v1.8h, v20.8h, v21.8h
-clip10  v2.8h, v3.8h, v20.8h, v21.8h
-st1 {v0.8h-v1.8h}, [x0],  x2
-st1 {v2.8h-v3.8h}, [x12], x2
-bne 1b
-ret
-endfunc
-
 function ff_hevc_add_residual_32x32_8_neon, export=1
 add x12,  x0, x2
 add x2,  x2, x2
@@ -209,10 +149,88 @@ function ff_hevc_add_residual_32x32_8_neon, export=1
 ret
 endfunc
 
-function ff_hevc_add_residual_32x32_10_neon, export=1
+.macro add_res bitdepth
+function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1
+mvniv21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+b   hevc_add_residual_4x4_16_neon
+endfunc
+function ff_hevc_add_residual_8x8_\bitdepth\()_neon, export=1
+mvniv21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+b   hevc_add_residual_8x8_16_neon
+endfunc
+function ff_hevc_add_residual_16x16_\bitdepth\()_neon, export=1
+mvniv21.8h, #((0xFF << (\bitdepth - 8))

Re: [FFmpeg-devel] [PATCH v3] lavc/aarch64: hevc_add_res add 12bit variants

2022-08-18 Thread J. Dekker

On 16 Aug 2022, at 14:46, Martin Storsjö wrote:

> On Tue, 16 Aug 2022, J. Dekker wrote:
>
>> hevc_add_res_4x4_12_c: 46.0
>> hevc_add_res_4x4_12_neon: 18.7
>> hevc_add_res_8x8_12_c: 194.7
>> hevc_add_res_8x8_12_neon: 25.2
>> hevc_add_res_16x16_12_c: 716.0
>> hevc_add_res_16x16_12_neon: 69.7
>> hevc_add_res_32x32_12_c: 3820.7
>> hevc_add_res_32x32_12_neon: 261.0
>>
>> Signed-off-by: J. Dekker 
>> ---
>>
>> libavcodec/aarch64/hevcdsp_idct_neon.S| 156 --
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
>> 2 files changed, 105 insertions(+), 85 deletions(-)
>
> Thanks, this version seems fine to me.
>
>> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
>> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> index 9cbe983870..b6d5efb77f 100644
>> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> @@ -25,22 +25,18 @@
>> #include "libavutil/aarch64/cpu.h"
>> #include "libavcodec/hevcdsp.h"
>>
>> -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs,
>> - ptrdiff_t stride);
>> +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, 
>> ptrdiff_t stride);
>
> The joined forms of these lines end up a bit long, while they previously did 
> fit below the 80 column soft-limit, so IMO I'd prefer to keep them wrapped - 
> but it's not a big deal. (I guess it made more sense to join the lines before 
> the 'const' was added.)
>
> // Martin

Pushed with these changes (entire set now).

Thanks,
-- 
jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] lavfi/Makefile: add missing folder to clean target

2023-07-21 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 libavfilter/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 9b7813575a..312e4c145b 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -656,8 +656,8 @@ TESTPROGS = drawutils filtfmts formats integral
 TOOLS-$(CONFIG_LIBZMQ) += zmqsend
 
 clean::
-   $(RM) $(CLEANSUFFIXES:%=libavfilter/dnn/%) 
$(CLEANSUFFIXES:%=libavfilter/opencl/%) \
-  $(CLEANSUFFIXES:%=libavfilter/vulkan/%)
+   $(RM) $(CLEANSUFFIXES:%=libavfilter/dnn/%) 
$(CLEANSUFFIXES:%=libavfilter/metal/%) \
+ $(CLEANSUFFIXES:%=libavfilter/opencl/%) 
$(CLEANSUFFIXES:%=libavfilter/vulkan/%)
 
 OPENCL = $(subst $(SRC_PATH)/,,$(wildcard $(SRC_PATH)/libavfilter/opencl/*.cl))
 .SECONDARY: $(OPENCL:.cl=.c)
-- 
2.41.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/7] lavc: add AV_CODEC_EXPORT_DATA_ERROR

2023-07-21 Thread J. Dekker

Add an option to avcodec to allow supported decoders to optionally
output information about error resilience as sidedata.

Co-Authored-By: Thomas Guillem 
Signed-off-by: J. Dekker 
---
 libavcodec/avcodec.h   | 5 +
 libavcodec/options_table.h | 1 +
 libavcodec/version.h   | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index fe41ecc3c9..7400604fc5 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -410,6 +410,11 @@ typedef struct RcOverride{
  * Do not apply film grain, export it instead.
  */
 #define AV_CODEC_EXPORT_DATA_FILM_GRAIN (1 << 3)
+/**
+ * Decoding only.
+ * Export the AVECInfo structure through frame side data.
+ */
+#define AV_CODEC_EXPORT_DATA_ERROR (1 << 4)
 
 /**
  * The decoder will keep a reference to the frame and may reuse it later.
diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
index bb4b894b06..0a1dc4b062 100644
--- a/libavcodec/options_table.h
+++ b/libavcodec/options_table.h
@@ -91,6 +91,7 @@ static const AVOption avcodec_options[] = {
 {"prft", "export Producer Reference Time through packet side data", 0, 
AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_EXPORT_DATA_PRFT}, INT_MIN, INT_MAX, 
A|V|S|E, "export_side_data"},
 {"venc_params", "export video encoding parameters through frame side data", 0, 
AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_EXPORT_DATA_VIDEO_ENC_PARAMS}, INT_MIN, 
INT_MAX, V|D, "export_side_data"},
 {"film_grain", "export film grain parameters through frame side data", 0, 
AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_EXPORT_DATA_FILM_GRAIN}, INT_MIN, INT_MAX, 
V|D, "export_side_data"},
+{"error_info", "export error info through frame side data", 0, 
AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_EXPORT_DATA_ERROR}, INT_MIN, INT_MAX, V|D, 
"export_side_data"},
 {"time_base", NULL, OFFSET(time_base), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, 
INT_MAX},
 {"g", "set the group of picture (GOP) size", OFFSET(gop_size), 
AV_OPT_TYPE_INT, {.i64 = 12 }, INT_MIN, INT_MAX, V|E},
 {"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), 
AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|D|E},
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 9411511e04..728ab8839d 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -29,7 +29,7 @@
 
 #include "version_major.h"
 
-#define LIBAVCODEC_VERSION_MINOR  22
+#define LIBAVCODEC_VERSION_MINOR  23
 #define LIBAVCODEC_VERSION_MICRO 100
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
-- 
2.41.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/7] lavu: add ecinfo sidedata

2023-07-21 Thread J. Dekker

Add sidedata for passing basic Error Concealment information helpful to
a renderer or end user attempting to filter or conceal video decoding
errors and artifacts.

Co-Authored-By: Thomas Guillem 
Signed-off-by: J. Dekker 
---
 libavutil/Makefile  |  2 ++
 libavutil/ec.c  | 42 +
 libavutil/ec.h  | 66 +
 libavutil/frame.c   |  1 +
 libavutil/frame.h   |  6 +
 libavutil/version.h |  2 +-
 6 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 libavutil/ec.c
 create mode 100644 libavutil/ec.h

diff --git a/libavutil/Makefile b/libavutil/Makefile
index bd9c6f9e32..81b6b1fb8a 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -28,6 +28,7 @@ HEADERS = adler32.h   
  \
   display.h \
   dovi_meta.h   \
   downmix_info.h\
+  ec.h  \
   encryption_info.h \
   error.h   \
   eval.h\
@@ -124,6 +125,7 @@ OBJS = adler32.o
\
dovi_meta.o  \
downmix_info.o   \
encryption_info.o\
+   ec.o \
error.o  \
eval.o   \
fifo.o   \
diff --git a/libavutil/ec.c b/libavutil/ec.c
new file mode 100644
index 00..762accd0a6
--- /dev/null
+++ b/libavutil/ec.c
@@ -0,0 +1,42 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "ec.h"
+
+AVECInfo *av_eci_create_side_data(AVFrame *f)
+{
+AVBufferRef *buf = NULL;
+AVECInfo *eci = av_mallocz(sizeof(AVECInfo));
+
+if (!eci)
+return NULL;
+
+buf = av_buffer_create((uint8_t *)eci, sizeof(AVECInfo), NULL, NULL, 0);
+if (!buf) {
+av_freep(&eci);
+return NULL;
+}
+
+if (!av_frame_new_side_data_from_buf(f, AV_FRAME_DATA_EC_INFO, buf)) {
+av_buffer_unref(&buf);
+return NULL;
+}
+
+return eci;
+}
diff --git a/libavutil/ec.h b/libavutil/ec.h
new file mode 100644
index 00..439fe876a2
--- /dev/null
+++ b/libavutil/ec.h
@@ -0,0 +1,66 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_EC_H
+#define AVUTIL_EC_H
+
+#include 
+#include 
+#include 
+
+#include "libavutil/mem.h"
+#include "libavutil/frame.h"
+
+/**
+ * Error Concealment information helpful to a renderer or end user
+ * attempting to filter or conceal video decoding errors and artifacts.
+ */
+typedef struct AVECInfo {
+/**
+ * Integer estimating how many pixels of the video frame had decoding
+ * errors.
+ */
+uint64_t error;
+/**
+ * Integer estimating how many pixels of the video frame decoded
+ * without error.
+ */
+uint64_t ok;
+/**
+ *

[FFmpeg-devel] [PATCH 3/7] lavc/error_resilience: fill ecinfo

2023-07-21 Thread J. Dekker

Fill ECInfo inside error resilience using references set by the decoder.

Co-Authored-By: Thomas Guillem 
Signed-off-by: J. Dekker 
---
 libavcodec/error_resilience.c | 91 +--
 libavcodec/error_resilience.h |  4 +-
 2 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/libavcodec/error_resilience.c b/libavcodec/error_resilience.c
index 2aa6f1d864..c1417ecf07 100644
--- a/libavcodec/error_resilience.c
+++ b/libavcodec/error_resilience.c
@@ -28,6 +28,7 @@
 #include 
 
 #include "libavutil/internal.h"
+#include "libavutil/ec.h"
 #include "avcodec.h"
 #include "error_resilience.h"
 #include "me_cmp.h"
@@ -411,7 +412,7 @@ static void guess_mv(ERContext *s)
 
 num_avail = 0;
 if (s->last_pic.motion_val[0])
-ff_thread_await_progress(s->last_pic.tf, mb_height-1, 0);
+ff_thread_await_progress(s->last_pic.tf, INT_MAX, 0);
 for (i = 0; i < mb_width * mb_height; i++) {
 const int mb_xy = s->mb_index2xy[i];
 int f = 0;
@@ -889,25 +890,61 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
 }
 }
 
-void ff_er_frame_end(ERContext *s)
+static void
+er_fill_info_ref(ERContext *s)
+{
+uint64_t acc_ok = 0, acc_err = 0;
+av_assert0(s->cur_pic.info);
+
+if (s->cur_pic.f->pict_type != AV_PICTURE_TYPE_I) {
+ERPicture *reffs[2] = {&s->last_pic, &s->next_pic};
+int i, nb_ref_pics = s->cur_pic.f->pict_type == AV_PICTURE_TYPE_B ? 2 
: 1;
+
+for (i = 0; i < nb_ref_pics; i++) {
+ERPicture *reff = reffs[i];
+
+if (reff->info == NULL)
+continue;
+
+ff_thread_await_progress(reff->tf, INT_MAX, 0);
+
+/* should check more accurately how refs are used */
+if (reff->info->error == 0 && reff->info->ref_error == 0)
+continue;
+
+if (acc_err < (reff->info->error + reff->info->ref_error)){
+acc_err = reff->info->error + reff->info->ref_error;
+acc_ok  = reff->info->ok + reff->info->ref_ok;
+}
+}
+}
+
+s->cur_pic.info->ref_error = acc_err;
+s->cur_pic.info->ref_ok = acc_ok;
+}
+
+int ff_er_frame_end(ERContext *s)
 {
 int *linesize = NULL;
-int i, mb_x, mb_y, error, error_type, dc_error, mv_error, ac_error;
+int i, mb_x, mb_y, error, error_type, dc_error, mv_error, ac_error, terror;
 int distance;
 int threshold_part[4] = { 100, 100, 100 };
 int threshold = 50;
 int is_intra_likely;
 int size = s->b8_stride * 2 * s->mb_height;
 
+
 /* We do not support ER of field pictures yet,
  * though it should not crash if enabled. */
-if (!s->avctx->error_concealment || !atomic_load(&s->error_count)  ||
-s->avctx->lowres   ||
-!er_supported(s)   ||
-atomic_load(&s->error_count) == 3 * s->mb_width *
-  (s->avctx->skip_top + s->avctx->skip_bottom)) {
-return;
+if (!s->avctx->error_concealment || s->avctx->lowres || !er_supported(s))
+return 0;
+
+if (!atomic_load(&s->error_count) ||
+ atomic_load(&s->error_count) == 3 * s->mb_width * (s->avctx->skip_top 
+ s->avctx->skip_bottom))
+{
+goto end_find_ref_errs;
 }
+
 linesize = s->cur_pic.f->linesize;
 
 if (   s->avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO
@@ -921,7 +958,7 @@ void ff_er_frame_end(ERContext *s)
 
 if (mb_x == s->mb_width) {
 av_log(s->avctx, AV_LOG_DEBUG, "ignoring last missing slice\n");
-return;
+goto end_find_ref_errs;
 }
 }
 
@@ -960,7 +997,7 @@ void ff_er_frame_end(ERContext *s)
 s->cur_pic.ref_index[i]  = NULL;
 s->cur_pic.motion_val[i] = NULL;
 }
-return;
+goto end_find_ref_errs;
 }
 }
 
@@ -1100,10 +1137,12 @@ void ff_er_frame_end(ERContext *s)
 }
 #endif
 
-dc_error = ac_error = mv_error = 0;
+terror = dc_error = ac_error = mv_error = 0;
 for (i = 0; i < s->mb_num; i++) {
 const int mb_xy = s->mb_index2xy[i];
 int error = s->error_status_table[mb_xy];
+if (error)
+terror++;
 if (error & ER_DC_ERROR)
 dc_error++;
 if (error & ER_AC_ERROR)
@@ -,10 +1150,9 @@ void ff_er_frame_end(ERContext *s)
 if (error & ER_MV_ERROR)
 mv_error++;
 }
-av_log(s->avctx, AV_LOG_INFO, "concealing %d DC, %d AC, %d MV errors in %c 
frame\n",
-   dc_error, ac_error, mv_error, 
av_get_pi

[FFmpeg-devel] [PATCH 4/7] lavc: set decode_error_flags when ec active

2023-07-21 Thread J. Dekker

FF_DECODE_ERROR_CONCEALMENT_ACTIVE should be set when ec is active on
supported decoders.

Co-Authored-By: Thomas Guillem 
Signed-off-by: J. Dekker 
---
 libavcodec/h263dec.c   |  6 --
 libavcodec/mpeg12dec.c |  3 ++-
 libavcodec/mss2.c  |  8 +---
 libavcodec/rv10.c  | 10 --
 libavcodec/rv34.c  | 12 +---
 libavcodec/vc1dec.c|  6 --
 6 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/libavcodec/h263dec.c b/libavcodec/h263dec.c
index 68a618a7ed..f3b9f09303 100644
--- a/libavcodec/h263dec.c
+++ b/libavcodec/h263dec.c
@@ -620,8 +620,10 @@ retry:
 
 av_assert1(s->bitstream_buffer_size == 0);
 frame_end:
-if (!s->studio_profile)
-ff_er_frame_end(&s->er);
+if (!s->studio_profile) {
+if (ff_er_frame_end(&s->er) > 0)
+s->current_picture.f->decode_error_flags |= 
FF_DECODE_ERROR_CONCEALMENT_ACTIVE;
+}
 
 if (avctx->hwaccel) {
 ret = avctx->hwaccel->end_frame(avctx);
diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c
index 27b45c6fc4..82a1e56b67 100644
--- a/libavcodec/mpeg12dec.c
+++ b/libavcodec/mpeg12dec.c
@@ -2038,7 +2038,8 @@ static int slice_end(AVCodecContext *avctx, AVFrame *pict)
 if (/* s->mb_y << field_pic == s->mb_height && */ !s->first_field && 
!s1->first_slice) {
 /* end of image */
 
-ff_er_frame_end(&s->er);
+if (ff_er_frame_end(&s->er) > 0)
+s->current_picture_ptr->f->decode_error_flags |= 
FF_DECODE_ERROR_CONCEALMENT_ACTIVE;
 
 ff_mpv_frame_end(s);
 
diff --git a/libavcodec/mss2.c b/libavcodec/mss2.c
index 98103f7fed..851346b0ad 100644
--- a/libavcodec/mss2.c
+++ b/libavcodec/mss2.c
@@ -421,8 +421,12 @@ static int decode_wmv9(AVCodecContext *avctx, const 
uint8_t *buf, int buf_size,
 
 ff_vc1_decode_blocks(v);
 
+f = s->current_picture.f;
+
 if (v->end_mb_x == s->mb_width && s->end_mb_y == s->mb_height) {
-ff_er_frame_end(&s->er);
+if (ff_er_frame_end(&s->er) > 0)
+ f->decode_error_flags |= FF_DECODE_ERROR_CONCEALMENT_ACTIVE;
+
 } else {
 av_log(v->s.avctx, AV_LOG_WARNING,
"disabling error correction due to block count mismatch %dx%d 
!= %dx%d\n",
@@ -431,8 +435,6 @@ static int decode_wmv9(AVCodecContext *avctx, const uint8_t 
*buf, int buf_size,
 
 ff_mpv_frame_end(s);
 
-f = s->current_picture.f;
-
 if (v->respic == 3) {
 ctx->dsp.upsample_plane(f->data[0], f->linesize[0], w,  h);
 ctx->dsp.upsample_plane(f->data[1], f->linesize[1], w+1 >> 1, h+1 >> 
1);
diff --git a/libavcodec/rv10.c b/libavcodec/rv10.c
index bb1ead5002..09a1d4dba6 100644
--- a/libavcodec/rv10.c
+++ b/libavcodec/rv10.c
@@ -440,6 +440,12 @@ static av_cold int rv10_decode_end(AVCodecContext *avctx)
 return 0;
 }
 
+static void rv10_er_frame_end(MpegEncContext *s)
+{
+if (ff_er_frame_end(&s->er) > 0)
+s->current_picture_ptr->f->decode_error_flags |= 
FF_DECODE_ERROR_CONCEALMENT_ACTIVE;
+}
+
 static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf,
   int buf_size, int buf_size2, int whole_size)
 {
@@ -477,7 +483,7 @@ static int rv10_decode_packet(AVCodecContext *avctx, const 
uint8_t *buf,
 if ((s->mb_x == 0 && s->mb_y == 0) || !s->current_picture_ptr) {
 // FIXME write parser so we always have complete frames?
 if (s->current_picture_ptr) {
-ff_er_frame_end(&s->er);
+rv10_er_frame_end(s);
 ff_mpv_frame_end(s);
 s->mb_x = s->mb_y = s->resync_mb_x = s->resync_mb_y = 0;
 }
@@ -649,7 +655,7 @@ static int rv10_decode_frame(AVCodecContext *avctx, AVFrame 
*pict,
 }
 
 if (s->current_picture_ptr && s->mb_y >= s->mb_height) {
-ff_er_frame_end(&s->er);
+rv10_er_frame_end(s);
 ff_mpv_frame_end(s);
 
 if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
index 51f18147af..c6231adf5f 100644
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@@ -1554,13 +1554,19 @@ static int get_slice_offset(AVCodecContext *avctx, 
const uint8_t *buf, int n, in
 return buf_size;
 }
 
+static void rv34_er_frame_end(MpegEncContext *s)
+{
+if (ff_er_frame_end(&s->er) > 0)
+s->current_picture_ptr->f->decode_error_flags |= 
FF_DECODE_ERROR_CONCEALMENT_ACTIVE;
+}
+
 static int finish_frame(AVCodecContext *avctx, AVFrame *pict)
 {
 RV34DecContext *r = avctx->priv_data;
 MpegEncContext *s = &r->s;
 int got_picture = 0, ret;
 
-ff_er_frame_end(&s->er);
+rv34_er_frame_end(s);
 ff_mpv_frame_end(s);
 s->mb_num_left

[FFmpeg-devel] [PATCH 5/7] lavc/h264: export ecinfo

2023-07-21 Thread J. Dekker

Export ecinfo to the user when AV_CODEC_EXPORT_DATA_ERROR is set.

Co-Authored-By: Thomas Guillem 
Signed-off-by: J. Dekker 
---
 libavcodec/h264_picture.c | 10 --
 libavcodec/h264_slice.c   | 16 +++-
 libavcodec/h264dec.c  | 33 +++--
 libavcodec/h264dec.h  |  8 +++-
 4 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/libavcodec/h264_picture.c b/libavcodec/h264_picture.c
index dcaf0fdb0a..aa7c5ac673 100644
--- a/libavcodec/h264_picture.c
+++ b/libavcodec/h264_picture.c
@@ -48,6 +48,7 @@ void ff_h264_unref_picture(H264Context *h, H264Picture *pic)
 av_buffer_unref(&pic->qscale_table_buf);
 av_buffer_unref(&pic->mb_type_buf);
 av_buffer_unref(&pic->pps_buf);
+av_buffer_unref(&pic->ec_info_buf);
 for (i = 0; i < 2; i++) {
 av_buffer_unref(&pic->motion_val_buf[i]);
 av_buffer_unref(&pic->ref_index_buf[i]);
@@ -61,6 +62,7 @@ static void h264_copy_picture_params(H264Picture *dst, const 
H264Picture *src)
 dst->qscale_table = src->qscale_table;
 dst->mb_type  = src->mb_type;
 dst->pps  = src->pps;
+dst->ec_info  = src->ec_info;
 
 for (int i = 0; i < 2; i++) {
 dst->motion_val[i] = src->motion_val[i];
@@ -111,7 +113,9 @@ int ff_h264_ref_picture(H264Context *h, H264Picture *dst, 
H264Picture *src)
 dst->qscale_table_buf = av_buffer_ref(src->qscale_table_buf);
 dst->mb_type_buf  = av_buffer_ref(src->mb_type_buf);
 dst->pps_buf  = av_buffer_ref(src->pps_buf);
-if (!dst->qscale_table_buf || !dst->mb_type_buf || !dst->pps_buf) {
+dst->ec_info_buf  = av_buffer_ref(src->ec_info_buf);
+if (!dst->qscale_table_buf || !dst->mb_type_buf || !dst->pps_buf
+ || !dst->ec_info_buf) {
 ret = AVERROR(ENOMEM);
 goto fail;
 }
@@ -168,6 +172,7 @@ int ff_h264_replace_picture(H264Context *h, H264Picture 
*dst, const H264Picture
 ret  = av_buffer_replace(&dst->qscale_table_buf, src->qscale_table_buf);
 ret |= av_buffer_replace(&dst->mb_type_buf, src->mb_type_buf);
 ret |= av_buffer_replace(&dst->pps_buf, src->pps_buf);
+ret |= av_buffer_replace(&dst->ec_info_buf, src->ec_info_buf);
 if (ret < 0)
 goto fail;
 
@@ -192,7 +197,7 @@ fail:
 return ret;
 }
 
-void ff_h264_set_erpic(ERPicture *dst, H264Picture *src)
+void ff_h264_set_erpic(ERPicture *dst, H264Picture *src, bool export_error)
 {
 #if CONFIG_ERROR_RESILIENCE
 int i;
@@ -202,6 +207,7 @@ void ff_h264_set_erpic(ERPicture *dst, H264Picture *src)
 if (!src)
 return;
 
+dst->info = export_error ? src->ec_info : NULL;
 dst->f = src->f;
 dst->tf = &src->tf;
 
diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
index 41bf30eefc..b291405c2a 100644
--- a/libavcodec/h264_slice.c
+++ b/libavcodec/h264_slice.c
@@ -176,12 +176,15 @@ static int init_table_pools(H264Context *h)
sizeof(int16_t), 
av_buffer_allocz);
 h->ref_index_pool= av_buffer_pool_init(4 * mb_array_size, 
av_buffer_allocz);
 
+h->ec_info_pool  = av_buffer_pool_init(sizeof(AVECInfo), 
av_buffer_allocz);
+
 if (!h->qscale_table_pool || !h->mb_type_pool || !h->motion_val_pool ||
-!h->ref_index_pool) {
+!h->ref_index_pool || !h->ec_info_pool) {
 av_buffer_pool_uninit(&h->qscale_table_pool);
 av_buffer_pool_uninit(&h->mb_type_pool);
 av_buffer_pool_uninit(&h->motion_val_pool);
 av_buffer_pool_uninit(&h->ref_index_pool);
+av_buffer_pool_uninit(&h->ec_info_pool);
 return AVERROR(ENOMEM);
 }
 
@@ -240,11 +243,13 @@ static int alloc_picture(H264Context *h, H264Picture *pic)
 
 pic->qscale_table_buf = av_buffer_pool_get(h->qscale_table_pool);
 pic->mb_type_buf  = av_buffer_pool_get(h->mb_type_pool);
-if (!pic->qscale_table_buf || !pic->mb_type_buf)
+pic->ec_info_buf  = av_buffer_pool_get(h->ec_info_pool);
+if (!pic->qscale_table_buf || !pic->mb_type_buf || !pic->ec_info_buf)
 goto fail;
 
 pic->mb_type  = (uint32_t*)pic->mb_type_buf->data + 2 * h->mb_stride + 
1;
 pic->qscale_table = pic->qscale_table_buf->data + 2 * h->mb_stride + 1;
+pic->ec_info = (AVECInfo*) pic->ec_info_buf->data;
 
 for (i = 0; i < 2; i++) {
 pic->motion_val_buf[i] = av_buffer_pool_get(h->motion_val_pool);
@@ -514,6 +519,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 pic->f->crop_right  = h->crop_right;
 pic->f->crop_top= h->crop_top;
 pic->f->crop_bottom = h->crop_bottom;
+pic->error_decode_slices = 0;
 
 pic->needs_fg =

[FFmpeg-devel] [PATCH 7/7] fate: add ecinfo sidedata test

2023-07-21 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 tests/Makefile  |   1 +
 tests/fate/ec.mak   |   6 +
 tests/ref/fate/ec-sidedata-h264 | 332 
 3 files changed, 339 insertions(+)
 create mode 100644 tests/fate/ec.mak
 create mode 100644 tests/ref/fate/ec-sidedata-h264

 ec/basic.h264 is available here: https://0x1.st/bP.h264

diff --git a/tests/Makefile b/tests/Makefile
index e09f30a0fc..f108a2460b 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -176,6 +176,7 @@ include $(SRC_PATH)/tests/fate/dnxhd.mak
 include $(SRC_PATH)/tests/fate/dpcm.mak
 include $(SRC_PATH)/tests/fate/dvvideo.mak
 include $(SRC_PATH)/tests/fate/ea.mak
+include $(SRC_PATH)/tests/fate/ec.mak
 include $(SRC_PATH)/tests/fate/exif.mak
 include $(SRC_PATH)/tests/fate/enc_external.mak
 # Must be included after lavf-video.mak
diff --git a/tests/fate/ec.mak b/tests/fate/ec.mak
new file mode 100644
index 00..fe46b7f8f8
--- /dev/null
+++ b/tests/fate/ec.mak
@@ -0,0 +1,6 @@
+FATE_EC-$(call ALLYES, ERROR_RESILIENCE H264_PARSER H264_DECODER) += 
fate-ec-sidedata-h264
+fate-ec-sidedata-h264: SRC = $(TARGET_SAMPLES)/ec/basic.h264
+fate-ec-sidedata-h264: CMD = run ffprobe$(PROGSSUF)$(EXESUF) -export_side_data 
+error_info -show_entries side_data -print_format default -bitexact -v 0 -i 
"$(SRC)"
+
+FATE_SAMPLES_FFMPEG+=$(FATE_EC-yes)
+fate-ec: $(FATE_EC-yes)
diff --git a/tests/ref/fate/ec-sidedata-h264 b/tests/ref/fate/ec-sidedata-h264
new file mode 100644
index 00..e00d2fc89c
--- /dev/null
+++ b/tests/ref/fate/ec-sidedata-h264
@@ -0,0 +1,332 @@
+[PACKET]
+[/PACKET]
+[PACKET]
+[/PACKET]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=414720
+error=142336
+ref_ok=0
+ref_error=0
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=414720
+ref_error=142336
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=414720
+ref_error=142336
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=414720
+ref_error=142336
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=414720
+ref_error=142336
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=414720
+ref_error=142336
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=414720
+ref_error=142336
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=414720
+ref_error=142336
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=414720
+ref_error=142336
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=1007360
+ref_error=663808
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=290816
+error=266240
+ref_ok=716544
+ref_error=397568
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=1007360
+ref_error=663808
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=301824
+error=255232
+ref_ok=414720
+ref_error=142336
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=716544
+ref_error=397568
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=716544
+ref_error=397568
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=716544
+ref_error=397568
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=716544
+ref_error=397568
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=716544
+ref_error=397568
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=716544
+ref_error=397568
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref_ok=716544
+ref_error=397568
+[/SIDE_DATA]
+[/FRAME]
+[PACKET]
+[/PACKET]
+[FRAME]
+[SIDE_DATA]
+side_data_type=Error Concealment Information
+ok=0
+error=0
+ref

[FFmpeg-devel] [PATCH 6/7] tools/ffprobe: add ecinfo frame side data

2023-07-21 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 fftools/ffprobe.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/fftools/ffprobe.c b/fftools/ffprobe.c
index a39185f6fe..9b1f04e1be 100644
--- a/fftools/ffprobe.c
+++ b/fftools/ffprobe.c
@@ -39,6 +39,7 @@
 #include "libavutil/bprint.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/display.h"
+#include "libavutil/ec.h"
 #include "libavutil/hash.h"
 #include "libavutil/hdr_dynamic_metadata.h"
 #include "libavutil/mastering_display_metadata.h"
@@ -2277,6 +2278,17 @@ static void 
print_ambient_viewing_environment(WriterContext *w,
 print_q("ambient_light_y", env->ambient_light_y, '/');
 }
 
+static void print_ecinfo(WriterContext *w, const AVECInfo *ecinfo)
+{
+if (!ecinfo)
+return;
+
+print_int("ok",ecinfo->ok);
+print_int("error", ecinfo->error);
+print_int("ref_ok",ecinfo->ref_ok);
+print_int("ref_error", ecinfo->ref_error);
+}
+
 static void print_pkt_side_data(WriterContext *w,
 AVCodecParameters *par,
 const AVPacketSideData *side_data,
@@ -2741,6 +2753,8 @@ static void show_frame(WriterContext *w, AVFrame *frame, 
AVStream *stream,
 } else if (sd->type == AV_FRAME_DATA_AMBIENT_VIEWING_ENVIRONMENT) {
 print_ambient_viewing_environment(
 w, (const AVAmbientViewingEnvironment *)sd->data);
+} else if (sd->type = AV_FRAME_DATA_EC_INFO) {
+print_ecinfo(w, (AVECInfo*)sd->data);
 }
 writer_print_section_footer(w);
 }
-- 
2.41.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 6/7 v1.1] tools/ffprobe: add ecinfo frame side data

2023-07-21 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 fftools/ffprobe.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/fftools/ffprobe.c b/fftools/ffprobe.c
index a39185f6fe..572a5150cf 100644
--- a/fftools/ffprobe.c
+++ b/fftools/ffprobe.c
@@ -39,6 +39,7 @@
 #include "libavutil/bprint.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/display.h"
+#include "libavutil/ec.h"
 #include "libavutil/hash.h"
 #include "libavutil/hdr_dynamic_metadata.h"
 #include "libavutil/mastering_display_metadata.h"
@@ -2277,6 +2278,17 @@ static void 
print_ambient_viewing_environment(WriterContext *w,
 print_q("ambient_light_y", env->ambient_light_y, '/');
 }
 
+static void print_ecinfo(WriterContext *w, const AVECInfo *ecinfo)
+{
+if (!ecinfo)
+return;
+
+print_int("ok",ecinfo->ok);
+print_int("error", ecinfo->error);
+print_int("ref_ok",ecinfo->ref_ok);
+print_int("ref_error", ecinfo->ref_error);
+}
+
 static void print_pkt_side_data(WriterContext *w,
 AVCodecParameters *par,
 const AVPacketSideData *side_data,
@@ -2741,6 +2753,8 @@ static void show_frame(WriterContext *w, AVFrame *frame, 
AVStream *stream,
 } else if (sd->type == AV_FRAME_DATA_AMBIENT_VIEWING_ENVIRONMENT) {
 print_ambient_viewing_environment(
 w, (const AVAmbientViewingEnvironment *)sd->data);
+} else if (sd->type == AV_FRAME_DATA_EC_INFO) {
+print_ecinfo(w, (AVECInfo*)sd->data);
 }
 writer_print_section_footer(w);
 }
-- 
2.41.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/7] lavu: add ecinfo sidedata

2023-07-25 Thread J. Dekker

Hi Devin,

Devin Heitmueller  writes:
> On Fri, Jul 21, 2023 at 9:38 AM J. Dekker  wrote:
>
> I appreciate the value of stats so I can tell that the stream had
> errors, but how is this side data "helpful to a renderer attempting to
> filter or conceal video decoding errors and artifacts" if there is no
> information relating to the region of the picture where the errors are
> present?  Is the assumption that an application will simply have some
> threshold at which it decides to duplicate the previous frame rather
> than showing the current one?

Yes, this is primarily the usecase here initially. As some point a
renderer would rather duplicate previous frame than rely on
reconstruction. This could be done within EC itself but letting the
decoder make this choice rather than exporting the information to a
renderer and doing it there seems incorrect to me.

It is the intention to expand the metadata to provide information about
specific regions in the frame in the future but mainly just providing
the framework for providing this sort of information in the first place
at the moment.

-- 
jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v3] lavc/aarch64: add hevc horizontal qpel/uni/bi

2022-10-11 Thread J. Dekker

checkasm benchmark on Ampere Altra (Neoverse N1):

put_hevc_qpel_bi_h4_8_c: 170.7
put_hevc_qpel_bi_h4_8_neon: 64.5
put_hevc_qpel_bi_h6_8_c: 373.7
put_hevc_qpel_bi_h6_8_neon: 130.2
put_hevc_qpel_bi_h8_8_c: 662.0
put_hevc_qpel_bi_h8_8_neon: 138.5
put_hevc_qpel_bi_h12_8_c: 1529.5
put_hevc_qpel_bi_h12_8_neon: 422.0
put_hevc_qpel_bi_h16_8_c: 2735.5
put_hevc_qpel_bi_h16_8_neon: 560.5
put_hevc_qpel_bi_h24_8_c: 6015.7
put_hevc_qpel_bi_h24_8_neon: 1636.0
put_hevc_qpel_bi_h32_8_c: 10779.0
put_hevc_qpel_bi_h32_8_neon: 2204.5
put_hevc_qpel_bi_h48_8_c: 24375.0
put_hevc_qpel_bi_h48_8_neon: 4984.0
put_hevc_qpel_bi_h64_8_c: 42768.0
put_hevc_qpel_bi_h64_8_neon: 8795.7
put_hevc_qpel_h4_8_c: 149.0
put_hevc_qpel_h4_8_neon: 55.7
put_hevc_qpel_h6_8_c: 321.2
put_hevc_qpel_h6_8_neon: 106.0
put_hevc_qpel_h8_8_c: 578.7
put_hevc_qpel_h8_8_neon: 133.2
put_hevc_qpel_h12_8_c: 1279.0
put_hevc_qpel_h12_8_neon: 391.7
put_hevc_qpel_h16_8_c: 2286.2
put_hevc_qpel_h16_8_neon: 519.7
put_hevc_qpel_h24_8_c: 5100.7
put_hevc_qpel_h24_8_neon: 1546.2
put_hevc_qpel_h32_8_c: 9022.0
put_hevc_qpel_h32_8_neon: 2060.2
put_hevc_qpel_h48_8_c: 20293.5
put_hevc_qpel_h48_8_neon: 4656.7
put_hevc_qpel_h64_8_c: 36037.0
put_hevc_qpel_h64_8_neon: 8262.7
put_hevc_qpel_uni_h4_8_c: 162.2
put_hevc_qpel_uni_h4_8_neon: 61.7
put_hevc_qpel_uni_h6_8_c: 355.2
put_hevc_qpel_uni_h6_8_neon: 114.2
put_hevc_qpel_uni_h8_8_c: 651.0
put_hevc_qpel_uni_h8_8_neon: 135.7
put_hevc_qpel_uni_h12_8_c: 1412.5
put_hevc_qpel_uni_h12_8_neon: 402.7
put_hevc_qpel_uni_h16_8_c: 2551.0
put_hevc_qpel_uni_h16_8_neon: 533.5
put_hevc_qpel_uni_h24_8_c: 5782.2
put_hevc_qpel_uni_h24_8_neon: 1578.7
put_hevc_qpel_uni_h32_8_c: 10586.5
put_hevc_qpel_uni_h32_8_neon: 2102.2
put_hevc_qpel_uni_h48_8_c: 23812.0
put_hevc_qpel_uni_h48_8_neon: 4739.5
put_hevc_qpel_uni_h64_8_c: 42958.7
put_hevc_qpel_uni_h64_8_neon: 8366.5

Signed-off-by: J. Dekker 
---

 Summary of changes since last iteration:
 - Interleaved stores
 - Changed tiling to loop more naturally
 - Increased code reuse (.text reduction by ~60%)
 - Simplified function variations through .req

 libavcodec/aarch64/Makefile   |   1 +
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  67 +++
 libavcodec/aarch64/hevcdsp_qpel_neon.S| 484 ++
 3 files changed, 552 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 9ce21566c6..02fb51c3ab 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -67,4 +67,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
aarch64/vp9itxfm_16bpp_neon.o   \
aarch64/vp9mc_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o 
\
aarch64/hevcdsp_init_aarch64.o  
\
+   aarch64/hevcdsp_qpel_neon.o 
\
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 644cc17715..44399b05d8 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -69,6 +69,46 @@ void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, 
const uint8_t *src, ptrd
   const int16_t *sao_offset_val, int 
eo, int width, int height);
 void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, 
ptrdiff_t stride_dst,
 const int16_t *sao_offset_val, int eo, 
int width, int height);
+void ff_hevc_put_hevc_qpel_h4_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t 
_srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h6_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t 
_srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h8_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t 
_srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t 
_srcstride, int height,
+  intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t 
_srcstride, int height,
+  intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, 
uint8_t *_src,
+ ptrdiff_t _srcstride, int height, 
intptr_t mx, intptr_t my,
+ int width);
+void ff_hevc_put_hevc_qpel_uni_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, 
uint8_t *_src,
+ ptrdiff_t _srcstride, int height, 
intptr_t mx, intptr_t my

Re: [FFmpeg-devel] [PATCH] avcodec/svq1enc: Workaround GCC bug 102513

2022-10-25 Thread J. Dekker

On 25 Oct 2022, at 14:48, Andreas Rheinhardt wrote:

> GCC 11 has a bug: When it creates clones of recursive functions
> (to inline some parameters), it clones a recursive function
> eight times by default, even when this exceeds the recursion
> depth. This happens with encode_block() in libavcodec/svq1enc.c
> where a parameter level is always in the range 0..5;
> but GCC 11 also creates functions corresponding to level UINT_MAX
> and UINT_MAX - 1 (on -O3; -O2 is fine).
>
> Using such levels would produce undefined behaviour and because
> of this GCC emits bogus -Warray-bounds warnings for these clones.
>
> Since commit d08b2900a9f0935959303da668cb00a8a7245228, certain
> symbols that are accessed like ff_svq1_inter_multistage_vlc[level]
> are declared with hidden visibility, which allows compilers
> to bake the offset implied by level into the instructions
> if level is a compile-time constant as it is in the clones.
> Yet this leads to insane offsets for level == UINT_MAX which
> can be incompatible with the supported offset ranges of relocations.
> This happens in the small code model (the default code model for
> AArch64).
>
> This commit therefore works around this bug by disabling cloning
> recursive functions for GCC 10 and 11. GCC 10 is affected by the
> underlying bug (see
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102513), so the workaround
> also targets it, although it only produces three versions of
> encode_block(), so it does not seem to trigger the actual issue here.
>
> The issue has been mitigated in GCC 12.1 (it no longer creates clones
> for impossible values; see also commit
> 1cb7fd317c84117bbb13b14851d62f77f57bb9ce), so the workaround
> does not target it.
>
> Reported-by: Josh Dekker 
> Signed-off-by: Andreas Rheinhardt 
> ---
>  libavcodec/svq1enc.c | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/libavcodec/svq1enc.c b/libavcodec/svq1enc.c
> index 75adbe7ea0..7c9430a137 100644
> --- a/libavcodec/svq1enc.c
> +++ b/libavcodec/svq1enc.c
> @@ -46,6 +46,12 @@
>  #include "libavutil/frame.h"
>  #include "libavutil/mem_internal.h"
>
> +// Workaround for GCC bug 102513
> +#if AV_GCC_VERSION_AT_LEAST(10, 0) && AV_GCC_VERSION_AT_MOST(12, 0) \
> +&& !defined(__clang__) && !defined(__INTEL_COMPILER)
> +#pragma GCC optimize ("no-ipa-cp-clone")
> +#endif
> +
>  typedef struct SVQ1EncContext {
>  /* FIXME: Needed for motion estimation, should not be used for anything
>   * else, the idea is to make the motion estimation eventually independent

Discussed on IRC, LGTM & pushed.

-- 
jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3] lavc/aarch64: add hevc horizontal qpel/uni/bi

2022-10-25 Thread J. Dekker



On 24 Oct 2022, at 14:01, Martin Storsjö wrote:

> On Tue, 11 Oct 2022, J. Dekker wrote:
>
>> [...]
>> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
>> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> index 644cc17715..44399b05d8 100644
>> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> @@ -69,6 +69,46 @@ void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, 
>> const uint8_t *src, ptrd
>>   const int16_t *sao_offset_val, int 
>> eo, int width, int height);
>> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, 
>> ptrdiff_t stride_dst,
>> const int16_t *sao_offset_val, int 
>> eo, int width, int height);
>> +void ff_hevc_put_hevc_qpel_h4_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t 
>> _srcstride, int height,
>> + intptr_t mx, intptr_t my, int width);
>
> The function pointers in the dsp context has gotten 'const' on the source 
> pointers now, which makes it emit a lot of warnings with GCC, and fail with 
> latest Clang. Please rebase and check that it builds without warnings.
>

Fixed.

>> [...]
>> +.ifc \type, qpel
>> +function ff_hevc_put_hevc_h4_8_neon, export=0
>> +uxtlv16.8h,  v16.8b
>> +uxtlv17.8h,  v17.8b
>> +uxtlv18.8h,  v18.8b
>> +uxtlv19.8h,  v19.8b
>> +
>> +mul v23.4h,  v16.4h, v0.h[0]
>> +mul v24.4h,  v18.4h, v0.h[0]
>> +
>> +.irpc i, 1234567
>> +ext v20.16b, v16.16b, v17.16b, #(2*\i)
>> +ext v21.16b, v18.16b, v19.16b, #(2*\i)
>> +mla v23.4h,  v20.4h, v0.h[\i]
>> +mla v24.4h,  v21.4h, v0.h[\i]
>> +.endr
>> +ret
>> +endfunc
>> +.endif
>> +
>> +function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
>> +load_filter mx
>> +.ifc \type, qpel_bi
>> +mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
>> +add x15, x4, #(MAX_PB_SIZE << 1) // src2b
>
> Beware that you can't in general rely on x16/x17 keeping their values for 
> long. If you branch to a function which is implemented in a different object 
> file, it may end up linked at a place in the address space which is too far 
> away for a regular 'bl' branch, so the linker has to insert a range extension 
> thunk, which clobbers x16/x17. But as long as everything here is branched 
> within the same object file, it should be ok.
>
> In general, if you need to use x16/x17, use it only for very short-lived 
> temporaries.

Alright, thanks for the consideration. Left as is since as you said we're not 
branching anywhere outside this file.

>> +.endif
>> +sub src, src, #3
>> +mov mx, lr
>
> Please use literal 'x30' instead of 'lr' - older binutils don't support the 
> 'lr' register name alias.

Fixed.

> Other than that, the code seems to run correctly, and the code looks mostly 
> reasonable now. (I didn't do a very deep read-through this time, but it looks 
> like you've addressed my earlier concerns.)

Thanks for the review, pushed with above fixes.

-- 
jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Fwd: Poll: Repeat vote: GA voters list updates

2023-11-12 Thread J. Dekker

In my eyes, the matter of how we should update the voters list was already 
resolved.

> https://vote.ffmpeg.org/cgi-bin/civs/vote.pl?id=E_07e9c717f7820201&key=240812a0c2e373ac
> This is your private URL. Do not give it to anyone else, because they could 
> use it to vote for you.

This vote seems invalidly created, as such my voting URL is shared here too.

-- 
jd

On Sun, Nov 12, 2023, at 16:40, Derek Buitenhuis wrote:
> This is an invalid do-over and I refuse to take part.
>
> Please enjoy my private URL.
>
> - Derek
>
>
>  Forwarded Message 
> Subject:  Poll: Repeat vote: GA voters list updates
> Date: Sun, 12 Nov 2023 15:00:41 +0100 (CET)
> From: Thilo Borgmann (CIVS poll supervisor) 
> Reply-To: thilo.borgm...@mail.de
> To:   derek.buitenh...@gmail.com
>
>
>
> A Condorcet Internet Voting Service poll named *Repeat vote: GA voters 
> list updates* has been created. You have been designated as a voter by 
> the poll supervisor, Thilo Borgmann (thilo.borgm...@mail.de 
> ).
>
> *Description of poll:* How do we update the list of active members of 
> the General Assembly?
>
> If you would like to vote, please visit the following URL:
>
> 
> https://vote.ffmpeg.org/cgi-bin/civs/vote.pl?id=E_07e9c717f7820201&key=452818415c5dbe91
>  
> 
>
> This is your private URL. Do not give it to anyone else, because they 
> could use it to vote for you.
>
> Your privacy will not be violated by voting. The voting service has 
> already destroyed the record of your email address and will not release 
> any information about whether or how you have voted.
>
> The poll has been announced to end on Sunday 19th of November, 23:59 
> (GMT+1). To view the results of the poll once it has ended, visit:
>
> https://vote.ffmpeg.org/cgi-bin/civs/results.pl?id=E_07e9c717f7820201
>
> For more information about the Condorcet Internet Voting Service, see
>
> http://civs.cs.cornell.edu
>
> . To control future email sent from CIVS, see
>
> https://vote.ffmpeg.org/cgi-bin/civs/mail_mgmt.pl
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [ANNOUNCE] Repeat vote: GA voters list updates

2023-11-12 Thread J. Dekker

On Sun, Nov 12, 2023, at 18:31, Michael Niedermayer wrote:
> On Sun, Nov 12, 2023 at 11:03:21AM -0300, James Almer wrote:
>> On 11/12/2023 10:59 AM, Thilo Borgmann via ffmpeg-devel wrote:
>> > I will also start the repeat vote now and everybody can hold their
>> > horses before going to flamewar. Depending on JB's explanations, he
>> > might still prove that the old vote is valid and this repeat vote
>> > becomes void.
>> 
>> Or you could have waited for his answer before doing the vote, don't you
>> think? Now people got a new vote email that may or may not be valid at all.
>
> I hope this new vote will result in the same winner as the last
> because otherwise we will have more questions and accusations

Multiple people (including myself) have already posted their voting URLs 
publicly
so this 'vote' outcome will be extremely questionable at best.

-- 
jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [ANNOUNCE] Repeat vote: GA voters list updates

2023-11-20 Thread J. Dekker

Derek Buitenhuis  writes:

> On 11/20/2023 3:44 PM, Thilo Borgmann via ffmpeg-devel wrote:
>> the results are available at [1]. As they confirm the just updated GA list 
>> as 
>> well as the update procedure twice a year on Jan 1st & Jul 1st, I think the 
>> upcoming votes (extra GA members, TC/CC elections) can then proceed as 
>> announced.
>
> The vote was not valid and announcing them like this is misleading.
>
> - Derek

Here's a link to the original vote results:

https://vote.ffmpeg.org/cgi-bin/civs/results.pl?id=E_029f7195fed7aadf

Yes, 'redoing' the vote is completely questionable; yes, even I objected
to it personally. And whilst we don't know how reliable the 2nd vote
was, the outcome is still the same as the original vote: this means that
it is possible to both discard the 2nd vote and prefer it (depending on
your perspective).

Some previous mails quoted me on IRC but seemed to omit essential
information, so I will repeat it here on the mailing list. The original
voter list was made in a few minutes by myself and the exact copy was
lost to time, I found different versions looking through my mails sent
and it's pretty unreliable. Saving the git hash and a date was something
added later.

However, none of this is really relevant since all we really need is a
general consensus to update the GA list using the script--from which we
are able to have a traceable list of voters and stop worrying about
this.

In short: We should be able to move forward now.

-- 
jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] doc/git-howto: use less weird username for git URL

2023-11-22 Thread J. Dekker

Michael Niedermayer  writes:

> Signed-off-by: Michael Niedermayer 
> ---
>  doc/git-howto.texi | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Most people probably use git@ already, seems to be more common practice
in general. Both users map to the same id on the server so would make
sense to update the documentation to recommend the more conventional
one.

Pushed.

-- 
jd

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] checkasm/hevc_deblock: add luma test

2023-12-05 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 tests/checkasm/hevc_deblock.c | 110 --
 1 file changed, 106 insertions(+), 4 deletions(-)

 Yes, this only supports 8bit. 10/12bit should be trivial, will add if this
 looks reasonable (I checked code paths using gdb, and as far as I can tell it
 does test all three). Tested on known good x86 asm.

diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c
index 66fc8d5646..3f970a470a 100644
--- a/tests/checkasm/hevc_deblock.c
+++ b/tests/checkasm/hevc_deblock.c
@@ -29,8 +29,8 @@
 static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff };
 
 #define SIZEOF_PIXEL ((bit_depth + 7) / 8)
-#define BUF_STRIDE (8 * 2)
-#define BUF_LINES (8)
+#define BUF_STRIDE (16 * 2)
+#define BUF_LINES (16)
 #define BUF_OFFSET (BUF_STRIDE * BUF_LINES)
 #define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
 
@@ -88,14 +88,116 @@ static void check_deblock_chroma(HEVCDSPContext *h, int 
bit_depth)
 }
 }
 
+// line zero
+#define P3 buf[-4 * xstride]
+#define P2 buf[-3 * xstride]
+#define P1 buf[-2 * xstride]
+#define P0 buf[-1 * xstride]
+#define Q0 buf[0 * xstride]
+#define Q1 buf[1 * xstride]
+#define Q2 buf[2 * xstride]
+#define Q3 buf[3 * xstride]
+
+// line three. used only for deblocking decision
+#define TP3 buf[-4 * xstride + 3 * ystride]
+#define TP2 buf[-3 * xstride + 3 * ystride]
+#define TP1 buf[-2 * xstride + 3 * ystride]
+#define TP0 buf[-1 * xstride + 3 * ystride]
+#define TQ0 buf[0  * xstride + 3 * ystride]
+#define TQ1 buf[1  * xstride + 3 * ystride]
+#define TQ2 buf[2  * xstride + 3 * ystride]
+#define TQ3 buf[3  * xstride + 3 * ystride]
+
+static void randomize_luma_buffers(int type, uint8_t *buf, ptrdiff_t xstride, 
ptrdiff_t ystride)
+{
+int i;
+buf += BUF_OFFSET;
+switch (type) {
+case 0: // strong
+for (i = 0; i < 16; i++) {
+P3 = P2 = P1 = P0 = 64;
+Q0 = Q1 = Q2 = Q3 = 80;
+buf += ystride;
+}
+break;
+case 1: // weak
+for (i = 0; i < 16; i++) {
+P3 = P2 = 60; P1 = P0 = 64;
+Q0 = Q1 = 74; Q2 = Q3 = 80;
+buf += ystride;
+}
+break;
+case 2: // none
+for (i = 0; i < 16; i++) {
+for (int j = -8; j < 8; j++) {
+buf[j * xstride + i * ystride] = rnd();
+}
+}
+break;
+}
+}
+
+static void check_deblock_luma(HEVCDSPContext *h, int bit_depth)
+{
+const char *type;
+const char *types[3] = { "strong", "normal", "skip" };
+int beta;
+int32_t tc[2] = { 0, 0 };
+// no_p, no_q can only be { 0,0 } for the simpler assembly (non *_c
+// variant) functions, see deblocking_filter_CTB() in hevc_filter.c
+uint8_t no_p[2] = { 0, 0 };
+uint8_t no_q[2] = { 0, 0 };
+LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
+
+declare_func(void, uint8_t *pix, ptrdiff_t stride, int beta, int32_t *tc, 
uint8_t *no_p, uint8_t *no_q);
+
+for (int j = 0; j < 3; j++) {
+beta = (j == 3) ? 0 : 32; // beta easy way to turn off filtering
+type = types[j];
+
+// see betatable[] in hevc_filter.c
+tc[0] = (rnd() & 63) + (rnd() & 1);
+tc[1] = (rnd() & 63) + (rnd() & 1);
+
+if (check_func(h->hevc_h_loop_filter_luma, 
"hevc_h_loop_filter_luma%d_%s", bit_depth, type)) {
+for (int i = 0; i < 4; i++) {
+randomize_luma_buffers(j, buf0, 16, 1);
+memcpy(buf1, buf0, BUF_SIZE);
+
+call_ref(buf0 + BUF_OFFSET, 16, beta, tc, no_p, no_q);
+call_new(buf1 + BUF_OFFSET, 16, beta, tc, no_p, no_q);
+if (memcmp(buf0, buf1, BUF_SIZE))
+fail();
+}
+bench_new(buf1 + BUF_OFFSET, 16, beta, tc, no_p, no_q);
+}
+
+if (check_func(h->hevc_v_loop_filter_luma, 
"hevc_v_loop_filter_luma%d_%s", bit_depth, type)) {
+for (int i = 0; i < 4; i++) {
+randomize_luma_buffers(j, buf0, 1, 16);
+memcpy(buf1, buf0, BUF_SIZE);
+
+call_ref(buf0 + BUF_OFFSET, 16, beta, tc, no_p, no_q);
+call_new(buf1 + BUF_OFFSET, 16, beta, tc, no_p, no_q);
+if (memcmp(buf0, buf1, BUF_SIZE))
+fail();
+}
+bench_new(buf1 + BUF_OFFSET, 16, beta, tc, no_p, no_q);
+}
+}
+}
+
 void checkasm_check_hevc_deblock(void)
 {
+HEVCDSPContext h;
 int bit_depth;
-
 for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
-HEVCDSPContext h;
 ff_hevc_dsp_init(&h, bit_depth);
 check_deblock_chroma(&h, bit_depth);
 }
 report("chroma");
+ff_hevc_dsp_init(&h, 8);
+check_deblock_luma(&h, 8);
+report("luma");
 }
--

Re: [FFmpeg-devel] [PATCH] doc: mention that for RISC-V, we prefer .S files

2023-12-05 Thread J. Dekker

"Jean-Baptiste Kempf"  writes:

> $subject
>
> See attachment.

Pushed with slight commit rewording for clarity.

-- 
jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2] checkasm/hevc_deblock: add luma and chroma full

2024-01-24 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 tests/checkasm/hevc_deblock.c | 225 +-
 1 file changed, 195 insertions(+), 30 deletions(-)

- added luma 10/12 bit
- supporting full (*_c) luma & chroma functions
- dynamically generating all test data

Appears to work for me. Testing on x86, hits the filtering decisions correctly.
x86 doesn't have the full asm functions though, need to check a platform which
has them (though the difference is minor, not sure why it wouldn't work).

-- 
jd

diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c
index 66fc8d5646..dfe7fc8e97 100644
--- a/tests/checkasm/hevc_deblock.c
+++ b/tests/checkasm/hevc_deblock.c
@@ -19,6 +19,7 @@
 #include 
 
 #include "libavutil/intreadwrite.h"
+#include "libavutil/macros.h"
 #include "libavutil/mem_internal.h"
 
 #include "libavcodec/avcodec.h"
@@ -29,10 +30,11 @@
 static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff };
 
 #define SIZEOF_PIXEL ((bit_depth + 7) / 8)
-#define BUF_STRIDE (8 * 2)
-#define BUF_LINES (8)
-#define BUF_OFFSET (BUF_STRIDE * BUF_LINES)
-#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
+#define BUF_STRIDE (16 * 2)
+#define BUF_LINES (16)
+// large buffer sizes based on high bit depth
+#define BUF_OFFSET (2 * BUF_STRIDE * BUF_LINES)
+#define BUF_SIZE (2 * BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
 
 #define randomize_buffers(buf0, buf1, size) \
 do {\
@@ -45,57 +47,220 @@ static const uint32_t pixel_mask[3] = { 0x, 
0x03ff03ff, 0x0fff0fff };
 }   \
 } while (0)
 
-static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth)
+static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth, int c)
 {
-int32_t tc[2] = { 0, 0 };
+// see tctable[] in hevc_filter.c, we check full range
+int32_t tc[2] = { rnd() % 25, rnd() % 25 };
 // no_p, no_q can only be { 0,0 } for the simpler assembly (non *_c
 // variant) functions, see deblocking_filter_CTB() in hevc_filter.c
-uint8_t no_p[2] = { 0, 0 };
-uint8_t no_q[2] = { 0, 0 };
+uint8_t no_p[2] = { rnd() & c, rnd() & c };
+uint8_t no_q[2] = { rnd() & c, rnd() & c };
 LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
 LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
 
 declare_func(void, uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t 
*no_p, uint8_t *no_q);
 
-if (check_func(h->hevc_h_loop_filter_chroma, 
"hevc_h_loop_filter_chroma%d", bit_depth)) {
-for (int i = 0; i < 4; i++) {
-randomize_buffers(buf0, buf1, BUF_SIZE);
-// see betatable[] in hevc_filter.c
-tc[0] = (rnd() & 63) + (rnd() & 1);
-tc[1] = (rnd() & 63) + (rnd() & 1);
+if (check_func(c ? h->hevc_h_loop_filter_chroma_c :
+   h->hevc_h_loop_filter_chroma, 
"hevc_h_loop_filter_chroma%d%s", bit_depth, c ? "_full" : "")) {
+randomize_buffers(buf0, buf1, BUF_SIZE);
 
-call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
-call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+if (memcmp(buf0, buf1, BUF_SIZE))
+fail();
+bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+}
+
+if (check_func(c ? h->hevc_v_loop_filter_chroma_c :
+   h->hevc_v_loop_filter_chroma, 
"hevc_v_loop_filter_chroma%d%s", bit_depth, c ? "_full" : "")) {
+randomize_buffers(buf0, buf1, BUF_SIZE);
+
+call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+if (memcmp(buf0, buf1, BUF_SIZE))
+fail();
+bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+}
+}
+
+#define P3 buf[-4 * xstride]
+#define P2 buf[-3 * xstride]
+#define P1 buf[-2 * xstride]
+#define P0 buf[-1 * xstride]
+#define Q0 buf[0 * xstride]
+#define Q1 buf[1 * xstride]
+#define Q2 buf[2 * xstride]
+#define Q3 buf[3 * xstride]
+
+#define EQU(x, y) do { \
+uint16_t z = (uint16_t)(y & ((1 << (bit_depth)) - 1)); \
+if (SIZEOF_PIXEL == 1) { \
+*(uint8_t*)(&x) = (uint8_t)z; \
+} else if (SIZEOF_PIXEL == 2) { \
+*(uint16_t*)(&x) = z; \
+} \
+} while(0)
+
+#define RNDDIFF(val, diff) av_clip(((SIZEOF_PIXEL == 1) ? \
+*(uint8_t*)(&val) : *(uint16_t*)(&val)) - (diff), 0, \
+(1 << (bit_depth)) - 1) + rnd() % FFMAX(2 * (diff), 1)
+
+#define TC25(x) ((tc[x] * 5 + 1) >> 1);
+
+static void randomize_luma_buffers(int type, int *beta, int32_t tc[2], uint8_t 
*buf, ptrdiff_

[FFmpeg-devel] [PATCH 0/2] Remove SDL2 output devices

2024-02-04 Thread J. Dekker

With the addition of threading in ffmpeg.c, the SDL2 devices no longer have the
'main' thread. This means that both the SDL2 and OpenGL output device are broken
in master. Rather than attempting to fix it, they should be removed instead as
there are better alternatives for debugging or viewing streams.

The 'pipe:' output can be used with a real video player such as mpv, vlc, or
even ffplay. For cases where the user was an application using the API they
should supply their own renderer.

J. Dekker (2):
  avdevice: remove sdl2 outdev
  avdevice: remove OpenGL device

 MAINTAINERS  |3 -
 configure|   16 -
 doc/outdevs.texi |  105 ---
 libavdevice/Makefile |2 -
 libavdevice/alldevices.c |2 -
 libavdevice/opengl_enc.c | 1313 --
 libavdevice/sdl2.c   |  370 ---
 7 files changed, 1811 deletions(-)
 delete mode 100644 libavdevice/opengl_enc.c
 delete mode 100644 libavdevice/sdl2.c

-- 
2.43.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] avdevice: remove sdl2 outdev

2024-02-04 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 MAINTAINERS  |   2 -
 configure|   3 +-
 doc/outdevs.texi |  66 ---
 libavdevice/Makefile |   1 -
 libavdevice/alldevices.c |   1 -
 libavdevice/sdl2.c   | 370 ---
 6 files changed, 1 insertion(+), 442 deletions(-)
 delete mode 100644 libavdevice/sdl2.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 4677931211..baead5d270 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -292,8 +292,6 @@ libavdevice
   libdc1394.c   Roman Shaposhnik
   opengl_enc.c  Lukasz Marek
   pulse_audio_enc.c Lukasz Marek
-  sdl   Stefano Sabatini
-  sdl2.cJosh de Kock
   v4l2.cGiorgio Vazzana
   vfwcap.c  Ramiro Polla
   xv.c  Lukasz Marek
diff --git a/configure b/configure
index 68f675a4bc..c4eebab14f 100755
--- a/configure
+++ b/configure
@@ -3635,7 +3635,6 @@ oss_indev_deps_any="sys_soundcard_h"
 oss_outdev_deps_any="sys_soundcard_h"
 pulse_indev_deps="libpulse"
 pulse_outdev_deps="libpulse"
-sdl2_outdev_deps="sdl2"
 sndio_indev_deps="sndio"
 sndio_outdev_deps="sndio"
 v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
@@ -7710,7 +7709,7 @@ enabled zoompan_filter  && prepend avfilter_deps 
"swscale"
 enabled lavfi_indev && prepend avdevice_deps "avfilter"
 
 #FIXME
-enabled_any sdl2_outdev opengl_outdev && enabled sdl2 &&
+enabled_any opengl_outdev && enabled sdl2 &&
 add_cflags $(filter_out '-Dmain=SDL_main' $sdl2_cflags)
 
 enabled opus_decoder&& prepend avcodec_deps "swresample"
diff --git a/doc/outdevs.texi b/doc/outdevs.texi
index f0484bbf8f..d3b9199463 100644
--- a/doc/outdevs.texi
+++ b/doc/outdevs.texi
@@ -406,72 +406,6 @@ Play a file on default device on default server:
 ffmpeg  -i INPUT -f pulse "stream name"
 @end example
 
-@section sdl
-
-SDL (Simple DirectMedia Layer) output device.
-
-"sdl2" can be used as alias for "sdl".
-
-This output device allows one to show a video stream in an SDL
-window. Only one SDL window is allowed per application, so you can
-have only one instance of this output device in an application.
-
-To enable this output device you need libsdl installed on your system
-when configuring your build.
-
-For more information about SDL, check:
-@url{http://www.libsdl.org/}
-
-@subsection Options
-
-@table @option
-
-@item window_borderless
-Set SDL window border off.
-Default value is 0 (enable window border).
-
-@item window_enable_quit
-Enable quit action (using window button or keyboard key)
-when non-zero value is provided.
-Default value is 1 (enable quit action).
-
-@item window_fullscreen
-Set fullscreen mode when non-zero value is provided.
-Default value is zero.
-
-@item window_size
-Set the SDL window size, can be a string of the form
-@var{width}x@var{height} or a video size abbreviation.
-If not specified it defaults to the size of the input video,
-downscaled according to the aspect ratio.
-
-@item window_title
-Set the SDL window title, if not specified default to the filename
-specified for the output device.
-
-@item window_x
-@item window_y
-Set the position of the window on the screen.
-@end table
-
-@subsection Interactive commands
-
-The window created by the device can be controlled through the
-following interactive commands.
-
-@table @key
-@item q, ESC
-Quit the device immediately.
-@end table
-
-@subsection Examples
-
-The following command shows the @command{ffmpeg} output is an
-SDL window, forcing its size to the qcif format:
-@example
-ffmpeg -i INPUT -c:v rawvideo -pix_fmt yuv420p -window_size qcif -f sdl "SDL 
output"
-@end example
-
 @section sndio
 
 sndio audio output device.
diff --git a/libavdevice/Makefile b/libavdevice/Makefile
index c30449201d..26b2339ae1 100644
--- a/libavdevice/Makefile
+++ b/libavdevice/Makefile
@@ -42,7 +42,6 @@ OBJS-$(CONFIG_PULSE_INDEV)   += pulse_audio_dec.o 
\
 pulse_audio_common.o timefilter.o
 OBJS-$(CONFIG_PULSE_OUTDEV)  += pulse_audio_enc.o \
 pulse_audio_common.o
-OBJS-$(CONFIG_SDL2_OUTDEV)   += sdl2.o
 OBJS-$(CONFIG_SNDIO_INDEV)   += sndio_dec.o sndio.o
 OBJS-$(CONFIG_SNDIO_OUTDEV)  += sndio_enc.o sndio.o
 OBJS-$(CONFIG_V4L2_INDEV)+= v4l2.o v4l2-common.o timefilter.o
diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
index 8a90fcb5d7..9215be7214 100644
--- a/libavdevice/alldevices.c
+++ b/libavdevice/alldevices.c
@@ -46,7 +46,6 @@ extern const AVInputFormat  ff_oss_demuxer;
 extern

[FFmpeg-devel] [PATCH 2/2] avdevice: remove OpenGL device

2024-02-04 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 MAINTAINERS  |1 -
 configure|   15 -
 doc/outdevs.texi |   39 --
 libavdevice/Makefile |1 -
 libavdevice/alldevices.c |1 -
 libavdevice/opengl_enc.c | 1313 --
 6 files changed, 1370 deletions(-)
 delete mode 100644 libavdevice/opengl_enc.c

diff --git a/MAINTAINERS b/MAINTAINERS
index baead5d270..c816be66ed 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -290,7 +290,6 @@ libavdevice
   iec61883.cGeorg Lippitsch
   lavfi Stefano Sabatini
   libdc1394.c   Roman Shaposhnik
-  opengl_enc.c  Lukasz Marek
   pulse_audio_enc.c Lukasz Marek
   v4l2.cGiorgio Vazzana
   vfwcap.c  Ramiro Polla
diff --git a/configure b/configure
index c4eebab14f..d97b9eedcc 100755
--- a/configure
+++ b/configure
@@ -317,7 +317,6 @@ External library support:
   --enable-libmysofa   enable libmysofa, needed for sofalizer filter [no]
   --enable-openal  enable OpenAL 1.1 capture support [no]
   --enable-opencl  enable OpenCL processing [no]
-  --enable-opengl  enable OpenGL rendering [no]
   --enable-openssl enable openssl, needed for https support
if gnutls, libtls or mbedtls is not used [no]
   --enable-pocketsphinxenable PocketSphinx, needed for asr filter [no]
@@ -1917,7 +1916,6 @@ EXTERNAL_LIBRARY_LIST="
 lv2
 mediacodec
 openal
-opengl
 openssl
 pocketsphinx
 vapoursynth
@@ -2237,7 +2235,6 @@ HEADERS_LIST="
 machine_ioctl_meteor_h
 malloc_h
 opencv2_core_core_c_h
-OpenGL_gl3_h
 poll_h
 sys_param_h
 sys_resource_h
@@ -3629,8 +3626,6 @@ lavfi_indev_deps="avfilter"
 libcdio_indev_deps="libcdio"
 libdc1394_indev_deps="libdc1394"
 openal_indev_deps="openal"
-opengl_outdev_deps="opengl"
-opengl_outdev_suggest="sdl2"
 oss_indev_deps_any="sys_soundcard_h"
 oss_outdev_deps_any="sys_soundcard_h"
 pulse_indev_deps="libpulse"
@@ -6952,12 +6947,6 @@ enabled opencl&& { check_pkg_config opencl 
OpenCL CL/cl.h clEnqueueN
  { test_cpp_condition "OpenCL/cl.h" 
"defined(CL_VERSION_1_2)" ||
test_cpp_condition "CL/cl.h" 
"defined(CL_VERSION_1_2)" ||
die "ERROR: opencl must be installed and 
version must be 1.2 or compatible"; }
-enabled opengl&& { check_lib opengl GL/glx.h glXGetProcAddress 
"-lGL" ||
-   check_lib opengl windows.h wglGetProcAddress 
"-lopengl32 -lgdi32" ||
-   check_lib opengl OpenGL/gl3.h glGetError 
"-Wl,-framework,OpenGL" ||
-   check_lib opengl ES2/gl.h glGetError 
"-isysroot=${sysroot} -Wl,-framework,OpenGLES" ||
-   die "ERROR: opengl not found."
- }
 enabled omx_rpi   && { test_code cc OMX_Core.h 
OMX_IndexConfigBrcmVideoRequestIFrame ||
{ ! enabled cross_compile &&
  add_cflags -isystem/opt/vc/include/IL &&
@@ -7708,10 +7697,6 @@ enabled zoompan_filter  && prepend avfilter_deps 
"swscale"
 
 enabled lavfi_indev && prepend avdevice_deps "avfilter"
 
-#FIXME
-enabled_any opengl_outdev && enabled sdl2 &&
-add_cflags $(filter_out '-Dmain=SDL_main' $sdl2_cflags)
-
 enabled opus_decoder&& prepend avcodec_deps "swresample"
 
 # reorder the items at var $1 to align with the items order at var $2 .
diff --git a/doc/outdevs.texi b/doc/outdevs.texi
index d3b9199463..86c78f31b7 100644
--- a/doc/outdevs.texi
+++ b/doc/outdevs.texi
@@ -301,45 +301,6 @@ ffmpeg -re -i INPUT -c:v rawvideo -pix_fmt bgra -f fbdev 
/dev/fb0
 
 See also @url{http://linux-fbdev.sourceforge.net/}, and fbset(1).
 
-@section opengl
-OpenGL output device.
-
-To enable this output device you need to configure FFmpeg with 
@code{--enable-opengl}.
-
-This output device allows one to render to OpenGL context.
-Context may be provided by application or default SDL window is created.
-
-When device renders to external context, application must implement handlers 
for following messages:
-@code{AV_DEV_TO_APP_CREATE_WINDOW_BUFFER} - create OpenGL context on current 
thread.
-@code{AV_DEV_TO_APP_PREPARE_WINDOW_BUFFER} - make OpenGL context current.
-@code{AV_DEV_TO_APP_DISPLAY_WINDOW_BUFFER} - swap buffers.
-@code{AV_DEV_TO_APP_DESTROY_WINDOW_BUFFER} - destroy OpenGL context.
-Application is also req

Re: [FFmpeg-devel] Sovereign Tech Fund

2024-02-04 Thread J. Dekker



On Sun, Feb 4, 2024, at 10:49, Rémi Denis-Courmont wrote:
> Hi,
>
> I don't believe it is appropriate to hold the vote before Derek's 
> question is addressed.
>
> We don't really know what we're voting on here.
>
> Le 1 février 2024 20:22:14 GMT+01:00, Derek Buitenhuis 
>  a écrit :
>>On 1/31/2024 9:44 PM, Derek Buitenhuis wrote:
>>> On 1/30/2024 1:48 AM, Michael Niedermayer wrote:
 https://trac.ffmpeg.org/wiki/SponsoringPrograms/STF/2024
>>> 
>>> Not to derail this fine thread, but what forks does the Merge Forks
>>> project refer to?
>>
>>I do not believe this has been answered.
>>
>>- Derek


The vote is unclear for me and also it was not explained who ‘the same person 
as before’ is, no reply or answer to this either. Hope Michael can clear this 
up.

- jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] configure,etc: unify shebang usage

2024-03-28 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 configure | 3 ++-
 doc/texidep.pl| 2 +-
 ffbuild/libversion.sh | 1 +
 tests/fate-run.sh | 2 +-
 tests/fate.sh | 2 +-
 5 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/configure b/configure
index 2d46ef0b9c..d332f18e3d 100755
--- a/configure
+++ b/configure
@@ -4725,7 +4725,7 @@ chmod +x $TMPE
 
 # make sure we can execute files in $TMPDIR
 cat > $TMPSH 2>> $logfile <> $logfile 2>&1
 if ! $TMPSH >> $logfile 2>&1; then
@@ -8270,6 +8270,7 @@ print_enabled_components libavformat/protocol_list.c 
URLProtocol url_protocols $
 # Settings for pkg-config files
 
 cat > $TMPH <  
diff --git a/ffbuild/libversion.sh b/ffbuild/libversion.sh
index a94ab58057..ecaa90cde6 100755
--- a/ffbuild/libversion.sh
+++ b/ffbuild/libversion.sh
@@ -1,3 +1,4 @@
+#!/bin/sh
 toupper(){
 echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
 }
diff --git a/tests/fate-run.sh b/tests/fate-run.sh
index 9863e4f2d9..6ae0320c60 100755
--- a/tests/fate-run.sh
+++ b/tests/fate-run.sh
@@ -1,4 +1,4 @@
-#! /bin/sh
+#!/bin/sh
 
 export LC_ALL=C
 
diff --git a/tests/fate.sh b/tests/fate.sh
index 07908be3a5..c5ee18de80 100755
--- a/tests/fate.sh
+++ b/tests/fate.sh
@@ -1,4 +1,4 @@
-#! /bin/sh
+#!/bin/sh
 
 config=$1
 
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] configure: simplify bigendian check

2024-03-28 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 configure | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/configure b/configure
index d332f18e3d..0ff1ff0335 100755
--- a/configure
+++ b/configure
@@ -6108,11 +6108,7 @@ extern_prefix=${sym%%ff_extern*}
 
 check_cc pragma_deprecated "" '_Pragma("GCC diagnostic push") _Pragma("GCC 
diagnostic ignored \"-Wdeprecated-declarations\"")'
 
-# The global variable ensures the bits appear unchanged in the object file.
-test_cc <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 1/2] configure,etc: unify shebang usage

2024-04-08 Thread J. Dekker

In some cases, these scripts can be called directly by packagers, and
some systems require the interpreter to be explicit.

Signed-off-by: J. Dekker 
---
 configure | 3 ++-
 doc/texidep.pl| 2 +-
 ffbuild/libversion.sh | 1 +
 tests/fate-run.sh | 2 +-
 tests/fate.sh | 2 +-
 5 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/configure b/configure
index f511fbae49..7b6f48e631 100755
--- a/configure
+++ b/configure
@@ -4737,7 +4737,7 @@ chmod +x $TMPE
 
 # make sure we can execute files in $TMPDIR
 cat > $TMPSH 2>> $logfile <> $logfile 2>&1
 if ! $TMPSH >> $logfile 2>&1; then
@@ -8283,6 +8283,7 @@ print_enabled_components libavformat/protocol_list.c 
URLProtocol url_protocols $
 # Settings for pkg-config files
 
 cat > $TMPH <  
diff --git a/ffbuild/libversion.sh b/ffbuild/libversion.sh
index a94ab58057..ecaa90cde6 100755
--- a/ffbuild/libversion.sh
+++ b/ffbuild/libversion.sh
@@ -1,3 +1,4 @@
+#!/bin/sh
 toupper(){
 echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
 }
diff --git a/tests/fate-run.sh b/tests/fate-run.sh
index 9863e4f2d9..6ae0320c60 100755
--- a/tests/fate-run.sh
+++ b/tests/fate-run.sh
@@ -1,4 +1,4 @@
-#! /bin/sh
+#!/bin/sh
 
 export LC_ALL=C
 
diff --git a/tests/fate.sh b/tests/fate.sh
index 07908be3a5..c5ee18de80 100755
--- a/tests/fate.sh
+++ b/tests/fate.sh
@@ -1,4 +1,4 @@
-#! /bin/sh
+#!/bin/sh
 
 config=$1
 
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 2/2] configure: simplify bigendian check

2024-04-08 Thread J. Dekker

The preferred way to use LTO is --enable-lto but often times packagers
still end up with -flto in cflags for various reasons. Using grep
on binary object files is brittle and relies on specific object
representation, which in the case of LLVM bitcode, debug-info or other
intermediary formats can fail silently.

This patch changes the check to a more commonly used define for
big-endian systems. More checks may need to be added in the future to
cover legacy machines.

Signed-off-by: J. Dekker 
---
 configure | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/configure b/configure
index 7b6f48e631..e787f13e0b 100755
--- a/configure
+++ b/configure
@@ -6120,11 +6120,7 @@ extern_prefix=${sym%%ff_extern*}
 
 check_cc pragma_deprecated "" '_Pragma("GCC diagnostic push") _Pragma("GCC 
diagnostic ignored \"-Wdeprecated-declarations\"")'
 
-# The global variable ensures the bits appear unchanged in the object file.
-test_cc <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v3 1/5] configure: simplify bigendian check

2024-04-09 Thread J. Dekker

The preferred way to use LTO is --enable-lto but often times packagers
still end up with -flto in cflags for various reasons. Using grep
on binary object files is brittle and relies on specific object
representation, which in the case of LLVM bitcode, debug information or
other intermediary formats can fail silently.

This patch changes the check to a more commonly used define for GCC
style compilers. More checks may be needed to cover other potential
compilers that don't use the __BYTE_ORDER__ define.

Signed-off-by: J. Dekker 
---
 configure | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/configure b/configure
index f511fbae49..7c22772485 100755
--- a/configure
+++ b/configure
@@ -6120,11 +6120,7 @@ extern_prefix=${sym%%ff_extern*}
 
 check_cc pragma_deprecated "" '_Pragma("GCC diagnostic push") _Pragma("GCC 
diagnostic ignored \"-Wdeprecated-declarations\"")'
 
-# The global variable ensures the bits appear unchanged in the object file.
-test_cc <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v3 2/5] ffbuild/libversion.sh: add shebang

2024-04-09 Thread J. Dekker

The implicit interpreter is dependent on the environment, and isn't
guaranteed to be /bin/sh. Some packagers call this script directly, and
in certain environments such as containers using qemu-user through
binfmt_misc emulation on Linux it doesn't fallback to /bin/sh.

To fix these cases we add the interpreter explicitly.

Signed-off-by: J. Dekker 
---
 ffbuild/libversion.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ffbuild/libversion.sh b/ffbuild/libversion.sh
index a94ab58057..ecaa90cde6 100755
--- a/ffbuild/libversion.sh
+++ b/ffbuild/libversion.sh
@@ -1,3 +1,4 @@
+#!/bin/sh
 toupper(){
 echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
 }
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v3 3/5] configure: switch to shebang without space

2024-04-09 Thread J. Dekker

Note that the config.sh file is left without a shebang, this file is
supposed to be sourced into the current environment.

This commit is purely cosmetic.

Signed-off-by: J. Dekker 
---
 configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index 7c22772485..55f1fc354d 100755
--- a/configure
+++ b/configure
@@ -4737,7 +4737,7 @@ chmod +x $TMPE
 
 # make sure we can execute files in $TMPDIR
 cat > $TMPSH 2>> $logfile <> $logfile 2>&1
 if ! $TMPSH >> $logfile 2>&1; then
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v3 4/5] tests/fate.sh: switch to shebang without space

2024-04-09 Thread J. Dekker

This commit is purely cosmetic.

Signed-off-by: J. Dekker 
---
 tests/fate-run.sh | 2 +-
 tests/fate.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/fate-run.sh b/tests/fate-run.sh
index 9863e4f2d9..6ae0320c60 100755
--- a/tests/fate-run.sh
+++ b/tests/fate-run.sh
@@ -1,4 +1,4 @@
-#! /bin/sh
+#!/bin/sh
 
 export LC_ALL=C
 
diff --git a/tests/fate.sh b/tests/fate.sh
index 07908be3a5..c5ee18de80 100755
--- a/tests/fate.sh
+++ b/tests/fate.sh
@@ -1,4 +1,4 @@
-#! /bin/sh
+#!/bin/sh
 
 config=$1
 
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v3 5/5] doc/texidep: switch to shebang without space

2024-04-09 Thread J. Dekker

This commit is purely cosmetic.

Signed-off-by: J. Dekker 
---
 doc/texidep.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/texidep.pl b/doc/texidep.pl
index 099690378e..33e6c7c53e 100644
--- a/doc/texidep.pl
+++ b/doc/texidep.pl
@@ -1,4 +1,4 @@
-#! /usr/bin/env perl
+#!/usr/bin/env perl
 
 # This script will print the dependency of a Texinfo file to stdout.
 # texidep.pl   
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3 3/5] configure: switch to shebang without space

2024-04-09 Thread J. Dekker


Martin Storsjö  writes:

> On Tue, 9 Apr 2024, J. Dekker wrote:
>
>> Note that the config.sh file is left without a shebang, this file is
>> supposed to be sourced into the current environment.
>>
>> This commit is purely cosmetic.
>>
>> Signed-off-by: J. Dekker 
>> ---
>> configure | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> Thanks, this set seems fine to me - the explanations seem good now. (I'd
> consider merging patches 3-5 though, but keeping the full commit message from
> patch 3).)
>

Thanks for review, pushed with 3-5 squashed.

-- 
jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] tests/checkasm: add exclude_guest for non-x86 linux perf

2024-04-09 Thread J. Dekker

exclude_guest is currently only supported on x86. However, not
specifying 'exclude_guest' implies that you can count guest events
should you run one. This creates an ABI issue whereby some non-x86
kernels require specifying exclude_guest = 1 explicitly.

Signed-off-by: J. Dekker 
---
 tests/checkasm/checkasm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index dcd2fd6957..8be6cb0f55 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -742,6 +742,9 @@ static int bench_init_linux(void)
 .disabled   = 1, // start counting only on demand
 .exclude_kernel = 1,
 .exclude_hv = 1,
+#if !ARCH_X86
+.exclude_guest  = 1,
+#endif
 };
 
 printf("benchmarking with Linux Perf Monitoring API\n");
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] aarch64: ac3dsp: Simplify the end of ff_ac3_sum_square_butterfly_float_neon

2024-04-09 Thread J. Dekker


Martin Storsjö  writes:

> Before:   Cortex A53 A72 A78
> ac3_sum_square_bufferfly_float_neon:  1005.7   516.5   224.5
> After:
> ac3_sum_square_bufferfly_float_neon:   981.7   504.5   223.2
> ---
>  libavcodec/aarch64/ac3dsp_neon.S | 16 
>  1 file changed, 4 insertions(+), 12 deletions(-)
>
> diff --git a/libavcodec/aarch64/ac3dsp_neon.S 
> b/libavcodec/aarch64/ac3dsp_neon.S
> index 20beb6cc50..7e97cc39f7 100644
> --- a/libavcodec/aarch64/ac3dsp_neon.S
> +++ b/libavcodec/aarch64/ac3dsp_neon.S
> @@ -103,17 +103,9 @@ function ff_ac3_sum_square_butterfly_float_neon, export=1
>  fmlav3.4s, v17.4s, v17.4s
>  subsw3, w3, #4
>  b.gt1b
> -faddp   v0.4s, v0.4s, v0.4s
> -faddp   v0.2s, v0.2s, v0.2s
> -st1 {v0.s}[0], [x0], #4
> -faddp   v1.4s, v1.4s, v1.4s
> -faddp   v1.2s, v1.2s, v1.2s
> -st1 {v1.s}[0], [x0], #4
> -faddp   v2.4s, v2.4s, v2.4s
> -faddp   v2.2s, v2.2s, v2.2s
> -st1 {v2.s}[0], [x0], #4
> -faddp   v3.4s, v3.4s, v3.4s
> -faddp   v3.2s, v3.2s, v3.2s
> -st1 {v3.s}[0], [x0]
> +faddp   v0.4s, v0.4s, v1.4s
> +faddp   v2.4s, v2.4s, v3.4s
> +faddp   v0.4s, v0.4s, v2.4s
> +st1 {v0.4s}, [x0]
>  ret
>  endfunc

Thanks, LGTM. Pushed with M1 benchmark on Linux.

-- 
jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2] tests/checkasm: add exclude_guest for non-x86 linux perf

2024-04-10 Thread J. Dekker

The exclude_guest option only has an effect on x86. Omitting
'exclude_guest' defaults to zero which implies that you can count guest
events should you run one. Some non-x86 kernels just ignore it, while
others (e.g. the Asahi Linux kernels) require the user to explicitly set
the option to 1, i.e. the only behaviour that makes sense when counting
guest events isn't supported.

Signed-off-by: J. Dekker 
---

 Made commit message clearer, no functional change since v1.

 tests/checkasm/checkasm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index dcd2fd6957..8be6cb0f55 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -742,6 +742,9 @@ static int bench_init_linux(void)
 .disabled   = 1, // start counting only on demand
 .exclude_kernel = 1,
 .exclude_hv = 1,
+#if !ARCH_X86
+.exclude_guest  = 1,
+#endif
 };
 
 printf("benchmarking with Linux Perf Monitoring API\n");
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2] tests/checkasm: add exclude_guest for non-x86 linux perf

2024-04-10 Thread J. Dekker


Martin Storsjö  writes:

> On Wed, 10 Apr 2024, J. Dekker wrote:
>
>> The exclude_guest option only has an effect on x86. Omitting
>> 'exclude_guest' defaults to zero which implies that you can count guest
>> events should you run one. Some non-x86 kernels just ignore it, while
>> others (e.g. the Asahi Linux kernels) require the user to explicitly set
>> the option to 1, i.e. the only behaviour that makes sense when counting
>> guest events isn't supported.
>>
>> Signed-off-by: J. Dekker 
>> ---
>>
>> Made commit message clearer, no functional change since v1.
>>
>> tests/checkasm/checkasm.c | 3 +++
>> 1 file changed, 3 insertions(+)
>>
>> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
>> index dcd2fd6957..8be6cb0f55 100644
>> --- a/tests/checkasm/checkasm.c
>> +++ b/tests/checkasm/checkasm.c
>> @@ -742,6 +742,9 @@ static int bench_init_linux(void)
>> .disabled   = 1, // start counting only on demand
>> .exclude_kernel = 1,
>> .exclude_hv = 1,
>> +#if !ARCH_X86
>> +.exclude_guest  = 1,
>> +#endif
>> };
>>
>> printf("benchmarking with Linux Perf Monitoring API\n");
>> -- 2.44.0
>
> Thanks, the updated commit message feels more readable to me at least.
>
> I'm not familiar with the perf API, but I tested perf on an aarch64 machine
> where perf benchmarking previously worked, and it still works after this
> change, so it seems ok.

Thanks, pushed.

-- 
jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] avfilter/riscv: build afir only if required

2024-04-24 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 libavfilter/riscv/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
index 0b968a9c0d..277dde2aed 100644
--- a/libavfilter/riscv/Makefile
+++ b/libavfilter/riscv/Makefile
@@ -1,2 +1,2 @@
-OBJS += riscv/af_afir_init.o
-RVV-OBJS += riscv/af_afir_rvv.o
+OBJS-$(CONFIG_AFIR_FILTER)   += riscv/af_afir_init.o
+RVV-OBJS-$(CONFIG_AFIR_FILTER)   += riscv/af_afir_rvv.o
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] checkasm/h264dsp: support checking more idct depths

2024-04-24 Thread J. Dekker

Signed-off-by: J. Dekker 
---
 tests/checkasm/h264dsp.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/checkasm/h264dsp.c b/tests/checkasm/h264dsp.c
index 0f484e3f43..5cb646ae49 100644
--- a/tests/checkasm/h264dsp.c
+++ b/tests/checkasm/h264dsp.c
@@ -173,6 +173,7 @@ static void dct8x8(int16_t *coef, int bit_depth)
 
 static void check_idct(void)
 {
+static const int depths[5] = { 8, 9, 10, 12, 14 };
 LOCAL_ALIGNED_16(uint8_t, src,  [8 * 8 * 2]);
 LOCAL_ALIGNED_16(uint8_t, dst,  [8 * 8 * 2]);
 LOCAL_ALIGNED_16(uint8_t, dst0, [8 * 8 * 2]);
@@ -181,10 +182,11 @@ static void check_idct(void)
 LOCAL_ALIGNED_16(int16_t, subcoef0, [8 * 8 * 2]);
 LOCAL_ALIGNED_16(int16_t, subcoef1, [8 * 8 * 2]);
 H264DSPContext h;
-int bit_depth, sz, align, dc;
+int bit_depth, sz, align, dc, i;
 declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *block, int 
stride);
 
-for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
+for (i = 0; i < 5; i++) {
+bit_depth = depths[i];
 ff_h264dsp_init(&h, bit_depth, 1);
 for (sz = 4; sz <= 8; sz += 4) {
 randomize_buffers();
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

1 2 >

1 - 100 of 164 matches

Mail list logo