Re: [FFmpeg-devel] [PATCH 10/11] avcodec/blockdsp: add AVX-512 version of clear_block(s)

2017-11-10 Thread James Almer
On 11/10/2017 10:28 AM, James Darnley wrote:
> On 2017-11-09 20:35, Martin Vignali wrote:
>> 2017-11-09 12:58 GMT+01:00 James Darnley :
>>
>>> From: James Darnley 
>>>
>>> Also adjust alignment requirements where nessecary.
>>> ---
>>> Whether this patch is committed or not the change to 4xm.c should be
>>> picked to
>>> master because the alignment is wrong for the AVX version of this
>>> function.  I
>>> assume it hasn't been noticed yet because it manages to be 32-byte aligned
>>> without intervention.
>>>
>>>
>> Thanks for fixing, the 4xm, i miss it in the avx patch
>>
>> Just by curiosity : can you post the checkasm result (i can't test AVX512) ?
> 
> I certainly can.
> 
>> $ ./tests/checkasm/checkasm --bench --test=blockdsp
>> benchmarking with native FFmpeg timers
>> nop: 26.0
>> checkasm: using random seed 402373647
>> MMX:
>>  - blockdsp.blockdsp [OK]
>> SSE:
>>  - blockdsp.blockdsp [OK]
>> AVX:
>>  - blockdsp.blockdsp [OK]
>> AVX-512:
>>  - blockdsp.blockdsp [OK]
>> checkasm: all 8 tests passed
>> blockdsp.clear_block_c: 23.5
>> blockdsp.clear_block_mmx: 11.5
>> blockdsp.clear_block_sse: 5.5
>> blockdsp.clear_block_avx: 3.0
>> blockdsp.clear_block_avx512: 5.0

This sounds like it's not worth adding.

>> blockdsp.clear_blocks_c: 48.0
>> blockdsp.clear_blocks_mmx: 77.0
>> blockdsp.clear_blocks_sse: 38.0
>> blockdsp.clear_blocks_avx: 18.5
>> blockdsp.clear_blocks_avx512: 11.0

This one is better, but a perf run to check how much CPU time is spent
in this function is needed, because I'm not sure it's important enough
to justify having the CPU throttled just to run avx512 code...
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 10/11] avcodec/blockdsp: add AVX-512 version of clear_block(s)

2017-11-10 Thread James Darnley
On 2017-11-09 20:35, Martin Vignali wrote:
> 2017-11-09 12:58 GMT+01:00 James Darnley :
> 
>> From: James Darnley 
>>
>> Also adjust alignment requirements where nessecary.
>> ---
>> Whether this patch is committed or not the change to 4xm.c should be
>> picked to
>> master because the alignment is wrong for the AVX version of this
>> function.  I
>> assume it hasn't been noticed yet because it manages to be 32-byte aligned
>> without intervention.
>>
>>
> Thanks for fixing, the 4xm, i miss it in the avx patch
> 
> Just by curiosity : can you post the checkasm result (i can't test AVX512) ?

I certainly can.

> $ ./tests/checkasm/checkasm --bench --test=blockdsp
> benchmarking with native FFmpeg timers
> nop: 26.0
> checkasm: using random seed 402373647
> MMX:
>  - blockdsp.blockdsp [OK]
> SSE:
>  - blockdsp.blockdsp [OK]
> AVX:
>  - blockdsp.blockdsp [OK]
> AVX-512:
>  - blockdsp.blockdsp [OK]
> checkasm: all 8 tests passed
> blockdsp.clear_block_c: 23.5
> blockdsp.clear_block_mmx: 11.5
> blockdsp.clear_block_sse: 5.5
> blockdsp.clear_block_avx: 3.0
> blockdsp.clear_block_avx512: 5.0
> blockdsp.clear_blocks_c: 48.0
> blockdsp.clear_blocks_mmx: 77.0
> blockdsp.clear_blocks_sse: 38.0
> blockdsp.clear_blocks_avx: 18.5
> blockdsp.clear_blocks_avx512: 11.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 10/11] avcodec/blockdsp: add AVX-512 version of clear_block(s)

2017-11-09 Thread Martin Vignali
2017-11-09 12:58 GMT+01:00 James Darnley :

> From: James Darnley 
>
> Also adjust alignment requirements where nessecary.
> ---
> Whether this patch is committed or not the change to 4xm.c should be
> picked to
> master because the alignment is wrong for the AVX version of this
> function.  I
> assume it hasn't been noticed yet because it manages to be 32-byte aligned
> without intervention.
>
>
Thanks for fixing, the 4xm, i miss it in the avx patch

Just by curiosity : can you post the checkasm result (i can't test AVX512) ?

lgtm

Martin
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 10/11] avcodec/blockdsp: add AVX-512 version of clear_block(s)

2017-11-09 Thread James Darnley
From: James Darnley 

Also adjust alignment requirements where nessecary.
---
Whether this patch is committed or not the change to 4xm.c should be picked to
master because the alignment is wrong for the AVX version of this function.  I
assume it hasn't been noticed yet because it manages to be 32-byte aligned
without intervention.

 libavcodec/4xm.c   |  2 +-
 libavcodec/asv.h   |  3 ++-
 libavcodec/bink.c  |  4 ++--
 libavcodec/dnxhdenc.h  |  2 +-
 libavcodec/eamad.c |  2 +-
 libavcodec/eatqi.c |  2 +-
 libavcodec/g2meet.c|  2 +-
 libavcodec/ituh263dec.c|  2 +-
 libavcodec/mdec.c  |  2 +-
 libavcodec/mimic.c |  2 +-
 libavcodec/mjpegdec.h  |  3 ++-
 libavcodec/proresdec2.c|  6 +++---
 libavcodec/speedhq.c   |  2 +-
 libavcodec/wmv2.h  |  2 +-
 libavcodec/x86/blockdsp.asm| 14 ++
 libavcodec/x86/blockdsp_init.c |  8 
 libavutil/internal.h   |  6 ++
 tests/checkasm/blockdsp.c  |  4 ++--
 18 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/libavcodec/4xm.c b/libavcodec/4xm.c
index 5547dfd87f..a97f051232 100644
--- a/libavcodec/4xm.c
+++ b/libavcodec/4xm.c
@@ -145,7 +145,7 @@ typedef struct FourXContext {
 int mv[256];
 VLC pre_vlc;
 int last_dc;
-DECLARE_ALIGNED(16, int16_t, block)[6][64];
+DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
 void *bitstream_buffer;
 unsigned int bitstream_buffer_size;
 int version;
diff --git a/libavcodec/asv.h b/libavcodec/asv.h
index a1366b6fe4..024ffa349a 100644
--- a/libavcodec/asv.h
+++ b/libavcodec/asv.h
@@ -35,6 +35,7 @@
 #include "bswapdsp.h"
 #include "fdctdsp.h"
 #include "idctdsp.h"
+#include "internal.h"
 #include "get_bits.h"
 #include "pixblockdsp.h"
 #include "put_bits.h"
@@ -54,7 +55,7 @@ typedef struct ASV1Context {
 int mb_height;
 int mb_width2;
 int mb_height2;
-DECLARE_ALIGNED(32, int16_t, block)[6][64];
+DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
 uint16_t intra_matrix[64];
 int q_intra_matrix[64];
 uint8_t *bitstream_buffer;
diff --git a/libavcodec/bink.c b/libavcodec/bink.c
index c4cf617a8b..eee19512f7 100644
--- a/libavcodec/bink.c
+++ b/libavcodec/bink.c
@@ -818,7 +818,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame 
*frame, GetBitContext *gb,
 int v, col[2];
 const uint8_t *scan;
 int xoff, yoff;
-LOCAL_ALIGNED_32(int16_t, block, [64]);
+LOCAL_ALIGNED_64(int16_t, block, [64]);
 LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
 int coordmap[64];
 int ybias = is_key ? -15 : 0;
@@ -985,7 +985,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame 
*frame, GetBitContext *gb,
 uint8_t *dst, *prev, *ref_start, *ref_end;
 int v, col[2];
 const uint8_t *scan;
-LOCAL_ALIGNED_32(int16_t, block, [64]);
+LOCAL_ALIGNED_64(int16_t, block, [64]);
 LOCAL_ALIGNED_16(uint8_t, ublock, [64]);
 LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
 int coordmap[64], quant_idx, coef_count, coef_idx[64];
diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h
index 963821ac81..4567a4aa8a 100644
--- a/libavcodec/dnxhdenc.h
+++ b/libavcodec/dnxhdenc.h
@@ -74,7 +74,7 @@ typedef struct DNXHDEncContext {
 unsigned min_padding;
 int intra_quant_bias;
 
-DECLARE_ALIGNED(32, int16_t, blocks)[12][64];
+DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, blocks)[12][64];
 DECLARE_ALIGNED(16, uint8_t, edge_buf_y)[512]; // has to hold 16x16 uint16 
when depth=10
 DECLARE_ALIGNED(16, uint8_t, edge_buf_uv)[2][512]; // has to hold 16x16 
uint16_t when depth=10
 
diff --git a/libavcodec/eamad.c b/libavcodec/eamad.c
index 7f28abbafe..0f638b43f6 100644
--- a/libavcodec/eamad.c
+++ b/libavcodec/eamad.c
@@ -54,7 +54,7 @@ typedef struct MadContext {
 GetBitContext gb;
 void *bitstream_buf;
 unsigned int bitstream_buf_size;
-DECLARE_ALIGNED(32, int16_t, block)[64];
+DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[64];
 ScanTable scantable;
 uint16_t quant_matrix[64];
 int mb_x;
diff --git a/libavcodec/eatqi.c b/libavcodec/eatqi.c
index 1a847a35da..9527bc2713 100644
--- a/libavcodec/eatqi.c
+++ b/libavcodec/eatqi.c
@@ -51,7 +51,7 @@ typedef struct TqiContext {
 uint16_t intra_matrix[64];
 int last_dc[3];
 
-DECLARE_ALIGNED(32, int16_t, block)[6][64];
+DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
 } TqiContext;
 
 static av_cold int tqi_decode_init(AVCodecContext *avctx)
diff --git a/libavcodec/g2meet.c b/libavcodec/g2meet.c
index 842095ba3b..57ff92c002 100644
--- a/libavcodec/g2meet.c
+++ b/libavcodec/g2meet.c
@@ -122,7 +122,7 @@ typedef struct JPGContext {
 
 VLCdc_vlc[2], ac_vlc[2];
 intprev_dc[3];
-DECLARE_ALIGNED(32, int16_t, block)[6][64];
+DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
 
 uint8_t*buf;
 }