The primary objective of Video Complexity Analyzer (VCA) is to provide a fast spatial and temporal complexity predictor for every frame/video segment/video in order to enhance the prediction of encoding parameters for applications like online per-title encoding.
Example: ./ffmpeg -i input.y4m -vf vca=file=vca.csv -f null - The `vca.csv` file contains the "E" and "h" features, which correspond to the spatial and temporal complexity of frames. For example for video: https://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4 we expect the following E and h values for the first 10 frames: POC,E,h 0,0,0.000000 1,0,0.000000 2,0,0.000000 3,0,0.000000 4,0,1.390741 5,0,1.788889 3,0,0.000000 4,0,1.390741 5,0,1.788889 6,0,1.670370 7,0,1.768519 8,0,3.114815 9,0,2.290741 10,0,2.459259 Signed-off-by: Hadi Amirpour <[email protected]> Signed-off-by: mrcybercat <[email protected]> --- Changelog | 1 + libavfilter/Makefile | 1 + libavfilter/allfilters.c | 1 + libavfilter/vca_dct.c | 598 +++++++++++++++++++++++ libavfilter/vca_dct.h | 131 +++++ libavfilter/vf_vca.c | 546 +++++++++++++++++++++ libavfilter/x86/Makefile | 1 + libavfilter/x86/vf_vca.asm | 877 ++++++++++++++++++++++++++++++++++ libavfilter/x86/vf_vca_init.c | 210 ++++++++ 9 files changed, 2366 insertions(+) create mode 100644 libavfilter/vca_dct.c create mode 100644 libavfilter/vca_dct.h create mode 100644 libavfilter/vf_vca.c create mode 100644 libavfilter/x86/vf_vca.asm create mode 100644 libavfilter/x86/vf_vca_init.c diff --git a/Changelog b/Changelog index aff0c78153..b26a263d23 100644 --- a/Changelog +++ b/Changelog @@ -47,6 +47,7 @@ version 8.1: - ProRes Vulkan encoder - LCEVC parser - LCEVC enhancement layer exporting in MPEG-TS +- Add VCA filter version 8.0: diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 03bf51d3fd..18421b4801 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -564,6 +564,7 @@ OBJS-$(CONFIG_V360_FILTER) += vf_v360.o OBJS-$(CONFIG_V360_VULKAN_FILTER) += vf_v360_vulkan.o OBJS-$(CONFIG_VAGUEDENOISER_FILTER) += vf_vaguedenoiser.o OBJS-$(CONFIG_VARBLUR_FILTER) += vf_varblur.o framesync.o +OBJS-$(CONFIG_VCA_FILTER) += vf_vca.o vca_dct.o OBJS-$(CONFIG_VECTORSCOPE_FILTER) += vf_vectorscope.o OBJS-$(CONFIG_VFLIP_FILTER) += vf_vflip.o OBJS-$(CONFIG_VFLIP_VULKAN_FILTER) += vf_flip_vulkan.o vulkan.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 66c49d453b..ab81ec6414 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -530,6 +530,7 @@ extern const FFFilter ff_vf_v360; extern const FFFilter ff_vf_v360_vulkan; extern const FFFilter ff_vf_vaguedenoiser; extern const FFFilter ff_vf_varblur; +extern const FFFilter ff_vf_vca; extern const FFFilter ff_vf_vectorscope; extern const FFFilter ff_vf_vflip; extern const FFFilter ff_vf_vflip_vulkan; diff --git a/libavfilter/vca_dct.c b/libavfilter/vca_dct.c new file mode 100644 index 0000000000..6d9a4f79f9 --- /dev/null +++ b/libavfilter/vca_dct.c @@ -0,0 +1,598 @@ +/* + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mem.h" + +#include "vca_dct.h" + +#define safe_abs(n) _Generic((n), \ + signed char: abs(n), short: abs(n), int: abs(n), long: labs(n), long long: llabs(n)) + + +static const int16_t weights_dct8[64] = { + 0, 27, 94, 94, 94, 94, 94, 95, 27, 94, 94, 95, 96, 97, 98, 99, + 94, 94, 95, 97, 99, 101, 104, 107, 94, 95, 97, 99, 103, 107, 113, 120, + 94, 96, 99, 103, 109, 116, 126, 138, 94, 97, 101, 107, 116, 128, 144, 164, + 94, 98, 104, 113, 126, 144, 168, 201, 95, 99, 107, 120, 138, 164, 201, 255, +}; + +static const int16_t weights_dct16[256] = { + 0, 27, 93, 93, 93, 93, 93, 93, 93, 93, 93, 94, 94, 94, 94, 94, 27, 93, 93, + 93, 93, 94, 94, 94, 94, 94, 94, 94, 94, 94, 95, 95, 93, 93, 93, 94, 94, 94, + 94, 94, 94, 95, 95, 95, 96, 96, 96, 97, 93, 93, 94, 94, 94, 94, 94, 95, 95, + 96, 96, 97, 97, 98, 99, 99, 93, 93, 94, 94, 94, 95, 95, 96, 96, 97, 98, 99, + 100, 101, 102, 103, 93, 94, 94, 94, 95, 95, 96, 97, 98, 99, 100, 101, 102, 104, 106, + 107, 93, 94, 94, 94, 95, 96, 97, 98, 99, 101, 102, 104, 106, 108, 110, 113, 93, 94, + 94, 95, 96, 97, 98, 99, 101, 103, 105, 107, 110, 113, 116, 120, 93, 94, 94, 95, 96, + 98, 99, 101, 103, 106, 108, 112, 115, 119, 123, 128, 93, 94, 95, 96, 97, 99, 101, 103, + 106, 109, 112, 116, 121, 126, 132, 138, 93, 94, 95, 96, 98, 100, 102, 105, 108, 112, 117, + 122, 128, 134, 142, 150, 94, 94, 95, 97, 99, 101, 104, 107, 112, 116, 122, 128, 135, 144, + 153, 164, 94, 94, 96, 97, 100, 102, 106, 110, 115, 121, 128, 135, 145, 155, 167, 181, 94, + 94, 96, 98, 101, 104, 108, 113, 119, 126, 134, 144, 155, 168, 183, 201, 94, 95, 96, 99, + 102, 106, 110, 116, 123, 132, 142, 153, 167, 183, 203, 225, 94, 95, 97, 99, 103, 107, 113, + 120, 128, 138, 150, 164, 181, 201, 225, 255, +}; + +static const int16_t weights_dct32[1024] = { + 0, 27, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, + 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 27, 93, 93, 93, 93, 93, + 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 94, 94, + 94, 94, 94, 94, 94, 94, 94, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, + 93, 93, 93, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, + 94, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 94, 94, 94, 94, 94, 94, 94, + 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 95, 95, 95, 95, 93, 93, 93, 93, 93, + 93, 93, 93, 93, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 95, + 95, 95, 95, 95, 95, 95, 95, 96, 93, 93, 93, 93, 93, 93, 93, 94, 94, 94, 94, + 94, 94, 94, 94, 94, 94, 94, 94, 95, 95, 95, 95, 95, 95, 96, 96, 96, 96, 96, + 96, 97, 93, 93, 93, 93, 93, 93, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 95, + 95, 95, 95, 95, 95, 96, 96, 96, 96, 97, 97, 97, 97, 98, 98, 93, 93, 93, 93, + 93, 94, 94, 94, 94, 94, 94, 94, 94, 94, 95, 95, 95, 95, 95, 96, 96, 96, 96, + 97, 97, 97, 98, 98, 98, 99, 99, 99, 93, 93, 93, 93, 93, 94, 94, 94, 94, 94, + 94, 94, 95, 95, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 100, + 100, 101, 101, 93, 93, 93, 93, 94, 94, 94, 94, 94, 94, 94, 95, 95, 95, 95, 96, + 96, 96, 97, 97, 97, 98, 98, 99, 99, 100, 100, 101, 101, 102, 102, 103, 93, 93, 93, + 93, 94, 94, 94, 94, 94, 94, 95, 95, 95, 95, 96, 96, 96, 97, 97, 98, 98, 99, + 99, 100, 100, 101, 102, 102, 103, 104, 104, 105, 93, 93, 93, 94, 94, 94, 94, 94, 94, + 95, 95, 95, 96, 96, 96, 97, 97, 98, 98, 99, 99, 100, 100, 101, 102, 102, 103, 104, + 105, 106, 107, 107, 93, 93, 93, 94, 94, 94, 94, 94, 95, 95, 95, 96, 96, 96, 97, + 97, 98, 98, 99, 100, 100, 101, 102, 102, 103, 104, 105, 106, 107, 108, 109, 110, 93, 93, + 93, 94, 94, 94, 94, 94, 95, 95, 95, 96, 96, 97, 97, 98, 99, 99, 100, 101, 101, + 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 93, 93, 93, 94, 94, 94, 94, 95, + 95, 95, 96, 96, 97, 97, 98, 99, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, + 110, 112, 113, 115, 116, 93, 93, 94, 94, 94, 94, 94, 95, 95, 96, 96, 97, 97, 98, + 99, 99, 100, 101, 102, 103, 104, 105, 106, 107, 109, 110, 112, 113, 115, 116, 118, 120, 93, + 93, 94, 94, 94, 94, 95, 95, 95, 96, 96, 97, 98, 99, 99, 100, 101, 102, 103, 104, + 105, 107, 108, 109, 111, 113, 114, 116, 118, 120, 122, 124, 93, 93, 94, 94, 94, 94, 95, + 95, 96, 96, 97, 98, 98, 99, 100, 101, 102, 103, 104, 106, 107, 108, 110, 112, 113, 115, + 117, 119, 121, 123, 126, 128, 93, 93, 94, 94, 94, 94, 95, 95, 96, 97, 97, 98, 99, + 100, 101, 102, 103, 104, 106, 107, 109, 110, 112, 114, 116, 118, 120, 122, 125, 127, 130, 133, + 93, 93, 94, 94, 94, 95, 95, 96, 96, 97, 98, 99, 100, 101, 102, 103, 104, 106, 107, + 109, 110, 112, 114, 116, 119, 121, 123, 126, 129, 132, 135, 138, 93, 93, 94, 94, 94, 95, + 95, 96, 97, 97, 98, 99, 100, 101, 103, 104, 105, 107, 109, 110, 112, 114, 117, 119, 122, + 124, 127, 130, 133, 136, 140, 144, 93, 93, 94, 94, 94, 95, 95, 96, 97, 98, 99, 100, + 101, 102, 104, 105, 107, 108, 110, 112, 114, 117, 119, 122, 125, 128, 131, 134, 138, 142, 146, + 150, 93, 93, 94, 94, 94, 95, 96, 96, 97, 98, 99, 100, 102, 103, 105, 106, 108, 110, + 112, 114, 117, 119, 122, 125, 128, 131, 135, 139, 143, 147, 152, 157, 93, 94, 94, 94, 95, + 95, 96, 97, 98, 99, 100, 101, 102, 104, 106, 107, 109, 112, 114, 116, 119, 122, 125, 128, + 132, 135, 140, 144, 148, 153, 159, 164, 93, 94, 94, 94, 95, 95, 96, 97, 98, 99, 100, + 102, 103, 105, 107, 109, 111, 113, 116, 119, 122, 125, 128, 132, 136, 140, 144, 149, 154, 160, + 166, 172, 93, 94, 94, 94, 95, 96, 96, 97, 98, 100, 101, 102, 104, 106, 108, 110, 113, + 115, 118, 121, 124, 128, 131, 135, 140, 145, 150, 155, 161, 167, 174, 181, 93, 94, 94, 94, + 95, 96, 97, 98, 99, 100, 102, 103, 105, 107, 109, 112, 114, 117, 120, 123, 127, 131, 135, + 140, 144, 150, 155, 161, 168, 175, 182, 191, 93, 94, 94, 94, 95, 96, 97, 98, 99, 101, + 102, 104, 106, 108, 110, 113, 116, 119, 122, 126, 130, 134, 139, 144, 149, 155, 161, 168, 175, + 183, 192, 201, 93, 94, 94, 95, 95, 96, 97, 98, 100, 101, 103, 105, 107, 109, 112, 115, + 118, 121, 125, 129, 133, 138, 143, 148, 154, 161, 168, 175, 184, 193, 202, 213, 93, 94, 94, + 95, 95, 96, 97, 99, 100, 102, 104, 106, 108, 110, 113, 116, 120, 123, 127, 132, 136, 142, + 147, 153, 160, 167, 175, 183, 193, 203, 214, 225, 93, 94, 94, 95, 95, 96, 98, 99, 101, + 102, 104, 107, 109, 112, 115, 118, 122, 126, 130, 135, 140, 146, 152, 159, 166, 174, 182, 192, + 202, 214, 226, 239, 93, 94, 94, 95, 96, 97, 98, 99, 101, 103, 105, 107, 110, 113, 116, + 120, 124, 128, 133, 138, 144, 150, 157, 164, 172, 181, 191, 201, 213, 225, 239, 255, +}; + +static const int16_t g_t4[4][4] = +{ + { 64, 64, 64, 64 }, + { 83, 36, -36, -83 }, + { 64, -64, -64, 64 }, + { 36, -83, 83, -36 } +}; + +static const int16_t g_t8[8][8] = +{ + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 89, 75, 50, 18, -18, -50, -75, -89 }, + { 83, 36, -36, -83, -83, -36, 36, 83 }, + { 75, -18, -89, -50, 50, 89, 18, -75 }, + { 64, -64, -64, 64, 64, -64, -64, 64 }, + { 50, -89, 18, 75, -75, -18, 89, -50 }, + { 36, -83, 83, -36, -36, 83, -83, 36 }, + { 18, -50, 75, -89, 89, -75, 50, -18 } +}; + +static const int16_t g_t16[16][16] = +{ + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90 }, + { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 }, + { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87 }, + { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 }, + { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80 }, + { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 }, + { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70 }, + { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 }, + { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57 }, + { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 }, + { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43 }, + { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 }, + { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25 }, + { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 }, + { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 } +}; + +static const int16_t g_t32[32][32] = +{ + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 }, + { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 }, + { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 }, + { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 }, + { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 }, + { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 }, + { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 }, + { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 }, + { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 }, + { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 }, + { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 }, + { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 }, + { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 }, + { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 }, + { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 }, + { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 }, + { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 }, + { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 }, + { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 }, + { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 }, + { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 }, + { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43, -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 }, + { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 }, + { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 }, + { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 }, + { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 }, + { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 }, + { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 }, + { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 }, + { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 }, + { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 } +}; + +static void partial_butterfly4(const int16_t* src, int16_t* dst, int shift, int line) +{ + int j; + int E[2], O[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* E and O */ + E[0] = src[0] + src[3]; + O[0] = src[0] - src[3]; + E[1] = src[1] + src[2]; + O[1] = src[1] - src[2]; + + dst[0] = (int16_t)((g_t4[0][0] * E[0] + g_t4[0][1] * E[1] + add) >> shift); + dst[2 * line] = (int16_t)((g_t4[2][0] * E[0] + g_t4[2][1] * E[1] + add) >> shift); + dst[line] = (int16_t)((g_t4[1][0] * O[0] + g_t4[1][1] * O[1] + add) >> shift); + dst[3 * line] = (int16_t)((g_t4[3][0] * O[0] + g_t4[3][1] * O[1] + add) >> shift); + + src += 4; + dst++; + } +} + +static void partial_butterfly8(const int16_t* src, int16_t* dst, int shift, int line) +{ + int j, k; + int E[4], O[4]; + int EE[2], EO[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* E and O*/ + for (k = 0; k < 4; k++) + { + E[k] = src[k] + src[7 - k]; + O[k] = src[k] - src[7 - k]; + } + + /* EE and EO */ + EE[0] = E[0] + E[3]; + EO[0] = E[0] - E[3]; + EE[1] = E[1] + E[2]; + EO[1] = E[1] - E[2]; + + dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> shift); + dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + add) >> shift); + dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + add) >> shift); + dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + add) >> shift); + + dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift); + dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift); + dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift); + dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift); + + src += 8; + dst++; + } +} + +static void partial_butterfly16(const int16_t* src, int16_t* dst, int shift, int line) +{ + int j, k; + int E[8], O[8]; + int EE[4], EO[4]; + int EEE[2], EEO[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* E and O */ + for (k = 0; k < 8; k++) + { + E[k] = src[k] + src[15 - k]; + O[k] = src[k] - src[15 - k]; + } + + /* EE and EO */ + for (k = 0; k < 4; k++) + { + EE[k] = E[k] + E[7 - k]; + EO[k] = E[k] - E[7 - k]; + } + + /* EEE and EEO */ + EEE[0] = EE[0] + EE[3]; + EEO[0] = EE[0] - EE[3]; + EEE[1] = EE[1] + EE[2]; + EEO[1] = EE[1] - EE[2]; + + dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) >> shift); + dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] + add) >> shift); + dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] + add) >> shift); + dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * EEO[1] + add) >> shift); + + for (k = 2; k < 16; k += 4) + { + dst[k * line] = (int16_t)((g_t16[k][0] * EO[0] + g_t16[k][1] * EO[1] + g_t16[k][2] * EO[2] + + g_t16[k][3] * EO[3] + add) >> shift); + } + + for (k = 1; k < 16; k += 2) + { + dst[k * line] = (int16_t)((g_t16[k][0] * O[0] + g_t16[k][1] * O[1] + g_t16[k][2] * O[2] + g_t16[k][3] * O[3] + + g_t16[k][4] * O[4] + g_t16[k][5] * O[5] + g_t16[k][6] * O[6] + g_t16[k][7] * O[7] + + add) >> shift); + } + + src += 16; + dst++; + } +} + +static void partial_butterfly32(const int16_t* src, int16_t* dst, int shift, int line) +{ + int j, k; + int E[16], O[16]; + int EE[8], EO[8]; + int EEE[4], EEO[4]; + int EEEE[2], EEEO[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* E and O*/ + for (k = 0; k < 16; k++) + { + E[k] = src[k] + src[31 - k]; + O[k] = src[k] - src[31 - k]; + } + + /* EE and EO */ + for (k = 0; k < 8; k++) + { + EE[k] = E[k] + E[15 - k]; + EO[k] = E[k] - E[15 - k]; + } + + /* EEE and EEO */ + for (k = 0; k < 4; k++) + { + EEE[k] = EE[k] + EE[7 - k]; + EEO[k] = EE[k] - EE[7 - k]; + } + + /* EEEE and EEEO */ + EEEE[0] = EEE[0] + EEE[3]; + EEEO[0] = EEE[0] - EEE[3]; + EEEE[1] = EEE[1] + EEE[2]; + EEEO[1] = EEE[1] - EEE[2]; + + dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + add) >> shift); + dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * EEEE[1] + add) >> shift); + dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * EEEO[1] + add) >> shift); + dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * EEEO[1] + add) >> shift); + for (k = 4; k < 32; k += 8) + { + dst[k * line] = (int16_t)((g_t32[k][0] * EEO[0] + g_t32[k][1] * EEO[1] + g_t32[k][2] * EEO[2] + + g_t32[k][3] * EEO[3] + add) >> shift); + } + + for (k = 2; k < 32; k += 4) + { + dst[k * line] = (int16_t)((g_t32[k][0] * EO[0] + g_t32[k][1] * EO[1] + g_t32[k][2] * EO[2] + + g_t32[k][3] * EO[3] + g_t32[k][4] * EO[4] + g_t32[k][5] * EO[5] + + g_t32[k][6] * EO[6] + g_t32[k][7] * EO[7] + add) >> shift); + } + + for (k = 1; k < 32; k += 2) + { + dst[k * line] = (int16_t)((g_t32[k][0] * O[0] + g_t32[k][1] * O[1] + g_t32[k][2] * O[2] + g_t32[k][3] * O[3] + + g_t32[k][4] * O[4] + g_t32[k][5] * O[5] + g_t32[k][6] * O[6] + g_t32[k][7] * O[7] + + g_t32[k][8] * O[8] + g_t32[k][9] * O[9] + g_t32[k][10] * O[10] + g_t32[k][11] * + O[11] + g_t32[k][12] * O[12] + g_t32[k][13] * O[13] + g_t32[k][14] * O[14] + + g_t32[k][15] * O[15] + add) >> shift); + } + + src += 32; + dst++; + } +} + +void ff_vca_dct4_c(const int16_t* block, int16_t* dst, int bit_depth) +{ + const int shift_1st = 1 + bit_depth - 8; + const int shift_2nd = 8; + + DECLARE_ALIGNED_32(int16_t, coef[4 * 4]); + + partial_butterfly4(block, coef, shift_1st, 4); + partial_butterfly4(coef, dst, shift_2nd, 4); +} + +void ff_vca_dct8_c(const int16_t* block, int16_t* dst, int bit_depth) +{ + const int shift_1st = 2 + bit_depth - 8; + const int shift_2nd = 9; + + DECLARE_ALIGNED_32(int16_t, coef[8 * 8]); + + partial_butterfly8(block, coef, shift_1st, 8); + partial_butterfly8(coef, dst, shift_2nd, 8); +} + +void ff_vca_dct16_c(const int16_t* block, int16_t* dst, int bit_depth) +{ + const int shift_1st = 3 + bit_depth - 8; + const int shift_2nd = 10; + + DECLARE_ALIGNED_32(int16_t, coef[16 * 16]); + + partial_butterfly16(block, coef, shift_1st, 16); + partial_butterfly16(coef, dst, shift_2nd, 16); +} + +void ff_vca_dct32_c(const int16_t* block, int16_t* dst, int bit_depth) +{ + const int shift_1st = 4 + bit_depth - 8; + const int shift_2nd = 11; + + DECLARE_ALIGNED_32(int16_t, coef[32 * 32]); + + partial_butterfly32(block, coef, shift_1st, 32); + partial_butterfly32(coef, dst, shift_2nd, 32); +} + +void ff_vca_lowpass_dct8_c(const int16_t* src, int16_t* dst, int bit_depth) +{ + DECLARE_ALIGNED_32(int16_t, coef[4 * 4]); + DECLARE_ALIGNED_32(int16_t, avg_block[4 * 4]); + + int16_t totalSum = 0; + int16_t sum = 0; + for (int i = 0; i < 4; i++) + for (int j =0; j < 4; j++) + { + // Calculate average of 2x2 cells + sum = src[2*i*8 + 2*j] + src[2*i*8 + 2*j + 1] + + src[(2*i+1)*8 + 2*j] + src[(2*i+1)*8 + 2*j + 1]; + avg_block[i*4 + j] = sum >> 2; + + totalSum += sum; // use to calculate total block average + } + + ff_vca_dct4_c(avg_block, coef, bit_depth); + + memset(dst, 0, 64 * sizeof(int16_t)); + for (int i = 0; i < 4; i++) + { + memcpy(&dst[i * 8], &coef[i * 4], 4 * sizeof(int16_t)); + } + + // replace first coef with total block average + dst[0] = totalSum << 1; +} + +void ff_vca_lowpass_dct16_c(const int16_t* src, int16_t* dst, int bit_depth) +{ + DECLARE_ALIGNED_32(int16_t, coef[8 * 8]); + DECLARE_ALIGNED_32(int16_t, avg_block[8 * 8]); + + int32_t totalSum = 0; + int16_t sum = 0; + for (int i = 0; i < 8; i++) + for (int j =0; j < 8; j++) + { + sum = src[2*i*16 + 2*j] + src[2*i*16 + 2*j + 1] + + src[(2*i+1)*16 + 2*j] + src[(2*i+1)*16 + 2*j + 1]; + avg_block[i*8 + j] = sum >> 2; + + totalSum += sum; + } + + ff_vca_dct8_c(avg_block, coef, bit_depth); + + memset(dst, 0, 256 * sizeof(int16_t)); + for (int i = 0; i < 8; i++) + { + memcpy(&dst[i * 16], &coef[i * 8], 8 * sizeof(int16_t)); + } + dst[0] = (int16_t)(totalSum >> 1); +} + +void ff_vca_lowpass_dct32_c(const int16_t* src, int16_t* dst, int bit_depth) +{ + DECLARE_ALIGNED_32(int16_t, coef[16 * 16]); + DECLARE_ALIGNED_32(int16_t, avg_block[16 * 16]); + + int32_t totalSum = 0; + int16_t sum = 0; + for (int i = 0; i < 16; i++) + for (int j =0; j < 16; j++) + { + sum = src[2*i*32 + 2*j] + src[2*i*32 + 2*j + 1] + + src[(2*i+1)*32 + 2*j] + src[(2*i+1)*32 + 2*j + 1]; + avg_block[i*16 + j] = sum >> 2; + + totalSum += sum; + } + + ff_vca_dct16_c(avg_block, coef, bit_depth); + + memset(dst, 0, 1024 * sizeof(int16_t)); + for (int i = 0; i < 16; i++) + { + memcpy(&dst[i * 32], &coef[i * 16], 16 * sizeof(int16_t)); + } + dst[0] = (int16_t)(totalSum >> 3); +} + + +uint32_t ff_calc_weighted_coeff(unsigned blocksize, int16_t *coeff_buffer, int enable_lowpass) +{ + uint32_t weighted_sum = 0; + + uint16_t* weights_matrix = weights_dct32; + switch (blocksize) + { + case 32: + weights_matrix = weights_dct32; + break; + case 16: + weights_matrix = weights_dct16; + break; + case 8: + weights_matrix = weights_dct8; + break; + } + + for (unsigned i = 0; i < blocksize * blocksize; i++) + { + uint32_t weighted_coeff = (uint32_t)((weights_matrix[i] * safe_abs(coeff_buffer[i])) >> 8); + weighted_sum += weighted_coeff; + } + + if (blocksize >= 16 && enable_lowpass) + weighted_sum *= 2; + + return weighted_sum; +} + + +static void copy_vals_wo_padding(unsigned pxl_depth, unsigned blocksize, uint8_t *src, unsigned stride, int16_t *buffer) +{ + if (pxl_depth == 1) + { + uint8_t *srcptr = src; + for (unsigned y = 0; y < blocksize; y++) + for (unsigned x = 0; x < blocksize; x++) + *(buffer++) = (int16_t)srcptr[x + stride*y]; + } else { + uint16_t *srcptr = (uint16_t *) src; + const unsigned bytes_per_line = blocksize * 2; + for (unsigned y = 0; y < blocksize; ++y) + { + memcpy(buffer, srcptr, blocksize * sizeof(uint16_t)); + srcptr += stride / 2; + buffer += blocksize; + } + } +} + +static void copy_vals_w_padding(unsigned pxl_depth, unsigned blocksize, uint8_t *src, unsigned stride, int16_t *buffer, unsigned padding_r, unsigned padding_b) +{ + unsigned y = 0; + int16_t *buffer_last_line = buffer; + + if (pxl_depth == 1) { + for (; y < blocksize - padding_b; y++, src += stride) { + unsigned x = 0; + buffer_last_line = buffer; + for (; x < blocksize - padding_r; x++) + *(buffer++) = (int16_t)(src[x]); + const int16_t last = (int16_t)(src[x]); + for (; x < blocksize; x++) + *(buffer++) = last; + } + for (; y < blocksize; y++) { + for (unsigned x = 0; x < blocksize; x++) + *(buffer++) = (buffer_last_line[x]); + } + } else { + uint16_t *srcptr = (uint16_t*)(src); + for (; y < blocksize - padding_b; y++) { + unsigned x = 0; + buffer_last_line = buffer; + + const unsigned nr_vals_copy = blocksize - padding_r; + memcpy(buffer, srcptr, nr_vals_copy * sizeof(uint16_t)); + + const uint16_t last = srcptr[nr_vals_copy - 1]; + for (unsigned x = nr_vals_copy; x < blocksize; x++) + buffer[x] = last; + + buffer += blocksize; + srcptr += stride / 2; + } + for (; y < blocksize; y++) { + const unsigned nr_bytes_copy = blocksize * 2; + memcpy(buffer, buffer_last_line, blocksize * sizeof(uint16_t)); + buffer += blocksize; + } + } +} + +void ff_copy_vals_buffer(unsigned pxl_depth, unsigned offset, unsigned blocksize, uint8_t *src, unsigned stride, int16_t *buffer, unsigned padding_r, unsigned padding_b) +{ + src += offset; + if (padding_r == 0 && padding_b == 0) + copy_vals_wo_padding(pxl_depth, blocksize, src, stride, buffer); + else + copy_vals_w_padding(pxl_depth, blocksize, src, stride, buffer, padding_r, padding_b); +} \ No newline at end of file diff --git a/libavfilter/vca_dct.h b/libavfilter/vca_dct.h new file mode 100644 index 0000000000..faa106c89a --- /dev/null +++ b/libavfilter/vca_dct.h @@ -0,0 +1,131 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * functions and constants for descrete cosine transform for VCA + */ + +#include "avfilter.h" +#include "libavutil/eval.h" +#include "libavutil/mem_internal.h" +#include "libavformat/avio.h" + +#ifndef AVFILTER_VCADCT_H +#define AVFILTER_VCADCT_H + +#if defined(__GNUC__) +#define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32))) +#elif defined(_MSC_VER) +#define ALIGN_VAR_32(T, var) __declspec(align(32)) T var +#endif + +typedef struct VCAPlaneInfo { + int pxl_depth; + int bit_depth; + + int w_pxls_src; + int h_pxls_src; + + int n_blocks; + + int w_blocks; + int h_blocks; + + int w_pxls; + int h_pxls; +} VCAPlaneInfo; + +typedef struct VCAResults { + // globals + uint32_t *energy; + uint32_t *energy_prev; + uint32_t *brightness; + double *energy_dif; +} VCAResults; + +typedef struct ResultSums{ + uint32_t E; + uint32_t L; + double h; +} ResultSums; + +typedef struct VCAContext { + const AVClass *class; + AVIOContext *avio_context; + void (*print)(AVFilterContext *ctx, int lvl, const char *msg, ...); // av_printf_format(2, 3); + void (*perform_dct)(const int16_t* block, int16_t* dst, int bit_depth); + + void (*calc_vca_slice_isnf0)(int stride, uint8_t *src, VCAPlaneInfo *plane, VCAResults *result, + int enable_lowpass, int slice_start, int slice_end, ResultSums *partial_sum, + void (*perform_dct)(const int16_t* block, int16_t* dst, int bit_depth)); + + void (*calc_vca_slice_isnf1)(int stride, uint8_t *src, VCAPlaneInfo *plane, VCAResults *result, + int enable_lowpass, int slice_start, int slice_end, ResultSums *partial_sum, + void (*perform_dct)(const int16_t* block, int16_t* dst, int bit_depth)); + + + // options + unsigned blocksize; + int enable_lowpass; + int enable_chroma; + int enable_brightness; + int enable_simd; + int yuview; + int n_frames; + char *file_str; + + // video frame properties + VCAPlaneInfo **plane; + int n_frames_processed; + + // results + VCAResults **result; +} VCAContext; + + +static const int16_t weights_dct8[64]; +static const int16_t weights_dct16[256]; +static const int16_t weights_dct32[1024]; + +static const int16_t g_t4[4][4]; +static const int16_t g_t8[8][8]; +static const int16_t g_t16[16][16]; +static const int16_t g_t32[32][32]; + +uint32_t ff_calc_weighted_coeff(unsigned blocksize, int16_t *coeff_buffer, int enable_lowpass); + +void ff_copy_vals_buffer(unsigned pxl_depth, unsigned offset, unsigned blocksize, uint8_t *src, unsigned stride, int16_t *buffer, unsigned padding_r, unsigned padding_b); + +void ff_vca_dct4_c(const int16_t* src, int16_t* dst, int bit_depth); + +void ff_vca_dct8_c(const int16_t* src, int16_t* dst, int bit_depth); + +void ff_vca_dct16_c(const int16_t* src, int16_t* dst, int bit_depth); + +void ff_vca_dct32_c(const int16_t* src, int16_t* dst, int bit_depth); + +void ff_vca_lowpass_dct8_c(const int16_t* src, int16_t* dst, int bit_depth); + +void ff_vca_lowpass_dct16_c(const int16_t* src, int16_t* dst, int bit_depth); + +void ff_vca_lowpass_dct32_c(const int16_t* src, int16_t* dst, int bit_depth); + +int ff_vca_dct_init_x86(VCAContext *v); + +#endif \ No newline at end of file diff --git a/libavfilter/vf_vca.c b/libavfilter/vf_vca.c new file mode 100644 index 0000000000..8c345aafa5 --- /dev/null +++ b/libavfilter/vf_vca.c @@ -0,0 +1,546 @@ +/* + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Calculate frame scores using Video Complexity Analyzer (VCA) + */ + + +#include "libavutil/timestamp.h" +#include "libavutil/mathematics.h" + +#include "libavutil/avassert.h" +#include "libavutil/imgutils.h" +#include "libavutil/mem.h" +#include "libavutil/opt.h" + + +#include "avfilter.h" +#include "filters.h" +#include "formats.h" +#include "video.h" + +#include "vca_dct.h" + +typedef struct ThreadData { + void (*perform_dct)(const int16_t* block, int16_t* dst, int bit_depth); + void (*calc_vca_slice)(int stride, uint8_t *src, VCAPlaneInfo *plane, VCAResults *result, + int enable_lowpass, int slice_start, int slice_end, ResultSums *partial_sum, + void (*perform_dct)(const int16_t* block, int16_t* dst, int bit_depth)); + int stride; + int blocksize; + + int enable_lowpass; + + uint8_t *src; + + VCAPlaneInfo *plane; + VCAResults *result; + + ResultSums **partial_sums; +} ThreadData; + +#define OFFSET(x) offsetof(VCAContext, x) +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM + +static const AVOption vca_options[] = { + // Analysis config + { "blocksize", "Set size of block", OFFSET(blocksize), AV_OPT_TYPE_INT, {.i64=32}, 8, 32, FLAGS }, + { "n", "Set the frames batch size, -1 to process all", OFFSET(n_frames), AV_OPT_TYPE_INT, {.i64=-1}, -1, INT_MAX, FLAGS }, + // Performance + { "lowpass", "Enable low-pass DCT", OFFSET(enable_lowpass), AV_OPT_TYPE_BOOL, { .i64=1 }, 0, 1, FLAGS }, + { "simd", "Enable hardware acceralation with SIMD", OFFSET(enable_simd), AV_OPT_TYPE_BOOL, { .i64=1 }, 0, 1, FLAGS }, + { "brightness", "Enable brightness infomation", OFFSET(enable_brightness), AV_OPT_TYPE_BOOL, { .i64=0 }, 0, 1, FLAGS }, + { "chroma", "Enable analysis of chroma channels", OFFSET(enable_chroma), AV_OPT_TYPE_BOOL, { .i64=0 }, 0, 1, FLAGS }, + // Output + { "file", "Set file where to print analysis information", OFFSET(file_str), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS }, + { "yuview", "Produce a detailed blockwise output for YUView", OFFSET(yuview), AV_OPT_TYPE_BOOL, { .i64=0 }, 0, 1, FLAGS }, + { NULL } +}; + +static const double E_norm_factor = 90; +static const double h_norm_factor = 18; + +AVFILTER_DEFINE_CLASS(vca); + +static const enum AVPixelFormat pxl_fmts[] = { + AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P, + AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUV410P, + AV_PIX_FMT_YUVJ411P, AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, + AV_PIX_FMT_YUVJ440P, AV_PIX_FMT_YUVJ444P, + #define PF(suf) AV_PIX_FMT_YUV420##suf, AV_PIX_FMT_YUV422##suf, AV_PIX_FMT_YUV444##suf + PF(P10), PF(P12), + AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, + AV_PIX_FMT_NONE +}; + +#define WRITE_YUVIEW_BRIGHTNESS_1_1 \ + result->brightness[block_i] = (uint32_t)sqrt(out_buffer[0]); + +#define WRITE_YUVIEW_BRIGHTNESS_1_0 +#define WRITE_YUVIEW_BRIGHTNESS_0_1 +#define WRITE_YUVIEW_BRIGHTNESS_0_0 + +#define WRITE_YUVIEW_BRIGHTNESS(IS_BRIGHTNESS, IS_YUVIEW) WRITE_YUVIEW_BRIGHTNESS_##IS_BRIGHTNESS##_##IS_YUVIEW + +#define WRITE_YUVIEW_ENERGY_DIF_1_1 \ + result->energy_dif[block_i] = abs((int)result->energy[block_i] - (int)result->energy_prev[block_i]); + +#define WRITE_YUVIEW_ENERGY_DIF_1_0 +#define WRITE_YUVIEW_ENERGY_DIF_0_1 +#define WRITE_YUVIEW_ENERGY_DIF_0_0 + +#define WRITE_YUVIEW_ENERGY_DIF(IS_NOT_FIRST, IS_YUVIEW) WRITE_YUVIEW_ENERGY_DIF_##IS_NOT_FIRST##_##IS_YUVIEW + +#define ENERGY_DIF_1 \ + partial_sum->h += abs((int)result->energy[block_i] - (int)result->energy_prev[block_i]); + +#define ENERGY_DIF_0 + +#define ENERGY_DIF(IS_NOT_FIRST) ENERGY_DIF_##IS_NOT_FIRST + +#define BRIGHTNESS_1 \ + partial_sum->L += (uint32_t)sqrt(out_buffer[0]); + +#define BRIGHTNESS_0 + +#define BRIGHTNESS(IS_BRIGHTNESS) BRIGHTNESS_##IS_BRIGHTNESS + +#define DEFINE_CALC_ENERGY_SLICE(BLOCKSIZE, IS_NOT_FIRST, IS_YUVIEW, IS_BRIGHNTESS) \ +static void calc_vca_##BLOCKSIZE##_isnf##IS_NOT_FIRST##_brig##IS_BRIGHNTESS##_yuview##IS_YUVIEW##_slice(\ + int stride, uint8_t *src, VCAPlaneInfo *plane, VCAResults *result, \ + int enable_lowpass, int slice_start, int slice_end, ResultSums *partial_sum, \ + void (*perform_dct)(const int16_t* block, int16_t* dst, int bit_depth)) { \ + int block_i = (slice_start / BLOCKSIZE) * plane->w_blocks; \ + DECLARE_ALIGNED_32(int16_t, block_buffer[BLOCKSIZE * BLOCKSIZE]); \ + DECLARE_ALIGNED_32(int16_t, out_buffer[BLOCKSIZE * BLOCKSIZE]); \ + const unsigned bit_depth = plane->bit_depth; \ + for (unsigned blockY = slice_start; blockY < slice_end; blockY += BLOCKSIZE) { \ + int padding_b = FFMAX(((int)(blockY + BLOCKSIZE) - (int)(plane->h_pxls_src)), 0); \ + for (unsigned blockX = 0; blockX < plane->w_pxls; blockX += BLOCKSIZE) { \ + int offset = blockX * plane->pxl_depth + (blockY * stride); \ + int padding_r = FFMAX((int)(blockX + BLOCKSIZE) - (int)(plane->w_pxls_src), 0); \ + /* Copy values to block buffer */ \ + ff_copy_vals_buffer(plane->pxl_depth, offset, BLOCKSIZE, src, stride, \ + block_buffer, padding_r, padding_b); \ + perform_dct(block_buffer, out_buffer, bit_depth); \ + /* Calculate energy */ \ + result->energy[block_i] = ff_calc_weighted_coeff(BLOCKSIZE, out_buffer, enable_lowpass);\ + partial_sum->E += result->energy[block_i]; \ + BRIGHTNESS(IS_BRIGHNTESS) \ + ENERGY_DIF(IS_NOT_FIRST) \ + WRITE_YUVIEW_BRIGHTNESS(IS_BRIGHNTESS, IS_YUVIEW) \ + WRITE_YUVIEW_ENERGY_DIF(IS_NOT_FIRST, IS_YUVIEW) \ + block_i++; \ + } \ + } \ +} + +#define FUNCTION_LIST(X) \ + X(8,0,0,0) X(16,0,0,0) X(32,0,0,0) X(8,1,0,0) X(16,1,0,0) X(32,1,0,0) \ + X(8,0,1,0) X(16,0,1,0) X(32,0,1,0) X(8,1,1,0) X(16,1,1,0) X(32,1,1,0) \ + X(8,0,0,1) X(16,0,0,1) X(32,0,0,1) X(8,1,0,1) X(16,1,0,1) X(32,1,0,1) \ + X(8,0,1,1) X(16,0,1,1) X(32,0,1,1) X(8,1,1,1) X(16,1,1,1) X(32,1,1,1) + +FUNCTION_LIST(DEFINE_CALC_ENERGY_SLICE) + +#define FN(blsz, isnf, brig, yuv) \ + calc_vca_##blsz##_isnf##isnf##_brig##brig##_yuview##yuv##_slice + +#define ISNF(blsz, brig, yuv) \ + { FN(blsz,0,brig,yuv), FN(blsz,1,brig,yuv) } + +#define YUVIEW(blsz, brig) \ + { ISNF(blsz, brig, 0), ISNF(blsz, brig, 1) } + +#define BRIGHT(blsz) \ + { YUVIEW(blsz, 0), YUVIEW(blsz, 1) } + +static void* calc_fn_table[3][2][2][2] = { + BRIGHT(8), + BRIGHT(16), + BRIGHT(32) +}; + + +static void print_log(AVFilterContext *ctx, int lvl, const char *msg, ...) +{ + va_list argument_list; + + va_start(argument_list, msg); + if (msg) + av_vlog(ctx, lvl, msg, argument_list); + va_end(argument_list); +} + +static void print_file(AVFilterContext *ctx, int lvl, const char *msg, ...) +{ + VCAContext *v = ctx->priv; + va_list argument_list; + + va_start(argument_list, msg); + if (msg) { + char buf[128]; + int ret = vsnprintf(buf, sizeof(buf), msg, argument_list); + avio_write(v->avio_context, buf, ret); + } + va_end(argument_list); +} + +static int calc_energy_filter_slice(AVFilterContext *ctx, void *arg, int job, int nb_jobs){ + ThreadData *th = arg; + + int block_row_start = (th->plane->h_blocks * job) / nb_jobs; + int block_row_end = (th->plane->h_blocks * (job+1)) / nb_jobs; + + int slice_start = block_row_start * th->blocksize; + int slice_end = block_row_end * th->blocksize; + + th->calc_vca_slice( + th->stride, th->src, th->plane, th->result, + th->enable_lowpass, slice_start, slice_end, + th->partial_sums[job], th->perform_dct + ); + + return 0; +} + +static void perform_vca(AVFilterContext *ctx, AVFilterLink *inlink, AVFrame *in, FilterLink *inl , + VCAContext *v, int plane_i, double* h, uint32_t* E, uint32_t* L){ + VCAPlaneInfo* plane = v->plane[plane_i]; + + int stride = in->linesize[plane_i] / plane->pxl_depth; + int nb_threads = ff_filter_get_nb_threads(ctx); + void* calc_vca_slice; + + ResultSums** partial_sums = av_calloc(nb_threads, sizeof(ResultSums*)); + for(int j = 0; j < nb_threads; j++) + partial_sums[j] = av_calloc(1, sizeof(ResultSums)); + + if (v->n_frames_processed == 0) + calc_vca_slice = v->calc_vca_slice_isnf0; + else + calc_vca_slice = v->calc_vca_slice_isnf1; + + ThreadData th = { + .stride = stride, + .blocksize = v->blocksize, + .enable_lowpass = v->enable_lowpass, + .src = in->data[plane_i], + .plane = plane, + .result = v->result[plane_i], + .partial_sums = partial_sums, + .perform_dct = v->perform_dct, + .calc_vca_slice = calc_vca_slice, + }; + + ff_filter_execute(ctx, calc_energy_filter_slice, &th, NULL, FFMIN(plane->h_blocks, nb_threads)); + + for (int i = 0; i < nb_threads; i++){ + E[plane_i] += th.partial_sums[i]->E; + L[plane_i] += th.partial_sums[i]->L; + h[plane_i] += th.partial_sums[i]->h; + } + + E[plane_i] /= (plane->n_blocks * E_norm_factor); + L[plane_i] /= (plane->n_blocks); + h[plane_i] /= (plane->n_blocks * h_norm_factor); + + av_free(th.partial_sums); + + // At the end copy current energy to the previous + memcpy(v->result[plane_i]->energy_prev ,v->result[plane_i]->energy, v->plane[plane_i]->n_blocks * sizeof(uint32_t)); +} + +static int filter_frame(AVFilterLink *inlink, AVFrame *in) +{ + AVFilterContext *ctx = inlink->dst; + VCAContext *v = ctx->priv; + FilterLink *inl = ff_filter_link(inlink); + int planes = v->enable_chroma ? 3 : 1; + + if (v->n_frames_processed >= v->n_frames && v->n_frames != -1) + return ff_filter_frame(inlink->dst->outputs[0], in); + + uint32_t E[3] = {0,0,0}; + uint32_t L[3] = {0.0,0.0,0.0}; + double h[3] = {0,0,0}; + + for(int i = 0; i < planes; i++){ + if (v->plane[i]->bit_depth != 8 + && v->plane[i]->bit_depth != 10 + && v->plane[i]->bit_depth != 12) + return AVERROR(AVERROR_INVALIDDATA); + perform_vca(ctx, inlink, in, inl, v, i, h, E, L); + } + + v->n_frames_processed++; + + // Dump info; + if (v->yuview) { + int block_i = 0; + for (unsigned y = 0; y < v->plane[0]->h_blocks; y ++) { + for (unsigned x = 0; x < v->plane[0]->w_blocks; x ++) { + v->print(ctx, AV_LOG_INFO, "%d;%d;%d;%d;%d;%d;%d\n", inl->frame_count_out, + x * v->blocksize, y * v->blocksize, v->blocksize, v->blocksize, + 0, v->result[0]->energy_prev[block_i]); + block_i++; + } + } + block_i = 0; + for (unsigned y = 0; y < v->plane[0]->h_blocks; y ++) { + for (unsigned x = 0; x < v->plane[0]->w_blocks; x ++) { + v->print(ctx, AV_LOG_INFO, "%d;%d;%d;%d;%d;%d;%.0f\n", inl->frame_count_out, + x * v->blocksize, y * v->blocksize, v->blocksize, v->blocksize, + 1, v->result[0]->energy_dif[block_i]); + block_i++; + } + } + } else { + v->print(ctx, AV_LOG_INFO, + "%4"PRId64, + inl->frame_count_out); + v->print(ctx, AV_LOG_INFO, + ",%d,%f", + E[0], h[0]); + if(v->enable_brightness) + v->print(ctx, AV_LOG_INFO,",%d",L[0]); + if (v->enable_chroma) { + v->print(ctx, AV_LOG_INFO, + ",%d,%f", + E[1], h[1]); + v->print(ctx, AV_LOG_INFO, + ",%d,%f", + E[2], h[2]); + if(v->enable_brightness) + v->print(ctx, AV_LOG_INFO,",%d,%d",L[1],L[2]); + } + } + + v->print(ctx, AV_LOG_INFO, "\n"); + return ff_filter_frame(inlink->dst->outputs[0], in); +} + +static int config_input(AVFilterLink *inlink) +{ + AVFilterContext *ctx = inlink->dst; + VCAContext *v = ctx->priv; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format); + int max_pixsteps[4]; + int planes; + + v->plane[0]->w_pxls_src = inlink->w; + v->plane[0]->h_pxls_src = inlink->h; + + if (v->enable_chroma){ + v->plane[1]->w_pxls_src = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w); + v->plane[1]->h_pxls_src = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h); + + v->plane[2]->w_pxls_src = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w); + v->plane[2]->h_pxls_src = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h); + + planes = 3; + } else + planes = 1; + + for(int i = 0; i < planes; i++){ + // Inference of bit depth + v->plane[i]->bit_depth = desc->comp[i].depth; + // Inference of pixel depth + av_image_fill_max_pixsteps(max_pixsteps, NULL, desc); + v->plane[i]->pxl_depth = max_pixsteps[i]; + + v->plane[i]->w_blocks = (v->plane[i]->w_pxls_src + v->blocksize - 1) / v->blocksize; + v->plane[i]->h_blocks = (v->plane[i]->h_pxls_src + v->blocksize - 1) / v->blocksize; + + v->plane[i]->n_blocks = v->plane[i]->w_blocks * v->plane[i]->h_blocks; + + v->plane[i]->w_pxls = v->plane[i]->w_blocks * v->blocksize; + v->plane[i]->h_pxls = v->plane[i]->h_blocks * v->blocksize; + + // Free previous buffers in case they are allocated already + av_freep(&v->result[i]->energy_prev); + av_freep(&v->result[i]->energy_dif); + av_freep(&v->result[i]->energy); + av_freep(&v->result[i]->brightness); + + v->result[i]->energy = av_malloc(v->plane[i]->n_blocks * sizeof(uint32_t)); + v->result[i]->energy_prev = av_malloc(v->plane[i]->n_blocks * sizeof(uint32_t)); + if (!v->result[i]->energy || ! v->result[i]->energy_prev) + return AVERROR(ENOMEM); + + if(v->yuview){ + v->result[i]->energy_dif = av_malloc(v->plane[i]->n_blocks * sizeof(double)); + v->result[i]->brightness = av_malloc(v->plane[i]->n_blocks * sizeof(uint32_t)); + if (!v->result[i]->energy_dif || !v->result[i]->brightness) + return AVERROR(ENOMEM); + } + } + + if (v->yuview) { + v->print(ctx, AV_LOG_INFO, "%%;%%;Written by VCA for YUView\n"); + v->print(ctx, AV_LOG_INFO, "%%;syntax-version;v1.22\n"); + v->print(ctx, AV_LOG_INFO, "%%;%%;POC;X-position of the left top pixel in the block;Y-position of the left top pixel in the block;"); + v->print(ctx, AV_LOG_INFO, "Width of the block;Height of the block; Type-ID;Type specific value\n"); + v->print(ctx, AV_LOG_INFO, "%%;seq-specs;%s;%s;%d;%d;%d\n", "file", "layer0", v->plane[0]->w_pxls, v->plane[0]->h_pxls, 24); + v->print(ctx, AV_LOG_INFO, "%%;type;0;BlockEnergy;range\n"); + v->print(ctx, AV_LOG_INFO, "%%;defaultRange;0;10000;heat\n"); + v->print(ctx, AV_LOG_INFO, "%%;type;1;TempEnergyDiff;range\n"); + v->print(ctx, AV_LOG_INFO, "%%;defaultRange;0;3000;heat\n"); + } else { + v->print(ctx, AV_LOG_INFO, "POC,E,h"); + if(v->enable_brightness) + v->print(ctx, AV_LOG_INFO, ",L"); + if (v->enable_chroma) + v->print(ctx, AV_LOG_INFO, ",EV,LV,hV,EU,LU,hE"); + if(v->enable_brightness && v->enable_chroma) + v->print(ctx, AV_LOG_INFO, ",LV,LU"); + + v->print(ctx, AV_LOG_INFO, "\n"); + } + + av_log(ctx, AV_LOG_INFO, "threads: %d\n", ff_filter_get_nb_threads(ctx)); + + return 0; +} + +static av_cold int init(AVFilterContext *ctx) +{ + // User options but no input data + VCAContext *v = ctx->priv; + int ret; + int planes = v->enable_chroma ? 3 : 1; + + // allocate arrays of pointers + v->result = av_calloc(planes, sizeof(*v->result)); + v->plane = av_calloc(planes, sizeof(*v->plane)); + if (!v->result || !v->plane) + return AVERROR(ENOMEM); + + // allocate each plane/result struct + for (int i = 0; i < planes; i++) { + v->result[i] = av_mallocz(sizeof(*v->result[i])); + v->plane[i] = av_mallocz(sizeof(*v->plane[i])); + + if (!v->result[i] || !v->plane[i]) + return AVERROR(ENOMEM); + } + + v->n_frames_processed = 0; + + if (v->file_str) { + v->print = print_file; + } else { + v->print = print_log; + } + + if (v->enable_lowpass) { + switch (v->blocksize) { + case 32: v->perform_dct = ff_vca_lowpass_dct32_c; break; + case 16: v->perform_dct = ff_vca_lowpass_dct16_c; break; + case 8: v->perform_dct = ff_vca_lowpass_dct8_c; break; + default: + av_log(ctx, AV_LOG_ERROR, "Unallowed blocksize: %d\n", v->blocksize); + return AVERROR(AVERROR_INVALIDDATA); + } + } else { + switch (v->blocksize) { + case 32: v->perform_dct = ff_vca_dct32_c; break; + case 16: v->perform_dct = ff_vca_dct16_c; break; + case 8: v->perform_dct = ff_vca_dct8_c; break; + default: + av_log(ctx, AV_LOG_ERROR, "Unallowed blocksize: %d\n", v->blocksize); + return AVERROR(AVERROR_INVALIDDATA); + } + } + + if (v->enable_simd) { + #if ARCH_X86 && HAVE_X86ASM + ret = ff_vca_dct_init_x86(v); + if (ret != 0) { + return ret; + } + #endif + } + + int b = 0; + switch (v->blocksize) { + case 8: b = 0; break; + case 16: b = 1; break; + case 32: b = 2; break; + default: + av_log(ctx, AV_LOG_ERROR, "Unallowed blocksize: %d\n", v->blocksize); + return AVERROR(AVERROR_INVALIDDATA); + } + + v->calc_vca_slice_isnf0 = calc_fn_table[b][v->enable_brightness][v->yuview][0]; + v->calc_vca_slice_isnf1 = calc_fn_table[b][v->enable_brightness][v->yuview][1]; + + v->avio_context = NULL; + if (v->file_str) { + ret = avio_open(&v->avio_context, v->file_str, AVIO_FLAG_WRITE); + + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n", + v->file_str, av_err2str(ret)); + return ret; + } + } + return 0; +} + +static av_cold void uninit(AVFilterContext *ctx) +{ + VCAContext *v = ctx->priv; + int planes = v->enable_chroma ? 3 : 1; + + for(int plane = 0; plane < planes; plane++) { + av_freep(&v->result[plane]->energy); + av_freep(&v->result[plane]->energy_prev); + av_freep(&v->result[plane]->energy_dif); + av_freep(&v->result[plane]->brightness); + } + + if (v->avio_context) { + avio_closep(&v->avio_context); + } +} + +static const AVFilterPad avfilter_vf_vca_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = filter_frame, + .config_props = config_input, + }, +}; + +const FFFilter ff_vf_vca = { + .p.name = "vca", + .p.description = NULL_IF_CONFIG_SMALL("Perform VCA analysis."), + .p.priv_class = &vca_class, + .p.flags = AVFILTER_FLAG_SLICE_THREADS, + .priv_size = sizeof(VCAContext), + .init = init, + .uninit = uninit, + FILTER_PIXFMTS_ARRAY(pxl_fmts), + FILTER_INPUTS(avfilter_vf_vca_inputs), + FILTER_OUTPUTS(ff_video_default_filterpad), +}; \ No newline at end of file diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index ade0efc9ae..7711da416b 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -64,6 +64,7 @@ X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o \ x86/vf_tinterlace_init.o X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose.o \ x86/vf_transpose_init.o +X86ASM-OBJS-$(CONFIG_VCA_FILTER) += x86/vf_vca.o x86/vf_vca_init.o X86ASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o x86/af_volume_init.o X86ASM-OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360.o x86/vf_v360_init.o X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o x86/vf_w3fdif_init.o diff --git a/libavfilter/x86/vf_vca.asm b/libavfilter/x86/vf_vca.asm new file mode 100644 index 0000000000..c121f2680c --- /dev/null +++ b/libavfilter/x86/vf_vca.asm @@ -0,0 +1,877 @@ +;***************************************************************************** +;* x86-optimized functions for DCT of VCA filter +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + + +;TO-DO : Further optimize the routines. + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 64 + +dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9 + +tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64 + dw 89, 75, 50, 18, -18, -50, -75, -89 + dw 83, 36, -36, -83, -83, -36, 36, 83 + dw 75, -18, -89, -50, 50, 89, 18, -75 + dw 64, -64, -64, 64, 64, -64, -64, 64 + dw 50, -89, 18, 75, -75, -18, 89, -50 + dw 36, -83, 83, -36, -36, 83, -83, 36 + dw 18, -50, 75, -89, 89, -75, 50, -18 + +tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64 + dw 90, 87, 80, 70, 57, 43, 25, 9 + dw 89, 75, 50, 18, -18, -50, -75, -89 + dw 87, 57, 9, -43, -80, -90, -70, -25 + dw 83, 36, -36, -83, -83, -36, 36, 83 + dw 80, 9, -70, -87, -25, 57, 90, 43 + dw 75, -18, -89, -50, 50, 89, 18, -75 + dw 70, -43, -87, 9, 90, 25, -80, -57 + dw 64, -64, -64, 64, 64, -64, -64, 64 + dw 57, -80, -25, 90, -9, -87, 43, 70 + dw 50, -89, 18, 75, -75, -18, 89, -50 + dw 43, -90, 57, 25, -87, 70, 9, -80 + dw 36, -83, 83, -36, -36, 83, -83, 36 + dw 25, -70, 90, -80, 43, 9, -57, 87 + dw 18, -50, 75, -89, 89, -75, 50, -18 + dw 9, -25, 43, -57, 70, -80, 87, -90 + +tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64 + dw -9, -25, -43, -57, -70, -80, -87, -90 + dw -89, -75, -50, -18, 18, 50, 75, 89 + dw 25, 70, 90, 80, 43, -9, -57, -87 + dw 83, 36, -36, -83, -83, -36, 36, 83 + dw -43, -90, -57, 25, 87, 70, -9, -80 + dw -75, 18, 89, 50, -50, -89, -18, 75 + dw 57, 80, -25, -90, -9, 87, 43, -70 + dw 64, -64, -64, 64, 64, -64, -64, 64 + dw -70, -43, 87, 9, -90, 25, 80, -57 + dw -50, 89, -18, -75, 75, 18, -89, 50 + dw 80, -9, -70, 87, -25, -57, 90, -43 + dw 36, -83, 83, -36, -36, 83, -83, 36 + dw -87, 57, -9, -43, 80, -90, 70, -25 + dw -18, 50, -75, 89, -89, 75, -50, 18 + dw 90, -87, 80, -70, 57, -43, 25, -9 + +dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 + +dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9 + +tab_dct32_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4 + dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90 + dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13 + dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 + dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22 + dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87 + dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31 + dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 + dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38 + dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80 + dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46 + dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 + dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54 + dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70 + dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61 + dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 + dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67 + dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57 + dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73 + dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 + dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78 + dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43 + dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82 + dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 + dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85 + dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25 + dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88 + dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 + dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90 + dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 + dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 + +tab_dct32_2: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 + dw -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 + dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 + dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 + dw -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 + dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 + dw 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 + dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 + dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 + dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 + dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 + dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 + dw -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 + dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 + dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 + dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 + dw -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 + dw -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 + dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 + dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 + dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 + dw -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 + dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 + dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 + dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 + dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 + dw 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 + dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 + dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 + dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 + dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 + +tab_dct4: times 4 dw 64, 64 + times 4 dw 83, 36 + times 4 dw 64, -64 + times 4 dw 36, -83 + +tab_dct8_1: times 2 dw 89, 50, 75, 18 + times 2 dw 75, -89, -18, -50 + times 2 dw 50, 18, -89, 75 + times 2 dw 18, 75, -50, -89 + +tab_dct8_2: times 2 dd 83, 36 + times 2 dd 36, 83 + times 1 dd 89, 75, 50, 18 + times 1 dd 75, -18, -89, -50 + times 1 dd 50, -89, 18, 75 + times 1 dd 18, -50, 75, -89 + +pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15 + +SECTION .text +const pd_2, times 8 dd 2 +const pd_4, times 4 dd 4 +const pd_8, times 4 dd 8 +const pd_128, times 4 dd 128 +const pd_256, times 4 dd 256 +const pd_512, times 4 dd 512 +const pd_1024, times 4 dd 1024 +const pw_ppppmmmm, times 1 dw 1, 1, 1, 1, -1, -1, -1, -1 +const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7 + +cextern pd_16 +cextern pd_32 +cextern pd_64 + +%macro DCT_CONSTS 1 +%if %1 == 12 + %define DCT4_SHIFT 5 + %define DCT4_ROUND 16 + %define IDCT_SHIFT 8 + %define IDCT_ROUND 128 + %define DST4_SHIFT 5 + %define DST4_ROUND 16 + %define DCT8_SHIFT1 6 + %define DCT8_ROUND1 32 +%elif %1 == 10 + %define DCT4_SHIFT 3 + %define DCT4_ROUND 4 + %define IDCT_SHIFT 10 + %define IDCT_ROUND 512 + %define DST4_SHIFT 3 + %define DST4_ROUND 4 + %define DCT8_SHIFT1 4 + %define DCT8_ROUND1 8 +%elif %1 == 8 + %define DCT4_SHIFT 1 + %define DCT4_ROUND 1 + %define IDCT_SHIFT 12 + %define IDCT_ROUND 2048 + %define DST4_SHIFT 1 + %define DST4_ROUND 1 + %define DCT8_SHIFT1 2 + %define DCT8_ROUND1 2 +%else + %error Unsupported BIT_DEPTH! +%endif +%endmacro + +%define DCT8_ROUND2 256 +%define DCT8_SHIFT2 9 + +%if ARCH_X86_64 == 1 +%macro DCT8_PASS_1 4 + vpbroadcastq m0, [r6 + %1] + pmaddwd m2, m%3, m0 + pmaddwd m0, m%4 + phaddd m2, m0 + paddd m2, m5 + psrad m2, DCT8_SHIFT1 + packssdw m2, m2 + vpermq m2, m2, 0x08 + mova [r5 + %2], xm2 +%endmacro + +%macro DCT8_PASS_2 2 + vbroadcasti128 m4, [r6 + %1] + pmaddwd m6, m0, m4 + pmaddwd m7, m1, m4 + pmaddwd m8, m2, m4 + pmaddwd m9, m3, m4 + phaddd m6, m7 + phaddd m8, m9 + phaddd m6, m8 + paddd m6, m5 + psrad m6, DCT8_SHIFT2 + + vbroadcasti128 m4, [r6 + %2] + pmaddwd m10, m0, m4 + pmaddwd m7, m1, m4 + pmaddwd m8, m2, m4 + pmaddwd m9, m3, m4 + phaddd m10, m7 + phaddd m8, m9 + phaddd m10, m8 + paddd m10, m5 + psrad m10, DCT8_SHIFT2 + + packssdw m6, m10 + vpermq m10, m6, 0xD8 + +%endmacro + +%macro VCA_DCT8 1 + +INIT_YMM avx2 +%if %1 == 12 +cglobal dct8_12bit, 3, 7, 11, 0-8*16 +%elif %1 == 10 +cglobal dct8_10bit, 3, 7, 11, 0-8*16 +%elif %1 == 8 +cglobal dct8_8bit, 3, 7, 11, 0-8*16 +%else + %error Unsupported BIT_DEPTH! +%endif + +vbroadcasti128 m5, [pd_ %+ DCT8_ROUND1] +%define DCT_SHIFT2 9 + + add r2d, r2d + lea r3, [r2 * 3] + lea r4, [r0 + r2 * 4] + mov r5, rsp + lea r6, [tab_dct8] + mova m6, [dct8_shuf] + + ;pass1 + mova xm0, [r0] + vinserti128 m0, m0, [r4], 1 + mova xm1, [r0 + r2] + vinserti128 m1, m1, [r4 + r2], 1 + mova xm2, [r0 + r2 * 2] + vinserti128 m2, m2, [r4 + r2 * 2], 1 + mova xm3, [r0 + r3] + vinserti128 m3, m3, [r4 + r3], 1 + + punpcklqdq m4, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m1, m2, m3 + punpckhqdq m2, m3 + + pshufb m0, m6 + pshufb m2, m6 + + paddw m3, m4, m0 + paddw m7, m1, m2 + + psubw m4, m0 + psubw m1, m2 + + DCT8_PASS_1 0 * 16, 0 * 16, 3, 7 + DCT8_PASS_1 1 * 16, 2 * 16, 4, 1 + DCT8_PASS_1 2 * 16, 4 * 16, 3, 7 + DCT8_PASS_1 3 * 16, 6 * 16, 4, 1 + DCT8_PASS_1 4 * 16, 1 * 16, 3, 7 + DCT8_PASS_1 5 * 16, 3 * 16, 4, 1 + DCT8_PASS_1 6 * 16, 5 * 16, 3, 7 + DCT8_PASS_1 7 * 16, 7 * 16, 4, 1 + + ;pass2 + vbroadcasti128 m5, [pd_ %+ DCT8_ROUND2] + + mova m0, [r5] + mova m1, [r5 + 32] + mova m2, [r5 + 64] + mova m3, [r5 + 96] + + DCT8_PASS_2 0 * 16, 1 * 16 + movu [r1], m10 + DCT8_PASS_2 2 * 16, 3 * 16 + movu [r1 + 32], m10 + DCT8_PASS_2 4 * 16, 5 * 16 + movu [r1 + 64], m10 + DCT8_PASS_2 6 * 16, 7 * 16 + movu [r1 + 96], m10 + RET +%endmacro + +%macro DCT16_PASS_1_E 2 + vpbroadcastq m7, [r7 + %1] + + pmaddwd m4, m0, m7 + pmaddwd m6, m2, m7 + phaddd m4, m6 + + paddd m4, m9 + psrad m4, DCT_SHIFT + + packssdw m4, m4 + vpermq m4, m4, 0x08 + + mova [r5 + %2], xm4 +%endmacro + +%macro DCT16_PASS_1_O 2 + vbroadcasti128 m7, [r7 + %1] + + pmaddwd m10, m0, m7 + pmaddwd m11, m2, m7 + phaddd m10, m11 ; [d0 d0 d1 d1 d4 d4 d5 d5] + + pmaddwd m11, m4, m7 + pmaddwd m12, m6, m7 + phaddd m11, m12 ; [d2 d2 d3 d3 d6 d6 d7 d7] + + phaddd m10, m11 ; [d0 d1 d2 d3 d4 d5 d6 d7] + + paddd m10, m9 + psrad m10, DCT_SHIFT + + packssdw m10, m10 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -] + vpermq m10, m10, 0x08 + + mova [r5 + %2], xm10 +%endmacro + +%macro DCT16_PASS_2 2 + vbroadcasti128 m8, [r7 + %1] + vbroadcasti128 m13, [r8 + %1] + + pmaddwd m10, m0, m8 + pmaddwd m11, m1, m13 + paddd m10, m11 + + pmaddwd m11, m2, m8 + pmaddwd m12, m3, m13 + paddd m11, m12 + phaddd m10, m11 + + pmaddwd m11, m4, m8 + pmaddwd m12, m5, m13 + paddd m11, m12 + + pmaddwd m12, m6, m8 + pmaddwd m13, m7, m13 + paddd m12, m13 + phaddd m11, m12 + + phaddd m10, m11 + paddd m10, m9 + psrad m10, DCT_SHIFT2 + + + vbroadcasti128 m8, [r7 + %2] + vbroadcasti128 m13, [r8 + %2] + + pmaddwd m14, m0, m8 + pmaddwd m11, m1, m13 + paddd m14, m11 + + pmaddwd m11, m2, m8 + pmaddwd m12, m3, m13 + paddd m11, m12 + phaddd m14, m11 + + pmaddwd m11, m4, m8 + pmaddwd m12, m5, m13 + paddd m11, m12 + + pmaddwd m12, m6, m8 + pmaddwd m13, m7, m13 + paddd m12, m13 + phaddd m11, m12 + + phaddd m14, m11 + paddd m14, m9 + psrad m14, DCT_SHIFT2 + + packssdw m10, m14 + vextracti128 xm14, m10, 1 + movlhps xm15, xm10, xm14 + movhlps xm14, xm10 +%endmacro + +%macro VCA_DCT16 1 + +INIT_YMM avx2 +%if %1 == 12 +cglobal dct16_12bit, 3, 9, 16, 0-16*mmsize + %define DCT_SHIFT 7 + vbroadcasti128 m9, [pd_64] +%elif %1 == 10 +cglobal dct16_10bit, 3, 9, 16, 0-16*mmsize + %define DCT_SHIFT 5 + vbroadcasti128 m9, [pd_16] +%elif %1 == 8 +cglobal dct16_8bit, 3, 9, 16, 0-16*mmsize + %define DCT_SHIFT 3 + vbroadcasti128 m9, [pd_4] +%else + %error Unsupported BIT_DEPTH! +%endif + +%define DCT_SHIFT2 10 + + add r2d, r2d + + mova m13, [dct16_shuf1] + mova m14, [dct16_shuf2] + lea r7, [tab_dct16_1 + 8 * 16] + lea r8, [tab_dct16_2 + 8 * 16] + lea r3, [r2 * 3] + mov r5, rsp + mov r4d, 2 ; Each iteration process 8 rows, so 16/8 iterations + +.pass1: + lea r6, [r0 + r2 * 4] + + movu m2, [r0] + movu m1, [r6] + vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo] + vperm2i128 m1, m2, m1, 0x31 ; [row0hi row4hi] + + movu m4, [r0 + r2] + movu m3, [r6 + r2] + vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo] + vperm2i128 m3, m4, m3, 0x31 ; [row1hi row5hi] + + movu m6, [r0 + r2 * 2] + movu m5, [r6 + r2 * 2] + vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo] + vperm2i128 m5, m6, m5, 0x31 ; [row2hi row6hi] + + movu m8, [r0 + r3] + movu m7, [r6 + r3] + vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo] + vperm2i128 m7, m8, m7, 0x31 ; [row3hi row7hi] + + pshufb m1, m13 + pshufb m3, m13 + pshufb m5, m13 + pshufb m7, m13 + + paddw m8, m0, m1 ;E + psubw m0, m1 ;O + + paddw m1, m2, m3 ;E + psubw m2, m3 ;O + + paddw m3, m4, m5 ;E + psubw m4, m5 ;O + + paddw m5, m6, m7 ;E + psubw m6, m7 ;O + + DCT16_PASS_1_O -7 * 16, 1 * 32 + DCT16_PASS_1_O -5 * 16, 3 * 32 + DCT16_PASS_1_O -3 * 16, 1 * 32 + 16 + DCT16_PASS_1_O -1 * 16, 3 * 32 + 16 + DCT16_PASS_1_O 1 * 16, 5 * 32 + DCT16_PASS_1_O 3 * 16, 7 * 32 + DCT16_PASS_1_O 5 * 16, 5 * 32 + 16 + DCT16_PASS_1_O 7 * 16, 7 * 32 + 16 + + pshufb m8, m14 + pshufb m1, m14 + phaddw m0, m8, m1 + + pshufb m3, m14 + pshufb m5, m14 + phaddw m2, m3, m5 + + DCT16_PASS_1_E -8 * 16, 0 * 32 + DCT16_PASS_1_E -4 * 16, 0 * 32 + 16 + DCT16_PASS_1_E 0 * 16, 4 * 32 + DCT16_PASS_1_E 4 * 16, 4 * 32 + 16 + + phsubw m0, m8, m1 + phsubw m2, m3, m5 + + DCT16_PASS_1_E -6 * 16, 2 * 32 + DCT16_PASS_1_E -2 * 16, 2 * 32 + 16 + DCT16_PASS_1_E 2 * 16, 6 * 32 + DCT16_PASS_1_E 6 * 16, 6 * 32 + 16 + + lea r0, [r0 + 8 * r2] + add r5, 256 + + dec r4d + jnz .pass1 + + mov r5, rsp + mov r4d, 2 + mov r2d, 32 + lea r3, [r2 * 3] + vbroadcasti128 m9, [pd_512] + +.pass2: + mova m0, [r5 + 0 * 32] ; [row0lo row4lo] + mova m1, [r5 + 8 * 32] ; [row0hi row4hi] + + mova m2, [r5 + 1 * 32] ; [row1lo row5lo] + mova m3, [r5 + 9 * 32] ; [row1hi row5hi] + + mova m4, [r5 + 2 * 32] ; [row2lo row6lo] + mova m5, [r5 + 10 * 32] ; [row2hi row6hi] + + mova m6, [r5 + 3 * 32] ; [row3lo row7lo] + mova m7, [r5 + 11 * 32] ; [row3hi row7hi] + + DCT16_PASS_2 -8 * 16, -7 * 16 + movu [r1], xm15 + movu [r1 + r2], xm14 + + DCT16_PASS_2 -6 * 16, -5 * 16 + movu [r1 + r2 * 2], xm15 + movu [r1 + r3], xm14 + + lea r6, [r1 + r2 * 4] + DCT16_PASS_2 -4 * 16, -3 * 16 + movu [r6], xm15 + movu [r6 + r2], xm14 + + DCT16_PASS_2 -2 * 16, -1 * 16 + movu [r6 + r2 * 2], xm15 + movu [r6 + r3], xm14 + + lea r6, [r6 + r2 * 4] + DCT16_PASS_2 0 * 16, 1 * 16 + movu [r6], xm15 + movu [r6 + r2], xm14 + + DCT16_PASS_2 2 * 16, 3 * 16 + movu [r6 + r2 * 2], xm15 + movu [r6 + r3], xm14 + + lea r6, [r6 + r2 * 4] + DCT16_PASS_2 4 * 16, 5 * 16 + movu [r6], xm15 + movu [r6 + r2], xm14 + + DCT16_PASS_2 6 * 16, 7 * 16 + movu [r6 + r2 * 2], xm15 + movu [r6 + r3], xm14 + + add r1, 16 + add r5, 128 + + dec r4d + jnz .pass2 + RET +%endmacro + +%macro DCT32_PASS_1 4 + vbroadcasti128 m8, [r7 + %1] + pmaddwd m11, m%3, m8 + pmaddwd m12, m%4, m8 + phaddd m11, m12 + + vbroadcasti128 m8, [r7 + %1 + 32] + vbroadcasti128 m10, [r7 + %1 + 48] + pmaddwd m12, m5, m8 + pmaddwd m13, m6, m10 + phaddd m12, m13 + + pmaddwd m13, m4, m8 + pmaddwd m14, m7, m10 + phaddd m13, m14 + + phaddd m12, m13 + + phaddd m11, m12 + paddd m11, m9 + psrad m11, DCT_SHIFT + + vpermq m11, m11, 0xD8 + packssdw m11, m11 + movq [r5 + %2], xm11 + vextracti128 xm10, m11, 1 + movq [r5 + %2 + 64], xm10 +%endmacro + +%macro DCT32_PASS_2 1 + mova m8, [r7 + %1] + mova m10, [r8 + %1] + pmaddwd m11, m0, m8 + pmaddwd m12, m1, m10 + paddd m11, m12 + + pmaddwd m12, m2, m8 + pmaddwd m13, m3, m10 + paddd m12, m13 + + phaddd m11, m12 + + pmaddwd m12, m4, m8 + pmaddwd m13, m5, m10 + paddd m12, m13 + + pmaddwd m13, m6, m8 + pmaddwd m14, m7, m10 + paddd m13, m14 + + phaddd m12, m13 + + phaddd m11, m12 + vextracti128 xm10, m11, 1 + paddd xm11, xm10 + + paddd xm11, xm9 + psrad xm11, DCT_SHIFT2 + packssdw xm11, xm11 + +%endmacro + +%macro VCA_DCT32 1 +INIT_YMM avx2 + +%if %1 == 12 +cglobal dct32_12bit, 3, 9, 16, 0-64*mmsize + %define DCT_SHIFT 8 + vpbroadcastq m9, [pd_128] +%elif %1 == 10 +cglobal dct32_10bit, 3, 9, 16, 0-64*mmsize + %define DCT_SHIFT 6 + vpbroadcastq m9, [pd_32] +%elif %1 == 8 +cglobal dct32_8bit, 3, 9, 16, 0-64*mmsize + %define DCT_SHIFT 4 + vpbroadcastq m9, [pd_8] +%else + %error Unsupported BIT_DEPTH! +%endif + +%define DCT_SHIFT2 11 + + add r2d, r2d + + lea r7, [tab_dct32_1] + lea r8, [tab_dct32_2] + lea r3, [r2 * 3] + mov r5, rsp + mov r4d, 8 + mova m15, [dct16_shuf1] + +.pass1: + movu m2, [r0] + movu m1, [r0 + 32] + pshufb m1, m15 + vpermq m1, m1, 0x4E + psubw m7, m2, m1 + paddw m2, m1 + + movu m1, [r0 + r2 * 2] + movu m0, [r0 + r2 * 2 + 32] + pshufb m0, m15 + vpermq m0, m0, 0x4E + psubw m8, m1, m0 + paddw m1, m0 + vperm2i128 m0, m2, m1, 0x20 ; [row0lo row2lo] for E + vperm2i128 m3, m2, m1, 0x31 ; [row0hi row2hi] for E + pshufb m3, m15 + psubw m1, m0, m3 + paddw m0, m3 + + vperm2i128 m5, m7, m8, 0x20 ; [row0lo row2lo] for O + vperm2i128 m6, m7, m8, 0x31 ; [row0hi row2hi] for O + + + movu m4, [r0 + r2] + movu m2, [r0 + r2 + 32] + pshufb m2, m15 + vpermq m2, m2, 0x4E + psubw m10, m4, m2 + paddw m4, m2 + + movu m3, [r0 + r3] + movu m2, [r0 + r3 + 32] + pshufb m2, m15 + vpermq m2, m2, 0x4E + psubw m11, m3, m2 + paddw m3, m2 + vperm2i128 m2, m4, m3, 0x20 ; [row1lo row3lo] for E + vperm2i128 m8, m4, m3, 0x31 ; [row1hi row3hi] for E + pshufb m8, m15 + psubw m3, m2, m8 + paddw m2, m8 + + vperm2i128 m4, m10, m11, 0x20 ; [row1lo row3lo] for O + vperm2i128 m7, m10, m11, 0x31 ; [row1hi row3hi] for O + + + DCT32_PASS_1 0 * 32, 0 * 64, 0, 2 + DCT32_PASS_1 2 * 32, 2 * 64, 1, 3 + DCT32_PASS_1 4 * 32, 4 * 64, 0, 2 + DCT32_PASS_1 6 * 32, 6 * 64, 1, 3 + DCT32_PASS_1 8 * 32, 8 * 64, 0, 2 + DCT32_PASS_1 10 * 32, 10 * 64, 1, 3 + DCT32_PASS_1 12 * 32, 12 * 64, 0, 2 + DCT32_PASS_1 14 * 32, 14 * 64, 1, 3 + DCT32_PASS_1 16 * 32, 16 * 64, 0, 2 + DCT32_PASS_1 18 * 32, 18 * 64, 1, 3 + DCT32_PASS_1 20 * 32, 20 * 64, 0, 2 + DCT32_PASS_1 22 * 32, 22 * 64, 1, 3 + DCT32_PASS_1 24 * 32, 24 * 64, 0, 2 + DCT32_PASS_1 26 * 32, 26 * 64, 1, 3 + DCT32_PASS_1 28 * 32, 28 * 64, 0, 2 + DCT32_PASS_1 30 * 32, 30 * 64, 1, 3 + + add r5, 8 + lea r0, [r0 + r2 * 4] + + dec r4d + jnz .pass1 + + mov r2d, 64 + lea r3, [r2 * 3] + mov r5, rsp + mov r4d, 8 + vpbroadcastq m9, [pd_1024] + +.pass2: + mova m0, [r5 + 0 * 64] + mova m1, [r5 + 0 * 64 + 32] + + mova m2, [r5 + 1 * 64] + mova m3, [r5 + 1 * 64 + 32] + + mova m4, [r5 + 2 * 64] + mova m5, [r5 + 2 * 64 + 32] + + mova m6, [r5 + 3 * 64] + mova m7, [r5 + 3 * 64 + 32] + + DCT32_PASS_2 0 * 32 + movq [r1], xm11 + DCT32_PASS_2 1 * 32 + movq [r1 + r2], xm11 + DCT32_PASS_2 2 * 32 + movq [r1 + r2 * 2], xm11 + DCT32_PASS_2 3 * 32 + movq [r1 + r3], xm11 + + lea r6, [r1 + r2 * 4] + DCT32_PASS_2 4 * 32 + movq [r6], xm11 + DCT32_PASS_2 5 * 32 + movq [r6 + r2], xm11 + DCT32_PASS_2 6 * 32 + movq [r6 + r2 * 2], xm11 + DCT32_PASS_2 7 * 32 + movq [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 8 * 32 + movq [r6], xm11 + DCT32_PASS_2 9 * 32 + movq [r6 + r2], xm11 + DCT32_PASS_2 10 * 32 + movq [r6 + r2 * 2], xm11 + DCT32_PASS_2 11 * 32 + movq [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 12 * 32 + movq [r6], xm11 + DCT32_PASS_2 13 * 32 + movq [r6 + r2], xm11 + DCT32_PASS_2 14 * 32 + movq [r6 + r2 * 2], xm11 + DCT32_PASS_2 15 * 32 + movq [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 16 * 32 + movq [r6], xm11 + DCT32_PASS_2 17 * 32 + movq [r6 + r2], xm11 + DCT32_PASS_2 18 * 32 + movq [r6 + r2 * 2], xm11 + DCT32_PASS_2 19 * 32 + movq [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 20 * 32 + movq [r6], xm11 + DCT32_PASS_2 21 * 32 + movq [r6 + r2], xm11 + DCT32_PASS_2 22 * 32 + movq [r6 + r2 * 2], xm11 + DCT32_PASS_2 23 * 32 + movq [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 24 * 32 + movq [r6], xm11 + DCT32_PASS_2 25 * 32 + movq [r6 + r2], xm11 + DCT32_PASS_2 26 * 32 + movq [r6 + r2 * 2], xm11 + DCT32_PASS_2 27 * 32 + movq [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 28 * 32 + movq [r6], xm11 + DCT32_PASS_2 29 * 32 + movq [r6 + r2], xm11 + DCT32_PASS_2 30 * 32 + movq [r6 + r2 * 2], xm11 + DCT32_PASS_2 31 * 32 + movq [r6 + r3], xm11 + + add r5, 256 + add r1, 8 + + dec r4d + jnz .pass2 + RET +%endmacro + +DCT_CONSTS 8 +VCA_DCT8 8 +VCA_DCT16 8 +VCA_DCT32 8 + +DCT_CONSTS 10 +VCA_DCT8 10 +VCA_DCT16 10 +VCA_DCT32 10 + +DCT_CONSTS 12 +VCA_DCT8 12 +VCA_DCT16 12 +VCA_DCT32 12 + +%endif diff --git a/libavfilter/x86/vf_vca_init.c b/libavfilter/x86/vf_vca_init.c new file mode 100644 index 0000000000..d19bfcc34b --- /dev/null +++ b/libavfilter/x86/vf_vca_init.c @@ -0,0 +1,210 @@ +/* + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavcodec/x86/constants.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/vca_dct.h" + +void ff_dct8_8bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride); +void ff_dct8_10bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride); +void ff_dct8_12bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride); + +void ff_dct16_8bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride); +void ff_dct16_10bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride); +void ff_dct16_12bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride); + +void ff_dct32_8bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride); +void ff_dct32_10bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride); +void ff_dct32_12bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride); + + +#if HAVE_X86ASM + +static void ff_dct8_avx2(const int16_t *src, int16_t *dst, int bit_depth) { + switch (bit_depth) { + case 8: + ff_dct8_8bit_avx2(src, dst, 8); + break; + case 10: + ff_dct8_10bit_avx2(src, dst, 8); + break; + case 12: + ff_dct8_12bit_avx2(src, dst, 8); + break; + default: + ff_vca_dct8_c(src, dst, bit_depth); + break; + } +} + +static void ff_dct16_avx2(const int16_t *src, int16_t *dst, int bit_depth) { + switch (bit_depth) { + case 8: + ff_dct16_8bit_avx2(src, dst, 16); + break; + case 10: + ff_dct16_10bit_avx2(src, dst, 16); + break; + case 12: + ff_dct16_12bit_avx2(src, dst, 16); + break; + default: + ff_vca_dct16_c(src, dst, bit_depth); + break; + } +} + +static void ff_dct32_avx2(const int16_t *src, int16_t *dst, int bit_depth) { + switch (bit_depth) { + case 8: + ff_dct32_8bit_avx2(src, dst, 32); + break; + case 10: + ff_dct32_10bit_avx2(src, dst, 32); + break; + case 12: + ff_dct32_12bit_avx2(src, dst, 32); + break; + default: + ff_vca_dct32_c(src, dst, bit_depth); + break; + } +} + +static void ff_lowpass_dct16_avx2(const int16_t *src, int16_t *dst, int bit_depth) { + DECLARE_ALIGNED_32(int16_t, coef[8 * 8]); + DECLARE_ALIGNED_32(int16_t, avg_block[8 * 8]); + + int32_t totalSum = 0; + int16_t sum = 0; + for (int i = 0; i < 8; i++) + for (int j =0; j < 8; j++) + { + sum = src[2*i*16 + 2*j] + src[2*i*16 + 2*j + 1] + + src[(2*i+1)*16 + 2*j] + src[(2*i+1)*16 + 2*j + 1]; + avg_block[i*8 + j] = sum >> 2; + + totalSum += sum; + } + + switch (bit_depth) { + case 8: + ff_dct8_8bit_avx2(avg_block, coef, 8); + break; + case 10: + ff_dct8_10bit_avx2(avg_block, coef, 8); + break; + case 12: + ff_dct8_12bit_avx2(avg_block, coef, 8); + break; + default: + ff_vca_dct8_c(avg_block, coef, bit_depth); + break; + } + + memset(dst, 0, 256 * sizeof(int16_t)); + for (int i = 0; i < 8; i++) + { + memcpy(&dst[i * 16], &coef[i * 8], 8 * sizeof(int16_t)); + } + dst[0] = (int16_t)(totalSum >> 1); +} + +static void ff_lowpass_dct32_avx2(const int16_t *src, int16_t *dst, int bit_depth) { + DECLARE_ALIGNED_32(int16_t, coef[16 * 16]); + DECLARE_ALIGNED_32(int16_t, avg_block[16 * 16]); + + int32_t totalSum = 0; + int16_t sum = 0; + for (int i = 0; i < 16; i++) + for (int j =0; j < 16; j++) + { + sum = src[2*i*32 + 2*j] + src[2*i*32 + 2*j + 1] + + src[(2*i+1)*32 + 2*j] + src[(2*i+1)*32 + 2*j + 1]; + avg_block[i*16 + j] = sum >> 2; + + totalSum += sum; + } + + switch (bit_depth) { + case 8: + ff_dct16_8bit_avx2(avg_block, coef, 16); + break; + case 10: + ff_dct16_10bit_avx2(avg_block, coef, 16); + break; + case 12: + ff_dct16_12bit_avx2(avg_block, coef, 16); + break; + default: + ff_vca_dct16_c(avg_block, coef, bit_depth); + break; + } + + memset(dst, 0, 1024 * sizeof(int16_t)); + for (int i = 0; i < 16; i++) + { + memcpy(&dst[i * 32], &coef[i * 16], 16 * sizeof(int16_t)); + } + dst[0] = (int16_t)(totalSum >> 3); +} +#endif /* HAVE_X86ASM */ + +av_cold int ff_vca_dct_init_x86(VCAContext *v) { +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_AVX2(cpu_flags)) { + if(v->enable_lowpass) { + switch (v->blocksize) { + case 32: + v->perform_dct = ff_lowpass_dct32_avx2; + return 0; + case 16: + v->perform_dct = ff_lowpass_dct16_avx2; + return 0; + case 8: + v->perform_dct = ff_vca_lowpass_dct8_c; + return 0; + default: + return AVERROR(AVERROR_INVALIDDATA); + } + } + else { + switch (v->blocksize) { + case 32: + v->perform_dct = ff_dct32_avx2; + return 0; + case 16: + v->perform_dct = ff_dct16_avx2; + return 0; + case 8: + v->perform_dct = ff_dct8_avx2; + return 0; + default: + return AVERROR(AVERROR_INVALIDDATA); + } + } + } +#endif /* HAVE_X86ASM */ +return 0; +} -- 2.50.1 (Apple Git-155) _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
