Re: [libav-devel] [PATCH] x86/cpu: add AV_CPU_FLAG_AVXSLOW flag

2015-05-23 Thread Luca Barbato
On 23/05/15 03:13, James Almer wrote:
 Signed-off-by: James Almer jamr...@gmail.com
 ---
  doc/APIchanges  |  3 +++
  libavutil/cpu.c |  3 +++
  libavutil/cpu.h |  1 +
  libavutil/version.h |  4 ++--
  libavutil/x86/cpu.c | 17 ++---
  5 files changed, 23 insertions(+), 5 deletions(-)
 

Looks fine.

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] x86/cpu: add AV_CPU_FLAG_AVXSLOW flag

2015-05-22 Thread James Almer
Signed-off-by: James Almer jamr...@gmail.com
---
 doc/APIchanges  |  3 +++
 libavutil/cpu.c |  3 +++
 libavutil/cpu.h |  1 +
 libavutil/version.h |  4 ++--
 libavutil/x86/cpu.c | 17 ++---
 5 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/doc/APIchanges b/doc/APIchanges
index 5d39ec6..b126364 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -13,6 +13,9 @@ libavutil: 2014-08-09
 
 API changes, most recent first:
 
+2015-xx-xx - xxx - lavu 54.13.0 - cpu.h
+  Add AV_CPU_FLAG_AVXSLOW.
+
 2015-xx-xx - xxx - lavc 56.23.0
   Add av_vda_default_init2.
 
diff --git a/libavutil/cpu.c b/libavutil/cpu.c
index 4e8ef61..e24b9dd 100644
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@@ -86,6 +86,7 @@ int av_parse_cpu_flags(const char *s)
 #define CPUFLAG_SSE4 (AV_CPU_FLAG_SSE4 | CPUFLAG_SSSE3)
 #define CPUFLAG_SSE42(AV_CPU_FLAG_SSE42| CPUFLAG_SSE4)
 #define CPUFLAG_AVX  (AV_CPU_FLAG_AVX  | CPUFLAG_SSE42)
+#define CPUFLAG_AVXSLOW  (AV_CPU_FLAG_AVXSLOW  | CPUFLAG_AVX)
 #define CPUFLAG_XOP  (AV_CPU_FLAG_XOP  | CPUFLAG_AVX)
 #define CPUFLAG_FMA3 (AV_CPU_FLAG_FMA3 | CPUFLAG_AVX)
 #define CPUFLAG_FMA4 (AV_CPU_FLAG_FMA4 | CPUFLAG_AVX)
@@ -108,6 +109,7 @@ int av_parse_cpu_flags(const char *s)
 { sse4.1  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE4
 },.unit = flags },
 { sse4.2  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE42   
 },.unit = flags },
 { avx , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX 
 },.unit = flags },
+{ avxslow , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVXSLOW 
 },.unit = flags },
 { xop , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_XOP 
 },.unit = flags },
 { fma3, NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA3
 },.unit = flags },
 { fma4, NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA4
 },.unit = flags },
@@ -219,6 +221,7 @@ static const struct {
 { AV_CPU_FLAG_SSE4,  sse4.1 },
 { AV_CPU_FLAG_SSE42, sse4.2 },
 { AV_CPU_FLAG_AVX,   avx},
+{ AV_CPU_FLAG_AVXSLOW,   avxslow},
 { AV_CPU_FLAG_XOP,   xop},
 { AV_CPU_FLAG_FMA3,  fma3   },
 { AV_CPU_FLAG_FMA4,  fma4   },
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index 7ce..c9469b3 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -45,6 +45,7 @@
 #define AV_CPU_FLAG_SSE4 0x0100 /// Penryn SSE4.1 functions
 #define AV_CPU_FLAG_SSE420x0200 /// Nehalem SSE4.2 functions
 #define AV_CPU_FLAG_AVX  0x4000 /// AVX functions: requires OS 
support even if YMM registers aren't used
+#define AV_CPU_FLAG_AVXSLOW   0x800 /// AVX supported, but slow when 
using YMM registers (e.g. Bulldozer)
 #define AV_CPU_FLAG_XOP  0x0400 /// Bulldozer XOP functions
 #define AV_CPU_FLAG_FMA4 0x0800 /// Bulldozer FMA4 functions
 #define AV_CPU_FLAG_CMOV 0x1000 /// i686 cmov
diff --git a/libavutil/version.h b/libavutil/version.h
index 9c45e0e..378f7b7 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -54,8 +54,8 @@
  */
 
 #define LIBAVUTIL_VERSION_MAJOR 54
-#define LIBAVUTIL_VERSION_MINOR 12
-#define LIBAVUTIL_VERSION_MICRO  1
+#define LIBAVUTIL_VERSION_MINOR 13
+#define LIBAVUTIL_VERSION_MICRO  0
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
LIBAVUTIL_VERSION_MINOR, \
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index 8be6d94..098ccf7 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -167,6 +167,7 @@ int ff_get_cpu_flags_x86(void)
 if (ext_caps  (1  22))
 rval |= AV_CPU_FLAG_MMXEXT;
 
+if (!strncmp(vendor.c, AuthenticAMD, 12)) {
 /* Allow for selectively disabling SSE2 functions on AMD processors
with SSE2 support but not SSE4a. This includes Athlon64, some
Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
@@ -174,9 +175,19 @@ int ff_get_cpu_flags_x86(void)
AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
so that SSE2 is used unless explicitly disabled by checking
AV_CPU_FLAG_SSE2SLOW. */
-if (!strncmp(vendor.c, AuthenticAMD, 12) 
-rval  AV_CPU_FLAG_SSE2  !(ecx  0x0040)) {
-rval |= AV_CPU_FLAG_SSE2SLOW;
+if (rval  AV_CPU_FLAG_SSE2  !(ecx  0x0040))
+rval |= AV_CPU_FLAG_SSE2SLOW;
+
+/* Similar to the above but for AVX functions on AMD processors.
+   This is necessary only for functions using YMM registers on 
Bulldozer
+   based CPUs as they lack 256-bits execution units. SSE/AVX functions
+   using XMM registers are always faster on them.
+   AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is
+   used