Broadcast from memory is better than load 128-bit vector + permutation
to 256-bit vector.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.
gcc/ChangeLog:
* config/i386/predicates.md (avx_vbroadcast128_operand): New
predicate.
* config/i386/sse.md (*avx_vbroadcastf128_<mode>_perm): New
pre_reload splitter.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx_vbroadcastf128.c: New test.
---
gcc/config/i386/predicates.md | 19 +++++++++++++++++++
gcc/config/i386/sse.md | 15 +++++++++++++++
.../gcc.target/i386/avx_vbroadcastf128.c | 17 +++++++++++++++++
3 files changed, 51 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/i386/avx_vbroadcastf128.c
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 5dbe444847f..57950d31878 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -2077,6 +2077,25 @@ (define_predicate "avx_vbroadcast_operand"
return true;
})
+;; Return true if OP is a parallel for a vbroadcastf128 permute.
+(define_predicate "avx_vbroadcast128_operand"
+ (and (match_code "parallel")
+ (match_code "const_int" "a"))
+{
+ int i, nelt = XVECLEN (op, 0);
+ int half = nelt / 2;
+
+ for (i = 0; i < nelt; ++i)
+ {
+ int index = INTVAL (XVECEXP (op, 0, i));
+ if ((i < half && index != i)
+ || (i >= half && index != (i - half)))
+ return false;
+ }
+
+ return true;
+})
+
;; Return true if OP is a parallel for a palignr permute.
(define_predicate "palignr_operand"
(and (match_code "parallel")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e87c26fcc07..8b28c8edb19 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -28148,6 +28148,21 @@ (define_insn "avx_vbroadcastf128_<mode>"
(set_attr "prefix" "vex,vex,vex,evex,evex,evex,evex")
(set_attr "mode" "<sseinsnmode>")])
+
+(define_insn_and_split "*avx_vbroadcastf128_<mode>_perm"
+ [(set (match_operand:V_256 0 "register_operand")
+ (vec_select:V_256
+ (vec_concat:V_256
+ (match_operand:<ssehalfvecmode> 1 "memory_operand")
+ (match_operand:<ssehalfvecmode> 2 "general_operand"))
+ (match_parallel 3 "avx_vbroadcast128_operand"
+ [(match_operand 4 "const_int_operand")])))]
+ "TARGET_AVX && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (vec_concat: V_256 (match_dup 1) (match_dup 1)))])
+
;; For broadcast[i|f]32x2. Yes there is no v4sf version, only v4si.
(define_mode_iterator VI4F_BRCST32x2
[V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
diff --git a/gcc/testsuite/gcc.target/i386/avx_vbroadcastf128.c
b/gcc/testsuite/gcc.target/i386/avx_vbroadcastf128.c
new file mode 100644
index 00000000000..e0bda7dda10
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx_vbroadcastf128.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O3" } */
+/* { dg-final { scan-assembler-not "vpermpd"} } */
+/* { dg-final { scan-assembler {(?n)vbroadcastf(?:128|64x2)} } } */
+
+void
+foo (double* __restrict a, double* b, double* c, int n)
+{
+ for (int i = 0; i != n; i+=4)
+ {
+ a[i] += b[i] * c[i];
+ a[i+1] += b[i+1] * c[i+1];
+ a[i+2] += b[i] * c[i+2];
+ a[i+3] += b[i+1] * c[i+3];
+ }
+
+}
--
2.34.1