On Wed, May 7, 2025 at 9:25 AM H.J. Lu <[email protected]> wrote:
>
> On Wed, May 7, 2025 at 9:17 AM Hongtao Liu <[email protected]> wrote:
> >
> > On Wed, May 7, 2025 at 9:06 AM H.J. Lu <[email protected]> wrote:
> > >
> > > On Tue, May 6, 2025 at 3:35 PM Hongtao Liu <[email protected]> wrote:
> > > >
> > > > On Tue, May 6, 2025 at 3:06 PM H.J. Lu <[email protected]> wrote:
> > > > >
> > > > > On Tue, May 6, 2025 at 2:30 PM Liu, Hongtao <[email protected]>
> > > > > wrote:
> > > > > >
> > > > > >
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: H.J. Lu <[email protected]>
> > > > > > > Sent: Tuesday, May 6, 2025 2:16 PM
> > > > > > > To: Liu, Hongtao <[email protected]>
> > > > > > > Cc: GCC Patches <[email protected]>; Uros Bizjak
> > > > > > > <[email protected]>
> > > > > > > Subject: Re: [PATCH] x86: Skip if the mode size is smaller than
> > > > > > > its natural size
> > > > > > >
> > > > > > > On Tue, May 6, 2025 at 10:54 AM Liu, Hongtao
> > > > > > > <[email protected]>
> > > > > > > wrote:
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > -----Original Message-----
> > > > > > > > > From: H.J. Lu <[email protected]>
> > > > > > > > > Sent: Thursday, May 1, 2025 6:39 AM
> > > > > > > > > To: GCC Patches <[email protected]>; Uros Bizjak
> > > > > > > > > <[email protected]>; Liu, Hongtao <[email protected]>
> > > > > > > > > Subject: [PATCH] x86: Skip if the mode size is smaller than
> > > > > > > > > its
> > > > > > > > > natural size
> > > > > > > > >
> > > > > > > > > When generating a SUBREG from V16QI to V2HF, validate_subreg
> > > > > > > > > fails
> > > > > > > > > since the V2HF size (4 bytes) is smaller than its natural
> > > > > > > > > size (word size).
> > > > > > > > > Update remove_redundant_vector_load to skip if the mode size
> > > > > > > > > is
> > > > > > > > > smaller than its natural size.
> > > > > > > > I think we can also handle it in replace_vector_const by
> > > > > > > > inserting an
> > > > > > > > extra move with (Set (reg:v4qi) (subreg:v4qi (v16qi const0_rtx)
> > > > > > > > 0))
> > > > > > > > And then use subreg with same vector size (v2hf<->v4qi) (set
> > > > > > > > (reg:v2hf) (subreg:v2hf (reg:v4qi) 0))
> > > > > > >
> > > > > > > What is the advantage of this approach? My patch uses a single
> > > > > > > instruction to
> > > > > > > write 4 bytes of 0s and 1s. Your suggestion needs at least one
> > > > > > > more
> > > > > > > instruction.
> > > > > > I'm not asking to do it for all the cases, just to handle those
> > > > > > cases with invalid subreg
> > > > > >
> > > > > > @@ -3334,8 +3334,11 @@ replace_vector_const (machine_mode
> > > > > > vector_mode, rtx vector_const,
> > > > > > machine_mode mode = GET_MODE (dest);
> > > > > >
> > > > > > rtx replace;
> > > > > > + if (!validate_subreg (mode, vector_mode, vector_const, 0))
> > > > > > + /* Insert an extra move to avoid invalid subreg. */
> > > > > > + .........
> > > > > > /* Replace the source operand with VECTOR_CONST. */
> > > > > > - if (SUBREG_P (dest) || mode == vector_mode)
> > > > > > + else if (SUBREG_P (dest) || mode == vector_mode)
> > > > > > replace = vector_const;
> > > > > > else
> > > > > > replace = gen_rtx_SUBREG (mode, vector_const, 0);
> > > > > >
> > > > > > For valid subreg, no need for extra instruction.
> > > > > > I think RA can eliminate the extra move, then the optimization is
> > > > > > not limited to "the mode size is smaller than its natural size".
> > > > >
> > > > > The only "the mode size is smaller than its natural size" cases are 4
> > > > > bytes (64-bit mode)
> > > > > or 2 bytes (32-bit mode). In both cases, there is no need for vector
> > > > > write with subreg.
> > > > It depends on the use, if the use must be put into the vector
> > > > register, .i.e below testcase, I assume your patch will restore the
> > > > extra pxor?
> > > >
> > > > typedef char v4qi __attribute__((vector_size(4)));
> > > > typedef char v16qi __attribute__((vector_size(16)));
> > > >
> > > >
> > > > v4qi a;
> > > > v16qi b;
> > > > void
> > > > foo (v4qi* c, v16qi* d)
> > > > {
> > > > v4qi sum = __extension__(v4qi){0, 0, 0, 0};
> > > > v16qi sum2 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> > > > 0, 0, 0, 0, 0};
> > > > for (int i = 0; i != 100; i++)
> > > > sum += c[i];
> > > > for (int i = 0 ; i != 100; i++)
> > > > sum2 += d[i];
> > > > a = sum;
> > > > b = sum2;
> > > > }
> > > >
> > > > And since this patch is to solve the issue of invalid subreg, why
> > > > don't we just use validate_subreg to guard in
> > > > remove_redundant_vector_load.
> > > > >
> > >
> > > Fixed in the v2 patch with 2 new tests.
> > >
> >
> > /* NB: Don't run recog_memoized here since vector SUBREG may not
> > be valid. Let LRA handle vector SUBREG. */
> >
> > I guess this comment can be removed?
>
> I will verify it.
>
> > Otherwise LGTM.
Here is the v3 patch with
- /* NB: Don't run recog_memoized here since vector SUBREG may not
- be valid. Let LRA handle vector SUBREG. */
SET_SRC (set) = replace;
/* Drop possible dead definitions. */
PATTERN (insn) = set;
+ INSN_CODE (insn) = -1;
+ recog_memoized (insn);
df_insn_rescan (insn);
There is no regression. I will check it in.
Thanks.
--
H.J.
From 47976925f3a03dee9da2b99457f83218471ef28e Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <[email protected]>
Date: Thu, 1 May 2025 06:30:41 +0800
Subject: [PATCH v3] x86: Insert extra move for mode size smaller than natural
size
When generating a SUBREG from V16QI to V2HF, validate_subreg fails since
V2HF is a floating point vector and its size (4 bytes) is smaller than its
natural size (word size). Insert an extra move with a QI vector SUBREG of
the same size to avoid validate_subreg failure.
gcc/
PR target/120036
* config/i386/i386-features.cc (ix86_get_vector_load_mode):
Handle 8/4/2 bytes.
(remove_redundant_vector_load): If the mode size is smaller than
its natural size, first insert an extra move with a QI vector
SUBREG of the same size to avoid validate_subreg failure.
gcc/testsuite/
PR target/120036
* g++.target/i386/pr120036.C: New test.
* gcc.target/i386/pr117839-3a.c: Likewise.
* gcc.target/i386/pr117839-3b.c: Likewise.
Signed-off-by: H.J. Lu <[email protected]>
---
gcc/config/i386/i386-features.cc | 39 ++++++-
gcc/testsuite/g++.target/i386/pr120036.C | 113 ++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr117839-3a.c | 22 ++++
gcc/testsuite/gcc.target/i386/pr117839-3b.c | 5 +
4 files changed, 175 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/g++.target/i386/pr120036.C
create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-3a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-3b.c
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 31f3ee2ef17..1ba5ac4faa4 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3309,8 +3309,16 @@ ix86_get_vector_load_mode (unsigned int size)
mode = V64QImode;
else if (size == 32)
mode = V32QImode;
- else
+ else if (size == 16)
mode = V16QImode;
+ else if (size == 8)
+ mode = V8QImode;
+ else if (size == 4)
+ mode = V4QImode;
+ else if (size == 2)
+ mode = V2QImode;
+ else
+ gcc_unreachable ();
return mode;
}
@@ -3338,13 +3346,36 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
if (SUBREG_P (dest) || mode == vector_mode)
replace = vector_const;
else
- replace = gen_rtx_SUBREG (mode, vector_const, 0);
+ {
+ unsigned int size = GET_MODE_SIZE (mode);
+ if (size < ix86_regmode_natural_size (mode))
+ {
+ /* If the mode size is smaller than its natural size,
+ first insert an extra move with a QI vector SUBREG
+ of the same size to avoid validate_subreg failure. */
+ machine_mode vmode = ix86_get_vector_load_mode (size);
+ rtx vreg;
+ if (mode == vmode)
+ vreg = vector_const;
+ else
+ {
+ vreg = gen_reg_rtx (vmode);
+ rtx vsubreg = gen_rtx_SUBREG (vmode, vector_const, 0);
+ rtx pat = gen_rtx_SET (vreg, vsubreg);
+ rtx_insn *vinsn = emit_insn_before (pat, insn);
+ df_insn_rescan (vinsn);
+ }
+ replace = gen_rtx_SUBREG (mode, vreg, 0);
+ }
+ else
+ replace = gen_rtx_SUBREG (mode, vector_const, 0);
+ }
- /* NB: Don't run recog_memoized here since vector SUBREG may not
- be valid. Let LRA handle vector SUBREG. */
SET_SRC (set) = replace;
/* Drop possible dead definitions. */
PATTERN (insn) = set;
+ INSN_CODE (insn) = -1;
+ recog_memoized (insn);
df_insn_rescan (insn);
}
}
diff --git a/gcc/testsuite/g++.target/i386/pr120036.C b/gcc/testsuite/g++.target/i386/pr120036.C
new file mode 100644
index 00000000000..a2fc24f1286
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr120036.C
@@ -0,0 +1,113 @@
+/* { dg-do compile { target fpic } } */
+/* { dg-options "-O2 -std=c++11 -march=sapphirerapids -fPIC" } */
+
+typedef _Float16 Native;
+struct float16_t
+{
+ Native native;
+ float16_t ();
+ float16_t (Native arg) : native (arg) {}
+ operator Native ();
+ float16_t
+ operator+ (float16_t rhs)
+ {
+ return native + rhs.native;
+ }
+ float16_t
+ operator* (float16_t)
+ {
+ return native * native;
+ }
+};
+template <int N> struct Simd
+{
+ static constexpr int kPrivateLanes = N;
+};
+template <int N> struct ClampNAndPow2
+{
+ using type = Simd<N>;
+};
+template <int kLimit> struct CappedTagChecker
+{
+ static constexpr int N = sizeof (int) ? kLimit : 0;
+ using type = typename ClampNAndPow2<N>::type;
+};
+template <typename, int kLimit, int>
+using CappedTag = typename CappedTagChecker<kLimit>::type;
+template <class D>
+int
+Lanes (D)
+{
+ return D::kPrivateLanes;
+}
+template <class D> int Zero (D);
+template <class D> using VFromD = decltype (Zero (D ()));
+struct Vec512
+{
+ __attribute__ ((__vector_size__ (16))) _Float16 raw;
+};
+Vec512 Zero (Simd<2>);
+template <class D> void ReduceSum (D, VFromD<D>);
+struct Dot
+{
+ template <int, class D, typename T>
+ static T
+ Compute (D d, T *pa, int num_elements)
+ {
+ T *pb;
+ int N = Lanes (d), i = 0;
+ if (__builtin_expect (num_elements < N, 0))
+ {
+ T sum0 = 0, sum1 = 0;
+ for (; i + 2 <= num_elements; i += 2)
+ {
+ float16_t __trans_tmp_6 = pa[i] * pb[i],
+ __trans_tmp_5 = sum0 + __trans_tmp_6,
+ __trans_tmp_8 = pa[i + 1] * pb[1],
+ __trans_tmp_7 = sum1 + __trans_tmp_8;
+ sum0 = __trans_tmp_5;
+ sum1 = __trans_tmp_7;
+ }
+ float16_t __trans_tmp_9 = sum0 + sum1;
+ return __trans_tmp_9;
+ }
+ decltype (Zero (d)) sum0;
+ ReduceSum (d, sum0);
+ __builtin_trap ();
+ }
+};
+template <int kMul, class Test, int kPow2> struct ForeachCappedR
+{
+ static void
+ Do (int min_lanes, int max_lanes)
+ {
+ CappedTag<int, kMul, kPow2> d;
+ Test () (int (), d);
+ ForeachCappedR<kMul / 2, Test, kPow2>::Do (min_lanes, max_lanes);
+ }
+};
+template <class Test, int kPow2> struct ForeachCappedR<0, Test, kPow2>
+{
+ static void Do (int, int);
+};
+struct TestDot
+{
+ template <class T, class D>
+ void
+ operator() (T, D d)
+ {
+ int counts[]{ 1, 3 };
+ for (int num : counts)
+ {
+ float16_t a;
+ T __trans_tmp_4 = Dot::Compute<0> (d, &a, num);
+ }
+ }
+};
+int DotTest_TestAllDot_TestTestBody_max_lanes;
+void
+DotTest_TestAllDot_TestTestBody ()
+{
+ ForeachCappedR<64, TestDot, 0>::Do (
+ 1, DotTest_TestAllDot_TestTestBody_max_lanes);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-3a.c b/gcc/testsuite/gcc.target/i386/pr117839-3a.c
new file mode 100644
index 00000000000..81afa9d156c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-3a.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+typedef char v4qi __attribute__((vector_size(4)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+v4qi a;
+v16qi b;
+void
+foo (v4qi* c, v16qi* d)
+{
+ v4qi sum = __extension__(v4qi){0, 0, 0, 0};
+ v16qi sum2 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0};
+ for (int i = 0; i != 100; i++)
+ sum += c[i];
+ for (int i = 0 ; i != 100; i++)
+ sum2 += d[i];
+ a = sum;
+ b = sum2;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-3b.c b/gcc/testsuite/gcc.target/i386/pr117839-3b.c
new file mode 100644
index 00000000000..a599c28752c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-3b.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include "pr117839-3a.c"
--
2.49.0