On Tue, 19 Mar 2013, Richard Henderson wrote:
I'm not fond of this, primarily because I believe the pattern should
not exist at all.
One year later, new try. Tweaking the pattern, I ended up with a copy of
the mov pattern (the subreg is generated automatically when the modes
don't match), so I just removed it. I know the comment in emit-rtl.c says
splitters are a better way forward than subregs, but I haven't managed
with splitters while the subreg patch is very simple :-) I added a -O0
testcase because when I was experimenting I had many versions that worked
for -O2 but ICEd at -O0 (and vice versa), but it might be redundant with
some other tests.
Bootstrap+testsuite on x86_64-linux-gnu.
2014-06-10 Marc Glisse <marc.gli...@inria.fr>
PR target/50829
gcc/
* config/i386/sse.md (enum unspec): Remove UNSPEC_CAST.
(avx_<castmode><avxsizesuffix>_<castmode>): Remove.
* config/i386/i386.c (builtin_description) [__builtin_ia32_si256_si,
__builtin_ia32_ps256_ps, __builtin_ia32_pd256_pd]: Replace the
removed insn with mov.
* emit-rtl.c (validate_subreg): Allow vector-vector subregs.
gcc/testsuite/
* gcc.target/i386/pr50829-1.c: New file.
* gcc.target/i386/pr50829-2.c: New file.
--
Marc Glisse
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c (revision 211397)
+++ gcc/config/i386/i386.c (working copy)
@@ -29793,23 +29793,23 @@ static const struct builtin_description
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256,
"__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code)
ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
{ OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256",
IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix,
"__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN,
(int) V8SI_FTYPE_V8SF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256,
"__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int)
V4DF_FTYPE_V4DF_V4DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256,
"__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int)
V4DF_FTYPE_V4DF_V4DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256,
"__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int)
V8SF_FTYPE_V8SF_V8SF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256,
"__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int)
V8SF_FTYPE_V8SF_V8SF },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si",
IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps",
IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd",
IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_movv8si, "__builtin_ia32_si256_si",
IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_movv8sf, "__builtin_ia32_ps256_ps",
IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_movv4df, "__builtin_ia32_pd256_pd",
IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si,
"__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int)
V4SI_FTYPE_V8SI },
{ OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf,
"__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int)
V4SF_FTYPE_V8SF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df,
"__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int)
V2DF_FTYPE_V4DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd",
IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd",
IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd",
IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps",
IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps",
IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps",
IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
Index: gcc/config/i386/sse.md
===================================================================
--- gcc/config/i386/sse.md (revision 211397)
+++ gcc/config/i386/sse.md (working copy)
@@ -66,21 +66,20 @@
UNSPEC_AESKEYGENASSIST
;; For PCLMUL support
UNSPEC_PCLMUL
;; For AVX support
UNSPEC_PCMP
UNSPEC_VPERMIL
UNSPEC_VPERMIL2
UNSPEC_VPERMIL2F128
- UNSPEC_CAST
UNSPEC_VTESTP
UNSPEC_VCVTPH2PS
UNSPEC_VCVTPS2PH
;; For AVX2 support
UNSPEC_VPERMVAR
UNSPEC_VPERMTI
UNSPEC_GATHER
UNSPEC_VSIBADDR
@@ -14816,40 +14815,20 @@
(define_expand "maskstore<mode>"
[(set (match_operand:V48_AVX2 0 "memory_operand")
(unspec:V48_AVX2
[(match_operand:<sseintvecmode> 2 "register_operand")
(match_operand:V48_AVX2 1 "register_operand")
(match_dup 0)]
UNSPEC_MASKMOV))]
"TARGET_AVX")
-(define_insn_and_split "avx_<castmode><avxsizesuffix>_<castmode>"
- [(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m")
- (unspec:AVX256MODE2P
- [(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "xm,x")]
- UNSPEC_CAST))]
- "TARGET_AVX"
- "#"
- "&& reload_completed"
- [(const_int 0)]
-{
- rtx op0 = operands[0];
- rtx op1 = operands[1];
- if (REG_P (op0))
- op0 = gen_rtx_REG (<ssehalfvecmode>mode, REGNO (op0));
- else
- op1 = gen_rtx_REG (<MODE>mode, REGNO (op1));
- emit_move_insn (op0, op1);
- DONE;
-})
-
(define_expand "vec_init<mode>"
[(match_operand:V_256 0 "register_operand")
(match_operand 1)]
"TARGET_AVX"
{
ix86_expand_vector_init (false, operands[0], operands[1]);
DONE;
})
(define_expand "vec_init<mode>"
Index: gcc/emit-rtl.c
===================================================================
--- gcc/emit-rtl.c (revision 211397)
+++ gcc/emit-rtl.c (working copy)
@@ -775,20 +775,23 @@ validate_subreg (enum machine_mode omode
else if ((COMPLEX_MODE_P (imode) || VECTOR_MODE_P (imode))
&& GET_MODE_INNER (imode) == omode)
;
/* ??? x86 sse code makes heavy use of *paradoxical* vector subregs,
i.e. (subreg:V4SF (reg:SF) 0). This surely isn't the cleanest way to
represent this. It's questionable if this ought to be represented at
all -- why can't this all be hidden in post-reload splitters that make
arbitrarily mode changes to the registers themselves. */
else if (VECTOR_MODE_P (omode) && GET_MODE_INNER (omode) == imode)
;
+ else if (VECTOR_MODE_P (omode) && VECTOR_MODE_P (imode)
+ && GET_MODE_INNER (omode) == GET_MODE_INNER (imode))
+ ;
/* Subregs involving floating point modes are not allowed to
change size. Therefore (subreg:DI (reg:DF) 0) is fine, but
(subreg:SI (reg:DF) 0) isn't. */
else if (FLOAT_MODE_P (imode) || FLOAT_MODE_P (omode))
{
if (! (isize == osize
/* LRA can use subreg to store a floating point value in
an integer mode. Although the floating point and the
integer modes need the same number of hard registers,
the size of floating point mode can be less than the
Index: gcc/testsuite/gcc.target/i386/pr50829-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr50829-1.c (revision 0)
+++ gcc/testsuite/gcc.target/i386/pr50829-1.c (working copy)
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+#include <x86intrin.h>
+
+__m256d
+concat (__m128d x)
+{
+ __m256d z = _mm256_castpd128_pd256 (x);
+ return _mm256_insertf128_pd (z, x, 1);
+}
+
+/* { dg-final { scan-assembler-not "vmov" } } */
Index: gcc/testsuite/gcc.target/i386/pr50829-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr50829-2.c (revision 0)
+++ gcc/testsuite/gcc.target/i386/pr50829-2.c (working copy)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx" } */
+
+#include <x86intrin.h>
+
+__m256d
+concat (__m128d x)
+{
+ __m256d z = _mm256_castpd128_pd256 (x);
+ return _mm256_insertf128_pd (z, x, 1);
+}