On Sat, May 16, 2020 at 8:13 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > On Fri, May 15, 2020 at 11:21:30AM +0200, Uros Bizjak wrote: > > On Wed, May 13, 2020 at 5:58 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > > > > > The question is, why STV pass creates its funny sequence? The > > > > > > original > > > > > > sequence should be easily solved by storing DImode from XMM register > > > > > > and (with patched gcc) pushing DImode value from the same XMM > > > > > > register. > > > > > > > > > > STV pass is mostly OK since it has to use XMM to move upper and lower > > > > > 32 bits of a 64-bit integer. The problem is that push XMM becomes > > > > > very > > > > > expensive later. > > > > > > > > As shown in the patch, XMM push should be just decrement of SP reg and > > > > move to the created stack slot. > > > > > > OK for master if there are no regressions? > > > > diff --git a/gcc/config/i386/i386-features.c > > b/gcc/config/i386/i386-features.c > > index 78fb373db6e..6cad125fa83 100644 > > --- a/gcc/config/i386/i386-features.c > > +++ b/gcc/config/i386/i386-features.c > > @@ -1264,7 +1264,8 @@ has_non_address_hard_reg (rtx_insn *insn) > > FOR_EACH_INSN_DEF (ref, insn) > > if (HARD_REGISTER_P (DF_REF_REAL_REG (ref)) > > && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER) > > - && DF_REF_REGNO (ref) != FLAGS_REG) > > + && DF_REF_REGNO (ref) != FLAGS_REG > > + && DF_REF_REGNO (ref) != SP_REG) > > return true; > > > > I don't think this part is correct. The function comment says: > > > > /* Return 1 if INSN uses or defines a hard register. > > Hard register uses in a memory address are ignored. > > Clobbers and flags definitions are ignored. */ > > > > and you are putting SP_REG into clobber part. > > > > OTOH, SP_REG in: > > > > (insn 28 27 29 3 (set (mem:DI (pre_dec:SI (reg/f:SI 7 sp)) [2 S8 A64]) > > (reg/v:DI 85 [ target ])) "x.i":19:5 40 {*pushdi2} > > > > is inside memory, so REG_SP should already be ignored. Please > > investigate, why it is not the case. > > Push isn't a simple memory access since it also updates stack pointer. > Fixed to handle pseudo register push. > > > > > +(define_insn "*pushv1ti2" > > + [(set (match_operand:V1TI 0 "push_operand" "=<") > > + (match_operand:V1TI 1 "general_no_elim_operand" "v"))] > > + "" > > + "#" > > + [(set_attr "type" "multi") > > + (set_attr "mode" "TI")]) > > + > > +(define_split > > + [(set (match_operand:V1TI 0 "push_operand" "") > > + (match_operand:V1TI 1 "sse_reg_operand" ""))] > > + "TARGET_64BIT && reload_completed" > > + [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2))) > > + (set (match_dup 0) (match_dup 1))] > > +{ > > + operands[2] = GEN_INT (-PUSH_ROUNDING (16)); > > + /* Preserve memory attributes. */ > > + operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx); > > +}) > > > > The above part is wrong on many levels, e.g. using wrong predicate, > > unnecessary rounding, it should be defined as define_insn_and_split, > > conditionalized on TARGET_64BIT && TARGET_STV and put nearby existing > > pushti patterns. > > Fixed. > > > > > I will implement push changes (modulo V1T1mode) by myself, since they > > are independent of STV stuff. > > > > Here is the updated patch. Tested on Linux/x86 and Linux/x86-64. OK > for master? > > Thanks. > > H.J. > --- > Add V1TI vector register push and split it after reload to a sequence > of: > > (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8))) > (set (match_dup 0) (match_dup 1)) > > so that STV pass can convert TI mode integer push to V1TI vector register > push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls > of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore > pseudo register push. > > Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with > -mpreferred-stack-boundary=2 and leads to segfault: > > Dump of assembler code for function __bid_nesd2: > 0x08049210 <+0>: endbr32 > 0x08049214 <+4>: push %esi > 0x08049215 <+5>: push %ebx > 0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx> > 0x0804921b <+11>: add $0x8de5,%ebx > 0x08049221 <+17>: sub $0x20,%esp > 0x08049224 <+20>: mov 0x30(%esp),%esi > 0x08049228 <+24>: pushl 0x2c(%esp) > 0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64> > 0x08049231 <+33>: mov %esi,(%esp) > 0x08049234 <+36>: movd %edx,%xmm1 > 0x08049238 <+40>: movd %eax,%xmm0 > 0x0804923c <+44>: punpckldq %xmm1,%xmm0 > => 0x08049240 <+48>: movaps %xmm0,0x10(%esp) > 0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64> > 0x0804924a <+58>: push %edx > 0x0804924b <+59>: push %eax > 0x0804924c <+60>: pushl 0x1c(%esp) > 0x08049250 <+64>: pushl 0x1c(%esp) > 0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal> > 0x08049259 <+73>: add $0x34,%esp > 0x0804925c <+76>: pop %ebx > 0x0804925d <+77>: pop %esi > 0x0804925e <+78>: ret > > when libgcc is compiled with -msse2. According to GCC manual: > > '-mpreferred-stack-boundary=NUM' > Attempt to keep the stack boundary aligned to a 2 raised to NUM > byte boundary. If '-mpreferred-stack-boundary' is not specified, > the default is 4 (16 bytes or 128-bits). > > *Warning:* If you use this switch, then you must build all modules > with the same value, including any libraries. This includes the > system libraries and startup modules. > > c-c++-common/dfp/func-vararg-mixed-2.c, which was added by > > commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f > Author: H.J. Lu <hongjiu...@intel.com> > Date: Wed Jul 30 19:24:02 2008 +0000 > > func-vararg-alternate-d128-2.c: New. > > 2008-07-30 H.J. Lu <hongjiu...@intel.com> > Joey Ye <joey...@intel.com> > > * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New. > * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise. > > isn't expected to work with libgcc. > > gcc/ > > PR target/95021 > * config/i386/i386-features.c (has_non_address_hard_reg): > Renamed to ... > (pseudo_reg_set): This. Return the SET expression. Ignore > pseudo register push. > (general_scalar_to_vector_candidate_p): Combine single_set and > has_non_address_hard_reg calls to pseudo_reg_set. > (timode_scalar_to_vector_candidate_p): Likewise. > * config/i386/i386.md (*pushv1ti2): New pattern. > > gcc/testsuite/ > > PR target/95021 > * c-c++-common/dfp/func-vararg-mixed-2.c: Removed. > * gcc.target/i386/pr95021-1.c: New test. > * gcc.target/i386/pr95021-2.c: Likewise. > * gcc.target/i386/pr95021-3.c: Likewise. > * gcc.target/i386/pr95021-4.c: Likewise. > * gcc.target/i386/pr95021-5.c: Likewise.
OK with a small nit in i386.md and testcase changes, as explained inline. Thanks, Uros. > --- > gcc/config/i386/i386-features.c | 37 +++--- > gcc/config/i386/i386.md | 16 +++ > .../c-c++-common/dfp/func-vararg-mixed-2.c | 105 ------------------ > gcc/testsuite/gcc.target/i386/pr95021-1.c | 25 +++++ > gcc/testsuite/gcc.target/i386/pr95021-2.c | 53 +++++++++ > gcc/testsuite/gcc.target/i386/pr95021-3.c | 25 +++++ > gcc/testsuite/gcc.target/i386/pr95021-4.c | 25 +++++ > gcc/testsuite/gcc.target/i386/pr95021-5.c | 59 ++++++++++ > 8 files changed, 224 insertions(+), 121 deletions(-) > delete mode 100644 gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-5.c > > diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c > index 78fb373db6e..b9b764c092a 100644 > --- a/gcc/config/i386/i386-features.c > +++ b/gcc/config/i386/i386-features.c > @@ -1253,25 +1253,36 @@ scalar_chain::convert () > return converted_insns; > } > > -/* Return 1 if INSN uses or defines a hard register. > - Hard register uses in a memory address are ignored. > - Clobbers and flags definitions are ignored. */ > +/* Return the SET expression if INSN doesn't reference hard register. > + Return NULL if INSN uses or defines a hard register, excluding > + pseudo register pushes, hard register uses in a memory address, > + clobbers and flags definitions. */ > > -static bool > -has_non_address_hard_reg (rtx_insn *insn) > +static rtx > +pseudo_reg_set (rtx_insn *insn) > { > + rtx set = single_set (insn); > + if (!set) > + return NULL; > + > + /* Check pseudo register push first. */ > + if (REG_P (SET_SRC (set)) > + && !HARD_REGISTER_P (SET_SRC (set)) > + && push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))) > + return set; > + > df_ref ref; > FOR_EACH_INSN_DEF (ref, insn) > if (HARD_REGISTER_P (DF_REF_REAL_REG (ref)) > && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER) > && DF_REF_REGNO (ref) != FLAGS_REG) > - return true; > + return NULL; > > FOR_EACH_INSN_USE (ref, insn) > if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref))) > - return true; > + return NULL; > > - return false; > + return set; > } > > /* Check if comparison INSN may be transformed > @@ -1345,14 +1356,11 @@ convertible_comparison_p (rtx_insn *insn, enum > machine_mode mode) > static bool > general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode) > { > - rtx def_set = single_set (insn); > + rtx def_set = pseudo_reg_set (insn); > > if (!def_set) > return false; > > - if (has_non_address_hard_reg (insn)) > - return false; > - > rtx src = SET_SRC (def_set); > rtx dst = SET_DEST (def_set); > > @@ -1442,14 +1450,11 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, > enum machine_mode mode) > static bool > timode_scalar_to_vector_candidate_p (rtx_insn *insn) > { > - rtx def_set = single_set (insn); > + rtx def_set = pseudo_reg_set (insn); > > if (!def_set) > return false; > > - if (has_non_address_hard_reg (insn)) > - return false; > - > rtx src = SET_SRC (def_set); > rtx dst = SET_DEST (def_set); > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 1bf0c1a7f01..bcb44a3fb46 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -1700,6 +1700,22 @@ (define_insn "*pushdi2_rex64" > [(set_attr "type" "push,multi,multi") > (set_attr "mode" "DI")]) > > +(define_insn_and_split "*pushv1ti2" > + [(set (match_operand:V1TI 0 "push_operand" "=<") > + (match_operand:V1TI 1 "register_operand" "v"))] > + "TARGET_64BIT && TARGET_STV" > + "#" > + "&& reload_completed" > + [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2))) > + (set (match_dup 0) (match_dup 1))] > +{ > + operands[2] = GEN_INT (-PUSH_ROUNDING (GET_MODE_SIZE (V1TImode))); > + /* Preserve memory attributes. */ > + operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx); > +} > + [(set_attr "type" "multi") > + (set_attr "mode" "TI")]) Please put the above pattern just above DWI "*push<mode>2". > + > ;; Convert impossible pushes of immediate to existing instructions. > ;; First try to get scratch register and go through it. In case this > ;; fails, push sign extended lower part first and then overwrite > diff --git a/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c > b/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c > deleted file mode 100644 > index 02cafb016d1..00000000000 > --- a/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c > +++ /dev/null > @@ -1,105 +0,0 @@ > -/* { dg-do run { target { { i?86-*-* x86_64-*-* } && ia32 } } } */ > -/* { dg-options "-mpreferred-stack-boundary=2" } */ > - > -/* C99 6.5.2.2 Function calls. > - Test passing varargs of the combination of decimal float types and > - other types. */ > - > -#include <stdarg.h> > -#include "dfp-dbg.h" > - > -/* Supposing the list of varying number of arguments is: > - unsigned int, _Decimal128, double, _Decimal32, _Decimal64. */ > - > -static _Decimal32 > -vararg_d32 (unsigned arg, ...) > -{ > - va_list ap; > - _Decimal32 result; > - > - va_start (ap, arg); > - > - va_arg (ap, unsigned int); > - va_arg (ap, _Decimal128); > - va_arg (ap, double); > - result = va_arg (ap, _Decimal32); > - > - va_end (ap); > - return result; > -} > - > -static _Decimal32 > -vararg_d64 (unsigned arg, ...) > -{ > - va_list ap; > - _Decimal64 result; > - > - va_start (ap, arg); > - > - va_arg (ap, unsigned int); > - va_arg (ap, _Decimal128); > - va_arg (ap, double); > - va_arg (ap, _Decimal32); > - result = va_arg (ap, _Decimal64); > - > - va_end (ap); > - return result; > -} > - > -static _Decimal128 > -vararg_d128 (unsigned arg, ...) > -{ > - va_list ap; > - _Decimal128 result; > - > - va_start (ap, arg); > - > - va_arg (ap, unsigned int); > - result = va_arg (ap, _Decimal128); > - > - va_end (ap); > - return result; > -} > - > -static unsigned int > -vararg_int (unsigned arg, ...) > -{ > - va_list ap; > - unsigned int result; > - > - va_start (ap, arg); > - > - result = va_arg (ap, unsigned int); > - > - va_end (ap); > - return result; > -} > - > -static double > -vararg_double (unsigned arg, ...) > -{ > - va_list ap; > - float result; > - > - va_start (ap, arg); > - > - va_arg (ap, unsigned int); > - va_arg (ap, _Decimal128); > - result = va_arg (ap, double); > - > - va_end (ap); > - return result; > -} > - > - > -int > -main () > -{ > - if (vararg_d32 (3, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 3.0df) FAILURE > - if (vararg_d64 (4, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 4.0dd) FAILURE > - if (vararg_d128 (1, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 1.0dl) FAILURE > - if (vararg_int (0, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 0) FAILURE > - if (vararg_double (2, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 2.0) FAILURE > - > - FINISH > -} > diff --git a/gcc/testsuite/gcc.target/i386/pr95021-1.c > b/gcc/testsuite/gcc.target/i386/pr95021-1.c > new file mode 100644 > index 00000000000..218776240ee > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr95021-1.c > @@ -0,0 +1,25 @@ > +/* { dg-do compile { target ia32 } } */ > +/* { dg-options "-O2 -msse2 -mstv -mregparm=0 -W" } * No need for -mregparm=0, it is the the default. Also, do we really need -W? > +/* { dg-final { scan-assembler "movq\[ \t\]+\[^\n\]*, %xmm" } } */ Please rather scan for "movq %xmmX, (%esp)", this is what we are looking for, the bad code has: movd %xmm1, 20(%esp) pushl 20(%esp) Also, we can check that "psrlq $32, %xmm1" is not there, which means that DImode value wasn't broken to SImode. Please improve this test in all 32bit testcases. > + > +typedef long long a; > +struct __jmp_buf_tag { > +}; > +typedef struct __jmp_buf_tag sigjmp_buf[1]; > +sigjmp_buf jmp_buf; > +int __sigsetjmp (sigjmp_buf, int); > +typedef struct { > + a target; > +} b; > +extern a *target_p; > +b *c; > +extern void foo (a); > +void > +d(void) > +{ > + if (__sigsetjmp(jmp_buf, 1)) { > + a target = *target_p; > + c->target = target; > + foo(target); > + } > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr95021-2.c > b/gcc/testsuite/gcc.target/i386/pr95021-2.c > new file mode 100644 > index 00000000000..473b5f81664 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr95021-2.c > @@ -0,0 +1,53 @@ > +/* { dg-do run { target ia32 } } */ > +/* { dg-require-effective-target sse2_runtime } */ > +/* { dg-options "-O2 -msse2 -mstv -W" } */ This is the same testcase as the one above. The default is -mregparm=0. > + > +#include <stdlib.h> > +#include <setjmp.h> > + > +jmp_buf buf; > + > +long long *target_p; > +long long *c; > + > +int count; > + > +__attribute__ ((noclone, noinline)) > +void > +foo (long long x) > +{ > + if (x != *c) > + abort (); > + if (!count) > + { > + count++; > + longjmp (buf, 1); > + } > +} > + > +__attribute__ ((noclone, noinline)) > +void > +bar (void) > +{ > + if (setjmp (buf)) > + { > + long long target = *target_p; > + *c = target; > + foo (target); > + } > + else > + foo (0); > +} > + > +int > +main () > +{ > + long long val = 30; > + long long local = 0; > + target_p = &val; > + c = &local; > + bar (); > + if (val != local) > + abort (); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr95021-3.c > b/gcc/testsuite/gcc.target/i386/pr95021-3.c > new file mode 100644 > index 00000000000..b213b5db072 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr95021-3.c > @@ -0,0 +1,25 @@ > +/* { dg-do compile { target ia32 } } */ > +/* { dg-options "-O2 -msse2 -mstv -mregparm=3 -W" } */ > +/* { dg-final { scan-assembler "movq\[ \t\]+\[^\n\]*, %xmm" } } */ Just #include "pr95021-1.c". No need to dupicate the same code. > +typedef long long a; > +struct __jmp_buf_tag { > +}; > +typedef struct __jmp_buf_tag sigjmp_buf[1]; > +sigjmp_buf jmp_buf; > +int __sigsetjmp (sigjmp_buf, int); > +typedef struct { > + a target; > +} b; > +extern a *target_p; > +b *c; > +extern void foo (a); > +void > +d(void) > +{ > + if (__sigsetjmp(jmp_buf, 1)) { > + a target = *target_p; > + c->target = target; > + foo(target); > + }> +} > diff --git a/gcc/testsuite/gcc.target/i386/pr95021-4.c > b/gcc/testsuite/gcc.target/i386/pr95021-4.c > new file mode 100644 > index 00000000000..a080d7b289f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr95021-4.c > @@ -0,0 +1,25 @@ > +/* { dg-do compile { target int128 } } */ > +/* { dg-options "-O2 -msse2 -mstv -W" } */ > +/* { dg-final { scan-assembler "movdqa\[ \t\]+\[^\n\]*, %xmm" } } */ Please scan for "movaps %xmmX, (%rsp)" here. > + > +typedef __int128 a; > +struct __jmp_buf_tag { > +}; > +typedef struct __jmp_buf_tag sigjmp_buf[1]; > +sigjmp_buf jmp_buf; > +int __sigsetjmp (sigjmp_buf, int); > +typedef struct { > + a target; > +} b; > +extern a *target_p; > +b *c; > +extern void foo (int, int, int, int, int, int, a); > +void > +d(void) > +{ > + if (__sigsetjmp(jmp_buf, 1)) { > + a target = *target_p; > + c->target = target; > + foo(1, 2, 3, 4, 5, 6, target); > + } > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr95021-5.c > b/gcc/testsuite/gcc.target/i386/pr95021-5.c > new file mode 100644 > index 00000000000..3fcf2c3922e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr95021-5.c > @@ -0,0 +1,59 @@ > +/* { dg-do run { target int128 } } */ > +/* { dg-require-effective-target sse2_runtime } */ > +/* { dg-options "-O2 -msse2 -mstv -W" } */ > + > +#include <stdlib.h> > +#include <setjmp.h> > + > +jmp_buf buf; > + > +__int128 *target_p; > +__int128 *c; > + > +int count; > + > +__attribute__ ((noclone, noinline)) > +void > +foo (__int128 i1, __int128 i2, __int128 i3, __int128 x) > +{ > + if (i1 != 0xbadbeef1) > + abort (); > + if (i2 != 0x2badbeef) > + abort (); > + if (i3 != 0xbad3beef) > + abort (); > + if (x != *c) > + abort (); > + if (!count) > + { > + count++; > + longjmp (buf, 1); > + } > +} > + > +__attribute__ ((noclone, noinline)) > +void > +bar (void) > +{ > + if (setjmp (buf)) > + { > + __int128 target = *target_p; > + *c = target; > + foo (0xbadbeef1, 0x2badbeef, 0xbad3beef, target); > + } > + else > + foo (0xbadbeef1, 0x2badbeef, 0xbad3beef, 0); > +} > + > +int > +main () > +{ > + __int128 val = 30; > + __int128 local = 0; > + target_p = &val; > + c = &local; > + bar (); > + if (val != local) > + abort (); > + return 0; > +} > -- > 2.26.2 >