from:"stefan"

[ABOUT-TO-PUSH PATCH] MAINTAINERS (s390 port): Add myself

2024-10-10 Thread Stefan Schulze Frielinghaus

ChangeLog:

* MAINTAINERS (s390 port): Add myself.
---
 I hope the overflow into the email column doesn't break any tooling.  I
 will leave the patch as is for some time before pushing.

 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9257b33ff08..f76d12f7f3f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -120,6 +120,7 @@ rs6000 vector extns Aldy Hernandez  

 rx port Nick Clifton
 s390 port   Ulrich Weigand  
 s390 port   Andreas Krebbel 
+s390 port   Stefan Schulze Frielinghaus 
 sh port Alexandre Oliva 
 sh port Oleg Endo   
 sparc port  David S. Miller 
-- 
2.45.2

[PATCH] s390: Remove -m{,no-}lra option

2024-09-19 Thread Stefan Schulze Frielinghaus

I have been missing the two test cases and removed them since they
depend on -mno-lra.

-- 8< --

Since the old reload pass is about to be removed and we defaulted to LRA
for over a decade, remove option -m{,no-}lra.

PR target/113953

gcc/ChangeLog:

* config/s390/s390.cc (s390_lra_p): Remove.
(TARGET_LRA_P): Remove.
* config/s390/s390.opt (mlra): Remove.
* config/s390/s390.opt.urls (mlra): Remove.

gcc/testsuite/ChangeLog:

* gcc.target/s390/TI-constants-nolra.c: Removed.
* gcc.target/s390/pr79895.c: Removed.
---
 gcc/config/s390/s390.cc   | 10 
 gcc/config/s390/s390.opt  |  4 --
 gcc/config/s390/s390.opt.urls |  2 -
 .../gcc.target/s390/TI-constants-nolra.c  | 47 ---
 gcc/testsuite/gcc.target/s390/pr79895.c   |  9 
 5 files changed, 72 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/s390/TI-constants-nolra.c
 delete mode 100644 gcc/testsuite/gcc.target/s390/pr79895.c

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index c9172d1153a..25d43ae3e13 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -11342,13 +11342,6 @@ s390_can_change_mode_class (machine_mode from_mode,
   return true;
 }
 
-/* Return true if we use LRA instead of reload pass.  */
-static bool
-s390_lra_p (void)
-{
-  return s390_lra_flag;
-}
-
 /* Return true if register FROM can be eliminated via register TO.  */
 
 static bool
@@ -18444,9 +18437,6 @@ s390_c_mode_for_floating_type (enum tree_index ti)
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P s390_legitimate_constant_p
 
-#undef TARGET_LRA_P
-#define TARGET_LRA_P s390_lra_p
-
 #undef TARGET_CAN_ELIMINATE
 #define TARGET_CAN_ELIMINATE s390_can_eliminate
 
diff --git a/gcc/config/s390/s390.opt b/gcc/config/s390/s390.opt
index a5b5aa95a12..23ea4b8232d 100644
--- a/gcc/config/s390/s390.opt
+++ b/gcc/config/s390/s390.opt
@@ -229,10 +229,6 @@ Set the branch costs for conditional branch instructions.  
Reasonable
 values are small, non-negative integers.  The default branch cost is
 1.
 
-mlra
-Target Var(s390_lra_flag) Init(1) Save
-Use LRA instead of reload.
-
 mpic-data-is-text-relative
 Target Var(s390_pic_data_is_text_relative) 
Init(TARGET_DEFAULT_PIC_DATA_IS_TEXT_RELATIVE)
 Assume data segments are relative to text segment.
diff --git a/gcc/config/s390/s390.opt.urls b/gcc/config/s390/s390.opt.urls
index ab1e761efa8..bc772d2ffc7 100644
--- a/gcc/config/s390/s390.opt.urls
+++ b/gcc/config/s390/s390.opt.urls
@@ -74,8 +74,6 @@ UrlSuffix(gcc/S_002f390-and-zSeries-Options.html#index-mzarch)
 
 ; skipping UrlSuffix for 'mbranch-cost=' due to finding no URLs
 
-; skipping UrlSuffix for 'mlra' due to finding no URLs
-
 ; skipping UrlSuffix for 'mpic-data-is-text-relative' due to finding no URLs
 
 ; skipping UrlSuffix for 'mindirect-branch=' due to finding no URLs
diff --git a/gcc/testsuite/gcc.target/s390/TI-constants-nolra.c 
b/gcc/testsuite/gcc.target/s390/TI-constants-nolra.c
deleted file mode 100644
index b9948fc4aa5..000
--- a/gcc/testsuite/gcc.target/s390/TI-constants-nolra.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/* { dg-do compile { target int128 } } */
-/* { dg-options "-O3 -mno-lra" } */
-
-/* 2x lghi */
-__int128 a() {
-  return 0;
-}
-
-/* 2x lghi */
-__int128 b() {
-  return -1;
-}
-
-/* 2x lghi */
-__int128 c() {
-  return -2;
-}
-
-/* lghi + llilh */
-__int128 d() {
-  return 16000 << 16;
-}
-
-/* lghi + llihf */
-__int128 e() {
-  return (unsigned long long)8 << 32;
-}
-
-/* lghi + llihf */
-__int128 f() {
-  return (unsigned __int128)8 << 96;
-}
-
-/* llihf + llihf - this is handled via movti_bigconst pattern */
-__int128 g() {
-  return ((unsigned __int128)8 << 96) | ((unsigned __int128)8 << 32);
-}
-
-/* Literal pool */
-__int128 h() {
-  return ((unsigned __int128)8 << 32) | 1;
-}
-
-/* Literal pool */
-__int128 i() {
-  return (((unsigned __int128)8 << 32) | 1) << 64;
-}
diff --git a/gcc/testsuite/gcc.target/s390/pr79895.c 
b/gcc/testsuite/gcc.target/s390/pr79895.c
deleted file mode 100644
index 02374e4b8a8..000
--- a/gcc/testsuite/gcc.target/s390/pr79895.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/* { dg-do compile { target int128 } } */
-/* { dg-options "-O1 -mno-lra" } */
-
-unsigned __int128 g;
-void
-foo ()
-{
-  g = (unsigned __int128)1 << 127;
-}
-- 
2.45.2

[PATCH] s390: Remove -m{,no-}lra option

2024-09-19 Thread Stefan Schulze Frielinghaus

Since the old reload pass is about to be removed and we defaulted to LRA
for over a decade, remove option -m{,no-}lra.

PR target/113953

gcc/ChangeLog:

* config/s390/s390.cc (s390_lra_p): Remove.
(TARGET_LRA_P): Remove.
* config/s390/s390.opt (mlra): Remove.
* config/s390/s390.opt.urls (mlra): Remove.
---
 Assuming that bootstrap and regtest (which are still running) finish
 successful, ok for mainline?

 gcc/config/s390/s390.cc   | 10 --
 gcc/config/s390/s390.opt  |  4 
 gcc/config/s390/s390.opt.urls |  2 --
 3 files changed, 16 deletions(-)

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index c9172d1153a..25d43ae3e13 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -11342,13 +11342,6 @@ s390_can_change_mode_class (machine_mode from_mode,
   return true;
 }
 
-/* Return true if we use LRA instead of reload pass.  */
-static bool
-s390_lra_p (void)
-{
-  return s390_lra_flag;
-}
-
 /* Return true if register FROM can be eliminated via register TO.  */
 
 static bool
@@ -18444,9 +18437,6 @@ s390_c_mode_for_floating_type (enum tree_index ti)
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P s390_legitimate_constant_p
 
-#undef TARGET_LRA_P
-#define TARGET_LRA_P s390_lra_p
-
 #undef TARGET_CAN_ELIMINATE
 #define TARGET_CAN_ELIMINATE s390_can_eliminate
 
diff --git a/gcc/config/s390/s390.opt b/gcc/config/s390/s390.opt
index a5b5aa95a12..23ea4b8232d 100644
--- a/gcc/config/s390/s390.opt
+++ b/gcc/config/s390/s390.opt
@@ -229,10 +229,6 @@ Set the branch costs for conditional branch instructions.  
Reasonable
 values are small, non-negative integers.  The default branch cost is
 1.
 
-mlra
-Target Var(s390_lra_flag) Init(1) Save
-Use LRA instead of reload.
-
 mpic-data-is-text-relative
 Target Var(s390_pic_data_is_text_relative) 
Init(TARGET_DEFAULT_PIC_DATA_IS_TEXT_RELATIVE)
 Assume data segments are relative to text segment.
diff --git a/gcc/config/s390/s390.opt.urls b/gcc/config/s390/s390.opt.urls
index ab1e761efa8..bc772d2ffc7 100644
--- a/gcc/config/s390/s390.opt.urls
+++ b/gcc/config/s390/s390.opt.urls
@@ -74,8 +74,6 @@ UrlSuffix(gcc/S_002f390-and-zSeries-Options.html#index-mzarch)
 
 ; skipping UrlSuffix for 'mbranch-cost=' due to finding no URLs
 
-; skipping UrlSuffix for 'mlra' due to finding no URLs
-
 ; skipping UrlSuffix for 'mpic-data-is-text-relative' due to finding no URLs
 
 ; skipping UrlSuffix for 'mindirect-branch=' due to finding no URLs
-- 
2.45.2

[PATCH] s390: Add expander for uaddc/usubc optabs

2024-09-18 Thread Stefan Schulze Frielinghaus

Bootstrapped and regtested on s390.  Both expander are constrained to
z196 because of the conditional moves.  I guess this is reasonable
nowadays.

Would be great if you could have a second look that setting the
carry/borrow bit (bit 18 of the PSW) is indeed correct.  Brain twisted
me at first ;-)

-- 8< --

gcc/ChangeLog:

* config/s390/s390.md (*add3_carry1_cc): Renamed to ...
(add3_carry1_cc): this and in order to use the
corresponding gen function, encode CC mode into pattern.
(*sub3_borrow_cc): Renamed to ...
(sub3_borrow_cc): this and in order to use the
corresponding gen function, encode CC mode into pattern.
(*add3_alc_carry1_cc): Renamed to ...
(add3_alc_carry1_cc): this and in order to use the
corresponding gen function, encode CC mode into pattern.
(sub3_slb_borrow1_cc): New.
(uaddc5): New.
(usubc5): New.

gcc/testsuite/ChangeLog:

* gcc.target/s390/uaddc-1.c: New test.
* gcc.target/s390/uaddc-2.c: New test.
* gcc.target/s390/usubc-1.c: New test.
* gcc.target/s390/usubc-2.c: New test.
---
 gcc/config/s390/s390.md | 103 +++-
 gcc/testsuite/gcc.target/s390/uaddc-1.c |  80 ++
 gcc/testsuite/gcc.target/s390/uaddc-2.c |  25 ++
 gcc/testsuite/gcc.target/s390/usubc-1.c |  80 ++
 gcc/testsuite/gcc.target/s390/usubc-2.c |  25 ++
 5 files changed, 295 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/uaddc-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/uaddc-2.c
 create mode 100644 gcc/testsuite/gcc.target/s390/usubc-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/usubc-2.c

diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 4a225ae24f3..6fd3f943fe1 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -6001,14 +6001,14 @@
 z10_super_E1,z10_super_E1,z10_super_E1")])
 
 ; alr, alfi, slfi, al, aly, alrk, alhsik, algr, algfi, slgfi, alg, alsi, 
algsi, algrk, alghsik
-(define_insn "*add3_carry1_cc"
-  [(set (reg CC_REGNUM)
-(compare (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" "%0,d, 
0, 0,d,0,0,0")
-  (match_operand:GPR 2 "general_operand"  " 
d,d,Op,On,K,R,T,C"))
- (match_dup 1)))
-   (set (match_operand:GPR 0 "nonimmediate_operand""=d,d, 
d, d,d,d,d,d")
+(define_insn "add3_carry1_cc"
+  [(set (reg:CCL1 CC_REGNUM)
+(compare:CCL1 (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" 
"%0,d, 0, 0,d,0,0,0")
+   (match_operand:GPR 2 "general_operand"  " 
d,d,Op,On,K,R,T,C"))
+ (match_dup 1)))
+   (set (match_operand:GPR 0 "nonimmediate_operand" 
"=d,d, d, d,d,d,d,d")
 (plus:GPR (match_dup 1) (match_dup 2)))]
-  "s390_match_ccmode (insn, CCL1mode)"
+  ""
   "@
alr\t%0,%2
alrk\t%0,%1,%2
@@ -6541,14 +6541,14 @@
(set_attr "z10prop" "z10_super_c_E1,*,z10_super_E1,z10_super_E1")])
 
 ; slr, sl, sly, slgr, slg, slrk, slgrk
-(define_insn "*sub3_borrow_cc"
-  [(set (reg CC_REGNUM)
-(compare (minus:GPR (match_operand:GPR 1 "register_operand" "0,d,0,0")
-   (match_operand:GPR 2 "general_operand"  "d,d,R,T"))
- (match_dup 1)))
-   (set (match_operand:GPR 0 "register_operand""=d,d,d,d")
+(define_insn "sub3_borrow_cc"
+  [(set (reg:CCL2 CC_REGNUM)
+(compare:CCL2 (minus:GPR (match_operand:GPR 1 "register_operand" 
"0,d,0,0")
+(match_operand:GPR 2 "general_operand"  
"d,d,R,T"))
+ (match_dup 1)))
+   (set (match_operand:GPR 0 "register_operand" 
"=d,d,d,d")
 (minus:GPR (match_dup 1) (match_dup 2)))]
-  "s390_match_ccmode (insn, CCL2mode)"
+  ""
   "@
slr\t%0,%2
slrk\t%0,%1,%2
@@ -6754,22 +6754,50 @@
 ; add(di|si)cc instruction pattern(s).
 ;
 
+(define_expand "uaddc5"
+  [(match_operand:GPR 0 "register_operand")
+   (match_operand:GPR 1 "nonimmediate_operand")
+   (match_operand:GPR 2 "nonimmediate_operand")
+   (match_operand:GPR 3 "nonimmediate_operand")
+   (match_operand:GPR 4 "general_operand")]
+  "TARGET_Z196 && (mode != DImode || TARGET_64BIT)"
+{
+  rtx cond = gen_rtx_LTU (mode, gen_rtx_REG (CCL1mode, CC_REGNUM), 
const0_rtx);
+  if (operands[4] == const0_rtx)
+emit_insn (gen_add3_carry1_cc (operands[0], operands[2], 
operands[3]));
+  else
+{
+  rtx tmp;
+  if (CONSTANT_P (operands[4]))
+   {
+ tmp = gen_reg_rtx (SImode);
+ emit_move_insn (tmp, operands[4]);
+   }
+  else
+   tmp = operands[4];
+  s390_emit_compare (LTU, tmp, const0_rtx);
+  emit_insn (gen_add3_alc_carry1_cc (operands[0], operands[2], 
operands[3], cond));
+}
+  emit_insn (gen_movcc (operands[1], cond, const1_rtx, const0_rtx));
+  DONE;
+})
+
 ;

Re: [RFC 0/4] Hard Register Constraints

2024-09-18 Thread Stefan Schulze Frielinghaus

On Wed, Sep 18, 2024 at 03:53:37PM +0200, Michael Matz wrote:
> Hello,
> 
> On Thu, 12 Sep 2024, Stefan Schulze Frielinghaus wrote:
> 
> > > > #define call_on_stack(stack, func, asm_call, argconstr...)  
> > > > \
> > > > {   
> > > > \
> > > >  register void *tos asm("r11"); 
> > > >  \
> > > > 
> > > >  \
> > > >  tos = ((void *)(stack));   
> > > >  \
> > > > 
> > > >  \
> > > >  asm_inline volatile(   
> > > >  \
> > > >  "movq   %%rsp, (%[tos]) \n"
> > > >  \
> > > >  "movq   %[tos], %%rsp   \n"
> > > >  \
> > > > 
> > > >  \
> > > >  asm_call   
> > > >  \
> > > > 
> > > >  \
> > > >  "popq   %%rsp   \n"
> > > >  \
> > > > 
> > > >  \
> > > >  : "+r" (tos), ASM_CALL_CONSTRAINT  
> > > >  \
> > > >  : [__func] "i" (func), [tos] "r" (tos) argconstr   
> > > >  \
> > > >  : "cc", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10",  
> > > >  \
> > > >"memory" 
> > > >  \
> > > >  ); 
> > > >  \
> > > > }
> > 
> > I didn't find documentation how "digit references" behave in combination
> > with register asm.
> 
> Because noone thought of that corner case while documenting stuff :-)
> 
> As you say: it only works because the involved inputs/outputs are the same 
> expression.  If they weren't the inconsistency would be detected in 
> reload/LRA when the necessary reloads would need to be generated and the 
> pass would find that that's impossible.
> 
> Now, question is, what to do in this case in the light of a new feature.  
> I would say that while perhaps sometimes convenient it's more likely to be 
> a programmers fault, so for your new hardreg constraints it seems better 
> to ...
> 
> > Anyway, I digress.  I haven't made up my mind how hard register
> > constraints should behave in those cases, i.e., in cases where multiple
> > inputs share the same register.  If the inputs are different or may be
> > different, then we can reject those programs.
> > 
> > asm ("" : "={r4}" (x) : "{r5}" (42), "{r5}" (24));
> > 
> > Whereas if the operands are provable equal (assuming y is not volatile)
> > 
> > asm ("" : "={r4}" (x) : "{r5}" (y), "{r5}" (y));
> > 
> > we could accept those programs.  Currently, I error out even for
> > programs of the latter form which may be a bit to restrictive.
> 
> ... do exactly this, error out unconditionally.  I wouldn't change 
> behaviour for existing features, i.e. register-asm vars plus matching 
> constraints (if for inout operands, or explicit matching constraints 
> doesn't matter) because there's existing usage that happens to work fine.
> 
> Why unconditionally and not just "when expressions are different"?  
> Because the latter is inherently hard to see when optimizations are 
> involved: is "(a + 0)" the same as "(a)"?  At which optimization levels?
> What if the "0" is an expression that needs further analysis to see that 
> it's actually zero?  And so on.

Thanks for sharing this.  I also tend to error out in those cases as it
rather looks like a programming error.  I came up with an updated
version just a few minutes ago where I added some documentation which
also discusses this.  I also added some discussion for output operands
where I also tend to error out because those look like programming
errors.

> 
> If it's not easily possible to error out only for the new hard-reg 
> constraints, and accept whatever is there for register-asm vars and 
> matching constraints, then I would opt to also _not_ error out for the new 
> feature, though.  Essentially that's saying that if the user writes 
> wacky code then its their responsibility that everything works out, which 
> is exactly what the current implementation does: if at the end no reloads 
> are required, it's fine (because it indeed adheres to all given 
> constraints!), otherwise we give an error.

I will have a look and try to distinguish between both mechanisms during
error checking.

Thanks!
Stefan

[PATCH v2 3/4] genoutput: Verify hard register constraints

2024-09-18 Thread Stefan Schulze Frielinghaus

Since genoutput has no information about hard register names we cannot
statically verify those names in constraints of the machine description.
Therefore, we have to do it at runtime.  Although verification shouldn't
be too expensive, restrict it to checking builds.  This should be
sufficient since hard register constraints in machine descriptions
probably change rarely, and each commit should be tested with checking
anyway, or at the very least before a release is taken.
---
 gcc/genoutput.cc | 46 ++
 gcc/output.h |  2 ++
 gcc/toplev.cc|  4 
 3 files changed, 52 insertions(+)

diff --git a/gcc/genoutput.cc b/gcc/genoutput.cc
index 2ffb2fb28d2..4f4fde83608 100644
--- a/gcc/genoutput.cc
+++ b/gcc/genoutput.cc
@@ -200,6 +200,8 @@ static const char indep_constraints[] = ",=+%*?!^$#&g";
 static class constraint_data *
 constraints_by_letter_table[1 << CHAR_BIT];
 
+static hash_set used_reg_names;
+
 static int mdep_constraint_len (const char *, file_location, int);
 static void note_constraint (md_rtx_info *);
 
@@ -1156,6 +1158,45 @@ main (int argc, const char **argv)
   output_insn_data ();
   output_get_insn_name ();
 
+  /* Since genoutput has no information about hard register names we cannot
+ statically verify hard register names in constraints of the machine
+ description.  Therefore, we have to do it at runtime.  Although
+ verification shouldn't be too expensive, restrict it to checking builds.
+   */
+  printf ("\n\n#if CHECKING_P\n");
+  if (used_reg_names.is_empty ())
+printf ("void verify_reg_names_in_constraints () { }\n");
+  else
+{
+  size_t max_len = 0;
+  for (auto it = used_reg_names.begin (); it != used_reg_names.end (); 
++it)
+   {
+ size_t len = strlen (*it);
+ if (len > max_len)
+   max_len = len;
+   }
+  printf ("void\nverify_reg_names_in_constraints ()\n{\n");
+  printf ("  static const char hregnames[%zu][%zu] = {\n",
+ used_reg_names.elements (), max_len + 1);
+  auto it = used_reg_names.begin ();
+  while (it != used_reg_names.end ())
+   {
+ printf ("\"%s\"", *it);
+ ++it;
+ if (it != used_reg_names.end ())
+   printf (",");
+ printf ("\n");
+   }
+  printf ("  };\n");
+  printf ("  for (size_t i = 0; i < %zu; ++i)\n",
+ used_reg_names.elements ());
+  printf ("if (decode_reg_name (hregnames[i]) < 0)\n");
+  printf ("  internal_error (\"invalid register %%qs used in "
+ "constraint of machine description\", hregnames[i]);\n");
+  printf ("}\n");
+}
+  printf ("#endif\n");
+
   fflush (stdout);
   return (ferror (stdout) != 0 || have_error
? FATAL_EXIT_CODE : SUCCESS_EXIT_CODE);
@@ -1294,6 +1335,11 @@ mdep_constraint_len (const char *s, file_location loc, 
int opno)
   ptrdiff_t len = end - s;
   if (*end == '}' && len > 1 && len < 31)
{
+ char *regname = new char[len];
+ memcpy (regname, s + 1, len - 1);
+ regname[len - 1] = '\0';
+ if (used_reg_names.add (regname))
+   delete[] regname;
  return len + 1;
}
 }
diff --git a/gcc/output.h b/gcc/output.h
index 46b0033b221..5f0f8a6098c 100644
--- a/gcc/output.h
+++ b/gcc/output.h
@@ -636,4 +636,6 @@ extern int default_address_cost (rtx, machine_mode, 
addr_space_t, bool);
 /* Stack usage.  */
 extern void output_stack_usage (void);
 
+extern void verify_reg_names_in_constraints ();
+
 #endif /* ! GCC_OUTPUT_H */
diff --git a/gcc/toplev.cc b/gcc/toplev.cc
index bc442a08c63..34c372ad1a2 100644
--- a/gcc/toplev.cc
+++ b/gcc/toplev.cc
@@ -1817,6 +1817,10 @@ backend_init_target (void)
 static void
 backend_init (void)
 {
+#if CHECKING_P
+  verify_reg_names_in_constraints ();
+#endif
+
   init_emit_once ();
 
   init_rtlanal ();
-- 
2.45.2

[PATCH v2 2/4] Error handling for hard register constraints

2024-09-18 Thread Stefan Schulze Frielinghaus

This implements some basic error handling for hard register constraints
including potential conflics with register asm operands.

In contrast to register asm operands, hard register constraints allow
more than just one register per operand.  Even more than just one
register per alternative.  For example, a valid constraint for an
operand is "{r0}{r1}m,{r2}".  However, this also means that we have to
make sure that each register is used at most once in each alternative
over all outputs and likewise over all inputs.  For asm statements this
is done by this patch during gimplification.  For hard register
constraints used in machine description, error handling is still a todo
and I haven't investigated this so far and consider this rather a low
priority.

There are 9/10 call sides for parse_{input,output}_constraint() which I
didn't dare to touch in the first run.  If this patch is about to be
accepted I could change those call sides and explicitly pass a null
pointer instead of overloading those functions as it is done right now.
I consider this an implementation nit and didn't want to clutter the
patch for reviewing.
---
 gcc/cfgexpand.cc  |  42 
 gcc/gimplify.cc   |  73 +-
 gcc/gimplify_reg_info.h   | 130 ++
 gcc/stmt.cc   | 229 +-
 gcc/stmt.h|   8 +-
 gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c   |  83 +++
 gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c   |  20 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c   |  21 ++
 gcc/testsuite/gcc.dg/pr87600-2.c  |  30 +--
 gcc/testsuite/gcc.dg/pr87600-3.c  |  35 +++
 gcc/testsuite/gcc.dg/pr87600-3.s  |   0
 .../gcc.target/s390/asm-hard-reg-1.c  | 103 
 .../gcc.target/s390/asm-hard-reg-2.c  |  43 
 .../gcc.target/s390/asm-hard-reg-3.c  |  42 
 gcc/testsuite/lib/scanasm.exp |   4 +
 15 files changed, 779 insertions(+), 84 deletions(-)
 create mode 100644 gcc/gimplify_reg_info.h
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c
 create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.c
 create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.s
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-3.c

diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index 13f8c08d295..fdbbd93f1b5 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -2966,44 +2966,6 @@ expand_asm_loc (tree string, int vol, location_t locus)
   emit_insn (body);
 }
 
-/* Return the number of times character C occurs in string S.  */
-static int
-n_occurrences (int c, const char *s)
-{
-  int n = 0;
-  while (*s)
-n += (*s++ == c);
-  return n;
-}
-
-/* A subroutine of expand_asm_operands.  Check that all operands have
-   the same number of alternatives.  Return true if so.  */
-
-static bool
-check_operand_nalternatives (const vec &constraints)
-{
-  unsigned len = constraints.length();
-  if (len > 0)
-{
-  int nalternatives = n_occurrences (',', constraints[0]);
-
-  if (nalternatives + 1 > MAX_RECOG_ALTERNATIVES)
-   {
- error ("too many alternatives in %");
- return false;
-   }
-
-  for (unsigned i = 1; i < len; ++i)
-   if (n_occurrences (',', constraints[i]) != nalternatives)
- {
-   error ("operand constraints for % differ "
-  "in number of alternatives");
-   return false;
- }
-}
-  return true;
-}
-
 /* Check for overlap between registers marked in CLOBBERED_REGS and
anything inappropriate in T.  Emit error and return the register
variable definition for error, NULL_TREE for ok.  */
@@ -3169,10 +3131,6 @@ expand_asm_stmt (gasm *stmt)
= TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE (t)));
 }
 
-  /* ??? Diagnose during gimplification?  */
-  if (! check_operand_nalternatives (constraints))
-return;
-
   /* Count the number of meaningful clobbered registers, ignoring what
  we would ignore later.  */
   auto_vec clobber_rvec;
diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 26a216e151d..08e0b5d047b 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -70,6 +70,10 @@ along with GCC; see the file COPYING3.  If not see
 #include "omp-offload.h"
 #include "context.h"
 #include "tree-nested.h"
+#include "insn-config.h"
+#include "recog.h"
+#include "output.h"
+#include "gimplify_reg_info.h"
 
 /* Identifier for a basic condition, mapping it to other basic conditions of
its Boolean expression.  Basic conditions given the same uid (in the same
@@ -7009,6 +7013,42 @@ gimplify_addr_expr (tree *expr_p, gimple_seq *pre_p, 
gimple_seq *

[PATCH v2 4/4] Rewrite register asm into hard register constraints

2024-09-18 Thread Stefan Schulze Frielinghaus

Currently a register asm already materializes during expand.  This
means, a hard register is allocated for the very first access of a
register asm as e.g. in an assignment.  As a consequence this might lead
to suboptimal register allocation if the assignment and the using asm
statement are spread far apart.  Even more problematic are function
calls in between register asm assignments and its using asm statement
since hard registers may be clobbered by a call.  The former may be
solved by pulling register asm assignments and asm statements close by.
However, the latter is not easily solved since sometimes function calls
are implicit.  For example

int
foo (int *x)
{
  register int y asm ("0") = 42;
  register int z asm ("1") = *x;
  asm ("bar\t%0,%1" : "+r" (z) : "r" (y));
  return z;
}

If compiled with address sanitizer, then a function call is introduced
for the memory load which in turn may interfer with the initialization
of register asm y.  Likewise, for some targets and configurations even
an operation like an addition may lead to an implicit library call.

In contrast hard register constraints materialize during register
allocation and therefore do not suffer from this, i.e., asm operands are
kept in pseudos until RA.  This patch adds the feature of rewriting
local register asm into code which exploits hard register constraints.
For example

register int global asm ("r3");

int foo (int x0)
{
  register int x asm ("r4") = x0;
  register int y asm ("r5");

  asm ("bar\t%0,%1,%2" : "=r" (x) : "0" (x), "r" (global));
  x += 42;
  asm ("baz\t%0,%1" : "=r" (y) : "r" (x));

  return y;
}

is rewritten during gimplification into

register int global asm ("r3");

int foo (int x0)
{
  int x = x0;
  int y;

  asm ("bar\t%0,%1,%2" : "={r4}" (x) : "0" (x), "r" (global));
  x += 42;
  asm ("baz\t%0,%1" : "={r5}" (y) : "{r4}" (x));

  return y;
}

The resulting code solely relies on hard register constraints modulo
global register asm.

Since I consider this as an experimental feature it is hidden behind new
flag -fdemote-register-asm (I'm open for other naming suggestions).
---
 gcc/common.opt|  4 +
 gcc/gimplify.cc   | 78 +++
 .../gcc.dg/asm-hard-reg-demotion-1.c  | 19 +
 .../gcc.dg/asm-hard-reg-demotion-2.c  | 19 +
 gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h  | 52 +
 5 files changed, 172 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-1.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-2.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h

diff --git a/gcc/common.opt b/gcc/common.opt
index ea39f87ae71..859a735a0b7 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3422,6 +3422,10 @@ fverbose-asm
 Common Var(flag_verbose_asm)
 Add extra commentary to assembler output.
 
+fdemote-register-asm
+Common Var(flag_demote_register_asm) Init(0)
+Demote local register asm and use hard register constraints instead
+
 fvisibility=
 Common Joined RejectNegative Enum(symbol_visibility) Var(default_visibility) 
Init(VISIBILITY_DEFAULT)
 -fvisibility=[default|internal|hidden|protected]   Set the default symbol 
visibility.
diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 08e0b5d047b..c9bd1769c28 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -7049,6 +7049,73 @@ num_alternatives (const_tree link)
   return num + 1;
 }
 
+static hash_set demote_register_asm;
+
+static void
+gimplify_demote_register_asm (tree link)
+{
+  if (!flag_demote_register_asm)
+return;
+  tree op = TREE_VALUE (link);
+  if (!VAR_P (op) || !DECL_HARD_REGISTER (op) || is_global_var (op))
+return;
+  tree id = DECL_ASSEMBLER_NAME (op);
+  const char *regname = IDENTIFIER_POINTER (id);
+  ++regname;
+  int regno = decode_reg_name (regname);
+  if (regno < 0)
+/* This indicates an error and we error out later on.  */
+return;
+  const char *constraint = TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE 
(link)));
+  auto_vec constraint_new;
+  for (const char *p = constraint; *p; )
+{
+  bool pushed = false;
+  switch (*p)
+   {
+   case '+': case '=': case '%': case '?': case '!': case '*': case '&':
+   case '#': case '$': case '^': case '{': case 'E': case 'F': case 'G':
+   case 'H': case 's': case 'i': case 'n': case 'I': case 'J': case 'K':
+   case 'L': case 'M': case 'N': case 'O': case 'P': case ',': case '0':
+   case '1': case '2': case '3': case '4': case '5': case '6': case '7':
+   case '8': case '9': case '[': case '<': case '>': case 'g': case 'X':
+ break;
+
+   default:
+ if (!ISALPHA (*p))
+   break;
+ enum constraint_num cn = lookup_constraint (p);
+ enum reg_class rclass = reg_class_for_constraint (cn);
+ if (rclass != NO_REGS || insn_extra_address_constraint (cn))
+   {
+ gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (regno), r

[PATCH v2 0/4] Hard Register Constraints

2024-09-18 Thread Stefan Schulze Frielinghaus

This is a follow-up to
https://gcc.gnu.org/pipermail/gcc-patches/2024-September/662725.html

I basically added only some documentation to the first patch.  If you
think that gcc/doc/extend.texi isn't the right place (especially the
discussion part which will be dropped/rephrased in the end anyway), then
just let me know.

Stefan Schulze Frielinghaus (4):
  Hard register constraints
  Error handling for hard register constraints
  genoutput: Verify hard register constraints
  Rewrite register asm into hard register constraints

 gcc/cfgexpand.cc  |  42 ---
 gcc/common.opt|   4 +
 gcc/doc/extend.texi   | 189 
 gcc/doc/md.texi   |   6 +
 gcc/function.cc   | 116 
 gcc/genoutput.cc  |  60 
 gcc/genpreds.cc   |   4 +-
 gcc/gimplify.cc   | 151 +-
 gcc/gimplify_reg_info.h   | 130 +
 gcc/ira.cc|  79 +-
 gcc/lra-constraints.cc|  13 +
 gcc/output.h  |   2 +
 gcc/recog.cc  |  11 +-
 gcc/stmt.cc   | 268 +-
 gcc/stmt.h|   9 +-
 gcc/testsuite/gcc.dg/asm-hard-reg-1.c |  85 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-2.c |  33 +++
 gcc/testsuite/gcc.dg/asm-hard-reg-3.c |  25 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-4.c |  50 
 gcc/testsuite/gcc.dg/asm-hard-reg-5.c |  36 +++
 gcc/testsuite/gcc.dg/asm-hard-reg-6.c |  60 
 gcc/testsuite/gcc.dg/asm-hard-reg-7.c |  41 +++
 gcc/testsuite/gcc.dg/asm-hard-reg-8.c |  49 
 .../gcc.dg/asm-hard-reg-demotion-1.c  |  19 ++
 .../gcc.dg/asm-hard-reg-demotion-2.c  |  19 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h  |  52 
 gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c   |  83 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c   |  20 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c   |  21 ++
 gcc/testsuite/gcc.dg/pr87600-2.c  |  30 +-
 gcc/testsuite/gcc.dg/pr87600-3.c  |  35 +++
 gcc/testsuite/gcc.dg/pr87600-3.s  |   0
 .../gcc.target/s390/asm-hard-reg-1.c  | 103 +++
 .../gcc.target/s390/asm-hard-reg-2.c  |  43 +++
 .../gcc.target/s390/asm-hard-reg-3.c  |  42 +++
 gcc/testsuite/lib/scanasm.exp |   4 +
 gcc/toplev.cc |   4 +
 37 files changed, 1851 insertions(+), 87 deletions(-)
 create mode 100644 gcc/gimplify_reg_info.h
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-1.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-2.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-3.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-4.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-5.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-6.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-7.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-8.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-1.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-2.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c
 create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.c
 create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.s
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-3.c

-- 
2.45.2

[PATCH v2 1/4] Hard register constraints

2024-09-18 Thread Stefan Schulze Frielinghaus

Implement hard register constraints of the form {regname} where regname
must be any valid register name for the target.  Such constraints may be
used in asm statements as a replacement for register asm and in machine
descriptions.

Due to optimizations it is not unexpected if two or more inputs require
the same value, then those also share a common pseudo.  However, this in
turn may lead to unsatisfiable asm where multiple inputs with different
hard register constraints share the same pseudo.  Therefore, we have to
introduce copies of such a pseudo and use these for conflicting inputs.
This is done prior RA during asmcons in match_asm_constraints_2().
While IRA tries to reduce live ranges, it also replaces some
register-register moves.  That in turn might undo those copies of a
pseudo which we just introduced during asmcons.  Thus, check in
decrease_live_ranges_number() via valid_replacement_for_asm_input_p()
whether it is valid to perform a replacement.

The reminder of the patch mostly deals with parsing and decoding hard
register constraints.  The actual work is done by LRA in
process_alt_operands() where a register filter, according to the
constraint, is installed.

For the sake of "reviewability" and in order to show the beauty of LRA,
error handling (which gets pretty involved) is spread out into a
subsequent patch.

Limitation: Currently, a fixed register cannot be used as hard register
constraint.  For example, accessing the stack pointer on x86_64 via

void *
foo (void)
{
  void *y;
  __asm__ ("" : "={rsp}" (y));
  return y;
}

leads to an error.  This is unfortunate since register asm does not have
this limitation.  The culprit seems to be that during reload
ira_class_hard_regs_num[rclass] does not even include fixed registers
which is why lra_assign() ultimately fails.  Does anyone have an idea
how to lift this limitation?  Maybe there is even a shortcut in order to
force a pseudo into a hard reg?
---
 gcc/doc/extend.texi   | 189 ++
 gcc/doc/md.texi   |   6 +
 gcc/function.cc   | 116 
 gcc/genoutput.cc  |  14 ++
 gcc/genpreds.cc   |   4 +-
 gcc/ira.cc|  79 ++-
 gcc/lra-constraints.cc|  13 ++
 gcc/recog.cc  |  11 +-
 gcc/stmt.cc   |  39 ++
 gcc/stmt.h|   1 +
 gcc/testsuite/gcc.dg/asm-hard-reg-1.c |  85 
 gcc/testsuite/gcc.dg/asm-hard-reg-2.c |  33 +
 gcc/testsuite/gcc.dg/asm-hard-reg-3.c |  25 
 gcc/testsuite/gcc.dg/asm-hard-reg-4.c |  50 +++
 gcc/testsuite/gcc.dg/asm-hard-reg-5.c |  36 +
 gcc/testsuite/gcc.dg/asm-hard-reg-6.c |  60 
 gcc/testsuite/gcc.dg/asm-hard-reg-7.c |  41 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-8.c |  49 +++
 18 files changed, 848 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-1.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-2.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-3.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-4.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-5.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-6.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-7.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-8.c

diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 0ea7a87053c..f8cbbabeba7 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -12549,6 +12549,7 @@ the two, as explained in the sections below.
 @menu
 * Global Register Variables::   Variables declared at global scope.
 * Local Register Variables::Variables declared within a function.
+* Hard Register Constraints::   Operands forced into specific machine 
registers.
 @end menu
 
 @node Global Register Variables
@@ -12754,6 +12755,194 @@ with slightly different characteristics (@pxref{MIPS 
Coprocessors,,
 Defining coprocessor specifics for MIPS targets, gccint, 
 GNU Compiler Collection (GCC) Internals}).
 
+@node Hard Register Constraints
+@subsubsection Hard Register Constraints
+
+Similar to register @code{asm} but still distinct, hard register constraints
+are another way to force operands of inline @code{asm} into specific machine
+registers.  In contrast to register @code{asm} where a variable is bound to a
+machine register, a hard register constraint loads an @code{asm} operand into a
+machine register.  Assume in the following that @code{r4} is a general-purpose
+register, @code{f5} a floating-point register, and @code{v6} a vector register
+for some target.
+
+@smallexample
+int x;
+int y __attribute__ ((vector_size (16)));
+@dots{}
+asm ("some instructions"
+ : "=@{r4@}" (x)
+ : "@{f5@}" (42.0), "@{v6@}" (y));
+@end smallexample
+
+For the inline @code{asm}, variable @code{x} is loaded into register @code{r4},
+and @code{y} into @code{v6}.  Furthermore, constant @co

[PATCH] s390: Fix TF to FPRX2 conversion [PR115860]

2024-09-12 Thread Stefan Schulze Frielinghaus

Bootstrapped and regtested on s390.  Approved offlist and as also
discussed offlist I went for removing format specifier %V.  This fixes

FAIL: g++.dg/cpp23/ext-floating14.C  -std=gnu++23 execution test
FAIL: g++.dg/cpp23/ext-floating14.C  -std=gnu++26 execution test
FAIL: c-c++-common/ubsan/float-cast-overflow-7.c   -O2  execution test
FAIL: c-c++-common/ubsan/float-cast-overflow-7.c   -O2 -flto 
-fno-use-linker-plugin -flto-partition=none  execution test
FAIL: c-c++-common/ubsan/float-cast-overflow-7.c   -O2 -flto 
-fuse-linker-plugin -fno-fat-lto-objects  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O0  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O1  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O2  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O2 -flto 
-fno-use-linker-plugin -flto-partition=none  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O2 -flto 
-fuse-linker-plugin -fno-fat-lto-objects  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O3 -g  execution 
test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -Os  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O0  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O1  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O2  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O2 -flto 
-fno-use-linker-plugin -flto-partition=none  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O2 -flto 
-fuse-linker-plugin -fno-fat-lto-objects  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O3 -g  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -Os  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O0  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O1  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O2  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O2 -flto -fno-use-linker-plugin 
-flto-partition=none  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O3 -g  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -Os  execution test
FAIL: gfortran.dg/pr96711.f90   -O0  execution test
FAIL: libffi.closures/nested_struct5.c -W -Wall -Wno-psabi -O2 output pattern 
test
FAIL: libphobos.phobos/std/algorithm/mutation.d execution test
FAIL: libphobos.phobos/std/conv.d execution test
FAIL: libphobos.phobos/std/internal/math/errorfunction.d execution test
FAIL: libphobos.phobos/std/variant.d execution test
FAIL: libphobos.phobos_shared/std/algorithm/mutation.d execution test
FAIL: libphobos.phobos_shared/std/conv.d execution test
FAIL: libphobos.phobos_shared/std/internal/math/errorfunction.d execution test
FAIL: libphobos.phobos_shared/std/variant.d execution test

I will push shortly.

-- >8 --

Currently subregs originating from *tf_to_fprx2_0 and *tf_to_fprx2_1
survive register allocation.  This in turn leads to wrong register
renaming.  Keeping the current approach would mean we need two insns for
*tf_to_fprx2_0 and *tf_to_fprx2_1, respectively.  Something along the
lines

(define_insn "*tf_to_fprx2_0"
  [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "=f") 0)
(unspec:DF [(match_operand:TF 1 "general_operand" "v")]
   UNSPEC_TF_TO_FPRX2_0))]
  "TARGET_VXE"
  "#")

(define_insn "*tf_to_fprx2_0"
  [(set (match_operand:DF 0 "nonimmediate_operand" "=f")
(unspec:DF [(match_operand:TF 1 "general_operand" "v")]
   UNSPEC_TF_TO_FPRX2_0))]
  "TARGET_VXE"
  "vpdi\t%v0,%v1,%v0,1
  [(set_attr "op_type" "VRR")])

and similar for *tf_to_fprx2_1.  Note, pre register allocation operand 0
has mode FPRX2 and afterwards DF once subregs have been eliminated.

Since we always copy a whole vector register into a floating-point
register pair, another way to fix this is to merge *tf_to_fprx2_0 and
*tf_to_fprx2_1 into a single insn which means we don't have to use
subregs at all.  The downside of this is that the assembler template
contains two instructions, now.  The upside is that we don't have to
come up with some artificial insn before RA which might be more
readable/maintainable.  That is implemented by this patch.

In commit r11-4872-ge627cda5686592, the output operand specifier %V was
introduced which is used in tf_to_fprx2 only, now.  Instead of coming up
with its counterpart %F for floating-point registers, which would also
only be used in tf_to_fprx2, I print the operands directly.  This
renders %V unused which is why it is removed by this patch.

gcc/ChangeLog:

PR 115860
* config/s390/s390.cc (print_operand): Remove operand specifier
%V.
* config/s390

[PATCH] s390: Fix AQ and AR constraints

2024-09-12 Thread Stefan Schulze Frielinghaus

Ensure for AQ and AR constraints that the resulting displacement after
adding any positive offset less than the size of the object being
referenced is still valid.

Bootstrapped and regtested on s390.  As approved by
https://gcc.gnu.org/pipermail/gcc-patches/2024-September/662865.html
I will push shortly.

gcc/ChangeLog:

* config/s390/s390.cc (s390_mem_constraint): Check displacement
for AQ and AR constraints.
---
 gcc/config/s390/s390.cc | 12 
 1 file changed, 12 insertions(+)

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 7aea776da2f..ae1f369e19d 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -3714,6 +3714,18 @@ s390_mem_constraint (const char *str, rtx op)
   if ((reload_completed || reload_in_progress)
  ? !offsettable_memref_p (op) : !offsettable_nonstrict_memref_p (op))
return 0;
+  /* offsettable_memref_p ensures only that any positive offset added to
+the address forms a valid general address.  For AQ and AR constraints
+we also have to verify that the resulting displacement after adding
+any positive offset less than the size of the object being referenced
+is still valid.  */
+  if (str[1] == 'Q' || str[1] == 'R')
+   {
+ int o = GET_MODE_SIZE (GET_MODE (op)) - 1;
+ rtx tmp = adjust_address (op, QImode, o);
+ if (!s390_check_qrst_address (str[1], XEXP (tmp, 0), true))
+   return 0;
+   }
   return s390_check_qrst_address (str[1], XEXP (op, 0), true);
 case 'B':
   /* Check for non-literal-pool variants of memory constraints.  */
-- 
2.45.2

Re: [RFC 0/4] Hard Register Constraints

2024-09-12 Thread Stefan Schulze Frielinghaus

On Thu, Sep 12, 2024 at 04:03:33PM +0200, Georg-Johann Lay wrote:
> 
> 
> Am 10.09.24 um 16:20 schrieb Stefan Schulze Frielinghaus:
> > This series introduces hard register constraints.  The first patch
> > enables hard register constraints for asm statements and for
> > machine descriptions.  The subsequent patch adds some basic error
> > handling for asm statements.  The third patch adds some verification of
> > register names used in machine description.  The fourth and last patch
> > adds the feature of rewriting local register asm into hard register
> > constraints.
> > 
> > This series was bootstrapped and regtested on s390.  Furthermore, the
> > new dg-compile tests were verified via cross compilers for the enabled
> > targets.  There is still some fallout if -fdemote-register-asm is used
> > since a couple of features are missing as e.g. erroring out during
> > gimplification if the clobber set of registers intersects with
> > input/output registers.
> > 
> > As a larger test vehicle I've compiled and regtested glibc on s390 using
> > -fdemote-register-asm without any fallout.  On x86_64 this fails due to
> > the limitation that fixed registers are currently not supported for hard
> > register constraints (see commit message of the first patch).  This is
> > also the reason why I'm posting this series already since I was hoping
> > to get some feedback about this limitation.
> > 
> > Furthermore, I've compiled the Linux kernel on s390 and x86_64 with
> > -fdemote-register-asm.  Interestingly, the Linux kernel for x86_64 makes
> > use of the following asm statement:
> > 
> > #define call_on_stack(stack, func, asm_call, argconstr...)  \
> > {   \
> >  register void *tos asm("r11");  \
> >  \
> >  tos = ((void *)(stack));\
> >  \
> >  asm_inline volatile(\
> >  "movq   %%rsp, (%[tos]) \n" \
> >  "movq   %[tos], %%rsp   \n" \
> >  \
> >  asm_call\
> >  \
> >  "popq   %%rsp   \n" \
> >  \
> >  : "+r" (tos), ASM_CALL_CONSTRAINT   \
> >  : [__func] "i" (func), [tos] "r" (tos) argconstr\
> >  : "cc", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10",   \
> >"memory"  \
> >  );  \
> > }
> > 
> > Note the output
> >"+r" (tos)
> > and the input
> >[tos] "r" (tos)
> > Currently I error out for this since I consider this as two inputs using
> > the same hard register.  One time an implicit input via '+' and a second
> > time via the explicit input.  Thus, actually I would expect a '='
> 
> Would you explain why the two operands are supposed to live in the same
> hard register?
> 
> From my understanding of asm semantics, this gives you two copies of
> tos:  The 1st one may be altered by the asm, and the 2nd one may not be
> changed.  As the operands neither refer to each other by "0" nor don't
> they use the same (single-register) register constraint, there is no
> reason / requirement to allocate the two operands to the same reg, no?

During gimplification an inout operand is canonicalized into one output
and one input operand.  The input operand refers via a digit to the
output operand.  For example

asm ("" : "+r" (x));

is rewritten into

asm ("" : "=r" (x) : "0" (x));

I didn't find documentation how "digit references" behave in combination
with register asm.  At least it is not defined here
https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html#index-0-in-constraint

Re: [PATCH] s390: Fix TF to FPRX2 conversion [PR115860]

2024-09-11 Thread Stefan Schulze Frielinghaus

On Wed, Sep 11, 2024 at 08:57:23PM +0200, Ilya Leoshkevich wrote:
> On Wed, 2024-09-11 at 16:44 +0200, Stefan Schulze Frielinghaus wrote:
> > On Wed, Sep 11, 2024 at 01:59:48PM +0200, Ilya Leoshkevich wrote:
> > > On Wed, 2024-09-11 at 13:34 +0200, Stefan Schulze Frielinghaus
> > > wrote:
> > > > On Wed, Sep 11, 2024 at 01:22:30PM +0200, Ilya Leoshkevich wrote:
> > > > > On Wed, 2024-09-11 at 12:35 +0200, Stefan Schulze Frielinghaus
> > > > > wrote:
> > > > > > On Wed, Sep 11, 2024 at 11:47:54AM +0200, Ilya Leoshkevich
> > > > > > wrote:
> > > > > > > On Fri, 2024-08-16 at 09:41 +0200, Stefan Schulze
> > > > > > > Frielinghaus
> > > > > > > wrote:
> > > > > > > > Currently subregs originating from *tf_to_fprx2_0 and
> > > > > > > > *tf_to_fprx2_1
> > > > > > > > survive register allocation.  This in turn leads to wrong
> > > > > > > > register
> > > > > > > > renaming.  Keeping the current approach would mean we
> > > > > > > > need
> > > > > > > > two
> > > > > > > > insns
> > > > > > > > for
> > > > > > > > *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively. 
> > > > > > > > Something
> > > > > > > > along
> > > > > > > > the
> > > > > > > > lines
> > > > > > > > 
> > > > > > > > (define_insn "*tf_to_fprx2_0"
> > > > > > > >   [(set (subreg:DF (match_operand:FPRX2 0
> > > > > > > > "nonimmediate_operand"
> > > > > > > > "=f") 0)
> > > > > > > >     (unspec:DF [(match_operand:TF 1 "general_operand"
> > > > > > > > "v")]
> > > > > > > >    UNSPEC_TF_TO_FPRX2_0))]
> > > > > > > >   "TARGET_VXE"
> > > > > > > >   "#")
> > > > > > > > 
> > > > > > > > (define_insn "*tf_to_fprx2_0"
> > > > > > > >   [(set (match_operand:DF 0 "nonimmediate_operand" "=f")
> > > > > > > >     (unspec:DF [(match_operand:TF 1 "general_operand"
> > > > > > > > "v")]
> > > > > > > >    UNSPEC_TF_TO_FPRX2_0))]
> > > > > > > >   "TARGET_VXE"
> > > > > > > >   "vpdi\t%v0,%v1,%v0,1
> > > > > > > >   [(set_attr "op_type" "VRR")])
> > > > > > > > 
> > > > > > > > and similar for *tf_to_fprx2_1.  Note, pre register
> > > > > > > > allocation
> > > > > > > > operand 0
> > > > > > > > has mode FPRX2 and afterwards DF once subregs have been
> > > > > > > > eliminated.
> > > > > > > > 
> > > > > > > > Since we always copy a whole vector register into a
> > > > > > > > floating-
> > > > > > > > point
> > > > > > > > register pair, another way to fix this is to merge
> > > > > > > > *tf_to_fprx2_0
> > > > > > > > and
> > > > > > > > *tf_to_fprx2_1 into a single insn which means we don't
> > > > > > > > have
> > > > > > > > to
> > > > > > > > use
> > > > > > > > subregs at all.  The downside of this is that the
> > > > > > > > assembler
> > > > > > > > template
> > > > > > > > contains two instructions, now.  The upside is that we
> > > > > > > > don't
> > > > > > > > have
> > > > > > > > to
> > > > > > > > come up with some artificial insn before RA which might
> > > > > > > > be
> > > > > > > > more
> > > > > > > > readable/maintainable.  That is implemented by this
> > > > > > > > patch.
> > > > > > > > 
> > > > > > > > In commit r11-4872-ge627cda5686592, the output operand

Re: [PATCH] s390: Fix TF to FPRX2 conversion [PR115860]

2024-09-11 Thread Stefan Schulze Frielinghaus

On Wed, Sep 11, 2024 at 01:59:48PM +0200, Ilya Leoshkevich wrote:
> On Wed, 2024-09-11 at 13:34 +0200, Stefan Schulze Frielinghaus wrote:
> > On Wed, Sep 11, 2024 at 01:22:30PM +0200, Ilya Leoshkevich wrote:
> > > On Wed, 2024-09-11 at 12:35 +0200, Stefan Schulze Frielinghaus
> > > wrote:
> > > > On Wed, Sep 11, 2024 at 11:47:54AM +0200, Ilya Leoshkevich wrote:
> > > > > On Fri, 2024-08-16 at 09:41 +0200, Stefan Schulze Frielinghaus
> > > > > wrote:
> > > > > > Currently subregs originating from *tf_to_fprx2_0 and
> > > > > > *tf_to_fprx2_1
> > > > > > survive register allocation.  This in turn leads to wrong
> > > > > > register
> > > > > > renaming.  Keeping the current approach would mean we need
> > > > > > two
> > > > > > insns
> > > > > > for
> > > > > > *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively.  Something
> > > > > > along
> > > > > > the
> > > > > > lines
> > > > > > 
> > > > > > (define_insn "*tf_to_fprx2_0"
> > > > > >   [(set (subreg:DF (match_operand:FPRX2 0
> > > > > > "nonimmediate_operand"
> > > > > > "=f") 0)
> > > > > >     (unspec:DF [(match_operand:TF 1 "general_operand"
> > > > > > "v")]
> > > > > >    UNSPEC_TF_TO_FPRX2_0))]
> > > > > >   "TARGET_VXE"
> > > > > >   "#")
> > > > > > 
> > > > > > (define_insn "*tf_to_fprx2_0"
> > > > > >   [(set (match_operand:DF 0 "nonimmediate_operand" "=f")
> > > > > >     (unspec:DF [(match_operand:TF 1 "general_operand"
> > > > > > "v")]
> > > > > >    UNSPEC_TF_TO_FPRX2_0))]
> > > > > >   "TARGET_VXE"
> > > > > >   "vpdi\t%v0,%v1,%v0,1
> > > > > >   [(set_attr "op_type" "VRR")])
> > > > > > 
> > > > > > and similar for *tf_to_fprx2_1.  Note, pre register
> > > > > > allocation
> > > > > > operand 0
> > > > > > has mode FPRX2 and afterwards DF once subregs have been
> > > > > > eliminated.
> > > > > > 
> > > > > > Since we always copy a whole vector register into a floating-
> > > > > > point
> > > > > > register pair, another way to fix this is to merge
> > > > > > *tf_to_fprx2_0
> > > > > > and
> > > > > > *tf_to_fprx2_1 into a single insn which means we don't have
> > > > > > to
> > > > > > use
> > > > > > subregs at all.  The downside of this is that the assembler
> > > > > > template
> > > > > > contains two instructions, now.  The upside is that we don't
> > > > > > have
> > > > > > to
> > > > > > come up with some artificial insn before RA which might be
> > > > > > more
> > > > > > readable/maintainable.  That is implemented by this patch.
> > > > > > 
> > > > > > In commit r11-4872-ge627cda5686592, the output operand
> > > > > > specifier
> > > > > > %V
> > > > > > was
> > > > > > introduced which is used in tf_to_fprx2 only, now.  I didn't
> > > > > > come
> > > > > > up
> > > > > > with its counterpart like %F for floating-point registers. 
> > > > > > Instead I
> > > > > > printed the register pair in the output function directly. 
> > > > > > This
> > > > > > spares
> > > > > > us a new and "rare" format specifier for a single insn.  I
> > > > > > don't
> > > > > > have
> > > > > > a
> > > > > > strong opinion which option to choose, however, we should
> > > > > > either
> > > > > > add
> > > > > > %F
> > > > > > in order to mimic the same behaviour as %V or getting rid of
> > > > > > %V
> > > > > > and
> > > > > > inline the logic in the

Re: [PATCH] s390: Fix TF to FPRX2 conversion [PR115860]

2024-09-11 Thread Stefan Schulze Frielinghaus

On Wed, Sep 11, 2024 at 01:22:30PM +0200, Ilya Leoshkevich wrote:
> On Wed, 2024-09-11 at 12:35 +0200, Stefan Schulze Frielinghaus wrote:
> > On Wed, Sep 11, 2024 at 11:47:54AM +0200, Ilya Leoshkevich wrote:
> > > On Fri, 2024-08-16 at 09:41 +0200, Stefan Schulze Frielinghaus
> > > wrote:
> > > > Currently subregs originating from *tf_to_fprx2_0 and
> > > > *tf_to_fprx2_1
> > > > survive register allocation.  This in turn leads to wrong
> > > > register
> > > > renaming.  Keeping the current approach would mean we need two
> > > > insns
> > > > for
> > > > *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively.  Something along
> > > > the
> > > > lines
> > > > 
> > > > (define_insn "*tf_to_fprx2_0"
> > > >   [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand"
> > > > "=f") 0)
> > > >     (unspec:DF [(match_operand:TF 1 "general_operand" "v")]
> > > >    UNSPEC_TF_TO_FPRX2_0))]
> > > >   "TARGET_VXE"
> > > >   "#")
> > > > 
> > > > (define_insn "*tf_to_fprx2_0"
> > > >   [(set (match_operand:DF 0 "nonimmediate_operand" "=f")
> > > >     (unspec:DF [(match_operand:TF 1 "general_operand" "v")]
> > > >    UNSPEC_TF_TO_FPRX2_0))]
> > > >   "TARGET_VXE"
> > > >   "vpdi\t%v0,%v1,%v0,1
> > > >   [(set_attr "op_type" "VRR")])
> > > > 
> > > > and similar for *tf_to_fprx2_1.  Note, pre register allocation
> > > > operand 0
> > > > has mode FPRX2 and afterwards DF once subregs have been
> > > > eliminated.
> > > > 
> > > > Since we always copy a whole vector register into a floating-
> > > > point
> > > > register pair, another way to fix this is to merge *tf_to_fprx2_0
> > > > and
> > > > *tf_to_fprx2_1 into a single insn which means we don't have to
> > > > use
> > > > subregs at all.  The downside of this is that the assembler
> > > > template
> > > > contains two instructions, now.  The upside is that we don't have
> > > > to
> > > > come up with some artificial insn before RA which might be more
> > > > readable/maintainable.  That is implemented by this patch.
> > > > 
> > > > In commit r11-4872-ge627cda5686592, the output operand specifier
> > > > %V
> > > > was
> > > > introduced which is used in tf_to_fprx2 only, now.  I didn't come
> > > > up
> > > > with its counterpart like %F for floating-point registers. 
> > > > Instead I
> > > > printed the register pair in the output function directly.  This
> > > > spares
> > > > us a new and "rare" format specifier for a single insn.  I don't
> > > > have
> > > > a
> > > > strong opinion which option to choose, however, we should either
> > > > add
> > > > %F
> > > > in order to mimic the same behaviour as %V or getting rid of %V
> > > > and
> > > > inline the logic in the output function.  I lean towards the
> > > > latter.
> > > > Any preferences?
> > > > ---
> > > >  gcc/config/s390/s390.md    |  2 +
> > > >  gcc/config/s390/vector.md  | 66 +++-
> > > > 
> > > > --
> > > >  gcc/testsuite/gcc.target/s390/pr115860-1.c | 26 +
> > > >  3 files changed, 60 insertions(+), 34 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/s390/pr115860-1.c
> > > 
> > > [...]
> > > 
> > > > +  char buf[64];
> > > > +  switch (which_alternative)
> > > > +    {
> > > > +    case 0:
> > > > +  if (REGNO (operands[0]) == REGNO (operands[1]))
> > > > +   return "vpdi\t%V0,%v1,%V0,5";
> > > > +  else
> > > > +   return "ldr\t%f0,%f1;vpdi\t%V0,%v1,%V0,5";
> > > > +    case 1:
> > > > +  {
> > > > +   const char *reg_pair = reg_names[REGNO (operands[0]) +
> > > > 1];
> > > > +   snprintf (buf, sizeof (buf),
> > > > "ld\t%%f0,%%1;ld\t%%%s,8+%%1",
> > >

Re: [PATCH] s390: Fix TF to FPRX2 conversion [PR115860]

2024-09-11 Thread Stefan Schulze Frielinghaus

On Wed, Sep 11, 2024 at 11:47:54AM +0200, Ilya Leoshkevich wrote:
> On Fri, 2024-08-16 at 09:41 +0200, Stefan Schulze Frielinghaus wrote:
> > Currently subregs originating from *tf_to_fprx2_0 and *tf_to_fprx2_1
> > survive register allocation.  This in turn leads to wrong register
> > renaming.  Keeping the current approach would mean we need two insns
> > for
> > *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively.  Something along the
> > lines
> > 
> > (define_insn "*tf_to_fprx2_0"
> >   [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand"
> > "=f") 0)
> >     (unspec:DF [(match_operand:TF 1 "general_operand" "v")]
> >    UNSPEC_TF_TO_FPRX2_0))]
> >   "TARGET_VXE"
> >   "#")
> > 
> > (define_insn "*tf_to_fprx2_0"
> >   [(set (match_operand:DF 0 "nonimmediate_operand" "=f")
> >     (unspec:DF [(match_operand:TF 1 "general_operand" "v")]
> >    UNSPEC_TF_TO_FPRX2_0))]
> >   "TARGET_VXE"
> >   "vpdi\t%v0,%v1,%v0,1
> >   [(set_attr "op_type" "VRR")])
> > 
> > and similar for *tf_to_fprx2_1.  Note, pre register allocation
> > operand 0
> > has mode FPRX2 and afterwards DF once subregs have been eliminated.
> > 
> > Since we always copy a whole vector register into a floating-point
> > register pair, another way to fix this is to merge *tf_to_fprx2_0 and
> > *tf_to_fprx2_1 into a single insn which means we don't have to use
> > subregs at all.  The downside of this is that the assembler template
> > contains two instructions, now.  The upside is that we don't have to
> > come up with some artificial insn before RA which might be more
> > readable/maintainable.  That is implemented by this patch.
> > 
> > In commit r11-4872-ge627cda5686592, the output operand specifier %V
> > was
> > introduced which is used in tf_to_fprx2 only, now.  I didn't come up
> > with its counterpart like %F for floating-point registers.  Instead I
> > printed the register pair in the output function directly.  This
> > spares
> > us a new and "rare" format specifier for a single insn.  I don't have
> > a
> > strong opinion which option to choose, however, we should either add
> > %F
> > in order to mimic the same behaviour as %V or getting rid of %V and
> > inline the logic in the output function.  I lean towards the latter.
> > Any preferences?
> > ---
> >  gcc/config/s390/s390.md    |  2 +
> >  gcc/config/s390/vector.md  | 66 +++-
> > --
> >  gcc/testsuite/gcc.target/s390/pr115860-1.c | 26 +
> >  3 files changed, 60 insertions(+), 34 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/s390/pr115860-1.c
> 
> [...]
> 
> > +  char buf[64];
> > +  switch (which_alternative)
> > +    {
> > +    case 0:
> > +  if (REGNO (operands[0]) == REGNO (operands[1]))
> > +   return "vpdi\t%V0,%v1,%V0,5";
> > +  else
> > +   return "ldr\t%f0,%f1;vpdi\t%V0,%v1,%V0,5";
> > +    case 1:
> > +  {
> > +   const char *reg_pair = reg_names[REGNO (operands[0]) + 1];
> > +   snprintf (buf, sizeof (buf), "ld\t%%f0,%%1;ld\t%%%s,8+%%1",
> > reg_pair);
> 
> I wonder if there is a corner case where 8+ does not fit into short
> displacement?

That is covered by constraint AR, i.e., for short displacement, and AT
for long displacement.

[RFC 3/4] genoutput: Verify hard register constraints

2024-09-10 Thread Stefan Schulze Frielinghaus

Since genoutput has no information about hard register names we cannot
statically verify those names in constraints of the machine description.
Therefore, we have to do it at runtime.  Although verification shouldn't
be too expensive, restrict it to checking builds.  This should be
sufficient since hard register constraints in machine descriptions
probably change rarely, and each commit should be tested with checking
anyway, or at the very least before a release is taken.
---
 gcc/genoutput.cc | 46 ++
 gcc/output.h |  2 ++
 gcc/toplev.cc|  4 
 3 files changed, 52 insertions(+)

diff --git a/gcc/genoutput.cc b/gcc/genoutput.cc
index 2ffb2fb28d2..4f4fde83608 100644
--- a/gcc/genoutput.cc
+++ b/gcc/genoutput.cc
@@ -200,6 +200,8 @@ static const char indep_constraints[] = ",=+%*?!^$#&g";
 static class constraint_data *
 constraints_by_letter_table[1 << CHAR_BIT];
 
+static hash_set used_reg_names;
+
 static int mdep_constraint_len (const char *, file_location, int);
 static void note_constraint (md_rtx_info *);
 
@@ -1156,6 +1158,45 @@ main (int argc, const char **argv)
   output_insn_data ();
   output_get_insn_name ();
 
+  /* Since genoutput has no information about hard register names we cannot
+ statically verify hard register names in constraints of the machine
+ description.  Therefore, we have to do it at runtime.  Although
+ verification shouldn't be too expensive, restrict it to checking builds.
+   */
+  printf ("\n\n#if CHECKING_P\n");
+  if (used_reg_names.is_empty ())
+printf ("void verify_reg_names_in_constraints () { }\n");
+  else
+{
+  size_t max_len = 0;
+  for (auto it = used_reg_names.begin (); it != used_reg_names.end (); 
++it)
+   {
+ size_t len = strlen (*it);
+ if (len > max_len)
+   max_len = len;
+   }
+  printf ("void\nverify_reg_names_in_constraints ()\n{\n");
+  printf ("  static const char hregnames[%zu][%zu] = {\n",
+ used_reg_names.elements (), max_len + 1);
+  auto it = used_reg_names.begin ();
+  while (it != used_reg_names.end ())
+   {
+ printf ("\"%s\"", *it);
+ ++it;
+ if (it != used_reg_names.end ())
+   printf (",");
+ printf ("\n");
+   }
+  printf ("  };\n");
+  printf ("  for (size_t i = 0; i < %zu; ++i)\n",
+ used_reg_names.elements ());
+  printf ("if (decode_reg_name (hregnames[i]) < 0)\n");
+  printf ("  internal_error (\"invalid register %%qs used in "
+ "constraint of machine description\", hregnames[i]);\n");
+  printf ("}\n");
+}
+  printf ("#endif\n");
+
   fflush (stdout);
   return (ferror (stdout) != 0 || have_error
? FATAL_EXIT_CODE : SUCCESS_EXIT_CODE);
@@ -1294,6 +1335,11 @@ mdep_constraint_len (const char *s, file_location loc, 
int opno)
   ptrdiff_t len = end - s;
   if (*end == '}' && len > 1 && len < 31)
{
+ char *regname = new char[len];
+ memcpy (regname, s + 1, len - 1);
+ regname[len - 1] = '\0';
+ if (used_reg_names.add (regname))
+   delete[] regname;
  return len + 1;
}
 }
diff --git a/gcc/output.h b/gcc/output.h
index 46b0033b221..5f0f8a6098c 100644
--- a/gcc/output.h
+++ b/gcc/output.h
@@ -636,4 +636,6 @@ extern int default_address_cost (rtx, machine_mode, 
addr_space_t, bool);
 /* Stack usage.  */
 extern void output_stack_usage (void);
 
+extern void verify_reg_names_in_constraints ();
+
 #endif /* ! GCC_OUTPUT_H */
diff --git a/gcc/toplev.cc b/gcc/toplev.cc
index bc442a08c63..34c372ad1a2 100644
--- a/gcc/toplev.cc
+++ b/gcc/toplev.cc
@@ -1817,6 +1817,10 @@ backend_init_target (void)
 static void
 backend_init (void)
 {
+#if CHECKING_P
+  verify_reg_names_in_constraints ();
+#endif
+
   init_emit_once ();
 
   init_rtlanal ();
-- 
2.45.2

[RFC 1/4] Hard register constraints

2024-09-10 Thread Stefan Schulze Frielinghaus

Implement hard register constraints of the form {regname} where regname
must be any valid register name for the target.  Such constraints may be
used in asm statements as a replacement for register asm and in machine
descriptions.

Due to optimizations it is not unexpected if two or more inputs require
the same value, then those also share a common pseudo.  However, this in
turn may lead to unsatisfiable asm where multiple inputs with different
hard register constraints share the same pseudo.  Therefore, we have to
introduce copies of such a pseudo and use these for conflicting inputs.
This is done prior RA during asmcons in match_asm_constraints_2().
While IRA tries to reduce live ranges, it also replaces some
register-register moves.  That in turn might undo those copies of a
pseudo which we just introduced during asmcons.  Thus, check in
decrease_live_ranges_number() via valid_replacement_for_asm_input_p()
whether it is valid to perform a replacement.

The reminder of the patch mostly deals with parsing and decoding hard
register constraints.  The actual work is done by LRA in
process_alt_operands() where a register filter, according to the
constraint, is installed.

For the sake of "reviewability" and in order to show the beauty of LRA,
error handling (which gets pretty involved) is spread out into a
subsequent patch.

Limitation: Currently, a fixed register cannot be used as hard register
constraint.  For example, accessing the stack pointer on x86_64 via

void *
foo (void)
{
  void *y;
  __asm__ ("" : "={rsp}" (y));
  return y;
}

leads to an error.  This is unfortunate since register asm does not have
this limitation.  The culprit seems to be that during reload
ira_class_hard_regs_num[rclass] does not even include fixed registers
which is why lra_assign() ultimately fails.  Does anyone have an idea
how to lift this limitation?  Maybe there is even a shortcut in order to
force a pseudo into a hard reg?
---
 gcc/function.cc   | 116 ++
 gcc/genoutput.cc  |  14 
 gcc/genpreds.cc   |   4 +-
 gcc/ira.cc|  79 +-
 gcc/lra-constraints.cc|  13 +++
 gcc/recog.cc  |  11 ++-
 gcc/stmt.cc   |  39 +
 gcc/stmt.h|   1 +
 gcc/testsuite/gcc.dg/asm-hard-reg-1.c |  85 +++
 gcc/testsuite/gcc.dg/asm-hard-reg-2.c |  33 
 gcc/testsuite/gcc.dg/asm-hard-reg-3.c |  25 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-4.c |  50 +++
 gcc/testsuite/gcc.dg/asm-hard-reg-5.c |  36 
 gcc/testsuite/gcc.dg/asm-hard-reg-6.c |  60 +
 gcc/testsuite/gcc.dg/asm-hard-reg-7.c |  41 +
 gcc/testsuite/gcc.dg/asm-hard-reg-8.c |  49 +++
 16 files changed, 653 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-1.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-2.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-3.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-4.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-5.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-6.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-7.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-8.c

diff --git a/gcc/function.cc b/gcc/function.cc
index a6f6de34942..bf5992f2b06 100644
--- a/gcc/function.cc
+++ b/gcc/function.cc
@@ -6974,6 +6974,115 @@ match_asm_constraints_1 (rtx_insn *insn, rtx *p_sets, 
int noutputs)
 df_insn_rescan (insn);
 }
 
+/* It is expected and desired that optimizations coalesce multiple pseudos into
+   one whenever possible.  However, in case of hard register constraints we may
+   have to undo this and introduce copies since otherwise we could constraint a
+   single pseudo to different hard registers.  For example, during register
+   allocation the following insn would be unsatisfiable since pseudo 60 is
+   constrained to hard register r5 and r6 at the same time.
+
+   (insn 7 5 0 2 (asm_operands/v ("foo") ("") 0 [
+  (reg:DI 60) repeated x2
+  ]
+   [
+  (asm_input:DI ("{r5}") t.c:4)
+  (asm_input:DI ("{r6}") t.c:4)
+  ]
+   [] t.c:4) "t.c":4:3 -1
+   (expr_list:REG_DEAD (reg:DI 60)
+  (nil)))
+
+   Therefore, introduce a copy of pseudo 60 and transform it into
+
+   (insn 10 5 7 2 (set (reg:DI 62)
+  (reg:DI 60)) "t.c":4:3 1503 {*movdi_64}
+   (nil))
+   (insn 7 10 11 2 (asm_operands/v ("foo") ("") 0 [
+  (reg:DI 60)
+  (reg:DI 62)
+  ]
+   [
+  (asm_input:DI ("{r5}") t.c:4)
+  (asm_input:DI ("{r6}") t.c:4)
+  ]
+   [] t.c:4) "t.c":4:3 -1
+   (expr_list:REG_DEAD (reg:DI 62)
+  (expr_list:REG_DEAD (reg:DI 60)
+  (nil
+
+   Now, LRA can assign pseudo 60 to r5, and pseudo 62 to r6.
+
+

[RFC 4/4] Rewrite register asm into hard register constraints

2024-09-10 Thread Stefan Schulze Frielinghaus

Currently a register asm already materializes during expand.  This
means, a hard register is allocated for the very first access of a
register asm as e.g. in an assignment.  As a consequence this might lead
to suboptimal register allocation if the assignment and the using asm
statement are spread far apart.  Even more problematic are function
calls in between register asm assignments and its using asm statement
since hard registers may be clobbered by a call.  The former may be
solved by pulling register asm assignments and asm statements close by.
However, the latter is not easily solved since sometimes function calls
are implicit.  For example

int
foo (int *x)
{
  register int y asm ("0") = 42;
  register int z asm ("1") = *x;
  asm ("bar\t%0,%1" : "+r" (z) : "r" (y));
  return z;
}

If compiled with address sanitizer, then a function call is introduced
for the memory load which in turn may interfer with the initialization
of register asm y.  Likewise, for some targets and configurations even
an operation like an addition may lead to an implicit library call.

In contrast hard register constraints materialize during register
allocation and therefore do not suffer from this, i.e., asm operands are
kept in pseudos until RA.  This patch adds the feature of rewriting
local register asm into code which exploits hard register constraints.
For example

register int global asm ("r3");

int foo (int x0)
{
  register int x asm ("r4") = x0;
  register int y asm ("r5");

  asm ("bar\t%0,%1,%2" : "=r" (x) : "0" (x), "r" (global));
  x += 42;
  asm ("baz\t%0,%1" : "=r" (y) : "r" (x));

  return y;
}

is rewritten during gimplification into

register int global asm ("r3");

int foo (int x0)
{
  int x = x0;
  int y;

  asm ("bar\t%0,%1,%2" : "={r4}" (x) : "0" (x), "r" (global));
  x += 42;
  asm ("baz\t%0,%1" : "={r5}" (y) : "{r4}" (x));

  return y;
}

The resulting code solely relies on hard register constraints modulo
global register asm.

Since I consider this as an experimental feature it is hidden behind new
flag -fdemote-register-asm (I'm open for other naming suggestions).
---
 gcc/common.opt|  4 +
 gcc/gimplify.cc   | 78 +++
 .../gcc.dg/asm-hard-reg-demotion-1.c  | 19 +
 .../gcc.dg/asm-hard-reg-demotion-2.c  | 19 +
 gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h  | 52 +
 5 files changed, 172 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-1.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-2.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h

diff --git a/gcc/common.opt b/gcc/common.opt
index ea39f87ae71..859a735a0b7 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3422,6 +3422,10 @@ fverbose-asm
 Common Var(flag_verbose_asm)
 Add extra commentary to assembler output.
 
+fdemote-register-asm
+Common Var(flag_demote_register_asm) Init(0)
+Demote local register asm and use hard register constraints instead
+
 fvisibility=
 Common Joined RejectNegative Enum(symbol_visibility) Var(default_visibility) 
Init(VISIBILITY_DEFAULT)
 -fvisibility=[default|internal|hidden|protected]   Set the default symbol 
visibility.
diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 08e0b5d047b..c9bd1769c28 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -7049,6 +7049,73 @@ num_alternatives (const_tree link)
   return num + 1;
 }
 
+static hash_set demote_register_asm;
+
+static void
+gimplify_demote_register_asm (tree link)
+{
+  if (!flag_demote_register_asm)
+return;
+  tree op = TREE_VALUE (link);
+  if (!VAR_P (op) || !DECL_HARD_REGISTER (op) || is_global_var (op))
+return;
+  tree id = DECL_ASSEMBLER_NAME (op);
+  const char *regname = IDENTIFIER_POINTER (id);
+  ++regname;
+  int regno = decode_reg_name (regname);
+  if (regno < 0)
+/* This indicates an error and we error out later on.  */
+return;
+  const char *constraint = TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE 
(link)));
+  auto_vec constraint_new;
+  for (const char *p = constraint; *p; )
+{
+  bool pushed = false;
+  switch (*p)
+   {
+   case '+': case '=': case '%': case '?': case '!': case '*': case '&':
+   case '#': case '$': case '^': case '{': case 'E': case 'F': case 'G':
+   case 'H': case 's': case 'i': case 'n': case 'I': case 'J': case 'K':
+   case 'L': case 'M': case 'N': case 'O': case 'P': case ',': case '0':
+   case '1': case '2': case '3': case '4': case '5': case '6': case '7':
+   case '8': case '9': case '[': case '<': case '>': case 'g': case 'X':
+ break;
+
+   default:
+ if (!ISALPHA (*p))
+   break;
+ enum constraint_num cn = lookup_constraint (p);
+ enum reg_class rclass = reg_class_for_constraint (cn);
+ if (rclass != NO_REGS || insn_extra_address_constraint (cn))
+   {
+ gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (regno), r

[RFC 2/4] Error handling for hard register constraints

2024-09-10 Thread Stefan Schulze Frielinghaus

This implements some basic error handling for hard register constraints
including potential conflics with register asm operands.

In contrast to register asm operands, hard register constraints allow
more than just one register per operand.  Even more than just one
register per alternative.  For example, a valid constraint for an
operand is "{r0}{r1}m,{r2}".  However, this also means that we have to
make sure that each register is used at most once in each alternative
over all outputs and likewise over all inputs.  For asm statements this
is done by this patch during gimplification.  For hard register
constraints used in machine description, error handling is still a todo
and I haven't investigated this so far and consider this rather a low
priority.

There are 9/10 call sides for parse_{input,output}_constraint() which I
didn't dare to touch in the first run.  If this patch is about to be
accepted I could change those call sides and explicitly pass a null
pointer instead of overloading those functions as it is done right now.
I consider this an implementation nit and didn't want to clutter the
patch for reviewing.
---
 gcc/cfgexpand.cc  |  42 
 gcc/gimplify.cc   |  73 +-
 gcc/gimplify_reg_info.h   | 130 ++
 gcc/stmt.cc   | 229 +-
 gcc/stmt.h|   8 +-
 gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c   |  83 +++
 gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c   |  20 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c   |  21 ++
 gcc/testsuite/gcc.dg/pr87600-2.c  |  30 +--
 gcc/testsuite/gcc.dg/pr87600-3.c  |  35 +++
 gcc/testsuite/gcc.dg/pr87600-3.s  |   0
 .../gcc.target/s390/asm-hard-reg-1.c  | 103 
 .../gcc.target/s390/asm-hard-reg-2.c  |  43 
 .../gcc.target/s390/asm-hard-reg-3.c  |  42 
 gcc/testsuite/lib/scanasm.exp |   4 +
 15 files changed, 779 insertions(+), 84 deletions(-)
 create mode 100644 gcc/gimplify_reg_info.h
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c
 create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c
 create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.c
 create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.s
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-3.c

diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index 13f8c08d295..fdbbd93f1b5 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -2966,44 +2966,6 @@ expand_asm_loc (tree string, int vol, location_t locus)
   emit_insn (body);
 }
 
-/* Return the number of times character C occurs in string S.  */
-static int
-n_occurrences (int c, const char *s)
-{
-  int n = 0;
-  while (*s)
-n += (*s++ == c);
-  return n;
-}
-
-/* A subroutine of expand_asm_operands.  Check that all operands have
-   the same number of alternatives.  Return true if so.  */
-
-static bool
-check_operand_nalternatives (const vec &constraints)
-{
-  unsigned len = constraints.length();
-  if (len > 0)
-{
-  int nalternatives = n_occurrences (',', constraints[0]);
-
-  if (nalternatives + 1 > MAX_RECOG_ALTERNATIVES)
-   {
- error ("too many alternatives in %");
- return false;
-   }
-
-  for (unsigned i = 1; i < len; ++i)
-   if (n_occurrences (',', constraints[i]) != nalternatives)
- {
-   error ("operand constraints for % differ "
-  "in number of alternatives");
-   return false;
- }
-}
-  return true;
-}
-
 /* Check for overlap between registers marked in CLOBBERED_REGS and
anything inappropriate in T.  Emit error and return the register
variable definition for error, NULL_TREE for ok.  */
@@ -3169,10 +3131,6 @@ expand_asm_stmt (gasm *stmt)
= TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE (t)));
 }
 
-  /* ??? Diagnose during gimplification?  */
-  if (! check_operand_nalternatives (constraints))
-return;
-
   /* Count the number of meaningful clobbered registers, ignoring what
  we would ignore later.  */
   auto_vec clobber_rvec;
diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 26a216e151d..08e0b5d047b 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -70,6 +70,10 @@ along with GCC; see the file COPYING3.  If not see
 #include "omp-offload.h"
 #include "context.h"
 #include "tree-nested.h"
+#include "insn-config.h"
+#include "recog.h"
+#include "output.h"
+#include "gimplify_reg_info.h"
 
 /* Identifier for a basic condition, mapping it to other basic conditions of
its Boolean expression.  Basic conditions given the same uid (in the same
@@ -7009,6 +7013,42 @@ gimplify_addr_expr (tree *expr_p, gimple_seq *pre_p, 
gimple_seq *

[RFC 0/4] Hard Register Constraints

2024-09-10 Thread Stefan Schulze Frielinghaus

This series introduces hard register constraints.  The first patch
enables hard register constraints for asm statements and for
machine descriptions.  The subsequent patch adds some basic error
handling for asm statements.  The third patch adds some verification of
register names used in machine description.  The fourth and last patch
adds the feature of rewriting local register asm into hard register
constraints.

This series was bootstrapped and regtested on s390.  Furthermore, the
new dg-compile tests were verified via cross compilers for the enabled
targets.  There is still some fallout if -fdemote-register-asm is used
since a couple of features are missing as e.g. erroring out during
gimplification if the clobber set of registers intersects with
input/output registers.

As a larger test vehicle I've compiled and regtested glibc on s390 using
-fdemote-register-asm without any fallout.  On x86_64 this fails due to
the limitation that fixed registers are currently not supported for hard
register constraints (see commit message of the first patch).  This is
also the reason why I'm posting this series already since I was hoping
to get some feedback about this limitation.

Furthermore, I've compiled the Linux kernel on s390 and x86_64 with
-fdemote-register-asm.  Interestingly, the Linux kernel for x86_64 makes
use of the following asm statement:

#define call_on_stack(stack, func, asm_call, argconstr...)  \
{   \
register void *tos asm("r11");  \
\
tos = ((void *)(stack));\
\
asm_inline volatile(\
"movq   %%rsp, (%[tos]) \n" \
"movq   %[tos], %%rsp   \n" \
\
asm_call\
\
"popq   %%rsp   \n" \
\
: "+r" (tos), ASM_CALL_CONSTRAINT   \
: [__func] "i" (func), [tos] "r" (tos) argconstr\
: "cc", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10",   \
  "memory"  \
);  \
}

Note the output
  "+r" (tos)
and the input
  [tos] "r" (tos)
Currently I error out for this since I consider this as two inputs using
the same hard register.  One time an implicit input via '+' and a second
time via the explicit input.  Thus, actually I would expect a '='
instead of a '+' for the output constraint since the input is explicitly
mentioned, or remove the input entirely and just use the inoutput
   [tos] "+r" (tos)
If you consider this valid asm I would have to adjust the error
handling.  Either way, this is just about error handling and doesn't
really affect code generation.

Stefan Schulze Frielinghaus (4):
  Hard register constraints
  Error handling for hard register constraints
  genoutput: Verify hard register constraints
  Rewrite register asm into hard register constraints

 gcc/cfgexpand.cc  |  42 ---
 gcc/common.opt|   4 +
 gcc/function.cc   | 116 
 gcc/genoutput.cc  |  60 
 gcc/genpreds.cc   |   4 +-
 gcc/gimplify.cc   | 151 +-
 gcc/gimplify_reg_info.h   | 130 +
 gcc/ira.cc|  79 +-
 gcc/lra-constraints.cc|  13 +
 gcc/output.h  |   2 +
 gcc/recog.cc  |  11 +-
 gcc/stmt.cc   | 268 +-
 gcc/stmt.h|   9 +-
 gcc/testsuite/gcc.dg/asm-hard-reg-1.c |  85 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-2.c |  33 +++
 gcc/testsuite/gcc.dg/asm-hard-reg-3.c |  25 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-4.c |  50 
 gcc/testsuite/gcc.dg/asm-hard-reg-5.c |  36 +++
 gcc/testsuite/gcc.dg/asm-hard-reg-6.c |  60 
 gcc/testsuite/gcc.dg/asm-hard-reg-7.c |

Re: [PATCH] s390: Fix TF to FPRX2 conversion [PR115860]

2024-09-08 Thread Stefan Schulze Frielinghaus

Ping

On Fri, Aug 16, 2024 at 09:41:55AM +0200, Stefan Schulze Frielinghaus wrote:
> Currently subregs originating from *tf_to_fprx2_0 and *tf_to_fprx2_1
> survive register allocation.  This in turn leads to wrong register
> renaming.  Keeping the current approach would mean we need two insns for
> *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively.  Something along the
> lines
> 
> (define_insn "*tf_to_fprx2_0"
>   [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "=f") 0)
> (unspec:DF [(match_operand:TF 1 "general_operand" "v")]
>UNSPEC_TF_TO_FPRX2_0))]
>   "TARGET_VXE"
>   "#")
> 
> (define_insn "*tf_to_fprx2_0"
>   [(set (match_operand:DF 0 "nonimmediate_operand" "=f")
> (unspec:DF [(match_operand:TF 1 "general_operand" "v")]
>UNSPEC_TF_TO_FPRX2_0))]
>   "TARGET_VXE"
>   "vpdi\t%v0,%v1,%v0,1
>   [(set_attr "op_type" "VRR")])
> 
> and similar for *tf_to_fprx2_1.  Note, pre register allocation operand 0
> has mode FPRX2 and afterwards DF once subregs have been eliminated.
> 
> Since we always copy a whole vector register into a floating-point
> register pair, another way to fix this is to merge *tf_to_fprx2_0 and
> *tf_to_fprx2_1 into a single insn which means we don't have to use
> subregs at all.  The downside of this is that the assembler template
> contains two instructions, now.  The upside is that we don't have to
> come up with some artificial insn before RA which might be more
> readable/maintainable.  That is implemented by this patch.
> 
> In commit r11-4872-ge627cda5686592, the output operand specifier %V was
> introduced which is used in tf_to_fprx2 only, now.  I didn't come up
> with its counterpart like %F for floating-point registers.  Instead I
> printed the register pair in the output function directly.  This spares
> us a new and "rare" format specifier for a single insn.  I don't have a
> strong opinion which option to choose, however, we should either add %F
> in order to mimic the same behaviour as %V or getting rid of %V and
> inline the logic in the output function.  I lean towards the latter.
> Any preferences?
> ---
>  gcc/config/s390/s390.md|  2 +
>  gcc/config/s390/vector.md  | 66 +++---
>  gcc/testsuite/gcc.target/s390/pr115860-1.c | 26 +
>  3 files changed, 60 insertions(+), 34 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/s390/pr115860-1.c
> 
> diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
> index 3d5759d6252..31240899934 100644
> --- a/gcc/config/s390/s390.md
> +++ b/gcc/config/s390/s390.md
> @@ -241,6 +241,8 @@
> UNSPEC_VEC_VFMIN
> UNSPEC_VEC_VFMAX
>  
> +   UNSPEC_TF_TO_FPRX2
> +
> UNSPEC_NNPA_VCLFNHS_V8HI
> UNSPEC_NNPA_VCLFNLS_V8HI
> UNSPEC_NNPA_VCRNFS_V8HI
> diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
> index a75b7cb5825..561182e0c2c 100644
> --- a/gcc/config/s390/vector.md
> +++ b/gcc/config/s390/vector.md
> @@ -907,36 +907,36 @@
>"vmrlg\t%0,%1,%2";
>[(set_attr "op_type" "VRR")])
>  
> -
> -(define_insn "*tf_to_fprx2_0"
> -  [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0)
> - (subreg:DF (match_operand:TF1 "general_operand"   "v") 0))]
> -  "TARGET_VXE"
> -  ; M4 == 1 corresponds to %v0[0] = %v1[0]; %v0[1] = %v0[1];
> -  "vpdi\t%v0,%v1,%v0,1"
> -  [(set_attr "op_type" "VRR")])
> -
> -(define_insn "*tf_to_fprx2_1"
> -  [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8)
> - (subreg:DF (match_operand:TF1 "general_operand"   "v") 8))]
> +(define_insn "tf_to_fprx2"
> +  [(set (match_operand:FPRX2 0 "register_operand" "=f,f ,f")
> + (unspec:FPRX2 [(match_operand:TF 1 "general_operand"   "v,AR,AT")]
> +   UNSPEC_TF_TO_FPRX2))]
>"TARGET_VXE"
> -  ; M4 == 5 corresponds to %V0[0] = %v1[1]; %V0[1] = %V0[1];
> -  "vpdi\t%V0,%v1,%V0,5"
> -  [(set_attr "op_type" "VRR")])
> -
> -(define_insn_and_split "tf_to_fprx2"
> -  [(set (match_operand:FPRX20 "nonimmediate_operand" "=f,f")
> - (subreg:FPRX2 (match_operand:TF 1 "general_operand"   "v,AR") 0))]
> -  "TARGET_VXE"
> -  "#&q

Re: [PATCH] s390: Fix strict_low_part generation

2024-09-08 Thread Stefan Schulze Frielinghaus

Ping

On Fri, Aug 16, 2024 at 09:14:02AM +0200, Stefan Schulze Frielinghaus wrote:
> In s390_expand_insv(), if generating code for ICM et al. src is a MEM
> and gen_lowpart might force src into a register such that we end up with
> patterns which do not match anymore.  Use adjust_address() instead in
> order to preserve a MEM.
> 
> Furthermore, it is not straight forward to enforce a subreg.  For
> example, in case of a paradoxical subreg, gen_lowpart() may return a
> register.  In order to compensate this, s390_gen_lowpart_subreg() emits
> a reference to a pseudo which does not coincide with its definition
> which is wrong.  Additionally, if dest is a paradoxical subreg, then do
> not try to emit a strict_low_part since it could mean that dest was not
> initialized even though this might be fixed up later by init-regs.
> 
> Splitter for insn *get_tp_64, *zero_extendhisi2_31,
> *zero_extendqisi2_31, *zero_extendqihi2_31 are applied after reload.
> Thus, operands[0] is a hard register and gen_lowpart (m, operands[0])
> just returns the hard register for mode m which is fine to use as an
> argument for strict_low_part, i.e., we do not need to enforce subregs
> here since after reload subregs are supposed to be eliminated anyway.
> 
> This fixes gcc.dg/torture/pr111821.c.
> 
> gcc/ChangeLog:
> 
>   * config/s390/s390-protos.h (s390_gen_lowpart_subreg): Remove.
>   * config/s390/s390.cc (s390_gen_lowpart_subreg): Remove.
>   (s390_expand_insv): Use adjust_address() and emit a
>   strict_low_part only in case of a natural subreg.
>   * config/s390/s390.md: Use gen_lowpart() instead of
>   s390_gen_lowpart_subreg().
> ---
>  Bootstrapped and regtested on s390.  Ok for mainline,gcc12,gcc13,gcc14?
> 
>  gcc/config/s390/s390-protos.h |  1 -
>  gcc/config/s390/s390.cc   | 47 +++
>  gcc/config/s390/s390.md   | 13 +-
>  3 files changed, 20 insertions(+), 41 deletions(-)
> 
> diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
> index b4646ccb606..e7ac59d17da 100644
> --- a/gcc/config/s390/s390-protos.h
> +++ b/gcc/config/s390/s390-protos.h
> @@ -50,7 +50,6 @@ extern void s390_set_has_landing_pad_p (bool);
>  extern bool s390_hard_regno_rename_ok (unsigned int, unsigned int);
>  extern int s390_class_max_nregs (enum reg_class, machine_mode);
>  extern bool s390_return_addr_from_memory(void);
> -extern rtx s390_gen_lowpart_subreg (machine_mode, rtx);
>  extern bool s390_fma_allowed_p (machine_mode);
>  #if S390_USE_TARGET_ATTRIBUTE
>  extern tree s390_valid_target_attribute_tree (tree args,
> diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
> index 7aea776da2f..7cdcebfc08b 100644
> --- a/gcc/config/s390/s390.cc
> +++ b/gcc/config/s390/s390.cc
> @@ -516,31 +516,6 @@ s390_return_addr_from_memory ()
>return cfun_gpr_save_slot(RETURN_REGNUM) == SAVE_SLOT_STACK;
>  }
>  
> -/* Generate a SUBREG for the MODE lowpart of EXPR.
> -
> -   In contrast to gen_lowpart it will always return a SUBREG
> -   expression.  This is useful to generate STRICT_LOW_PART
> -   expressions.  */
> -rtx
> -s390_gen_lowpart_subreg (machine_mode mode, rtx expr)
> -{
> -  rtx lowpart = gen_lowpart (mode, expr);
> -
> -  /* There might be no SUBREG in case it could be applied to the hard
> - REG rtx or it could be folded with a paradoxical subreg.  Bring
> - it back.  */
> -  if (!SUBREG_P (lowpart))
> -{
> -  machine_mode reg_mode = TARGET_ZARCH ? DImode : SImode;
> -  gcc_assert (REG_P (lowpart));
> -  lowpart = gen_lowpart_SUBREG (mode,
> - gen_rtx_REG (reg_mode,
> -  REGNO (lowpart)));
> -}
> -
> -  return lowpart;
> -}
> -
>  /* Return nonzero if it's OK to use fused multiply-add for MODE.  */
>  bool
>  s390_fma_allowed_p (machine_mode mode)
> @@ -7112,15 +7087,21 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
>/* Emit a strict_low_part pattern if possible.  */
>if (smode_bsize == bitsize && bitpos == mode_bsize - smode_bsize)
>   {
> -   rtx low_dest = s390_gen_lowpart_subreg (smode, dest);
> -   rtx low_src = gen_lowpart (smode, src);
> -
> -   switch (smode)
> +   rtx low_dest = gen_lowpart (smode, dest);
> +   if (SUBREG_P (low_dest) && !paradoxical_subreg_p (low_dest))
>   {
> - case E_QImode: emit_insn (gen_movstrictqi (low_dest, low_src)); 
> return true;
> - case E_HImode: emit_insn (gen_movstricthi (low_dest, low_src)); 
> return true;
> - case E_SImode: emit_insn (gen_movstrictsi (low_de

[PATCH] s390: Fix TF to FPRX2 conversion [PR115860]

2024-08-16 Thread Stefan Schulze Frielinghaus

Currently subregs originating from *tf_to_fprx2_0 and *tf_to_fprx2_1
survive register allocation.  This in turn leads to wrong register
renaming.  Keeping the current approach would mean we need two insns for
*tf_to_fprx2_0 and *tf_to_fprx2_1, respectively.  Something along the
lines

(define_insn "*tf_to_fprx2_0"
  [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "=f") 0)
(unspec:DF [(match_operand:TF 1 "general_operand" "v")]
   UNSPEC_TF_TO_FPRX2_0))]
  "TARGET_VXE"
  "#")

(define_insn "*tf_to_fprx2_0"
  [(set (match_operand:DF 0 "nonimmediate_operand" "=f")
(unspec:DF [(match_operand:TF 1 "general_operand" "v")]
   UNSPEC_TF_TO_FPRX2_0))]
  "TARGET_VXE"
  "vpdi\t%v0,%v1,%v0,1
  [(set_attr "op_type" "VRR")])

and similar for *tf_to_fprx2_1.  Note, pre register allocation operand 0
has mode FPRX2 and afterwards DF once subregs have been eliminated.

Since we always copy a whole vector register into a floating-point
register pair, another way to fix this is to merge *tf_to_fprx2_0 and
*tf_to_fprx2_1 into a single insn which means we don't have to use
subregs at all.  The downside of this is that the assembler template
contains two instructions, now.  The upside is that we don't have to
come up with some artificial insn before RA which might be more
readable/maintainable.  That is implemented by this patch.

In commit r11-4872-ge627cda5686592, the output operand specifier %V was
introduced which is used in tf_to_fprx2 only, now.  I didn't come up
with its counterpart like %F for floating-point registers.  Instead I
printed the register pair in the output function directly.  This spares
us a new and "rare" format specifier for a single insn.  I don't have a
strong opinion which option to choose, however, we should either add %F
in order to mimic the same behaviour as %V or getting rid of %V and
inline the logic in the output function.  I lean towards the latter.
Any preferences?
---
 gcc/config/s390/s390.md|  2 +
 gcc/config/s390/vector.md  | 66 +++---
 gcc/testsuite/gcc.target/s390/pr115860-1.c | 26 +
 3 files changed, 60 insertions(+), 34 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/pr115860-1.c

diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 3d5759d6252..31240899934 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -241,6 +241,8 @@
UNSPEC_VEC_VFMIN
UNSPEC_VEC_VFMAX
 
+   UNSPEC_TF_TO_FPRX2
+
UNSPEC_NNPA_VCLFNHS_V8HI
UNSPEC_NNPA_VCLFNLS_V8HI
UNSPEC_NNPA_VCRNFS_V8HI
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index a75b7cb5825..561182e0c2c 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -907,36 +907,36 @@
   "vmrlg\t%0,%1,%2";
   [(set_attr "op_type" "VRR")])
 
-
-(define_insn "*tf_to_fprx2_0"
-  [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0)
-   (subreg:DF (match_operand:TF1 "general_operand"   "v") 0))]
-  "TARGET_VXE"
-  ; M4 == 1 corresponds to %v0[0] = %v1[0]; %v0[1] = %v0[1];
-  "vpdi\t%v0,%v1,%v0,1"
-  [(set_attr "op_type" "VRR")])
-
-(define_insn "*tf_to_fprx2_1"
-  [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8)
-   (subreg:DF (match_operand:TF1 "general_operand"   "v") 8))]
+(define_insn "tf_to_fprx2"
+  [(set (match_operand:FPRX2 0 "register_operand" "=f,f ,f")
+   (unspec:FPRX2 [(match_operand:TF 1 "general_operand"   "v,AR,AT")]
+ UNSPEC_TF_TO_FPRX2))]
   "TARGET_VXE"
-  ; M4 == 5 corresponds to %V0[0] = %v1[1]; %V0[1] = %V0[1];
-  "vpdi\t%V0,%v1,%V0,5"
-  [(set_attr "op_type" "VRR")])
-
-(define_insn_and_split "tf_to_fprx2"
-  [(set (match_operand:FPRX20 "nonimmediate_operand" "=f,f")
-   (subreg:FPRX2 (match_operand:TF 1 "general_operand"   "v,AR") 0))]
-  "TARGET_VXE"
-  "#"
-  "!(MEM_P (operands[1]) && MEM_VOLATILE_P (operands[1]))"
-  [(set (match_dup 2) (match_dup 3))
-   (set (match_dup 4) (match_dup 5))]
 {
-  operands[2] = simplify_gen_subreg (DFmode, operands[0], FPRX2mode, 0);
-  operands[3] = simplify_gen_subreg (DFmode, operands[1], TFmode, 0);
-  operands[4] = simplify_gen_subreg (DFmode, operands[0], FPRX2mode, 8);
-  operands[5] = simplify_gen_subreg (DFmode, operands[1], TFmode, 8);
+  char buf[64];
+  switch (which_alternative)
+{
+case 0:
+  if (REGNO (operands[0]) == REGNO (operands[1]))
+   return "vpdi\t%V0,%v1,%V0,5";
+  else
+   return "ldr\t%f0,%f1;vpdi\t%V0,%v1,%V0,5";
+case 1:
+  {
+   const char *reg_pair = reg_names[REGNO (operands[0]) + 1];
+   snprintf (buf, sizeof (buf), "ld\t%%f0,%%1;ld\t%%%s,8+%%1", reg_pair);
+   output_asm_insn (buf, operands);
+   return "";
+  }
+case 2:
+  {
+   const char *reg_pair = reg_names[REGNO (operands[0]) + 1];
+   snprintf (buf, sizeof (buf), "ldy\t%%f0,%%1;ldy\t%%%s,8+%%1", reg_pair);
+

[PATCH] s390: Fix strict_low_part generation

2024-08-16 Thread Stefan Schulze Frielinghaus

In s390_expand_insv(), if generating code for ICM et al. src is a MEM
and gen_lowpart might force src into a register such that we end up with
patterns which do not match anymore.  Use adjust_address() instead in
order to preserve a MEM.

Furthermore, it is not straight forward to enforce a subreg.  For
example, in case of a paradoxical subreg, gen_lowpart() may return a
register.  In order to compensate this, s390_gen_lowpart_subreg() emits
a reference to a pseudo which does not coincide with its definition
which is wrong.  Additionally, if dest is a paradoxical subreg, then do
not try to emit a strict_low_part since it could mean that dest was not
initialized even though this might be fixed up later by init-regs.

Splitter for insn *get_tp_64, *zero_extendhisi2_31,
*zero_extendqisi2_31, *zero_extendqihi2_31 are applied after reload.
Thus, operands[0] is a hard register and gen_lowpart (m, operands[0])
just returns the hard register for mode m which is fine to use as an
argument for strict_low_part, i.e., we do not need to enforce subregs
here since after reload subregs are supposed to be eliminated anyway.

This fixes gcc.dg/torture/pr111821.c.

gcc/ChangeLog:

* config/s390/s390-protos.h (s390_gen_lowpart_subreg): Remove.
* config/s390/s390.cc (s390_gen_lowpart_subreg): Remove.
(s390_expand_insv): Use adjust_address() and emit a
strict_low_part only in case of a natural subreg.
* config/s390/s390.md: Use gen_lowpart() instead of
s390_gen_lowpart_subreg().
---
 Bootstrapped and regtested on s390.  Ok for mainline,gcc12,gcc13,gcc14?

 gcc/config/s390/s390-protos.h |  1 -
 gcc/config/s390/s390.cc   | 47 +++
 gcc/config/s390/s390.md   | 13 +-
 3 files changed, 20 insertions(+), 41 deletions(-)

diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index b4646ccb606..e7ac59d17da 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -50,7 +50,6 @@ extern void s390_set_has_landing_pad_p (bool);
 extern bool s390_hard_regno_rename_ok (unsigned int, unsigned int);
 extern int s390_class_max_nregs (enum reg_class, machine_mode);
 extern bool s390_return_addr_from_memory(void);
-extern rtx s390_gen_lowpart_subreg (machine_mode, rtx);
 extern bool s390_fma_allowed_p (machine_mode);
 #if S390_USE_TARGET_ATTRIBUTE
 extern tree s390_valid_target_attribute_tree (tree args,
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 7aea776da2f..7cdcebfc08b 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -516,31 +516,6 @@ s390_return_addr_from_memory ()
   return cfun_gpr_save_slot(RETURN_REGNUM) == SAVE_SLOT_STACK;
 }
 
-/* Generate a SUBREG for the MODE lowpart of EXPR.
-
-   In contrast to gen_lowpart it will always return a SUBREG
-   expression.  This is useful to generate STRICT_LOW_PART
-   expressions.  */
-rtx
-s390_gen_lowpart_subreg (machine_mode mode, rtx expr)
-{
-  rtx lowpart = gen_lowpart (mode, expr);
-
-  /* There might be no SUBREG in case it could be applied to the hard
- REG rtx or it could be folded with a paradoxical subreg.  Bring
- it back.  */
-  if (!SUBREG_P (lowpart))
-{
-  machine_mode reg_mode = TARGET_ZARCH ? DImode : SImode;
-  gcc_assert (REG_P (lowpart));
-  lowpart = gen_lowpart_SUBREG (mode,
-   gen_rtx_REG (reg_mode,
-REGNO (lowpart)));
-}
-
-  return lowpart;
-}
-
 /* Return nonzero if it's OK to use fused multiply-add for MODE.  */
 bool
 s390_fma_allowed_p (machine_mode mode)
@@ -7112,15 +7087,21 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
   /* Emit a strict_low_part pattern if possible.  */
   if (smode_bsize == bitsize && bitpos == mode_bsize - smode_bsize)
{
- rtx low_dest = s390_gen_lowpart_subreg (smode, dest);
- rtx low_src = gen_lowpart (smode, src);
-
- switch (smode)
+ rtx low_dest = gen_lowpart (smode, dest);
+ if (SUBREG_P (low_dest) && !paradoxical_subreg_p (low_dest))
{
-   case E_QImode: emit_insn (gen_movstrictqi (low_dest, low_src)); 
return true;
-   case E_HImode: emit_insn (gen_movstricthi (low_dest, low_src)); 
return true;
-   case E_SImode: emit_insn (gen_movstrictsi (low_dest, low_src)); 
return true;
-   default: break;
+ poly_int64 offset = GET_MODE_SIZE (mode) - GET_MODE_SIZE (smode);
+ rtx low_src = adjust_address (src, smode, offset);
+ switch (smode)
+   {
+   case E_QImode: emit_insn (gen_movstrictqi (low_dest, low_src));
+  return true;
+   case E_HImode: emit_insn (gen_movstricthi (low_dest, low_src));
+  return true;
+   case E_SImode: emit_insn (gen_movstrictsi (low_dest, low_src));
+  retu

[PATCH] s390: Remove vector intrinsics

2024-08-08 Thread Stefan Schulze Frielinghaus

The following intrinsics are not implemented.  Thus, remove them.

Ok for mainline?

gcc/ChangeLog:

* config/s390/vecintrin.h (vec_vstbrh): Remove.
(vec_vstbrf): Remove.
(vec_vstbrg): Remove.
(vec_vstbrq): Remove.
(vec_vstbrf_flt): Remove.
(vec_vstbrg_dbl): Remove.
(vec_vsterb): Remove.
(vec_vsterh): Remove.
(vec_vsterf): Remove.
(vec_vsterg): Remove.
(vec_vsterf_flt): Remove.
(vec_vsterg_dbl): Remove.
---
 gcc/config/s390/vecintrin.h | 14 --
 1 file changed, 14 deletions(-)

diff --git a/gcc/config/s390/vecintrin.h b/gcc/config/s390/vecintrin.h
index daeed91ef97..de29f913637 100644
--- a/gcc/config/s390/vecintrin.h
+++ b/gcc/config/s390/vecintrin.h
@@ -160,20 +160,6 @@ __lcbb(const void *ptr, int bndry)
   cc != 3 ? 1 : 0; \
 })
 
-#define vec_vstbrh vec_vlbrh
-#define vec_vstbrf vec_vlbrf
-#define vec_vstbrg vec_vlbrg
-#define vec_vstbrq vec_vlbrq
-#define vec_vstbrf_flt vec_vlbrf_flt
-#define vec_vstbrg_dbl vec_vlbrg_dbl
-
-#define vec_vsterb vec_vlerb
-#define vec_vsterh vec_vlerh
-#define vec_vsterf vec_vlerh
-#define vec_vsterg vec_vlerh
-#define vec_vsterf_flt vec_vlerf_flt
-#define vec_vsterg_dbl vec_vlerg_dbl
-
 #define vec_extend_to_fp32_hi __builtin_s390_vclfnhs
 #define vec_extend_to_fp32_lo __builtin_s390_vclfnls
 #define vec_round_from_fp32 __builtin_s390_vcrnfs
-- 
2.45.2

[PATCH] s390: Fix high-level builtins vec_gfmsum{,_accum}_128

2024-08-08 Thread Stefan Schulze Frielinghaus

Starting with r14-9449-g9f2b16ce1efef0 builtins were streamlined with
those in LLVM.  In particular s390_vgfm{,a}g have been changed from
UV16QI to UINT128 in order to match those in LLVM.  However, these
low-level builtins are directly used by the high-level builtins
vec_gfmsum{,_accum}_128 which expect UV16QI instead.  Therefore,
introduce new low-level builtins s390_vgfm{,a}g_128 and make use of
them, respectively.

Bootstrapped on s390.  Ok for mainline and releases/gcc-14?

gcc/ChangeLog:

* config/s390/s390-builtin-types.def (BT_FN_UV16QI_UV2DI_UV2DI):
New.
(BT_FN_UV16QI_UV2DI_UV2DI_UV16QI): New.
* config/s390/s390-builtins.def (s390_vgfmg_128): New.
(s390_vgfmag_128): New.
* config/s390/vecintrin.h (vec_gfmsum_128): Use s390_vgfmg_128.
(vec_gfmsum_accum_128): Use s390_vgfmag_128.
---
 gcc/config/s390/s390-builtin-types.def | 2 ++
 gcc/config/s390/s390-builtins.def  | 2 ++
 gcc/config/s390/vecintrin.h| 4 ++--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/gcc/config/s390/s390-builtin-types.def 
b/gcc/config/s390/s390-builtin-types.def
index d70eaade8ea..e6f5631ed7a 100644
--- a/gcc/config/s390/s390-builtin-types.def
+++ b/gcc/config/s390/s390-builtin-types.def
@@ -221,6 +221,7 @@ DEF_FN_TYPE_2 (BT_FN_UV16QI_UCHAR_UCHAR, BT_UV16QI, 
BT_UCHAR, BT_UCHAR)
 DEF_FN_TYPE_2 (BT_FN_UV16QI_UV16QI_INTPTR, BT_UV16QI, BT_UV16QI, BT_INTPTR)
 DEF_FN_TYPE_2 (BT_FN_UV16QI_UV16QI_UCHAR, BT_UV16QI, BT_UV16QI, BT_UCHAR)
 DEF_FN_TYPE_2 (BT_FN_UV16QI_UV16QI_UV16QI, BT_UV16QI, BT_UV16QI, BT_UV16QI)
+DEF_FN_TYPE_2 (BT_FN_UV16QI_UV2DI_UV2DI, BT_UV16QI, BT_UV2DI, BT_UV2DI)
 DEF_FN_TYPE_2 (BT_FN_UV16QI_UV8HI_UV8HI, BT_UV16QI, BT_UV8HI, BT_UV8HI)
 DEF_FN_TYPE_2 (BT_FN_UV2DI_UCHAR_UCHAR, BT_UV2DI, BT_UCHAR, BT_UCHAR)
 DEF_FN_TYPE_2 (BT_FN_UV2DI_ULONGLONG_INT, BT_UV2DI, BT_ULONGLONG, BT_INT)
@@ -299,6 +300,7 @@ DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UCHAR_INT, BT_UV16QI, 
BT_UV16QI, BT_UCHAR, BT
 DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UV16QI_INT, BT_UV16QI, BT_UV16QI, 
BT_UV16QI, BT_INT)
 DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UV16QI_INTPTR, BT_UV16QI, BT_UV16QI, 
BT_UV16QI, BT_INTPTR)
 DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UV16QI_UV16QI, BT_UV16QI, BT_UV16QI, 
BT_UV16QI, BT_UV16QI)
+DEF_FN_TYPE_3 (BT_FN_UV16QI_UV2DI_UV2DI_UV16QI, BT_UV16QI, BT_UV2DI, BT_UV2DI, 
BT_UV16QI)
 DEF_FN_TYPE_3 (BT_FN_UV16QI_UV8HI_UV8HI_INTPTR, BT_UV16QI, BT_UV8HI, BT_UV8HI, 
BT_INTPTR)
 DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_ULONGLONG_INT, BT_UV2DI, BT_UV2DI, 
BT_ULONGLONG, BT_INT)
 DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_UV2DI_INT, BT_UV2DI, BT_UV2DI, BT_UV2DI, 
BT_INT)
diff --git a/gcc/config/s390/s390-builtins.def 
b/gcc/config/s390/s390-builtins.def
index 3a63213e571..7f6190fa810 100644
--- a/gcc/config/s390/s390-builtins.def
+++ b/gcc/config/s390/s390-builtins.def
@@ -1666,6 +1666,7 @@ B_DEF  (s390_vgfmb, vec_gfmsumv16qi,  
  0,
 B_DEF  (s390_vgfmh, vec_gfmsumv8hi, 0, 
 B_VX,   0,  BT_FN_UV4SI_UV8HI_UV8HI)
 B_DEF  (s390_vgfmf, vec_gfmsumv4si, 0, 
 B_VX,   0,  BT_FN_UV2DI_UV4SI_UV4SI)
 B_DEF  (s390_vgfmg, vec_gfmsum_128, 0, 
 B_VX,   0,  BT_FN_UINT128_UV2DI_UV2DI)
+B_DEF  (s390_vgfmg_128, vec_gfmsum_128, 0, 
 B_VX,   0,  BT_FN_UV16QI_UV2DI_UV2DI)
 
 OB_DEF (s390_vec_gfmsum_accum,  
s390_vec_gfmsum_accum_u8,s390_vec_gfmsum_accum_u32,B_VX,
BT_FN_OV4SI_OV4SI_OV4SI_OV4SI)
 OB_DEF_VAR (s390_vec_gfmsum_accum_u8,   s390_vgfmab,0, 
 0,  BT_OV_UV8HI_UV16QI_UV16QI_UV8HI)
@@ -1676,6 +1677,7 @@ B_DEF  (s390_vgfmab,
vec_gfmsum_accumv16qi,0,
 B_DEF  (s390_vgfmah,vec_gfmsum_accumv8hi,0,
 B_VX,   0,  BT_FN_UV4SI_UV8HI_UV8HI_UV4SI)
 B_DEF  (s390_vgfmaf,vec_gfmsum_accumv4si,0,
 B_VX,   0,  BT_FN_UV2DI_UV4SI_UV4SI_UV2DI)
 B_DEF  (s390_vgfmag,vec_gfmsum_accum_128,0,
 B_VX,   0,  BT_FN_UINT128_UV2DI_UV2DI_UINT128)
+B_DEF  (s390_vgfmag_128,vec_gfmsum_accum_128,0,
 B_VX,   0,  BT_FN_UV16QI_UV2DI_UV2DI_UV16QI)
 
 OB_DEF (s390_vec_abs,   s390_vec_abs_s8,s390_vec_abs_dbl,  
 B_VX,   BT_FN_OV4SI_OV4SI)
 OB_DEF_VAR (s390_vec_abs_s8,s390_vlpb,  0, 
 0,  BT_OV_V16QI_V16QI)
diff --git a/gcc/config/s390/vecintrin.h b/gcc/config/s390/vecintrin.h
index 9abbd761067..daeed91ef97 100644
--- a/gcc/config/s390/vecintrin.h
+++ b/gcc/config/s390/vecintrin.h
@@ -98,8 +98,8 @@ __lcbb(const void *ptr, int bndry)
 #define vec_splat_u64 __builtin_s390

Re: [PATCH v2] Hard register constraints

2024-08-05 Thread Stefan Schulze Frielinghaus

On Mon, Aug 05, 2024 at 02:19:50PM +0200, Georg-Johann Lay wrote:
> Am 05.08.24 um 12:28 schrieb Stefan Schulze Frielinghaus:
> > This is a follow-up of
> > https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654013.html
> > 
> > What has changed?
> > 
> > - Rebased and fixed an issue in constrain_operands which manifested
> > after late-combine.
> > 
> > - Introduced new test cases for Arm, Intel, POWER, RISCV, S/390 for 32-
> > and 64-bit where appropriate (including register pairs etc.).  Test
> > gcc.dg/asm-hard-reg-7.c is a bit controversial since I'm testing for an
> > anti feature here, i.e., I'm testing for register asm in conjunction
> > with calls.  I'm fine with removing it in the end but I wanted to keep
> > it in for demonstration purposes at least during discussion of this
> > patch.
> > 
> > - Split test pr87600-2.c into pr87600-2.c and pr87600-3.c since test0
> > errors out early, now.  Otherwise, the remaining errors would not be
> > reported.  Beside that the error message has slightly changed.
> > 
> > - Modified genoutput.cc in order to allow hard register constraints in
> > machine descriptions.  For example, on s390 the instruction mvcrl makes
> 
> As I already said, such a feature would be great.  Some questions:
> 
> Which pass is satisfying that constraint? AFAIK for local reg vars,
> it is asmcons, but for register constraints in md it it the register
> allocator.

This is done by reload during process_alt_operands().  Basically
every other change in gimplify.cc, stmt.cc etc. is only there in order
to do some error checking and have some proper diagnostics.

> The avr backend has many insns that use explicit hard regs in order to
> model some libcalls (ones with footprints smaller than ABI, or that
> deviate from the ABI).  A proper way would be to add a register
> constraint for each possible hard reg, e.g. R20_1 for QImode in R20,
> R20_2 for HImode in R20, etc.  This would require a dozen or more
> new register classes, and the problem with that is that register
> allocation produces less efficient code even for cases that do
> not use these new constraints.  So I gave up that approach.
> 
> How does your feature work? Does it imply that for each hreg
> constraint there must be an according register class?

No.  During reload I limit the set of registers by installing a filter
and let RA solve it.

> 
> Obviously local reg vars don't require respective reg classes,
> so I thought about representing such insns as asm_input or
> whatever, but that's pure hack and would never pass a review...
> 
> > use of the implicit register r0 which we currently deal with as follows:
> > 
> > (define_insn "*mvcrl"
> >[(set (match_operand:BLK 0 "memory_operand" "=Q")
> > (unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q")
> >  (reg:SI GPR0_REGNUM)]
> > UNSPEC_MVCRL))]
> >"TARGET_Z15"
> >"mvcrl\t%0,%1"
> >[(set_attr "op_type" "SSE")])
> > 
> > (define_expand "mvcrl"
> >[(set (reg:SI GPR0_REGNUM) (match_operand:SI 2 "general_operand"))
> > (set (match_operand:BLK 0 "memory_operand" "=Q")
> > (unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q")
> >  (reg:SI GPR0_REGNUM)]
> > UNSPEC_MVCRL))]
> >"TARGET_Z15"
> >"")
> > 
> > In the expander we ensure that GPR0 is setup correctly.  With this patch
> > we could simply write
> > 
> > (define_insn "mvcrl"
> >[(set (match_operand:BLK 0 "memory_operand" "=Q")
> >  (unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q")
> >   (match_operand:SI 2 "general_operand" "{r0}")]
> >  UNSPEC_MVCRL))]
> >"TARGET_Z15"
> >"mvcrl\t%0,%1"
> >[(set_attr "op_type" "SSE")])
> > 
> > What I dislike is that I didn't find a way to verify hard register names
> 
> Are plain register numbers also supported? Like "{0}" ?
> (Provided regno(r0) == 0).

Basically whatever passes decode_reg_name() is allowed.

> 
> > during genoutput, i.e., ensuring that the name is valid after all.  This
> > is due to the fact how reg_names is defined which cannot be accessed by
> > genoutput.  The same holds true for REGISTER_NAMES et al. which may
> > referenc

[PATCH v2] Hard register constraints

2024-08-05 Thread Stefan Schulze Frielinghaus

This is a follow-up of
https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654013.html

What has changed?

- Rebased and fixed an issue in constrain_operands which manifested
after late-combine.

- Introduced new test cases for Arm, Intel, POWER, RISCV, S/390 for 32-
and 64-bit where appropriate (including register pairs etc.).  Test
gcc.dg/asm-hard-reg-7.c is a bit controversial since I'm testing for an
anti feature here, i.e., I'm testing for register asm in conjunction
with calls.  I'm fine with removing it in the end but I wanted to keep
it in for demonstration purposes at least during discussion of this
patch.

- Split test pr87600-2.c into pr87600-2.c and pr87600-3.c since test0
errors out early, now.  Otherwise, the remaining errors would not be
reported.  Beside that the error message has slightly changed.

- Modified genoutput.cc in order to allow hard register constraints in
machine descriptions.  For example, on s390 the instruction mvcrl makes
use of the implicit register r0 which we currently deal with as follows:

(define_insn "*mvcrl"
  [(set (match_operand:BLK 0 "memory_operand" "=Q")
   (unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q")
(reg:SI GPR0_REGNUM)]
   UNSPEC_MVCRL))]
  "TARGET_Z15"
  "mvcrl\t%0,%1"
  [(set_attr "op_type" "SSE")])

(define_expand "mvcrl"
  [(set (reg:SI GPR0_REGNUM) (match_operand:SI 2 "general_operand"))
   (set (match_operand:BLK 0 "memory_operand" "=Q")
   (unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q")
(reg:SI GPR0_REGNUM)]
   UNSPEC_MVCRL))]
  "TARGET_Z15"
  "")

In the expander we ensure that GPR0 is setup correctly.  With this patch
we could simply write

(define_insn "mvcrl"
  [(set (match_operand:BLK 0 "memory_operand" "=Q")
(unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q")
 (match_operand:SI 2 "general_operand" "{r0}")]
UNSPEC_MVCRL))]
  "TARGET_Z15"
  "mvcrl\t%0,%1"
  [(set_attr "op_type" "SSE")])

What I dislike is that I didn't find a way to verify hard register names
during genoutput, i.e., ensuring that the name is valid after all.  This
is due to the fact how reg_names is defined which cannot be accessed by
genoutput.  The same holds true for REGISTER_NAMES et al. which may
reference some target specific variable (see e.g. POWER).  Thus, in case
of an invalid register name in a machine description file we do not
end-up with a genoutput-time error but instead fail at run-time in
process_alt_operands():

   case '{':
   {
 int regno = parse_constraint_regname (p);
 gcc_assert (regno >= 0);
 cl = REGNO_REG_CLASS (regno);
 CLEAR_HARD_REG_SET (hregset);
 SET_HARD_REG_BIT (hregset, regno);
 cl_filter = &hregset;
 goto reg;
   }

This is rather unfortunate but I couldn't find a way how to validate
register names during genoutput.  If no one else has an idea I will
replace gcc_assert with a more expressive error message.

What's next?

I was thinking about replacing register asm with the new hard register
constraint.  This would solve problems like demonstrated by
gcc.dg/asm-hard-reg-7.c.  For example, we could replace the constraint

   register int x asm ("r5") = 42;
   asm ("foo   %0" :: "r" (x));

with

   register int x asm ("r5") = 42;
   asm ("foo   %0" :: "{r5}" (x));

and ignore any further effect of the register asm.  However, I haven't
really thought this through and there are certainly cases which are
currently allowed which cannot trivially be converted as e.g. here:

   register int x asm ("r5") = 42;
   asm ("foo   %0" :: "rd" (x));

Multiple alternatives are kind of strange in combination with register
asm.  For example, on s390 the two constraints "r" and "d" restrict both
to GPRs.  That is not a show stopper but certainly something which needs
some consideration.  If you can think of some wild combinations/edge
cases I would be happy to here about.  Anyhow, this is something for a
further patch.

Last but not least, if there is enough consent to accept this feature, I
will start writing up some documentation.

Bootstrapped and regtested on Arm, Intel, POWER, RISCV, S/390.  I have
only verified the 32-bit tests via cross compilers and didn't execute
them in contrast to 64-bit targets.
---
 gcc/cfgexpand.cc  |  42 -
 gcc/genoutput.cc  |  12 ++
 gcc/genpreds.cc   |   4 +-
 gcc/gimplify.cc   | 134 ++-
 gcc/lra-constraints.cc|  13 ++
 gcc/recog.cc  |  11 +-
 gcc/stmt.cc   | 155 +-
 gcc/stmt.h|  12 +-
 gcc/testsuite/gcc.dg/asm-hard-reg-1.c |  85 ++
 gcc/testsuite/gcc.dg/asm-hard-reg-2.c |  33 
 gcc/testsuite/gcc.dg/asm-hard-reg-3.c |  25 +

Re: [PATCH] regrename: Skip renaming register pairs [PR115860]

2024-07-24 Thread Stefan Schulze Frielinghaus

On Tue, Jul 23, 2024 at 11:40:00AM -0600, Jeff Law wrote:
> 
> 
> On 7/23/24 9:45 AM, Stefan Schulze Frielinghaus wrote:
> 
> > 
> > > They come from:
> > > ```
> > > (define_insn "*tf_to_fprx2_0"
> > >[(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0)
> > >  (subreg:DF (match_operand:TF1 "general_operand"   "v") 
> > > 0))]
> > > ...
> > > (define_insn "*tf_to_fprx2_1"
> > >[(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8)
> > >  (subreg:DF (match_operand:TF1 "general_operand"   "v") 
> > > 8))]
> > > 
> > > ```
> > > 
> > > I am not sure if that is a valid thing to do. s390 backend is the only
> > > one that has insn patterns like this. all that uses "+" use either
> > > strict_lowpart of zero_extract for the lhs or just a pure set.
> > > Maybe there is a better way of representing this. Maybe using unspec here?
> > 
> > I gave unspec a try and came up with
> > 
> > (define_insn "*tf_to_fprx2_0"
> >[(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0)
> >  (unspec:DF [(match_operand:TF1 "general_operand"   "v")] 
> > UNSPEC_TF_TO_FPRX2_0))]
> >"TARGET_VXE"
> >; M4 == 1 corresponds to %v0[0] = %v1[0]; %v0[1] = %v0[1];
> >"vpdi\t%v0,%v1,%v0,1"
> >[(set_attr "op_type" "VRR")])
> > 
> > (define_insn "*tf_to_fprx2_1"
> >[(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8)
> >  (unspec:DF [(match_operand:TF1 "general_operand"   "v")] 
> > UNSPEC_TF_TO_FPRX2_1))]
> >"TARGET_VXE"
> >; M4 == 5 corresponds to %V0[0] = %v1[1]; %V0[1] = %V0[1];
> >"vpdi\t%V0,%v1,%V0,5"
> >[(set_attr "op_type" "VRR")])
> > 
> > which seems to work.  However, I'm still getting subregs at final:
> > 
> > (insn 3 18 7 (set (reg/v:TF 18 %f4 [orig:62 x ] [62])
> >  (mem/c:TF (reg:DI 2 %r2 [65]) [1 x+0 S16 A64])) "t.c":3:1 421 
> > {movtf_vr}
> >   (expr_list:REG_DEAD (reg:DI 2 %r2 [65])
> >  (nil)))
> > (insn 7 3 8 (set (subreg:DF (reg:FPRX2 16 %f0 [64]) 0)
> >  (unspec:DF [
> >  (reg/v:TF 18 %f4 [orig:62 x ] [62])
> >  ] UNSPEC_TF_TO_FPRX2_0)) "t.c":4:10 569 {*tf_to_fprx2_0}
> >   (nil))
> > (insn 8 7 14 (set (subreg:DF (reg:FPRX2 16 %f0 [64]) 8)
> >  (unspec:DF [
> >  (reg/v:TF 18 %f4 [orig:62 x ] [62])
> >  ] UNSPEC_TF_TO_FPRX2_1)) "t.c":4:10 570 {*tf_to_fprx2_1}
> >   (expr_list:REG_DEAD (reg/v:TF 18 %f4 [orig:62 x ] [62])
> >  (nil)))
> > 
> > Thus, I'm not sure whether this really solves the problem or rather
> > shifts around it.  I'm still a bit puzzled why the initial RTL is
> > invalid.  If I understood you correctly Jeff, then we are missing a
> > pattern which would match once the subregs are eliminated.  Since none
> > exists the subregs survive and regrename gets confused.  This basically
> > means that subregs of register pairs must not survive RA and the unspec
> > solution from above is no real solution.
> I'd tend to agree.  The routine in question is cleanup_subreg_operands and
> from a quick looksie it's not going to work for the insn in question because
> cleanup_subreg_operands actually looks down into the recog data structures
> for each operand.  In the case above the subreg is explicit in the RTL
> rather than matched by the operand predicate.

Right, I did some further tests over night where I also added patterns
in order to match variants where the subregs are eliminated and that
seems to work.  I still haven't made up my mind which route would be
best.  Anyhow, it is clear that this patch should be dropped and I will
come up with a solution for the target.

Thank you Andrew and Jeff for pointing this out.  Some myths about
subregs have been revealed for me :)

Cheers,
Stefan

Re: [PATCH] regrename: Skip renaming register pairs [PR115860]

2024-07-23 Thread Stefan Schulze Frielinghaus

On Mon, Jul 22, 2024 at 08:16:16PM -0700, Andrew Pinski wrote:
> On Sun, Jul 21, 2024 at 11:47 PM Stefan Schulze Frielinghaus
> > diff --git a/gcc/regrename.cc b/gcc/regrename.cc
> > index 054e601740b..6ae5a2309d0 100644
> > --- a/gcc/regrename.cc
> > +++ b/gcc/regrename.cc
> > @@ -1113,6 +1113,10 @@ scan_rtx_reg (rtx_insn *insn, rtx *loc, enum 
> > reg_class cl, enum scan_actions act
> >
> >   c = create_new_chain (this_regno, this_nregs, loc, insn, cl);
> >
> > + /* Give up early in case of register pairs.  */
> > + if (this_nregs != 1)
> > +   c->cannot_rename = 1;
> 
> 
> I am a bit worried this will make TImode (and DImode for 32bit targets) worse.
> And it might make aarch64's vector struct types much worse than they
> are currently.
> It is interesting how there is a subreg of a hardregister after reload
> showing up here. Is that on purpose?

Good catch.  I don't think this was on purpose.  When looking at the
dump I rather thought this is valid RTL and didn't question it since
subregs for register pairs got "expanded" during final.

> They come from:
> ```
> (define_insn "*tf_to_fprx2_0"
>   [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0)
> (subreg:DF (match_operand:TF1 "general_operand"   "v") 0))]
> ...
> (define_insn "*tf_to_fprx2_1"
>   [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8)
> (subreg:DF (match_operand:TF1 "general_operand"   "v") 8))]
> 
> ```
> 
> I am not sure if that is a valid thing to do. s390 backend is the only
> one that has insn patterns like this. all that uses "+" use either
> strict_lowpart of zero_extract for the lhs or just a pure set.
> Maybe there is a better way of representing this. Maybe using unspec here?

I gave unspec a try and came up with

(define_insn "*tf_to_fprx2_0"
  [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0)
(unspec:DF [(match_operand:TF1 "general_operand"   "v")] 
UNSPEC_TF_TO_FPRX2_0))]
  "TARGET_VXE"
  ; M4 == 1 corresponds to %v0[0] = %v1[0]; %v0[1] = %v0[1];
  "vpdi\t%v0,%v1,%v0,1"
  [(set_attr "op_type" "VRR")])

(define_insn "*tf_to_fprx2_1"
  [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8)
(unspec:DF [(match_operand:TF1 "general_operand"   "v")] 
UNSPEC_TF_TO_FPRX2_1))]
  "TARGET_VXE"
  ; M4 == 5 corresponds to %V0[0] = %v1[1]; %V0[1] = %V0[1];
  "vpdi\t%V0,%v1,%V0,5"
  [(set_attr "op_type" "VRR")])

which seems to work.  However, I'm still getting subregs at final:

(insn 3 18 7 (set (reg/v:TF 18 %f4 [orig:62 x ] [62])
(mem/c:TF (reg:DI 2 %r2 [65]) [1 x+0 S16 A64])) "t.c":3:1 421 {movtf_vr}
 (expr_list:REG_DEAD (reg:DI 2 %r2 [65])
(nil)))
(insn 7 3 8 (set (subreg:DF (reg:FPRX2 16 %f0 [64]) 0)
(unspec:DF [
(reg/v:TF 18 %f4 [orig:62 x ] [62])
] UNSPEC_TF_TO_FPRX2_0)) "t.c":4:10 569 {*tf_to_fprx2_0}
 (nil))
(insn 8 7 14 (set (subreg:DF (reg:FPRX2 16 %f0 [64]) 8)
(unspec:DF [
(reg/v:TF 18 %f4 [orig:62 x ] [62])
] UNSPEC_TF_TO_FPRX2_1)) "t.c":4:10 570 {*tf_to_fprx2_1}
 (expr_list:REG_DEAD (reg/v:TF 18 %f4 [orig:62 x ] [62])
(nil)))

Thus, I'm not sure whether this really solves the problem or rather
shifts around it.  I'm still a bit puzzled why the initial RTL is
invalid.  If I understood you correctly Jeff, then we are missing a
pattern which would match once the subregs are eliminated.  Since none
exists the subregs survive and regrename gets confused.  This basically
means that subregs of register pairs must not survive RA and the unspec
solution from above is no real solution.

Since the only purpose of tf_to_fprx2_0 and tf_to_fprx2_1 are to move a
long double from a vector register into a FP register pair one could
also merge both insn into one and emit two instructions in the assembler
template.  This would at least circumvent the subreg issue.

(define_insn "tf_to_fprx2"
  [(set (match_operand:FPRX2 0 "nonimmediate_operand" "=f")
(unspec:FPRX2 [(match_operand:TF 1 "general_operand"   "v")] 
UNSPEC_TF_TO_FPRX2))]
  "TARGET_VXE"
  "vpdi\t%v0,%v1,%v0,1;vpdi\t%V0,%v1,%V0,5"
  [(set_attr "length" "12")
   (set_attr "op_type" "VRR")])

I will give this a try tomorrow.

Thanks,
Stefan

[PATCH] regrename: Skip renaming register pairs [PR115860]

2024-07-21 Thread Stefan Schulze Frielinghaus

It is not trivial to decide when a write of a register pair terminates
or starts a new chain.  For example, prior regrename we have

(insn 91 38 36 5 (set (reg:FPRX2 16 %f0 [orig:76 x ] [76])
(const_double:FPRX2 0.0 [0x0.0p+0])) 
"float-cast-overflow-7-reduced.c":5:55 discrim 2 1507 {*movfprx2_64}
 (expr_list:REG_EQUAL (const_double:FPRX2 0.0 [0x0.0p+0])
(nil)))
(insn 36 91 37 5 (set (subreg:DF (reg:FPRX2 16 %f0 [orig:76 x ] [76]) 0)
(mem/c:DF (plus:DI (reg/f:DI 15 %r15)
(const_int 160 [0xa0])) [7 %sfp+-32 S8 A64])) 
"float-cast-overflow-7-reduced.c":5:55 discrim 2 1512 {*movdf_64dfp}
 (nil))
(insn 37 36 43 5 (set (subreg:DF (reg:FPRX2 16 %f0 [orig:76 x ] [76]) 8)
(mem/c:DF (plus:DI (reg/f:DI 15 %r15)
(const_int 168 [0xa8])) [7 %sfp+-24 S8 A64])) 
"float-cast-overflow-7-reduced.c":5:55 discrim 2 1512 {*movdf_64dfp}
 (nil))

where insn 91 writes both registers of a register pair and it is clear
that an existing chain must be terminated and a new started.  Insn 36
and 37 write only into one register of a corresponding register pair.
For each write on its own it is not obvious when to terminate an
existing chain and to start a new one.  In other words, once insn 36
materializes and 37 didn't we are kind of in a limbo state.  Tracking
this correctly is inherently hard and I'm not entirely sure whether
optimizations could even lead to more complicated cases where it is even
less clear when a chain terminates and a new has to be started.
Therefore, skip renaming of register pairs.

Bootstrapped and regtested on x86_64, aarch64, powerpc64le, and s390.
Ok for mainline?

This fixes on s390:
FAIL: g++.dg/cpp23/ext-floating14.C  -std=gnu++23 execution test
FAIL: g++.dg/cpp23/ext-floating14.C  -std=gnu++26 execution test
FAIL: c-c++-common/ubsan/float-cast-overflow-7.c   -O2  execution test
FAIL: c-c++-common/ubsan/float-cast-overflow-7.c   -O2 -flto 
-fno-use-linker-plugin -flto-partition=none  execution test
FAIL: c-c++-common/ubsan/float-cast-overflow-7.c   -O2 -flto 
-fuse-linker-plugin -fno-fat-lto-objects  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O0  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O1  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O2  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O2 -flto 
-fno-use-linker-plugin -flto-partition=none  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O2 -flto 
-fuse-linker-plugin -fno-fat-lto-objects  execution test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -O3 -g  execution 
test
FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c   -Os  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O0  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O1  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O2  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O2 -flto 
-fno-use-linker-plugin -flto-partition=none  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O2 -flto 
-fuse-linker-plugin -fno-fat-lto-objects  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -O3 -g  execution test
FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c   -Os  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O0  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O1  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O2  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O2 -flto -fno-use-linker-plugin 
-flto-partition=none  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -O3 -g  execution test
FAIL: gcc.dg/torture/fp-int-convert-timode.c   -Os  execution test
FAIL: gfortran.dg/pr96711.f90   -O0  execution test
FAIL: TestSignalForwardingExternal
FAIL: go test misc/cgo/testcarchive
FAIL: libffi.closures/nested_struct5.c -W -Wall -Wno-psabi -O2 output pattern 
test
FAIL: libphobos.phobos/std/algorithm/mutation.d execution test
FAIL: libphobos.phobos/std/conv.d execution test
FAIL: libphobos.phobos/std/internal/math/errorfunction.d execution test
FAIL: libphobos.phobos/std/variant.d execution test
FAIL: libphobos.phobos_shared/std/algorithm/mutation.d execution test
FAIL: libphobos.phobos_shared/std/conv.d execution test
FAIL: libphobos.phobos_shared/std/internal/math/errorfunction.d execution test
FAIL: libphobos.phobos_shared/std/variant.d execution test

gcc/ChangeLog:

PR rtl-optimiztion/115860
* regrename.cc (scan_rtx_reg): Do not try to rename register
pairs.
---
 gcc/regrename.cc | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/regrename.cc b/gcc/regrename.cc
index 054e601740b..6ae5a2309d0 100644
---

Re: [PATCH] s390: Fix unresolved iterators bhfgq and xdee

2024-07-19 Thread Stefan Schulze Frielinghaus

I'm pinging this early since I would like to make sure that it gets into
14.2 RC which is about to be done on Tuesday 23rd July.

On Tue, Jul 16, 2024 at 04:50:29PM +0200, Stefan Schulze Frielinghaus wrote:
> Code attribute bhfgq is missing a mapping for TF.  This results in
> unresolved iterators in assembler templates for *bswaptf.
> 
> With the TF mapping added the base mnemonics vlbr and vstbr are not
> "used" anymore but only the extended mnemonics (vlbr was
> interpreted as vlbr; likewise for vstbr).  Therefore, remove the base
> mnemonics from the scheduling description, otherwise, genattrtab would
> error about unknown mnemonics.
> 
> Likewise, for movtf_vr only the extended mnemonics for vrepi are used,
> now, which means the base mnemonic is "unused" and has to be removed
> from the scheduling description.
> 
> Similarly, we end up with unresolved iterators in assembler templates
> for mulfprx23 since code attribute xdee is missing a mapping for FPRX2.
> 
> Note, this is basically a cherry pick of commit r15-2060-ga4abda934aa426
> with the addition that vrepi is removed from the scheduling description,
> too.
> 
> Bootstrapped on s390.  Ok for release branches 12, 13, and 14?
> 
> gcc/ChangeLog:
> 
>   * config/s390/3931.md (vlbr, vstbr, vrepi): Remove.
>   * config/s390/s390.md (xdee): Add FPRX2 mapping.
>   * config/s390/vector.md (bhfgq): Add TF mapping.
> ---
>  gcc/config/s390/3931.md   | 7 ---
>  gcc/config/s390/s390.md   | 2 +-
>  gcc/config/s390/vector.md | 2 +-
>  3 files changed, 2 insertions(+), 9 deletions(-)
> 
> diff --git a/gcc/config/s390/3931.md b/gcc/config/s390/3931.md
> index bed1f6c21f1..9cb11b72bba 100644
> --- a/gcc/config/s390/3931.md
> +++ b/gcc/config/s390/3931.md
> @@ -404,7 +404,6 @@ vlvgg,
>  vlvgh,
>  vlvgp,
>  vst,
> -vstbr,
>  vstbrf,
>  vstbrg,
>  vstbrh,
> @@ -627,7 +626,6 @@ tm,
>  tmy,
>  vl,
>  vlbb,
> -vlbr,
>  vlbrf,
>  vlbrg,
>  vlbrh,
> @@ -661,7 +659,6 @@ vlreph,
>  vlrl,
>  vlrlr,
>  vst,
> -vstbr,
>  vstbrf,
>  vstbrg,
>  vstbrh,
> @@ -1077,7 +1074,6 @@ vrepb,
>  vrepf,
>  vrepg,
>  vreph,
> -vrepi,
>  vrepib,
>  vrepif,
>  vrepig,
> @@ -1930,7 +1926,6 @@ vrepb,
>  vrepf,
>  vrepg,
>  vreph,
> -vrepi,
>  vrepib,
>  vrepif,
>  vrepig,
> @@ -2156,7 +2151,6 @@ vistrfs,
>  vistrhs,
>  vl,
>  vlbb,
> -vlbr,
>  vlbrf,
>  vlbrg,
>  vlbrh,
> @@ -2248,7 +2242,6 @@ tbegin,
>  tbeginc,
>  tend,
>  vst,
> -vstbr,
>  vstbrf,
>  vstbrg,
>  vstbrh,
> diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
> index 50a828f2bbb..8edc1261c38 100644
> --- a/gcc/config/s390/s390.md
> +++ b/gcc/config/s390/s390.md
> @@ -744,7 +744,7 @@
>  ;; In FP templates, a  in "mr" will expand to "mxr" in
>  ;; TF/TDmode, "mdr" in DF/DDmode, "meer" in SFmode and "mer in
>  ;; SDmode.
> -(define_mode_attr xdee [(TF "x") (DF "d") (SF "ee") (TD "x") (DD "d") (SD 
> "e")])
> +(define_mode_attr xdee [(TF "x") (FPRX2 "x") (DF "d") (SF "ee") (TD "x") (DD 
> "d") (SD "e")])
>  
>  ;; The decimal floating point variants of add, sub, div and mul support 3
>  ;; fp register operands.  The following attributes allow to merge the bfp and
> diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
> index 1bae1056951..f88e8b655fa 100644
> --- a/gcc/config/s390/vector.md
> +++ b/gcc/config/s390/vector.md
> @@ -134,7 +134,7 @@
>   (V1TI "q") (TI "q")
>   (V1SF "f") (V2SF "f") (V4SF "f")
>   (V1DF "g") (V2DF "g")
> - (V1TF "q")])
> + (V1TF "q") (TF "q")])
>  
>  ; This is for vmalhw. It gets an 'w' attached to avoid confusion with
>  ; multiply and add logical high vmalh.
> -- 
> 2.45.0
>

Re: [PATCH] s390: testsuite: Fix vcond-shift.c

2024-07-19 Thread Stefan Schulze Frielinghaus

On Thu, Jul 18, 2024 at 11:58:10PM -0700, Andrew Pinski wrote:
> On Thu, Jul 18, 2024 at 10:31 PM Stefan Schulze Frielinghaus
>  wrote:
> >
> > Previously we optimized expressions of the form a < 0 ? -1 : 0 to
> > (signed)a >> 31 during vcond expanding.  Since r15-1741-g2ccdd0f22312a1
> > this is done in match.pd.  The implementation in the back end as well as
> > in match.pd are basically the same but still distinct.  For the tests in
> > vcond-shift.c the back end emitted
> >
> >   (xx - (xx >> 31)) >> 1
> >
> > whereas now via match.pd
> >
> >   ((int) ((unsigned int) xx >> 31) + xx) >> 1
> >
> > which is basically the same.  We just have to adapt the scan-assembler
> > directives w.r.t. signed/unsigned shifts which is done by this patch.
> 
> Note I filed https://gcc.gnu.org/PR115999 because I noticed those 2
> form produce slightly different code generation for scalars (I assume
> it will produce similar issues for vectors too).

Thanks for the heads up.  In that case we should probably wait a bit
once a normal form or whatever has settled.

Cheers,
Stefan

> 
> Thanks,
> Andrew Pinski
> 
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/s390/vector/vcond-shift.c: Adapt to new match.pd
> > rule and change scan-assembler-times for shifts.
> > ---
> >  Regtested on s390.  Ok for mainline?
> >
> >  gcc/testsuite/gcc.target/s390/vector/vcond-shift.c | 12 ++--
> >  1 file changed, 6 insertions(+), 6 deletions(-)
> >
> > diff --git a/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c 
> > b/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c
> > index a6b4e97aa50..b942f44039d 100644
> > --- a/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c
> > +++ b/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c
> > @@ -3,13 +3,13 @@
> >  /* { dg-do compile { target { s390*-*-* } } } */
> >  /* { dg-options "-O3 -march=z13 -mzarch" } */
> >
> > -/* { dg-final { scan-assembler-times "vesraf\t%v.?,%v.?,31" 6 } } */
> > -/* { dg-final { scan-assembler-times "vesrah\t%v.?,%v.?,15" 6 } } */
> > -/* { dg-final { scan-assembler-times "vesrab\t%v.?,%v.?,7" 6 } } */
> > +/* { dg-final { scan-assembler-times "vesraf\t%v.?,%v.?,31" 4 } } */
> > +/* { dg-final { scan-assembler-times "vesrah\t%v.?,%v.?,15" 4 } } */
> > +/* { dg-final { scan-assembler-times "vesrab\t%v.?,%v.?,7" 4 } } */
> >  /* { dg-final { scan-assembler-not "vzero\t*" } } */
> > -/* { dg-final { scan-assembler-times "vesrlf\t%v.?,%v.?,31" 4 } } */
> > -/* { dg-final { scan-assembler-times "vesrlh\t%v.?,%v.?,15" 4 } } */
> > -/* { dg-final { scan-assembler-times "vesrlb\t%v.?,%v.?,7" 4 } } */
> > +/* { dg-final { scan-assembler-times "vesrlf\t%v.?,%v.?,31" 6 } } */
> > +/* { dg-final { scan-assembler-times "vesrlh\t%v.?,%v.?,15" 6 } } */
> > +/* { dg-final { scan-assembler-times "vesrlb\t%v.?,%v.?,7" 6 } } */
> >
> >  /* Make it expand to two vector operations.  */
> >  #define ITER(X) (2 * (16 / sizeof (X[1])))
> > --
> > 2.45.2
> >

[PATCH] s390: testsuite: Fix vcond-shift.c

2024-07-19 Thread Stefan Schulze Frielinghaus

Previously we optimized expressions of the form a < 0 ? -1 : 0 to
(signed)a >> 31 during vcond expanding.  Since r15-1741-g2ccdd0f22312a1
this is done in match.pd.  The implementation in the back end as well as
in match.pd are basically the same but still distinct.  For the tests in
vcond-shift.c the back end emitted

  (xx - (xx >> 31)) >> 1

whereas now via match.pd

  ((int) ((unsigned int) xx >> 31) + xx) >> 1

which is basically the same.  We just have to adapt the scan-assembler
directives w.r.t. signed/unsigned shifts which is done by this patch.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/vcond-shift.c: Adapt to new match.pd
rule and change scan-assembler-times for shifts.
---
 Regtested on s390.  Ok for mainline?

 gcc/testsuite/gcc.target/s390/vector/vcond-shift.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c 
b/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c
index a6b4e97aa50..b942f44039d 100644
--- a/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c
+++ b/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c
@@ -3,13 +3,13 @@
 /* { dg-do compile { target { s390*-*-* } } } */
 /* { dg-options "-O3 -march=z13 -mzarch" } */
 
-/* { dg-final { scan-assembler-times "vesraf\t%v.?,%v.?,31" 6 } } */
-/* { dg-final { scan-assembler-times "vesrah\t%v.?,%v.?,15" 6 } } */
-/* { dg-final { scan-assembler-times "vesrab\t%v.?,%v.?,7" 6 } } */
+/* { dg-final { scan-assembler-times "vesraf\t%v.?,%v.?,31" 4 } } */
+/* { dg-final { scan-assembler-times "vesrah\t%v.?,%v.?,15" 4 } } */
+/* { dg-final { scan-assembler-times "vesrab\t%v.?,%v.?,7" 4 } } */
 /* { dg-final { scan-assembler-not "vzero\t*" } } */
-/* { dg-final { scan-assembler-times "vesrlf\t%v.?,%v.?,31" 4 } } */
-/* { dg-final { scan-assembler-times "vesrlh\t%v.?,%v.?,15" 4 } } */
-/* { dg-final { scan-assembler-times "vesrlb\t%v.?,%v.?,7" 4 } } */
+/* { dg-final { scan-assembler-times "vesrlf\t%v.?,%v.?,31" 6 } } */
+/* { dg-final { scan-assembler-times "vesrlh\t%v.?,%v.?,15" 6 } } */
+/* { dg-final { scan-assembler-times "vesrlb\t%v.?,%v.?,7" 6 } } */
 
 /* Make it expand to two vector operations.  */
 #define ITER(X) (2 * (16 / sizeof (X[1])))
-- 
2.45.2

[PATCH] s390: Fix unresolved iterators bhfgq and xdee

2024-07-16 Thread Stefan Schulze Frielinghaus

Code attribute bhfgq is missing a mapping for TF.  This results in
unresolved iterators in assembler templates for *bswaptf.

With the TF mapping added the base mnemonics vlbr and vstbr are not
"used" anymore but only the extended mnemonics (vlbr was
interpreted as vlbr; likewise for vstbr).  Therefore, remove the base
mnemonics from the scheduling description, otherwise, genattrtab would
error about unknown mnemonics.

Likewise, for movtf_vr only the extended mnemonics for vrepi are used,
now, which means the base mnemonic is "unused" and has to be removed
from the scheduling description.

Similarly, we end up with unresolved iterators in assembler templates
for mulfprx23 since code attribute xdee is missing a mapping for FPRX2.

Note, this is basically a cherry pick of commit r15-2060-ga4abda934aa426
with the addition that vrepi is removed from the scheduling description,
too.

Bootstrapped on s390.  Ok for release branches 12, 13, and 14?

gcc/ChangeLog:

* config/s390/3931.md (vlbr, vstbr, vrepi): Remove.
* config/s390/s390.md (xdee): Add FPRX2 mapping.
* config/s390/vector.md (bhfgq): Add TF mapping.
---
 gcc/config/s390/3931.md   | 7 ---
 gcc/config/s390/s390.md   | 2 +-
 gcc/config/s390/vector.md | 2 +-
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/gcc/config/s390/3931.md b/gcc/config/s390/3931.md
index bed1f6c21f1..9cb11b72bba 100644
--- a/gcc/config/s390/3931.md
+++ b/gcc/config/s390/3931.md
@@ -404,7 +404,6 @@ vlvgg,
 vlvgh,
 vlvgp,
 vst,
-vstbr,
 vstbrf,
 vstbrg,
 vstbrh,
@@ -627,7 +626,6 @@ tm,
 tmy,
 vl,
 vlbb,
-vlbr,
 vlbrf,
 vlbrg,
 vlbrh,
@@ -661,7 +659,6 @@ vlreph,
 vlrl,
 vlrlr,
 vst,
-vstbr,
 vstbrf,
 vstbrg,
 vstbrh,
@@ -1077,7 +1074,6 @@ vrepb,
 vrepf,
 vrepg,
 vreph,
-vrepi,
 vrepib,
 vrepif,
 vrepig,
@@ -1930,7 +1926,6 @@ vrepb,
 vrepf,
 vrepg,
 vreph,
-vrepi,
 vrepib,
 vrepif,
 vrepig,
@@ -2156,7 +2151,6 @@ vistrfs,
 vistrhs,
 vl,
 vlbb,
-vlbr,
 vlbrf,
 vlbrg,
 vlbrh,
@@ -2248,7 +2242,6 @@ tbegin,
 tbeginc,
 tend,
 vst,
-vstbr,
 vstbrf,
 vstbrg,
 vstbrh,
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 50a828f2bbb..8edc1261c38 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -744,7 +744,7 @@
 ;; In FP templates, a  in "mr" will expand to "mxr" in
 ;; TF/TDmode, "mdr" in DF/DDmode, "meer" in SFmode and "mer in
 ;; SDmode.
-(define_mode_attr xdee [(TF "x") (DF "d") (SF "ee") (TD "x") (DD "d") (SD 
"e")])
+(define_mode_attr xdee [(TF "x") (FPRX2 "x") (DF "d") (SF "ee") (TD "x") (DD 
"d") (SD "e")])
 
 ;; The decimal floating point variants of add, sub, div and mul support 3
 ;; fp register operands.  The following attributes allow to merge the bfp and
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 1bae1056951..f88e8b655fa 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -134,7 +134,7 @@
(V1TI "q") (TI "q")
(V1SF "f") (V2SF "f") (V4SF "f")
(V1DF "g") (V2DF "g")
-   (V1TF "q")])
+   (V1TF "q") (TF "q")])
 
 ; This is for vmalhw. It gets an 'w' attached to avoid confusion with
 ; multiply and add logical high vmalh.
-- 
2.45.0

[PATCH] s390: Fix unresolved iterators bhfgq and xdee

2024-07-16 Thread Stefan Schulze Frielinghaus

Code attribute bhfgq is missing a mapping for TF.  This results in
unresolved iterators in assembler templates for *bswaptf.

With the TF mapping added the base mnemonics vlbr and vstbr are not
"used" anymore but only the extended mnemonics (vlbr was
interpreted as vlbr; likewise for vstbr).  Therefore, remove the base
mnemonics from the scheduling description, otherwise, genattrtab would
error about unknown mnemonics.

Similarly, we end up with unresolved iterators in assembler templates
for mulfprx23 since code attribute xdee is missing a mapping for FPRX2.

gcc/ChangeLog:

* config/s390/3931.md (vlbr, vstbr): Remove.
* config/s390/s390.md (xdee): Add FPRX2 mapping.
* config/s390/vector.md (bhfgq): Add TF mapping.
---
 Bootstrapped and regtested on s390.  Ok for {mainline,12,13,14}?

 gcc/config/s390/3931.md   | 5 -
 gcc/config/s390/s390.md   | 2 +-
 gcc/config/s390/vector.md | 2 +-
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/gcc/config/s390/3931.md b/gcc/config/s390/3931.md
index 632c2456b6a..9f7a4c58755 100644
--- a/gcc/config/s390/3931.md
+++ b/gcc/config/s390/3931.md
@@ -404,7 +404,6 @@ vlvgg,
 vlvgh,
 vlvgp,
 vst,
-vstbr,
 vstbrf,
 vstbrg,
 vstbrh,
@@ -627,7 +626,6 @@ tm,
 tmy,
 vl,
 vlbb,
-vlbr,
 vlbrf,
 vlbrg,
 vlbrh,
@@ -661,7 +659,6 @@ vlreph,
 vlrl,
 vlrlr,
 vst,
-vstbr,
 vstbrf,
 vstbrg,
 vstbrh,
@@ -2148,7 +2145,6 @@ vistrfs,
 vistrhs,
 vl,
 vlbb,
-vlbr,
 vlbrf,
 vlbrg,
 vlbrh,
@@ -2240,7 +2236,6 @@ tbegin,
 tbeginc,
 tend,
 vst,
-vstbr,
 vstbrf,
 vstbrg,
 vstbrh,
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 303026f6af7..3d5759d6252 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -745,7 +745,7 @@
 ;; In FP templates, a  in "mr" will expand to "mxr" in
 ;; TF/TDmode, "mdr" in DF/DDmode, "meer" in SFmode and "mer in
 ;; SDmode.
-(define_mode_attr xdee [(TF "x") (DF "d") (SF "ee") (TD "x") (DD "d") (SD 
"e")])
+(define_mode_attr xdee [(TF "x") (FPRX2 "x") (DF "d") (SF "ee") (TD "x") (DD 
"d") (SD "e")])
 
 ;; The decimal floating point variants of add, sub, div and mul support 3
 ;; fp register operands.  The following attributes allow to merge the bfp and
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 63678859657..cca9e3556c9 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -136,7 +136,7 @@
(V1TI "q") (TI "q")
(V1SF "f") (V2SF "f") (V4SF "f")
(V1DF "g") (V2DF "g")
-   (V1TF "q")])
+   (V1TF "q") (TF "q")])
 
 ; This is for vmalhw. It gets an 'w' attached to avoid confusion with
 ; multiply and add logical high vmalh.
-- 
2.45.2

Re: [PATCH] s390: Align cjump_64 and icjump_64

2024-07-11 Thread Stefan Schulze Frielinghaus

On Thu, Jul 11, 2024 at 07:32:17PM +0200, Stefan Schulze Frielinghaus wrote:
> On Thu, Jul 11, 2024 at 05:14:58PM +0200, Jakub Jelinek wrote:
> > On Thu, Jul 11, 2024 at 05:09:41PM +0200, Stefan Schulze Frielinghaus wrote:
> > > I didn't have the schedule for 11.5 RC in mind which is tomorrow and the
> > > release a week afterwards.  I hope this is still appropriate for 11.5?
> > 
> > From my side, if Andreas or somebody else approves it, it is tested on 11
> > branch and committed by tomorrow, it can be added.
> > But I'd like to know what patches I should wait for tomorrow and approximate
> > ETA (and ideally before end of working day in Europe).  Once rc1 is done, 
> > only
> > severe blockers will be possible.
> 
> The tester is running over night and will finish around 7 AM CEST.  I
> will let you know once it has finished.  If anything goes wrong we can
> skip this patch of course.

The tester was extremely slow this time and still didn't finish.  I
don't wanna rush it risking to introduce late time problems for the
11.5 release.  Since I'm testing for three different architectures and
the first one hasn't finished, let's drop this patch for 11.5.

Sorry for the noise,
Stefan

Re: [PATCH] s390: Align cjump_64 and icjump_64

2024-07-11 Thread Stefan Schulze Frielinghaus

On Thu, Jul 11, 2024 at 05:14:58PM +0200, Jakub Jelinek wrote:
> On Thu, Jul 11, 2024 at 05:09:41PM +0200, Stefan Schulze Frielinghaus wrote:
> > I didn't have the schedule for 11.5 RC in mind which is tomorrow and the
> > release a week afterwards.  I hope this is still appropriate for 11.5?
> 
> From my side, if Andreas or somebody else approves it, it is tested on 11
> branch and committed by tomorrow, it can be added.
> But I'd like to know what patches I should wait for tomorrow and approximate
> ETA (and ideally before end of working day in Europe).  Once rc1 is done, only
> severe blockers will be possible.

The tester is running over night and will finish around 7 AM CEST.  I
will let you know once it has finished.  If anything goes wrong we can
skip this patch of course.

Cheers,
Stefan

Re: [PATCH] s390: Align cjump_64 and icjump_64

2024-07-11 Thread Stefan Schulze Frielinghaus

On Thu, Jul 11, 2024 at 04:29:19PM +0200, Stefan Schulze Frielinghaus wrote:
> During machine reorg we optimize backward jumps and transform insns as
> e.g.
> 
> (jump_insn 118 117 119 (set (pc)
> (if_then_else (ne (reg:CCRAW 33 %cc)
> (const_int 8 [0x8]))
> (label_ref 134)
> (pc))) "dec_math_1.f90":204:8 discrim 1 2161 {*cjump_64}
>  (expr_list:REG_DEAD (reg:CCRAW 33 %cc)
> (int_list:REG_BR_PROB 719407028 (nil)))
>  -> 134)
> 
> into
> 
> (jump_insn 118 117 432 (set (pc)
> (if_then_else (ne (reg:CCRAW 33 %cc)
> (const_int 8 [0x8]))
> (pc)
> (label_ref 433))) "dec_math_1.f90":204:8 discrim 1 -1
>  (expr_list:REG_DEAD (reg:CCRAW 33 %cc)
> (int_list:REG_BR_PROB 719407028 (nil)))
>  -> 433)
> 
> The latter is not recognized anymore since *icjump_64 only matches
> CC_REGNUM against zero.  Fixed by aligning *cjump_64 and *icjump_64.
> 
> gcc/ChangeLog:
> 
>   * config/s390/s390.md (*icjump_64): Allow raw CC comparisons,
>   i.e., any constant integer between 0 and 15 for CC comparisons.
> ---
>  Bootstrap and regtest or still running.  Assuming no regressions, ok
>  for {mainline,11,12,13,14}?  Would be great to see this in 14.2 RC :)

I didn't have the schedule for 11.5 RC in mind which is tomorrow and the
release a week afterwards.  I hope this is still appropriate for 11.5?

Cheers,
Stefan

> 
>  gcc/config/s390/s390.md | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
> index f5d7003dfad..d3931b09417 100644
> --- a/gcc/config/s390/s390.md
> +++ b/gcc/config/s390/s390.md
> @@ -9556,7 +9556,8 @@
>  (define_insn "*icjump_64"
>[(set (pc)
>  (if_then_else
> -  (match_operator 1 "s390_comparison" [(reg CC_REGNUM) (const_int 
> 0)])
> +  (match_operator 1 "s390_comparison" [(reg CC_REGNUM)
> +(match_operand 2 
> "const_int_operand" "")])
>(pc)
>(label_ref (match_operand 0 "" ""]
>""
> -- 
> 2.45.2
>

[PATCH] s390: Align cjump_64 and icjump_64

2024-07-11 Thread Stefan Schulze Frielinghaus

During machine reorg we optimize backward jumps and transform insns as
e.g.

(jump_insn 118 117 119 (set (pc)
(if_then_else (ne (reg:CCRAW 33 %cc)
(const_int 8 [0x8]))
(label_ref 134)
(pc))) "dec_math_1.f90":204:8 discrim 1 2161 {*cjump_64}
 (expr_list:REG_DEAD (reg:CCRAW 33 %cc)
(int_list:REG_BR_PROB 719407028 (nil)))
 -> 134)

into

(jump_insn 118 117 432 (set (pc)
(if_then_else (ne (reg:CCRAW 33 %cc)
(const_int 8 [0x8]))
(pc)
(label_ref 433))) "dec_math_1.f90":204:8 discrim 1 -1
 (expr_list:REG_DEAD (reg:CCRAW 33 %cc)
(int_list:REG_BR_PROB 719407028 (nil)))
 -> 433)

The latter is not recognized anymore since *icjump_64 only matches
CC_REGNUM against zero.  Fixed by aligning *cjump_64 and *icjump_64.

gcc/ChangeLog:

* config/s390/s390.md (*icjump_64): Allow raw CC comparisons,
i.e., any constant integer between 0 and 15 for CC comparisons.
---
 Bootstrap and regtest or still running.  Assuming no regressions, ok
 for {mainline,11,12,13,14}?  Would be great to see this in 14.2 RC :)

 gcc/config/s390/s390.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index f5d7003dfad..d3931b09417 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -9556,7 +9556,8 @@
 (define_insn "*icjump_64"
   [(set (pc)
 (if_then_else
-  (match_operator 1 "s390_comparison" [(reg CC_REGNUM) (const_int 0)])
+  (match_operator 1 "s390_comparison" [(reg CC_REGNUM)
+  (match_operand 2 
"const_int_operand" "")])
   (pc)
   (label_ref (match_operand 0 "" ""]
   ""
-- 
2.45.2

[PATCH] s390: Fully exploit vgm, vgbm, vrepi

2024-07-02 Thread Stefan Schulze Frielinghaus

Currently instructions vgm and vrepi are utilized only for constant
vectors where the element mode equals the element mode of the
corresponding instruction.  This patch lifts this restriction by making
use of those instructions for constant vectors even if element modes
do not coincide.  For example, the constant vector

  (v2di){0x7ffe7ffe, 0x7ffe7ffe}

can be loaded via vgmf %v0,1,30.  Similar, the constant vector

  (v4si){0x, 0x, 0x, 0x}

can be loaded via vrepiq %v0,-86.

Analog, if the element mode of a constant vector is smaller than the
element mode of a corresponding instruction, we still may make use of
those instructions.  For example, the constant vector

  (v4si){0x7fff, 0xfffe, 0x7fff, 0xfffe}

can be loaded via vgmg %v0,17,46.  Similar, the constant vector

  (v4si){-1, -16643, -1, -16643}

can be loaded via vrepig %v0,-16643.

Additionally this patch enables vgm, vgbm, vrepi for partial vectors,
i.e., vectors of size less than 16 bytes.  Basically this is done by
treating a vector as a full vector resulting in replicating constants
into the ignored bits whereas vgbm sets those to zero.

Furthermore, there is no restriction to integer vectors anymore, i.e.,
supporting scalars of mode up to and including TI and TF and also
floating-point vectors.

Here are some numbers how often instructions are emitted for SPEC 2017:

w/o patch w/ patch
vgbm  140  365
vgm 1750824452
vrepi1360 2775

I expect most (maybe even all) to save us a load from the literal pool.

gcc/ChangeLog:

* config/s390/2964.md: Remove extended mnemonics for vgm.
* config/s390/3906.md: Remove extended mnemonics for vgm.
* config/s390/3931.md: Remove extended mnemonics for vgm.
* config/s390/8561.md: Remove extended mnemonics for vgm.
* config/s390/constraints.md (jKK): Remove constraint.
(jzz): Add constraint.
* config/s390/s390-protos.h (s390_contiguous_bitmask_vector_p):
Add prototype.
(s390_constant_via_vgm_p): Add prototype.
(s390_constant_via_vrepi_p): Add prototype.
* config/s390/s390.cc (s390_contiguous_bitmask_vector_p): New
function.
(s390_constant_via_vgm_vrepi_helper): New function.
(s390_constant_via_vgm_p): New function.
(s390_constant_via_vgbm_p): For the sake of symmetry rename
s390_bytemask_vector_p into s390_constant_via_vgbm_p.
(s390_bytemask_vector_p): Deal with non-integer and partial
vectors.
(s390_constant_via_vrepi_p): New function.
(s390_legitimate_constant_p): Allow partial vectors.
(legitimate_reload_constant_p): Fix indentation.
(legitimate_reload_vector_constant_p): Restrict to constraints
j00, jm1, jxx, jyy, jzz only, i.e., allow partial vectors.
(s390_expand_vec_init): Also make use of vrepi if possible.
(print_operand): Add q,p,r for vgm,vrepi,vgbm, respectively.
Remove e,s,t for constant vectors.
* config/s390/s390.md (movti): Add variants utilizing
vgbm,vgm,vrepi.
* config/s390/vector.md (mov): Adapt variants
for vgbm,vgm,vrepi for the new scheme.
(mov): Adapt variants for vgbm,vgm for the new
scheme and add vrepi variant for modes V_8,V_16,V_32,V_64.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/vec-copysign.c: Change to non-extended
mnemonic.
* gcc.target/s390/vector/vec-genmask-1.c: Change to non-extended
mnemonic.
* gcc.target/s390/vector/vec-init-1.c: Change to non-extended
mnemonic.
* gcc.target/s390/vector/vec-vrepi-1.c: Change to non-extended
mnemonic.
* gcc.target/s390/zvector/autovec-double-quiet-uneq.c: Change to
non-extended mnemonic.
* gcc.target/s390/zvector/autovec-float-quiet-uneq.c: Change to
non-extended mnemonic.
* gcc.target/s390/zvector/vec-genmask-1.c: Change to
non-extended mnemonic.
* gcc.target/s390/zvector/vec-splat-1.c: Change to non-extended
mnemonic.
* gcc.target/s390/zvector/vec-splat-2.c: Change to non-extended
mnemonic.
* gcc.target/s390/vector/vgbm-double-1.c: New test.
* gcc.target/s390/vector/vgbm-float-1.c: New test.
* gcc.target/s390/vector/vgbm-int128-1.c: New test.
* gcc.target/s390/vector/vgbm-integer-1.c: New test.
* gcc.target/s390/vector/vgbm-longdouble-1.c: New test.
* gcc.target/s390/vector/vgm-df-1.c: New test.
* gcc.target/s390/vector/vgm-di-1.c: New test.
* gcc.target/s390/vector/vgm-hi-1.c: New test.
* gcc.target/s390/vector/vgm-int128-1.c: New test.
* gcc.target/s390/vector/vgm-longdouble-1.c: New test.
* gcc.target/s390/vector/vgm-qi-1.c: New test.
* gcc.target/s390/vector/vgm-sf-1.c: New test.
* gcc.target/s390/vector/vgm-si-1.c: N

[PATCH] s390: Fix output template for movv1qi

2024-07-02 Thread Stefan Schulze Frielinghaus

Although for instructions MVI and MVIY it does not make a difference
whether the immediate is interpreted as signed or unsigned, GAS expects
unsigned immediates for instruction format SI_URD.

gcc/ChangeLog:

* config/s390/vector.md (mov): Fix output template for
movv1qi.
---
 Bootstrapped and regtested on s390.  Ok for {mainline,11,12,13,14}?

 gcc/config/s390/vector.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 40de0c75a7c..26fd505f2cd 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -368,8 +368,8 @@
lr\t%0,%1
mvi\t%0,0
mviy\t%0,0
-   mvi\t%0,-1
-   mviy\t%0,-1
+   mvi\t%0,255
+   mviy\t%0,255
lhi\t%0,0
lhi\t%0,-1
llc\t%0,%1
-- 
2.45.2

[PATCH 0/3] Prepare and drop vcond expanders

2024-07-01 Thread Stefan Schulze Frielinghaus

This drops vcond expanders.  The first patch
"s390: Emulate vec_cmp{eq,gt,gtu} for 128-bit integers" is somewhat
independent of the other two, since we run already in ICEs.  However,
since after removing vcond expanders testsuite shows one additional
fallout without this patch, which is why I would like to make sure that
this patch lands first and included it in this series.

Stefan Schulze Frielinghaus (3):
  s390: Emulate vec_cmp{eq,gt,gtu} for 128-bit integers
  s390: Enable vcond_mask for 128-bit ops
  s390: Drop vcond{,u} expanders

 gcc/config/s390/vector.md | 156 --
 .../gcc.target/s390/vector/vec-cmp-emu-1.c|  35 
 .../gcc.target/s390/vector/vec-cmp-emu-2.c|  18 ++
 .../gcc.target/s390/vector/vec-cmp-emu-3.c|  17 ++
 4 files changed, 175 insertions(+), 51 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-2.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-3.c

-- 
2.45.2

[PATCH 3/3] s390: Drop vcond{,u} expanders

2024-07-01 Thread Stefan Schulze Frielinghaus

Optabs vcond{,u} will be removed for GCC 15.  Since regtest shows no
fallout, dropping the expanders, now.

gcc/ChangeLog:

PR target/114189
* config/s390/vector.md (V_HW2): Remove.
(vcond): Remove.
(vcondu): Remove.
---
 Bootstrapped and regtested on s390.  Ok for mainline?

 gcc/config/s390/vector.md | 35 ---
 1 file changed, 35 deletions(-)

diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 0e57dd1650c..1caf732d1f9 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -27,14 +27,9 @@
V2SF V4SF V1DF V2DF V1TF V1TI TI])
 
 ; All modes directly supported by the hardware having full vector reg size
-; V_HW2 is for having two iterators expanding independently e.g. vcond.
-; It's similar to V_HW, but not fully identical: V1TI is not included, because
-; there are no 128-bit compares.
 (define_mode_iterator V_HW  [V16QI V8HI V4SI V2DI V1TI TI V2DF
 (V4SF "TARGET_VXE") (V1TF "TARGET_VXE")
 (TF "TARGET_VXE")])
-(define_mode_iterator V_HW2 [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE")
-(V1TF "TARGET_VXE") (TF "TARGET_VXE")])
 
 (define_mode_iterator VT_HW_HSDT [V8HI V4SI V4SF V2DI V2DF V1TI V1TF TI TF])
 (define_mode_iterator V_HW_HSD [V8HI V4SI (V4SF "TARGET_VXE") V2DI V2DF])
@@ -725,36 +720,6 @@
 }
 })
 
-(define_expand "vcond"
-  [(set (match_operand:V_HW 0 "register_operand" "")
-   (if_then_else:V_HW
-(match_operator 3 "vcond_comparison_operator"
-[(match_operand:V_HW2 4 "register_operand" "")
- (match_operand:V_HW2 5 "nonmemory_operand" "")])
-(match_operand:V_HW 1 "nonmemory_operand" "")
-(match_operand:V_HW 2 "nonmemory_operand" "")))]
-  "TARGET_VX && GET_MODE_NUNITS (mode) == GET_MODE_NUNITS 
(mode)"
-{
-  s390_expand_vcond (operands[0], operands[1], operands[2],
-GET_CODE (operands[3]), operands[4], operands[5]);
-  DONE;
-})
-
-(define_expand "vcondu"
-  [(set (match_operand:V_HW 0 "register_operand" "")
-   (if_then_else:V_HW
-(match_operator 3 "comparison_operator"
-[(match_operand:V_HW2 4 "register_operand" "")
- (match_operand:V_HW2 5 "nonmemory_operand" "")])
-(match_operand:V_HW 1 "nonmemory_operand" "")
-(match_operand:V_HW 2 "nonmemory_operand" "")))]
-  "TARGET_VX && GET_MODE_NUNITS (mode) == GET_MODE_NUNITS 
(mode)"
-{
-  s390_expand_vcond (operands[0], operands[1], operands[2],
-GET_CODE (operands[3]), operands[4], operands[5]);
-  DONE;
-})
-
 (define_expand "vcond_mask_"
   [(set (match_operand:VT 0 "register_operand" "")
(if_then_else:VT
-- 
2.45.2

[PATCH 1/3] s390: Emulate vec_cmp{eq,gt,gtu} for 128-bit integers

2024-07-01 Thread Stefan Schulze Frielinghaus

Mode iterator V_HW enables V1TI for target VXE which means
vec_cmpv1tiv1ti becomes available which leads to an ICE since there is
no corresponding insn.

Fixed by emulating comparisons and enabling mode V1TI unconditionally
for V_HW.  For the sake of symmetry, I also added TI mode to V_HW since
TF mode is already included.  As a consequence the consumers of V_HW
vec_{splat,slb,sld,sldw,sldb,srdb,srab,srb,test_mask_int,test_mask}
also become available for 128-bit integers.

This fixes gcc.c-torture/execute/pr105613.c and gcc.dg/pr106063.c.

gcc/ChangeLog:

* config/s390/vector.md (V_HW): Enable V1TI unconditionally and
add TI.
(vec_cmpu): Add 128-bit integer
variants.
(*vec_cmpeq_nocc_emu): Emulate operation.
(*vec_cmpgt_nocc_emu): Emulate operation.
(*vec_cmpgtu_nocc_emu): Emulate operation.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/vec-cmp-emu-1.c: New test.
* gcc.target/s390/vector/vec-cmp-emu-2.c: New test.
* gcc.target/s390/vector/vec-cmp-emu-3.c: New test.
---
 Bootstrapped and regtested on s390.  Ok for mainline and GCC 14?

 gcc/config/s390/vector.md | 113 --
 .../gcc.target/s390/vector/vec-cmp-emu-1.c|  35 ++
 .../gcc.target/s390/vector/vec-cmp-emu-2.c|  18 +++
 .../gcc.target/s390/vector/vec-cmp-emu-3.c|  17 +++
 4 files changed, 171 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-2.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-3.c

diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 40de0c75a7c..032ec44542c 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -30,7 +30,7 @@
 ; V_HW2 is for having two iterators expanding independently e.g. vcond.
 ; It's similar to V_HW, but not fully identical: V1TI is not included, because
 ; there are no 128-bit compares.
-(define_mode_iterator V_HW  [V16QI V8HI V4SI V2DI (V1TI "TARGET_VXE") V2DF
+(define_mode_iterator V_HW  [V16QI V8HI V4SI V2DI V1TI TI V2DF
 (V4SF "TARGET_VXE") (V1TF "TARGET_VXE")
 (TF "TARGET_VXE")])
 (define_mode_iterator V_HW2 [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE")
@@ -50,6 +50,7 @@
 (define_mode_iterator VI_HW_HSDT [V8HI V4SI V2DI V1TI TI])
 (define_mode_iterator VI_HW_HS  [V8HI  V4SI])
 (define_mode_iterator VI_HW_QH  [V16QI V8HI])
+(define_mode_iterator VI_HW_T   [V1TI TI])
 
 ; Directly supported vector modes with a certain number of elements
 (define_mode_iterator V_HW_2   [V2DI V2DF])
@@ -151,7 +152,7 @@
(V1HI "V1HI") (V2HI "V2HI") (V4HI "V4HI") (V8HI 
"V8HI")
(V1SI "V1SI") (V2SI "V2SI") (V4SI "V4SI")
(V1DI "V1DI") (V2DI "V2DI")
-   (V1TI "V1TI")
+   (V1TI "V1TI") (TI "V1TI")
(V1SF "V1SI") (V2SF "V2SI") (V4SF "V4SI")
(V1DF "V1DI") (V2DF "V2DI")
(V1TF "V1TI") (TF "V1TI")])
@@ -160,7 +161,7 @@
(V1HI "v1hi") (V2HI "v2hi") (V4HI "v4hi") (V8HI 
"v8hi")
(V1SI "v1si") (V2SI "v2si") (V4SI "v4si")
(V1DI "v1di") (V2DI "v2di")
-   (V1TI "v1ti")
+   (V1TI "v1ti") (TI "v1ti")
(V1SF "v1si") (V2SF "v2si") (V4SF "v4si")
(V1DF "v1di") (V2DF "v2di")
(V1TF "v1ti") (TF   "v1ti")])
@@ -1956,11 +1957,11 @@
   DONE;
 })
 
-(define_expand "vec_cmpu"
-  [(set (match_operand:VI_HW0 "register_operand" "")
-   (match_operator:VI_HW   1 ""
- [(match_operand:VI_HW 2 "register_operand" "")
-  (match_operand:VI_HW 3 "register_operand" "")]))]
+(define_expand "vec_cmpu"
+  [(set (match_operand:VIT_HW0 "register_operand" "")
+   (match_operator:VIT_HW   1 ""
+ [(match_operand:VIT_HW 2 "register_operand" "")
+  (match_operand:VIT_HW 3 "register_operand" "")]))]
   "TARGET_VX"
 {
   s390_expand_vec_compare (operands[0], GET_CODE(operands[1]), operands[2], 
operands[3]);
@@ -1975,6 +1976,94 @@
   "vc\t%v2,%v0,%v1"
   [(set_attr "op_type" "VRR")])
 
+(define_insn_and_split "*vec_cmpeq_nocc_emu"
+  [(set (match_operand:VI_HW_T 0 "register_operand" "=v")
+   (eq:VI_HW_T (match_operand:VI_HW_T 1 "register_operand"  "v")
+   (match_operand:VI_HW_T 2 "register_operand"  "v")))]
+  "TARGET_VX"
+  "#"
+  "&& can_create_pseudo_p ()"
+  [(set (match_dup 3)
+   (eq:V2DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 4)
+   (vec_select:V2DI (match_dup 3) (parallel [(const_int 1) (const_int 
0)])))
+   (set (match_dup 3)
+   (and:V2DI (match_dup 3) (match_dup 4)))
+   (set (m

[PATCH 2/3] s390: Enable vcond_mask for 128-bit ops

2024-07-01 Thread Stefan Schulze Frielinghaus

In preparation of dropping vcond{,u,eq} optabs
https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654690.html
enable 128-bit operands for vcond_mask---including integer as well as
floating point.

This fixes partially PR115519 w.r.t. autovec-long-double-signaling-*.c
tests.

gcc/ChangeLog:

* config/s390/vector.md: Enable vcond_mask for 128-bit ops.
---
 Bootstrapped and regtested on s390.  Ok for mainline?

 gcc/config/s390/vector.md | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 032ec44542c..0e57dd1650c 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -756,12 +756,12 @@
 })
 
 (define_expand "vcond_mask_"
-  [(set (match_operand:V 0 "register_operand" "")
-   (if_then_else:V
+  [(set (match_operand:VT 0 "register_operand" "")
+   (if_then_else:VT
 (eq (match_operand: 3 "register_operand" "")
 (match_dup 4))
-(match_operand:V 2 "register_operand" "")
-(match_operand:V 1 "register_operand" "")))]
+(match_operand:VT 2 "register_operand" "")
+(match_operand:VT 1 "register_operand" "")))]
   "TARGET_VX"
   "operands[4] = CONST0_RTX (mode);")
 
-- 
2.45.2

Re: [PATCH] Hard register asm constraint

2024-06-28 Thread Stefan Schulze Frielinghaus

On Fri, Jun 28, 2024 at 11:46:08AM +0200, Georg-Johann Lay wrote:
> Am 27.06.24 um 10:51 schrieb Stefan Schulze Frielinghaus:
> > On Thu, Jun 27, 2024 at 09:45:32AM +0200, Georg-Johann Lay wrote:
> > > Am 24.05.24 um 11:13 Am 25.06.24 um 16:03 schrieb Paul Koning:
> > > > > On Jun 24, 2024, at 1:50 AM, Stefan Schulze Frielinghaus 
> > > > >  wrote:
> > > > > On Mon, Jun 10, 2024 at 07:19:19AM +0200, Stefan Schulze Frielinghaus 
> > > > > wrote:
> > > > > > On Fri, May 24, 2024 at 11:13:12AM +0200, Stefan Schulze 
> > > > > > Frielinghaus wrote:
> > > > > > > This implements hard register constraints for inline asm.  A hard 
> > > > > > > register
> > > > > > > constraint is of the form {regname} where regname is any valid 
> > > > > > > register.  This
> > > > > > > basically renders register asm superfluous.  For example, the 
> > > > > > > snippet
> > > > > > > 
> > > > > > > int test (int x, int y)
> > > > > > > {
> > > > > > >register int r4 asm ("r4") = x;
> > > > > > >register int r5 asm ("r5") = y;
> > > > > > >unsigned int copy = y;
> > > > > > >asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy));
> > > > > > >return r4;
> > > > > > > }
> > > > > > > 
> > > > > > > could be rewritten into
> > > > > > > 
> > > > > > > int test (int x, int y)
> > > > > > > {
> > > > > > >asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y));
> > > > > > >return x;
> > > > > > > }
> > > > 
> > > > I like this idea but I'm wondering: regular constraints specify what 
> > > > sort of value is needed, for example an int vs. a short int vs. a 
> > > > float.  The notation you've shown doesn't seem to have that aspect.
> > > > 
> > > > The other comment is that I didn't see documentation updates to reflect 
> > > > this new feature.
> > > > 
> > > > paul
> > > > 
> > >   Stefan Schulze Frielinghaus:
> > > > This implements hard register constraints for inline asm.  A hard 
> > > > register
> > > > constraint is of the form {regname} where regname is any valid 
> > > > register.  This
> > > > basically renders register asm superfluous.  For example, the snippet
> > > > 
> > > > int test (int x, int y)
> > > > {
> > > > register int r4 asm ("r4") = x;
> > > > register int r5 asm ("r5") = y;
> > > > unsigned int copy = y;
> > > > asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy));
> > > > return r4;
> > > > }
> > > > 
> > > > could be rewritten into
> > > > 
> > > > int test (int x, int y)
> > > > {
> > > > asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y));
> > > > return x;
> > > > }
> > > 
> > > Hi, can this also be used in machine descriptions?
> > > 
> > > It would make some insn handling much simpler, for example in
> > > the avr backend.
> > > 
> > > That backend has insns that represent assembly sequences in libgcc
> > > which have a smaller register footprint than plain calls.  However
> > > this requires that such insns have explicit description of which regs
> > > go in and out.
> > > 
> > > The current solution uses hard regs, which works, but a proper
> > > implementation would use register constraints.  I tries that a while
> > > ago, and register constraints lead to a code bloat even in places that
> > > don't use these constraints due to the zillions of new register classes
> > > like R22_1, R22;2, R22_4, R20_1, R20_2, R20_4 etc. that were required.
> > > 
> > > Your approach would allow to use hard register constraints in insns,
> > > and so far the only problem is to determine how much hard regs are
> > > used by the constraint.  The gen tools that generates

Re: [PATCH] Hard register asm constraint

2024-06-27 Thread Stefan Schulze Frielinghaus

On Thu, Jun 27, 2024 at 09:45:32AM +0200, Georg-Johann Lay wrote:
> 
> 
> Am 24.05.24 um 11:13 Am 25.06.24 um 16:03 schrieb Paul Koning:
> > 
> > 
> > > On Jun 24, 2024, at 1:50 AM, Stefan Schulze Frielinghaus 
> > >  wrote:
> > > 
> > > Ping.
> > > 
> > > On Mon, Jun 10, 2024 at 07:19:19AM +0200, Stefan Schulze Frielinghaus 
> > > wrote:
> > > > Ping.
> > > > 
> > > > On Fri, May 24, 2024 at 11:13:12AM +0200, Stefan Schulze Frielinghaus 
> > > > wrote:
> > > > > This implements hard register constraints for inline asm.  A hard 
> > > > > register
> > > > > constraint is of the form {regname} where regname is any valid 
> > > > > register.  This
> > > > > basically renders register asm superfluous.  For example, the snippet
> > > > > 
> > > > > int test (int x, int y)
> > > > > {
> > > > >   register int r4 asm ("r4") = x;
> > > > >   register int r5 asm ("r5") = y;
> > > > >   unsigned int copy = y;
> > > > >   asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy));
> > > > >   return r4;
> > > > > }
> > > > > 
> > > > > could be rewritten into
> > > > > 
> > > > > int test (int x, int y)
> > > > > {
> > > > >   asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y));
> > > > >   return x;
> > > > > }
> > 
> > I like this idea but I'm wondering: regular constraints specify what sort 
> > of value is needed, for example an int vs. a short int vs. a float.  The 
> > notation you've shown doesn't seem to have that aspect.
> > 
> > The other comment is that I didn't see documentation updates to reflect 
> > this new feature.
> > 
> > paul
> > 
>  Stefan Schulze Frielinghaus:
> > This implements hard register constraints for inline asm.  A hard register
> > constraint is of the form {regname} where regname is any valid register.  
> > This
> > basically renders register asm superfluous.  For example, the snippet
> > 
> > int test (int x, int y)
> > {
> >register int r4 asm ("r4") = x;
> >register int r5 asm ("r5") = y;
> >unsigned int copy = y;
> >asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy));
> >return r4;
> > }
> > 
> > could be rewritten into
> > 
> > int test (int x, int y)
> > {
> >asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y));
> >return x;
> > }
> 
> Hi, can this also be used in machine descriptions?
> 
> It would make some insn handling much simpler, for example in
> the avr backend.
> 
> That backend has insns that represent assembly sequences in libgcc
> which have a smaller register footprint than plain calls.  However
> this requires that such insns have explicit description of which regs
> go in and out.
> 
> The current solution uses hard regs, which works, but a proper
> implementation would use register constraints.  I tries that a while
> ago, and register constraints lead to a code bloat even in places that
> don't use these constraints due to the zillions of new register classes
> like R22_1, R22;2, R22_4, R20_1, R20_2, R20_4 etc. that were required.
> 
> Your approach would allow to use hard register constraints in insns,
> and so far the only problem is to determine how much hard regs are
> used by the constraint.  The gen tools that generates cc code from md
> would use the operand's machine mode to infer the number of hard regs.

I have this on my todo list but ignored it for the very first draft.  At
the moment this already fails because genoutput cannot parse the
constraint format.

In my "alpha draft" I implemented this feature by emitting moves to hard
registers during expand.  This had the limitation that I couldn't
support multiple alternatives in combination with hard-register
constraints.  I'm still not sure whether this is a feature we really
want or whether it should be rather denied.  Anyhow, with this kind of
implementation I doubt that this would be feasible for machine
descriptions.  I moved on with my current draft where the constraint
manifests during register allocation.  This also allows multiple
alternatives.  I think one of the (major?) advantages of doing it this
way is that operands are kept in pseudos which means they are
automagically saved/restored over function boundaries and what not.  Or
in other words, the register constraint manifests at the asm boundary
which is probably what users expect and should be less error prone
(again just thinking of implicit code which gets injected as e.g. by
sanitizers introducing calls etc.).

So long story short, I would like to look into this but currently it
doesn't work.  I'm also not sure to which extend this could be used.
However, once I have some more time I will have a look at the avr
backend for examples.

Cheers,
Stefan

Re: [PATCH] Hard register asm constraint

2024-06-26 Thread Stefan Schulze Frielinghaus

On Wed, Jun 26, 2024 at 11:10:38AM -0400, Paul Koning wrote:
> 
> 
> > On Jun 26, 2024, at 8:54 AM, Stefan Schulze Frielinghaus 
> >  wrote:
> > 
> > On Tue, Jun 25, 2024 at 01:02:39PM -0400, Paul Koning wrote:
> >> 
> >> 
> >>> On Jun 25, 2024, at 12:04 PM, Stefan Schulze Frielinghaus 
> >>>  wrote:
> >>> 
> >>> On Tue, Jun 25, 2024 at 10:03:34AM -0400, Paul Koning wrote:
> >>>> 
> >>>>>>> ...
> >>>>>>> could be rewritten into
> >>>>>>> 
> >>>>>>> int test (int x, int y)
> >>>>>>> {
> >>>>>>> asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y));
> >>>>>>> return x;
> >>>>>>> }
> >>>> 
> >>>> I like this idea but I'm wondering: regular constraints specify what 
> >>>> sort of value is needed, for example an int vs. a short int vs. a float. 
> >>>>  The notation you've shown doesn't seem to have that aspect.
> >>> 
> >>> As Maciej already pointed out the type of the expression should suffice.
> >>> My assumption was that an asm can deal with a value as is or its
> >>> promoted value.  At least for integer values this should be fine and
> >>> AFAICS is also the case for simple constraints like "r" which do not
> >>> define any mode.  I've probably overseen something but which constraint
> >>> differentiates between int vs short?  However, you have a good point
> >>> with this and I should test this more.
> >> 
> >> I thought there was but I may be confused.  On the other hand, there 
> >> definitely are (machine dependent) constraints that distinguish, say, 
> >> float from integer registers; pdp11 is an example.  If you were to use an 
> >> "a" constraint, that means a floating point register and the compiler will 
> >> detect attempts to pass non-float operands ("Inconsistent operand 
> >> constraints...").
> >> 
> >> I see that the existing "register int ..." syntax appears to check that 
> >> the register is the right type for the data type given for it, so for 
> >> example on pdp11, 
> >> 
> >>register int ac1 asm ("ac1") = i;
> >> 
> >> fails ("register ... isn't suitable for data type").  I assume your new 
> >> syntax would perform the same check and produce roughly the same error 
> >> message.  You might verify that.  On pdp11, trying to use, for example, 
> >> "r0" for a float, or "ac0" for an int, would produce that error.
> > 
> > Right, so far I don't error out here which I will change.  It basically
> > results in bit casting floats to ints currently.
> 
> That would be bad.  For one thing, a PDP11 float doesn't fit in an integer 
> register.
> 
> That also brings up another point (which applies to more mainstream targets 
> as well): for data types that require multiple registers, say a register pair 
> for a double length value, how is that handled?  One possible answer is to 
> reject that.  Another would be to load a register pair.
> 
> This case applies to a "long int" on pdp11, or 32 bit MIPS, and probably a 
> bunch of others.

Absolutely, also on mainstream targets you could think of 128-bit integers
or long doubles which typically don't fit in (single) GPRs.  I should
definitely add error handling for this.  Similar, I don't error out for
non-primitive data types.

I will give register pairs a try.

Thanks for all your comments so far :)

Cheers,
Stefan

Re: [PATCH] s390: Check for ADDR_REGS in s390_decompose_addrstyle_without_index

2024-06-26 Thread Stefan Schulze Frielinghaus

On Wed, Jun 26, 2024 at 02:15:18PM +0200, Stefan Schulze Frielinghaus wrote:
> An explicit check for address registers was not required so far since
> during register allocation the processing of address constraints was
> sufficient.  However, address constraints themself do not check for
> REGNO_OK_FOR_{BASE,INDEX}_P.  Thus, with the newly introduced
> late-combine pass in r15-1579-g792f97b44ffc5e we generate new insns with
> invalid address registers which aren't fixed up afterwards.
> 
> Fixed by explicitly checking for address registers in
> s390_decompose_addrstyle_without_index such that those new insns are
> rejected.
> 
> gcc/ChangeLog:
> 
>   target/PR115634
>   * config/s390/s390.cc (s390_decompose_addrstyle_without_index):
>   Check for ADDR_REGS in s390_decompose_addrstyle_without_index.
> ---
>  This restores bootstrap on s390.  I ran the testsuite against mainline
>  and of course there is some fallout which is most likely coming from
>  the new pass or other changes.  I have another job running comparing
>  pre r15-1579-g792f97b44ffc5e with and without this patch.  Assuming
>  this goes well, ok for mainline?

Bootstrap and regtest of this test went also fine.

> 
>  gcc/config/s390/s390.cc | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
> index c65421de831..05a0fde7fb0 100644
> --- a/gcc/config/s390/s390.cc
> +++ b/gcc/config/s390/s390.cc
> @@ -3347,7 +3347,9 @@ s390_decompose_addrstyle_without_index (rtx op, rtx 
> *base,
>while (op && GET_CODE (op) == SUBREG)
>  op = SUBREG_REG (op);
>  
> -  if (op && GET_CODE (op) != REG)
> +  if (op && (!REG_P (op)
> +  || (reload_completed
> +  && !REGNO_OK_FOR_BASE_P (REGNO (op)
>  return false;
>  
>if (offset)
> -- 
> 2.45.1
>

Re: [PATCH] Hard register asm constraint

2024-06-26 Thread Stefan Schulze Frielinghaus

On Tue, Jun 25, 2024 at 01:02:39PM -0400, Paul Koning wrote:
> 
> 
> > On Jun 25, 2024, at 12:04 PM, Stefan Schulze Frielinghaus 
> >  wrote:
> > 
> > On Tue, Jun 25, 2024 at 10:03:34AM -0400, Paul Koning wrote:
> >> 
> >>>>> ...
> >>>>> could be rewritten into
> >>>>> 
> >>>>> int test (int x, int y)
> >>>>> {
> >>>>> asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y));
> >>>>> return x;
> >>>>> }
> >> 
> >> I like this idea but I'm wondering: regular constraints specify what sort 
> >> of value is needed, for example an int vs. a short int vs. a float.  The 
> >> notation you've shown doesn't seem to have that aspect.
> > 
> > As Maciej already pointed out the type of the expression should suffice.
> > My assumption was that an asm can deal with a value as is or its
> > promoted value.  At least for integer values this should be fine and
> > AFAICS is also the case for simple constraints like "r" which do not
> > define any mode.  I've probably overseen something but which constraint
> > differentiates between int vs short?  However, you have a good point
> > with this and I should test this more.
> 
> I thought there was but I may be confused.  On the other hand, there 
> definitely are (machine dependent) constraints that distinguish, say, float 
> from integer registers; pdp11 is an example.  If you were to use an "a" 
> constraint, that means a floating point register and the compiler will detect 
> attempts to pass non-float operands ("Inconsistent operand constraints...").
> 
> I see that the existing "register int ..." syntax appears to check that the 
> register is the right type for the data type given for it, so for example on 
> pdp11, 
> 
>   register int ac1 asm ("ac1") = i;
> 
> fails ("register ... isn't suitable for data type").  I assume your new 
> syntax would perform the same check and produce roughly the same error 
> message.  You might verify that.  On pdp11, trying to use, for example, "r0" 
> for a float, or "ac0" for an int, would produce that error.

Right, so far I don't error out here which I will change.  It basically
results in bit casting floats to ints currently.

Just one thing to note: this is not a novel feature but pretty similar
to Rust's explicit register operands:
https://doc.rust-lang.org/rust-by-example/unsafe/asm.html#explicit-register-operands

Cheers,
Stefan

[PATCH] s390: Check for ADDR_REGS in s390_decompose_addrstyle_without_index

2024-06-26 Thread Stefan Schulze Frielinghaus

An explicit check for address registers was not required so far since
during register allocation the processing of address constraints was
sufficient.  However, address constraints themself do not check for
REGNO_OK_FOR_{BASE,INDEX}_P.  Thus, with the newly introduced
late-combine pass in r15-1579-g792f97b44ffc5e we generate new insns with
invalid address registers which aren't fixed up afterwards.

Fixed by explicitly checking for address registers in
s390_decompose_addrstyle_without_index such that those new insns are
rejected.

gcc/ChangeLog:

target/PR115634
* config/s390/s390.cc (s390_decompose_addrstyle_without_index):
Check for ADDR_REGS in s390_decompose_addrstyle_without_index.
---
 This restores bootstrap on s390.  I ran the testsuite against mainline
 and of course there is some fallout which is most likely coming from
 the new pass or other changes.  I have another job running comparing
 pre r15-1579-g792f97b44ffc5e with and without this patch.  Assuming
 this goes well, ok for mainline?

 gcc/config/s390/s390.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index c65421de831..05a0fde7fb0 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -3347,7 +3347,9 @@ s390_decompose_addrstyle_without_index (rtx op, rtx *base,
   while (op && GET_CODE (op) == SUBREG)
 op = SUBREG_REG (op);
 
-  if (op && GET_CODE (op) != REG)
+  if (op && (!REG_P (op)
+|| (reload_completed
+&& !REGNO_OK_FOR_BASE_P (REGNO (op)
 return false;
 
   if (offset)
-- 
2.45.1

Re: [PATCH] Hard register asm constraint

2024-06-25 Thread Stefan Schulze Frielinghaus

On Tue, Jun 25, 2024 at 10:03:34AM -0400, Paul Koning wrote:
> 
> 
> > On Jun 24, 2024, at 1:50 AM, Stefan Schulze Frielinghaus 
> >  wrote:
> > 
> > Ping.
> > 
> > On Mon, Jun 10, 2024 at 07:19:19AM +0200, Stefan Schulze Frielinghaus wrote:
> >> Ping.
> >> 
> >> On Fri, May 24, 2024 at 11:13:12AM +0200, Stefan Schulze Frielinghaus 
> >> wrote:
> >>> This implements hard register constraints for inline asm.  A hard register
> >>> constraint is of the form {regname} where regname is any valid register.  
> >>> This
> >>> basically renders register asm superfluous.  For example, the snippet
> >>> 
> >>> int test (int x, int y)
> >>> {
> >>>  register int r4 asm ("r4") = x;
> >>>  register int r5 asm ("r5") = y;
> >>>  unsigned int copy = y;
> >>>  asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy));
> >>>  return r4;
> >>> }
> >>> 
> >>> could be rewritten into
> >>> 
> >>> int test (int x, int y)
> >>> {
> >>>  asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y));
> >>>  return x;
> >>> }
> 
> I like this idea but I'm wondering: regular constraints specify what sort of 
> value is needed, for example an int vs. a short int vs. a float.  The 
> notation you've shown doesn't seem to have that aspect.

As Maciej already pointed out the type of the expression should suffice.
My assumption was that an asm can deal with a value as is or its
promoted value.  At least for integer values this should be fine and
AFAICS is also the case for simple constraints like "r" which do not
define any mode.  I've probably overseen something but which constraint
differentiates between int vs short?  However, you have a good point
with this and I should test this more.

> The other comment is that I didn't see documentation updates to reflect this 
> new feature.

I didn't came up with documentation yet since I was not sure whether
such a proposal would be accepted at all, i.e., just wanted to hear
whether you see some show stoppers or not.  Assuming this goes well I
guess it should be documented under simple constraints
https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html

Thanks,
Stefan

Re: [PATCH] Hard register asm constraint

2024-06-24 Thread Stefan Schulze Frielinghaus

Ping.

On Mon, Jun 10, 2024 at 07:19:19AM +0200, Stefan Schulze Frielinghaus wrote:
> Ping.
> 
> On Fri, May 24, 2024 at 11:13:12AM +0200, Stefan Schulze Frielinghaus wrote:
> > This implements hard register constraints for inline asm.  A hard register
> > constraint is of the form {regname} where regname is any valid register.  
> > This
> > basically renders register asm superfluous.  For example, the snippet
> > 
> > int test (int x, int y)
> > {
> >   register int r4 asm ("r4") = x;
> >   register int r5 asm ("r5") = y;
> >   unsigned int copy = y;
> >   asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy));
> >   return r4;
> > }
> > 
> > could be rewritten into
> > 
> > int test (int x, int y)
> > {
> >   asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y));
> >   return x;
> > }
> > 
> > As a side-effect this also solves the problem of call-clobbered registers.
> > That being said, I was wondering whether we could utilize this feature in 
> > order
> > to get rid of local register asm automatically?  For example, converting
> > 
> > // Result will be in r2 on s390
> > extern int bar (void);
> > 
> > void test (void)
> > {
> >   register int x asm ("r2") = 42;
> >   bar ();
> >   asm ("foo %0\n" :: "r" (x));
> > }
> > 
> > into
> > 
> > void test (void)
> > {
> >   int x = 42;
> >   bar ();
> >   asm ("foo %0\n" :: "{r2}" (x));
> > }
> > 
> > in order to get rid of the limitation of call-clobbered registers which may
> > lead to subtle bugs---especially if you think of non-obvious calls e.g.
> > introduced by sanitizer/tracer/whatever.  Since such a transformation has 
> > the
> > potential to break existing code do you see any edge cases where this might 
> > be
> > problematic or even show stoppers?  Currently, even
> > 
> > int test (void)
> > {
> >   register int x asm ("r2") = 42;
> >   register int y asm ("r2") = 24;
> >   asm ("foo %0,%1\n" :: "r" (x), "r" (y));
> > }
> > 
> > is allowed which seems error prone to me.  Thus, if 100% backwards
> > compatibility would be required, then automatically converting every 
> > register
> > asm to the new mechanism isn't viable.  Still quite a lot could be 
> > transformed.
> > Any thoughts?
> > 
> > Currently I allow multiple alternatives as demonstrated by
> > gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c.  However, since a hard 
> > register
> > constraint is pretty specific I could also think of erroring out in case of
> > alternatives.  Are there any real use cases out there for multiple
> > alternatives where one would like to use hard register constraints?
> > 
> > With the current implementation we have a "user visible change" in the sense
> > that for
> > 
> > void test (void)
> > {
> >   register int x asm ("r2") = 42;
> >   register int y asm ("r2") = 24;
> >   asm ("foo %0,%1\n" : "=r" (x), "=r" (y));
> > }
> > 
> > we do not get the error
> > 
> >   "invalid hard register usage between output operands"
> > 
> > anymore but rather
> > 
> >   "multiple outputs to hard register: %r2"
> > 
> > This is due to the error handling in gimplify_asm_expr ().  Speaking of 
> > errors,
> > I also error out earlier as before which means that e.g. in pr87600-2.c only
> > the first error is reported and processing is stopped afterwards which means
> > the subsequent tests fail.
> > 
> > I've been skimming through all targets and it looks to me as if none is 
> > using
> > curly brackets for their constraints.  Of course, I may have missed 
> > something.
> > 
> > Cheers,
> > Stefan
> > 
> > PS: Current state for Clang: https://reviews.llvm.org/D105142
> > 
> > ---
> >  gcc/cfgexpand.cc  |  42 ---
> >  gcc/genpreds.cc   |   4 +-
> >  gcc/gimplify.cc   | 115 +-
> >  gcc/lra-constraints.cc|  17 +++
> >  gcc/recog.cc  |  14 ++-
> >  gcc/stmt.cc   | 102 +++-
&

Re: [PATCH] s390: define single step vector casts

2024-06-20 Thread Stefan Schulze Frielinghaus

On Thu, Jun 20, 2024 at 09:06:11AM +0200, Juergen Christ wrote:
> Some casts were missing leading to missed of bad vectorizations where
> casting was done scalar followed by a vector creation from the
> individual elements.
> 
> gcc/ChangeLog:
> 
>   * config/s390/vector.md (VEC_HALF_NARROWED): New mode iterator.
>   (vec_half_narrowed): ditto.
>   (trunc2): New pattern.
>   (vec_pack_ufix_trunc_v2df): ditto.
>   (vec_pack_sfix_trunc_v2df): ditto.
>   (vec_unpack_sfix_trunc_lo_v4sf): ditto.
>   (vec_unpack_sfix_trunc_hi_v4sf): ditto.
>   (vec_unpack_ufix_trunc_lo_v4sf): ditto.
>   (vec_unpack_ufix_trunc_hi_v4sf): ditto.
>   (floatv2siv2sf2): ditto.
>   (floatunsv2siv2sf2): ditto.
>   (vec_unpacks_float_hi_v4si): ditto.
>   (vec_unpacks_float_lo_v4si): ditto.
>   (vec_unpacku_float_hi_v4si): ditto.
>   (vec_unpacku_float_lo_v4si): ditto.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/s390/vector/vec-cast-single.c: New test.
>   * gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c: New test.
> 
> Bootstrapped and regtested on s390x.  Ok for trunk?
> 
> Signed-off-by: Juergen Christ 
> ---
>  gcc/config/s390/vector.md | 170 ++-
>  .../gcc.target/s390/vector/vec-cast-single.c  | 271 ++
>  .../s390/vector/vec_pack_ufix_trunc_v2df.c|  30 ++
>  3 files changed, 463 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c
>  create mode 100644 
> gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c
> 
> diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
> index 40de0c75a7cf..356f25d26deb 100644
> --- a/gcc/config/s390/vector.md
> +++ b/gcc/config/s390/vector.md
> @@ -89,6 +89,8 @@
>  
>  (define_mode_iterator VI_EXTEND [V2QI V2HI V2SI V4QI V4HI])
>  
> +(define_mode_iterator VI_TRUNC [V2HI V2SI V2DI V4HI V4SI])
> +
>  ; Empty string for all but TImode.  This is used to hide the TImode
>  ; expander name in case it is defined already.  See addti3 for an
>  ; example.
> @@ -211,6 +213,14 @@
>  (V1SF "v1df") (V2SF "v2df") (V4SF "v4df")
>  (V1DF "v1tf") (V2DF "v2tf")])
>  
> +; Vector with narrowed element size and the same number of elements.
> +(define_mode_attr VEC_HALF_NARROWED [(V1HI "V1QI") (V2HI "V2QI") (V4HI 
> "V4QI") (V8HI "V8QI")
> +   (V1SI "V1HI") (V2SI "V2HI") (V4SI "V4HI")
> +(V1DI "V1DI") (V2DI "V2SI")])
> +(define_mode_attr vec_half_narrowed [(V1HI "v1qi") (V2HI "v2qi") (V4HI 
> "v4qi") (V8HI "v8qi")
> +   (V1SI "v1hi") (V2SI "v2hi") (V4SI "v4hi")
> +(V1DI "v1di") (V2DI "v2si")])
> +
>  ; Vector with half the element size AND half the number of elements.
>  (define_mode_attr vec_halfhalf
>[(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
> @@ -2422,6 +2432,17 @@
>operands[2] = gen_reg_rtx (V4SFmode);
>  })
>  
> +;; vector truncate
> +
> +; downcasts
> +
> +(define_insn "trunc2"
> +  [(set (match_operand: 0 "register_operand" "=v")
> +(truncate: (match_operand:VI_TRUNC 1 
> "register_operand" "v")))]
> +  "TARGET_VX"
> +  "vpk\t %0,%1,%1"
  ^
whitespace

> +  [(set_attr "op_type" "VRR")])
> +
>  ;; vector unpack v16qi
>  
>  ; signed
> @@ -3177,17 +3198,150 @@
>emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2]));
>emit_insn (gen_vstlv16qi (operands[1], len, mem));
>DONE;
> -});;
> +})
> +
> +(define_expand "vec_pack_ufix_trunc_v2df"
> +  [(match_operand:V4SI 0 "register_operand")
> +   (match_operand:V2DF 1 "register_operand")
> +   (match_operand:V2DF 2 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r1 = gen_reg_rtx (V2DImode);
> +  rtx r2 = gen_reg_rtx (V2DImode);
> +
> +  emit_insn (gen_fixuns_truncv2dfv2di2 (r1, operands[1]));
> +  emit_insn (gen_fixuns_truncv2dfv2di2 (r2, operands[2]));
> +  emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2));
> +  DONE;
> +})

I haven't really wrapped my head around this, however, this two step
conversion could miss an IEEE-inexact-exception if a double fits into a
64-bit integer but not in a 32-bit integer.  What does the IL/vectorizer
say about exceptions?  Ok to miss some or do we have to guard this by
no-trapping-math et al.?

> +
> +(define_expand "vec_pack_sfix_trunc_v2df"
> +  [(match_operand:V4SI 0 "register_operand")
> +   (match_operand:V2DF 1 "register_operand")
> +   (match_operand:V2DF 2 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r1 = gen_reg_rtx (V2DImode);
> +  rtx r2 = gen_reg_rtx (V2DImode);
> +
> +  emit_insn (gen_fix_truncv2dfv2di2 (r1, operands[1]));
> +  emit_insn (gen_fix_truncv2dfv2di2 (r2, operands[2]));
> +  emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2));
> +  DONE;
> +})

same as above

> +
> +; v4sf -> v2di
> +(define_expand "vec_unpack_sfix_trunc_lo_v4sf"
> +  [(match_operand:V2DI 0 "register_operand")
> +   (match_

Re: [PATCH] middle-end/114189 - drop uses of vcond{,u,eq}_optab

2024-06-17 Thread Stefan Schulze Frielinghaus

On Mon, Jun 17, 2024 at 08:16:34AM +0200, Richard Biener wrote:
> On Mon, 17 Jun 2024, Kewen.Lin wrote:
> 
> > Hi Richi,
> > 
> > on 2024/6/14 18:31, Richard Biener wrote:
> > > The following retires vcond{,u,eq} optabs by stopping to use them
> > > from the middle-end.  Targets instead (should) implement vcond_mask
> > > and vec_cmp{,u,eq} optabs.  The PR this change refers to lists
> > > possibly affected targets - those implementing these patterns,
> > > and in particular it lists mips, sparc and ia64 as targets that
> > > most definitely will regress while others might simply remove
> > > their vcond{,u,eq} patterns.
> > > 
> > > I'd appreciate testing, I do not expect fallout for x86 or arm/aarch64.
> > > I know riscv doesn't implement any of the legacy optabs.  But less
> > > maintained vector targets might need adjustments.
> > 
> > Thanks for making this change, this patch can be bootstrapped on ppc64{,le}
> > but both have one failure on gcc/testsuite/gcc.target/powerpc/pr66144-3.c,
> > by looking into it, I found it just exposed one oversight in the current
> > rs6000 vcond_mask support (the condition mask location is wrong), so I think
> > this change is fine for rs6000 port, I'll also test SPEC2017 for this (with
> > rs6000 vcond_mask change) soon.
> 
> Btw, for those targets where the patch works out fine it would be nice
> to delete their vcond{,u,eq} expanders (and double-check that doesn't
> cause issues on its own).
> 
> Can target maintainers note whether their targets support all condition
> codes for their vector comparisons (including FP variants)?  And 
> whether they choose to implement all condition codes in vec_cmp
> and adjust with inversion / operand swapping for not supported cases?

On s390 we support all comparison operations with inverse / operand
swapping via s390_expand_vec_compare.  However, we still have some
failures for which I opened PR115519.  Currently it is unclear to me
what precisely is missing and will have a further look.  vcond_mask
expander is also implemented for all modes.

Cheers,
Stefan

> 
> Thanks,
> Richard.
> 
> > BR,
> > Kewen
> > 
> > > 
> > > I want to get rid of those optabs for GCC 15.  If I don't hear from
> > > you I will assume your target is fine.
> > > 
> > > Thanks,
> > > Richard.
> > > 
> > >   PR middle-end/114189
> > >   * optabs-query.h (get_vcond_icode): Always return CODE_FOR_nothing.
> > >   (get_vcond_eq_icode): Likewise.
> > > ---
> > >  gcc/optabs-query.h | 13 -
> > >  1 file changed, 4 insertions(+), 9 deletions(-)
> > > 
> > > diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
> > > index 0cb2c21ba85..31fbce80175 100644
> > > --- a/gcc/optabs-query.h
> > > +++ b/gcc/optabs-query.h
> > > @@ -112,14 +112,9 @@ get_vec_cmp_eq_icode (machine_mode vmode, 
> > > machine_mode mask_mode)
> > > mode CMODE, unsigned if UNS is true, resulting in a value of mode 
> > > VMODE.  */
> > >  
> > >  inline enum insn_code
> > > -get_vcond_icode (machine_mode vmode, machine_mode cmode, bool uns)
> > > +get_vcond_icode (machine_mode, machine_mode, bool)
> > >  {
> > > -  enum insn_code icode = CODE_FOR_nothing;
> > > -  if (uns)
> > > -icode = convert_optab_handler (vcondu_optab, vmode, cmode);
> > > -  else
> > > -icode = convert_optab_handler (vcond_optab, vmode, cmode);
> > > -  return icode;
> > > +  return CODE_FOR_nothing;
> > >  }
> > >  
> > >  /* Return insn code for a conditional operator with a mask mode
> > > @@ -135,9 +130,9 @@ get_vcond_mask_icode (machine_mode vmode, 
> > > machine_mode mmode)
> > > mode CMODE (only EQ/NE), resulting in a value of mode VMODE.  */
> > >  
> > >  inline enum insn_code
> > > -get_vcond_eq_icode (machine_mode vmode, machine_mode cmode)
> > > +get_vcond_eq_icode (machine_mode, machine_mode)
> > >  {
> > > -  return convert_optab_handler (vcondeq_optab, vmode, cmode);
> > > +  return CODE_FOR_nothing;
> > >  }
> > >  
> > >  /* Enumerates the possible extraction_insn operations.  */
> > 
> > 
> 
> -- 
> Richard Biener 
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Re: [PATCH] s390: testsuite: Fix ifcvt-one-insn-bool.c

2024-06-13 Thread Stefan Schulze Frielinghaus

Ping.

On Wed, Jun 05, 2024 at 08:00:15AM +0200, Stefan Schulze Frielinghaus wrote:
> With the change of r15-787-g57e04879389f9c I forgot to also update this
> test.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/s390/ifcvt-one-insn-bool.c: Fix loc.
> ---
>  Ok for mainline?  Ok for GCC 14 if the corresponding backport is also
>  approved?
> 
>  gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c 
> b/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c
> index 0c8c2f879a6..4ae29dbd6b6 100644
> --- a/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c
> +++ b/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c
> @@ -3,7 +3,7 @@
>  /* { dg-do compile { target { s390*-*-* } } } */
>  /* { dg-options "-O2 -march=z13 -mzarch" } */
>  
> -/* { dg-final { scan-assembler "lochinh\t%r.?,1" } } */
> +/* { dg-final { scan-assembler "lochile\t%r.?,1" } } */
>  #include 
>  
>  int foo (int *a, unsigned int n)
> -- 
> 2.45.1
>

Re: [PATCH v2] s390: Implement TARGET_NOCE_CONVERSION_PROFITABLE_P [PR109549]

2024-06-13 Thread Stefan Schulze Frielinghaus

Ping.

On Sun, Jun 02, 2024 at 02:07:24PM +0200, Stefan Schulze Frielinghaus wrote:
> Since the patch works fine so far for mainline, ok to backport to GCC 14?
> 
> On Fri, May 17, 2024 at 08:59:05AM +0200, Stefan Schulze Frielinghaus wrote:
> > I've adapted the patch as follows and will push.
> > 
> > Thanks,
> > Stefan
> > 
> > --
> > 
> > Consider a NOCE conversion as profitable if there is at least one
> > conditional move.
> > 
> > gcc/ChangeLog:
> > 
> > * config/s390/s390.cc (TARGET_NOCE_CONVERSION_PROFITABLE_P):
> > Define.
> > (s390_noce_conversion_profitable_p): Implement.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > * gcc.target/s390/ccor.c: Order of loads are reversed, now, as a
> > consequence the condition has to be reversed.
> > ---
> >  gcc/config/s390/s390.cc  | 32 
> >  gcc/testsuite/gcc.target/s390/ccor.c |  4 ++--
> >  2 files changed, 34 insertions(+), 2 deletions(-)
> > 
> > diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
> > index bf46eab2d63..7f8f1681c2a 100644
> > --- a/gcc/config/s390/s390.cc
> > +++ b/gcc/config/s390/s390.cc
> > @@ -78,6 +78,7 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "tree-pass.h"
> >  #include "context.h"
> >  #include "builtins.h"
> > +#include "ifcvt.h"
> >  #include "rtl-iter.h"
> >  #include "intl.h"
> >  #include "tm-constrs.h"
> > @@ -18037,6 +18038,34 @@ s390_vectorize_vec_perm_const (machine_mode vmode, 
> > machine_mode op_mode,
> >return vectorize_vec_perm_const_1 (d);
> >  }
> >  
> > +/* Consider a NOCE conversion as profitable if there is at least one
> > +   conditional move.  */
> > +
> > +static bool
> > +s390_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info 
> > *if_info)
> > +{
> > +  if (if_info->speed_p)
> > +{
> > +  for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
> > +   {
> > + rtx set = single_set (insn);
> > + if (set == NULL)
> > +   continue;
> > + if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
> > +   continue;
> > + rtx src = SET_SRC (set);
> > + machine_mode mode = GET_MODE (src);
> > + if (GET_MODE_CLASS (mode) != MODE_INT
> > + && GET_MODE_CLASS (mode) != MODE_FLOAT)
> > +   continue;
> > + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
> > +   continue;
> > + return true;
> > +   }
> > +}
> > +  return default_noce_conversion_profitable_p (seq, if_info);
> > +}
> > +
> >  /* Initialize GCC target structure.  */
> >  
> >  #undef  TARGET_ASM_ALIGNED_HI_OP
> > @@ -18350,6 +18379,9 @@ s390_vectorize_vec_perm_const (machine_mode vmode, 
> > machine_mode op_mode,
> >  #undef TARGET_VECTORIZE_VEC_PERM_CONST
> >  #define TARGET_VECTORIZE_VEC_PERM_CONST s390_vectorize_vec_perm_const
> >  
> > +#undef TARGET_NOCE_CONVERSION_PROFITABLE_P
> > +#define TARGET_NOCE_CONVERSION_PROFITABLE_P 
> > s390_noce_conversion_profitable_p
> > +
> >  struct gcc_target targetm = TARGET_INITIALIZER;
> >  
> >  #include "gt-s390.h"
> > diff --git a/gcc/testsuite/gcc.target/s390/ccor.c 
> > b/gcc/testsuite/gcc.target/s390/ccor.c
> > index 31f30f60314..36a3c3a999a 100644
> > --- a/gcc/testsuite/gcc.target/s390/ccor.c
> > +++ b/gcc/testsuite/gcc.target/s390/ccor.c
> > @@ -42,7 +42,7 @@ GENFUN1(2)
> >  
> >  GENFUN1(3)
> >  
> > -/* { dg-final { scan-assembler {locrno} } } */
> > +/* { dg-final { scan-assembler {locro} } } */
> >  
> >  GENFUN2(0,1)
> >  
> > @@ -58,7 +58,7 @@ GENFUN2(0,3)
> >  
> >  GENFUN2(1,2)
> >  
> > -/* { dg-final { scan-assembler {locrnlh} } } */
> > +/* { dg-final { scan-assembler {locrlh} } } */
> >  
> >  GENFUN2(1,3)
> >  
> > -- 
> > 2.45.0
> >

Re: [PATCH] s390: testsuite: Fix nobp-table-jump-*.c

2024-06-13 Thread Stefan Schulze Frielinghaus

Ping.

On Mon, Jun 03, 2024 at 03:43:39PM +0200, Stefan Schulze Frielinghaus wrote:
> Starting with r14-5628-g53ba8d669550d3 interprocedural VRP became strong
> enough in order to render these tests useless.  Fixed by disabling IPA.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/s390/nobp-table-jump-inline-z10.c: Do not perform
>   IPA.
>   * gcc.target/s390/nobp-table-jump-inline-z900.c: Dito.
>   * gcc.target/s390/nobp-table-jump-z10.c: Dito.
>   * gcc.target/s390/nobp-table-jump-z900.c: Dito.
> ---
>  Ok for mainline?
> 
>  .../s390/nobp-table-jump-inline-z10.c | 42 +--
>  .../s390/nobp-table-jump-inline-z900.c| 42 +--
>  .../gcc.target/s390/nobp-table-jump-z10.c | 42 +--
>  .../gcc.target/s390/nobp-table-jump-z900.c| 42 +--
>  4 files changed, 84 insertions(+), 84 deletions(-)
> 
> diff --git a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c 
> b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c
> index 8dfd7e4c786..121751166d0 100644
> --- a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c
> +++ b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c
> @@ -4,29 +4,29 @@
>  /* case-values-threshold will be set to 20 by the back-end when jump
> thunk are requested.  */
>  
> -int __attribute__((noinline,noclone)) foo1 (void) { return 1; }
> -int __attribute__((noinline,noclone)) foo2 (void) { return 2; }
> -int __attribute__((noinline,noclone)) foo3 (void) { return 3; }
> -int __attribute__((noinline,noclone)) foo4 (void) { return 4; }
> -int __attribute__((noinline,noclone)) foo5 (void) { return 5; }
> -int __attribute__((noinline,noclone)) foo6 (void) { return 6; }
> -int __attribute__((noinline,noclone)) foo7 (void) { return 7; }
> -int __attribute__((noinline,noclone)) foo8 (void) { return 8; }
> -int __attribute__((noinline,noclone)) foo9 (void) { return 9; }
> -int __attribute__((noinline,noclone)) foo10 (void) { return 10; }
> -int __attribute__((noinline,noclone)) foo11 (void) { return 11; }
> -int __attribute__((noinline,noclone)) foo12 (void) { return 12; }
> -int __attribute__((noinline,noclone)) foo13 (void) { return 13; }
> -int __attribute__((noinline,noclone)) foo14 (void) { return 14; }
> -int __attribute__((noinline,noclone)) foo15 (void) { return 15; }
> -int __attribute__((noinline,noclone)) foo16 (void) { return 16; }
> -int __attribute__((noinline,noclone)) foo17 (void) { return 17; }
> -int __attribute__((noinline,noclone)) foo18 (void) { return 18; }
> -int __attribute__((noinline,noclone)) foo19 (void) { return 19; }
> -int __attribute__((noinline,noclone)) foo20 (void) { return 20; }
> +int __attribute__((noipa)) foo1 (void) { return 1; }
> +int __attribute__((noipa)) foo2 (void) { return 2; }
> +int __attribute__((noipa)) foo3 (void) { return 3; }
> +int __attribute__((noipa)) foo4 (void) { return 4; }
> +int __attribute__((noipa)) foo5 (void) { return 5; }
> +int __attribute__((noipa)) foo6 (void) { return 6; }
> +int __attribute__((noipa)) foo7 (void) { return 7; }
> +int __attribute__((noipa)) foo8 (void) { return 8; }
> +int __attribute__((noipa)) foo9 (void) { return 9; }
> +int __attribute__((noipa)) foo10 (void) { return 10; }
> +int __attribute__((noipa)) foo11 (void) { return 11; }
> +int __attribute__((noipa)) foo12 (void) { return 12; }
> +int __attribute__((noipa)) foo13 (void) { return 13; }
> +int __attribute__((noipa)) foo14 (void) { return 14; }
> +int __attribute__((noipa)) foo15 (void) { return 15; }
> +int __attribute__((noipa)) foo16 (void) { return 16; }
> +int __attribute__((noipa)) foo17 (void) { return 17; }
> +int __attribute__((noipa)) foo18 (void) { return 18; }
> +int __attribute__((noipa)) foo19 (void) { return 19; }
> +int __attribute__((noipa)) foo20 (void) { return 20; }
>  
>  
> -int __attribute__((noinline,noclone))
> +int __attribute__((noipa))
>  bar (int a)
>  {
>int ret = 0;
> diff --git a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c 
> b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c
> index 46d2c54bcff..5ad0c72afc3 100644
> --- a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c
> +++ b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c
> @@ -4,29 +4,29 @@
>  /* case-values-threshold will be set to 20 by the back-end when jump
> thunk are requested.  */
>  
> -int __attribute__((noinline,noclone)) foo1 (void) { return 1; }
> -int __attribute__((noinline,noclone)) foo2 (void) { return 2; }
> -int __attribute__((noinline,noclone)) foo3 (void) { return 3; }
> -int __attribute__((noinline,noclone)) foo4 (void) { return 4; }
> -int __attribute__((noinline,noclone)) foo5 (void)

Re: [PATCH] s390: Extend two element float vector

2024-06-11 Thread Stefan Schulze Frielinghaus

On Tue, Jun 11, 2024 at 10:42:26AM +0200, Andreas Krebbel wrote:
> On 6/11/24 10:26, Stefan Schulze Frielinghaus wrote:
> > This implements a V2SF -> V2DF extend.
> > 
> > gcc/ChangeLog:
> > 
> > * config/s390/vector.md (*vmrhf): New.
> > (extendv2sfv2df2): New.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > * gcc.target/s390/vector/vec-extend-3.c: New test.
> 
> Since we already have a *vmrhf pattern, should we perhaps add something to
> the name to make it easier to distinguish in the rtl dumps? You have added
> the mode already, but perhaps something like *vmrhf_half or something
> like this?

I like the one with _half added which I will push soon.

Thanks,
Stefan

> 
> Ok with or without that change. Thanks!
> 
> 
> Andreas
> 
>

[PATCH] s390: Extend two element float vector

2024-06-11 Thread Stefan Schulze Frielinghaus

This implements a V2SF -> V2DF extend.

gcc/ChangeLog:

* config/s390/vector.md (*vmrhf): New.
(extendv2sfv2df2): New.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/vec-extend-3.c: New test.
---
 Bootstrap and regtested on s390.  Ok for mainline?

 gcc/config/s390/vector.md | 28 +++
 .../gcc.target/s390/vector/vec-extend-3.c | 18 
 2 files changed, 46 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-extend-3.c

diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index a931a4b1b17..d8657fae56d 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -895,6 +895,17 @@
   "vmrhf\t%0,%1,%2";
   [(set_attr "op_type" "VRR")])
 
+(define_insn "*vmrhf"
+  [(set (match_operand:V_HW_40 
"register_operand" "=v")
+   (vec_select:V_HW_4
+(vec_concat:V_HW_4 (match_operand: 1 
"register_operand"  "v")
+   (match_operand: 2 
"register_operand"  "v"))
+(parallel [(const_int 0) (const_int 2)
+   (const_int 1) (const_int 3)])))]
+  "TARGET_VX"
+  "vmrhf\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
 (define_insn "*vmrlf"
   [(set (match_operand:V_HW_4  0 
"register_operand" "=v")
 (vec_select:V_HW_4
@@ -2394,6 +2405,23 @@
   "vuph\t%0,%1"
   [(set_attr "op_type" "VRR")])
 
+(define_expand "extendv2sfv2df2"
+  [(set (match_dup 2)
+   (vec_select:V4SF
+(vec_concat:V4SF (match_operand:V2SF 1 "register_operand")
+ (match_dup 1))
+(parallel [(const_int 0) (const_int 2)
+   (const_int 1) (const_int 3)])))
+   (set (match_operand:V2DF 0 "register_operand")
+   (float_extend:V2DF
+(vec_select:V2SF
+ (match_dup 2)
+ (parallel [(const_int 0) (const_int 2)]]
+  "TARGET_VX"
+{
+  operands[2] = gen_reg_rtx (V4SFmode);
+})
+
 ;; vector unpack v16qi
 
 ; signed
diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-extend-3.c 
b/gcc/testsuite/gcc.target/s390/vector/vec-extend-3.c
new file mode 100644
index 000..2b02e7bf9f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vec-extend-3.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=z13 -mzarch" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef float v2sf __attribute__ ((vector_size (8)));
+typedef double v2df __attribute__ ((vector_size (16)));
+
+/*
+** extendv2sfv2df2:
+** vmrhf   %v24,%v24,%v24
+** vldeb   %v24,%v24
+** br  %r14
+*/
+
+v2df extendv2sfv2df2 (v2sf x)
+{
+  return __builtin_convertvector (x, v2df);
+}
-- 
2.45.1

[PATCH] s390: Extend two/four element integer vectors

2024-06-11 Thread Stefan Schulze Frielinghaus

For the moment I deliberately left out one-element QHS vectors since it
is unclear whether these are pathological cases or whether they are
really used.  If we ever get an extend for V1DI -> V1TI we should
reconsider this.

As a side-effect this fixes PR115261.

gcc/ChangeLog:

target/PR115261
* config/s390/s390.md (any_extend,extend_insn,zero_extend):
New code attributes and code iterator.
* config/s390/vector.md (V_EXTEND): New mode iterator.
(2): New insn.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/vec-extend-1.c: New test.
* gcc.target/s390/vector/vec-extend-2.c: New test.
---
 Bootstrap and regtested on s390.  Ok for mainline?

 gcc/config/s390/s390.md   |  4 +
 gcc/config/s390/vector.md | 29 +--
 .../gcc.target/s390/vector/vec-extend-1.c | 79 +++
 .../gcc.target/s390/vector/vec-extend-2.c | 55 +
 4 files changed, 162 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-extend-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-extend-2.c

diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index c607dce3cf0..1311a5f01cf 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -602,6 +602,10 @@
 
 (define_attr "relative_long" "no,yes" (const_string "no"))
 
+(define_code_attr extend_insn [(sign_extend "extend") (zero_extend 
"zero_extend")])
+(define_code_attr zero_extend [(sign_extend "") (zero_extend "l")])
+(define_code_iterator any_extend [sign_extend zero_extend])
+
 ;; Pipeline description for z900.
 (include "2064.md")
 
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index ed4742d93c9..a931a4b1b17 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -87,6 +87,8 @@
 ; 32 bit int<->fp vector conversion instructions are available since VXE2 
(z15).
 (define_mode_iterator VX_VEC_CONV_BFP [V2DF (V4SF "TARGET_VXE2")])
 
+(define_mode_iterator VI_EXTEND [V2QI V2HI V2SI V4QI V4HI])
+
 ; Empty string for all but TImode.  This is used to hide the TImode
 ; expander name in case it is defined already.  See addti3 for an
 ; example.
@@ -195,13 +197,20 @@
(V1DF "V2DF") (V2DF "V4DF")])
 
 ; Vector with widened element size and the same number of elements.
-(define_mode_attr vec_2x_wide [(V1QI "V1HI") (V2QI "V2HI") (V4QI "V4HI") (V8QI 
"V8HI") (V16QI "V16HI")
+(define_mode_attr VEC_2X_WIDE [(V1QI "V1HI") (V2QI "V2HI") (V4QI "V4HI") (V8QI 
"V8HI") (V16QI "V16HI")
   (V1HI "V1SI") (V2HI "V2SI") (V4HI "V4SI") (V8HI 
"V8SI")
   (V1SI "V1DI") (V2SI "V2DI") (V4SI "V4DI")
   (V1DI "V1TI") (V2DI "V2TI")
   (V1SF "V1DF") (V2SF "V2DF") (V4SF "V4DF")
   (V1DF "V1TF") (V2DF "V2TF")])
 
+(define_mode_attr vec_2x_wide [(V1QI "v1hi") (V2QI "v2hi") (V4QI "v4hi") (V8QI 
"v8hi") (V16QI "v16hi")
+  (V1HI "v1si") (V2HI "v2si") (V4HI "v4si") (V8HI 
"v8si")
+  (V1SI "v1di") (V2SI "v2di") (V4SI "v4di")
+  (V1DI "v1ti") (V2DI "v2ti")
+  (V1SF "v1df") (V2SF "v2df") (V4SF "v4df")
+  (V1DF "v1tf") (V2DF "v2tf")])
+
 ; Vector with half the element size AND half the number of elements.
 (define_mode_attr vec_halfhalf
   [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
@@ -1604,7 +1613,7 @@
 UNSPEC_VEC_UMULT_ODD))
(set (match_operand: 0 "register_operand" "")
 (vec_select:
-(vec_concat: (match_dup 3) (match_dup 4))
+(vec_concat: (match_dup 3) (match_dup 4))
 (match_dup 5)))]
   "TARGET_VX"
  {
@@ -1623,7 +1632,7 @@
 UNSPEC_VEC_UMULT_ODD))
(set (match_operand: 0 "register_operand" "")
 (vec_select:
-(vec_concat: (match_dup 3) (match_dup 4))
+(vec_concat: (match_dup 3) (match_dup 4))
 (match_dup 5)))]
   "TARGET_VX"
  {
@@ -1642,7 +1651,7 @@
 UNSPEC_VEC_SMULT_ODD))
(set (match_operand: 0 "register_operand" "")
 (vec_select:
-(vec_concat: (match_dup 3) (match_dup 4))
+(vec_concat: (match_dup 3) (match_dup 4))
 (match_dup 5)))]
   "TARGET_VX"
  {
@@ -1661,7 +1670,7 @@
 UNSPEC_VEC_SMULT_ODD))
(set (match_operand: 0 "register_operand" "")
 (vec_select:
-(vec_concat: (match_dup 3) (match_dup 4))
+(vec_concat: (match_dup 3) (match_dup 4))
 (match_dup 5)))]
   "TARGET_VX"
  {
@@ -2375,6 +2384,16 @@
   "vpkls\t%0,%1,%2"
   [(set_attr "op_type" "VRR")])
 
+;; vector unpack / extend
+
+(define_insn "2"
+  [(set (match_operand: 0 "register_operand" "=v")
+   (any_extend:
+

Re: [PATCH] Hard register asm constraint

2024-06-09 Thread Stefan Schulze Frielinghaus

Ping.

On Fri, May 24, 2024 at 11:13:12AM +0200, Stefan Schulze Frielinghaus wrote:
> This implements hard register constraints for inline asm.  A hard register
> constraint is of the form {regname} where regname is any valid register.  This
> basically renders register asm superfluous.  For example, the snippet
> 
> int test (int x, int y)
> {
>   register int r4 asm ("r4") = x;
>   register int r5 asm ("r5") = y;
>   unsigned int copy = y;
>   asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy));
>   return r4;
> }
> 
> could be rewritten into
> 
> int test (int x, int y)
> {
>   asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y));
>   return x;
> }
> 
> As a side-effect this also solves the problem of call-clobbered registers.
> That being said, I was wondering whether we could utilize this feature in 
> order
> to get rid of local register asm automatically?  For example, converting
> 
> // Result will be in r2 on s390
> extern int bar (void);
> 
> void test (void)
> {
>   register int x asm ("r2") = 42;
>   bar ();
>   asm ("foo %0\n" :: "r" (x));
> }
> 
> into
> 
> void test (void)
> {
>   int x = 42;
>   bar ();
>   asm ("foo %0\n" :: "{r2}" (x));
> }
> 
> in order to get rid of the limitation of call-clobbered registers which may
> lead to subtle bugs---especially if you think of non-obvious calls e.g.
> introduced by sanitizer/tracer/whatever.  Since such a transformation has the
> potential to break existing code do you see any edge cases where this might be
> problematic or even show stoppers?  Currently, even
> 
> int test (void)
> {
>   register int x asm ("r2") = 42;
>   register int y asm ("r2") = 24;
>   asm ("foo %0,%1\n" :: "r" (x), "r" (y));
> }
> 
> is allowed which seems error prone to me.  Thus, if 100% backwards
> compatibility would be required, then automatically converting every register
> asm to the new mechanism isn't viable.  Still quite a lot could be 
> transformed.
> Any thoughts?
> 
> Currently I allow multiple alternatives as demonstrated by
> gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c.  However, since a hard 
> register
> constraint is pretty specific I could also think of erroring out in case of
> alternatives.  Are there any real use cases out there for multiple
> alternatives where one would like to use hard register constraints?
> 
> With the current implementation we have a "user visible change" in the sense
> that for
> 
> void test (void)
> {
>   register int x asm ("r2") = 42;
>   register int y asm ("r2") = 24;
>   asm ("foo   %0,%1\n" : "=r" (x), "=r" (y));
> }
> 
> we do not get the error
> 
>   "invalid hard register usage between output operands"
> 
> anymore but rather
> 
>   "multiple outputs to hard register: %r2"
> 
> This is due to the error handling in gimplify_asm_expr ().  Speaking of 
> errors,
> I also error out earlier as before which means that e.g. in pr87600-2.c only
> the first error is reported and processing is stopped afterwards which means
> the subsequent tests fail.
> 
> I've been skimming through all targets and it looks to me as if none is using
> curly brackets for their constraints.  Of course, I may have missed something.
> 
> Cheers,
> Stefan
> 
> PS: Current state for Clang: https://reviews.llvm.org/D105142
> 
> ---
>  gcc/cfgexpand.cc  |  42 ---
>  gcc/genpreds.cc   |   4 +-
>  gcc/gimplify.cc   | 115 +-
>  gcc/lra-constraints.cc|  17 +++
>  gcc/recog.cc  |  14 ++-
>  gcc/stmt.cc   | 102 +++-
>  gcc/stmt.h|  10 +-
>  .../gcc.target/s390/asm-hard-reg-1.c  | 103 
>  .../gcc.target/s390/asm-hard-reg-2.c  |  29 +
>  .../gcc.target/s390/asm-hard-reg-3.c  |  24 
>  gcc/testsuite/lib/scanasm.exp |   4 +
>  11 files changed, 407 insertions(+), 57 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-1.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-3.c
> 
> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
> index 557cb28733b..47f71a2e803 100644
> --- a/gcc/cfgexpand.cc

[PATCH] s390: testsuite: Fix ifcvt-one-insn-bool.c

2024-06-04 Thread Stefan Schulze Frielinghaus

With the change of r15-787-g57e04879389f9c I forgot to also update this
test.

gcc/testsuite/ChangeLog:

* gcc.target/s390/ifcvt-one-insn-bool.c: Fix loc.
---
 Ok for mainline?  Ok for GCC 14 if the corresponding backport is also
 approved?

 gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c 
b/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c
index 0c8c2f879a6..4ae29dbd6b6 100644
--- a/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c
+++ b/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c
@@ -3,7 +3,7 @@
 /* { dg-do compile { target { s390*-*-* } } } */
 /* { dg-options "-O2 -march=z13 -mzarch" } */
 
-/* { dg-final { scan-assembler "lochinh\t%r.?,1" } } */
+/* { dg-final { scan-assembler "lochile\t%r.?,1" } } */
 #include 
 
 int foo (int *a, unsigned int n)
-- 
2.45.1

[PATCH] s390: testsuite: Fix nobp-table-jump-*.c

2024-06-03 Thread Stefan Schulze Frielinghaus

Starting with r14-5628-g53ba8d669550d3 interprocedural VRP became strong
enough in order to render these tests useless.  Fixed by disabling IPA.

gcc/testsuite/ChangeLog:

* gcc.target/s390/nobp-table-jump-inline-z10.c: Do not perform
IPA.
* gcc.target/s390/nobp-table-jump-inline-z900.c: Dito.
* gcc.target/s390/nobp-table-jump-z10.c: Dito.
* gcc.target/s390/nobp-table-jump-z900.c: Dito.
---
 Ok for mainline?

 .../s390/nobp-table-jump-inline-z10.c | 42 +--
 .../s390/nobp-table-jump-inline-z900.c| 42 +--
 .../gcc.target/s390/nobp-table-jump-z10.c | 42 +--
 .../gcc.target/s390/nobp-table-jump-z900.c| 42 +--
 4 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c 
b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c
index 8dfd7e4c786..121751166d0 100644
--- a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c
+++ b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c
@@ -4,29 +4,29 @@
 /* case-values-threshold will be set to 20 by the back-end when jump
thunk are requested.  */
 
-int __attribute__((noinline,noclone)) foo1 (void) { return 1; }
-int __attribute__((noinline,noclone)) foo2 (void) { return 2; }
-int __attribute__((noinline,noclone)) foo3 (void) { return 3; }
-int __attribute__((noinline,noclone)) foo4 (void) { return 4; }
-int __attribute__((noinline,noclone)) foo5 (void) { return 5; }
-int __attribute__((noinline,noclone)) foo6 (void) { return 6; }
-int __attribute__((noinline,noclone)) foo7 (void) { return 7; }
-int __attribute__((noinline,noclone)) foo8 (void) { return 8; }
-int __attribute__((noinline,noclone)) foo9 (void) { return 9; }
-int __attribute__((noinline,noclone)) foo10 (void) { return 10; }
-int __attribute__((noinline,noclone)) foo11 (void) { return 11; }
-int __attribute__((noinline,noclone)) foo12 (void) { return 12; }
-int __attribute__((noinline,noclone)) foo13 (void) { return 13; }
-int __attribute__((noinline,noclone)) foo14 (void) { return 14; }
-int __attribute__((noinline,noclone)) foo15 (void) { return 15; }
-int __attribute__((noinline,noclone)) foo16 (void) { return 16; }
-int __attribute__((noinline,noclone)) foo17 (void) { return 17; }
-int __attribute__((noinline,noclone)) foo18 (void) { return 18; }
-int __attribute__((noinline,noclone)) foo19 (void) { return 19; }
-int __attribute__((noinline,noclone)) foo20 (void) { return 20; }
+int __attribute__((noipa)) foo1 (void) { return 1; }
+int __attribute__((noipa)) foo2 (void) { return 2; }
+int __attribute__((noipa)) foo3 (void) { return 3; }
+int __attribute__((noipa)) foo4 (void) { return 4; }
+int __attribute__((noipa)) foo5 (void) { return 5; }
+int __attribute__((noipa)) foo6 (void) { return 6; }
+int __attribute__((noipa)) foo7 (void) { return 7; }
+int __attribute__((noipa)) foo8 (void) { return 8; }
+int __attribute__((noipa)) foo9 (void) { return 9; }
+int __attribute__((noipa)) foo10 (void) { return 10; }
+int __attribute__((noipa)) foo11 (void) { return 11; }
+int __attribute__((noipa)) foo12 (void) { return 12; }
+int __attribute__((noipa)) foo13 (void) { return 13; }
+int __attribute__((noipa)) foo14 (void) { return 14; }
+int __attribute__((noipa)) foo15 (void) { return 15; }
+int __attribute__((noipa)) foo16 (void) { return 16; }
+int __attribute__((noipa)) foo17 (void) { return 17; }
+int __attribute__((noipa)) foo18 (void) { return 18; }
+int __attribute__((noipa)) foo19 (void) { return 19; }
+int __attribute__((noipa)) foo20 (void) { return 20; }
 
 
-int __attribute__((noinline,noclone))
+int __attribute__((noipa))
 bar (int a)
 {
   int ret = 0;
diff --git a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c 
b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c
index 46d2c54bcff..5ad0c72afc3 100644
--- a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c
+++ b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c
@@ -4,29 +4,29 @@
 /* case-values-threshold will be set to 20 by the back-end when jump
thunk are requested.  */
 
-int __attribute__((noinline,noclone)) foo1 (void) { return 1; }
-int __attribute__((noinline,noclone)) foo2 (void) { return 2; }
-int __attribute__((noinline,noclone)) foo3 (void) { return 3; }
-int __attribute__((noinline,noclone)) foo4 (void) { return 4; }
-int __attribute__((noinline,noclone)) foo5 (void) { return 5; }
-int __attribute__((noinline,noclone)) foo6 (void) { return 6; }
-int __attribute__((noinline,noclone)) foo7 (void) { return 7; }
-int __attribute__((noinline,noclone)) foo8 (void) { return 8; }
-int __attribute__((noinline,noclone)) foo9 (void) { return 9; }
-int __attribute__((noinline,noclone)) foo10 (void) { return 10; }
-int __attribute__((noinline,noclone)) foo11 (void) { return 11; }
-int __attribute__((noinline,noclone)) foo12 (void) { return 12; }
-int __attribute__((noinline,noclone)) foo13 (

Re: [PATCH v2] s390: Implement TARGET_NOCE_CONVERSION_PROFITABLE_P [PR109549]

2024-06-02 Thread Stefan Schulze Frielinghaus

Since the patch works fine so far for mainline, ok to backport to GCC 14?

On Fri, May 17, 2024 at 08:59:05AM +0200, Stefan Schulze Frielinghaus wrote:
> I've adapted the patch as follows and will push.
> 
> Thanks,
> Stefan
> 
> --
> 
> Consider a NOCE conversion as profitable if there is at least one
> conditional move.
> 
> gcc/ChangeLog:
> 
>   * config/s390/s390.cc (TARGET_NOCE_CONVERSION_PROFITABLE_P):
>   Define.
>   (s390_noce_conversion_profitable_p): Implement.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/s390/ccor.c: Order of loads are reversed, now, as a
>   consequence the condition has to be reversed.
> ---
>  gcc/config/s390/s390.cc  | 32 
>  gcc/testsuite/gcc.target/s390/ccor.c |  4 ++--
>  2 files changed, 34 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
> index bf46eab2d63..7f8f1681c2a 100644
> --- a/gcc/config/s390/s390.cc
> +++ b/gcc/config/s390/s390.cc
> @@ -78,6 +78,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-pass.h"
>  #include "context.h"
>  #include "builtins.h"
> +#include "ifcvt.h"
>  #include "rtl-iter.h"
>  #include "intl.h"
>  #include "tm-constrs.h"
> @@ -18037,6 +18038,34 @@ s390_vectorize_vec_perm_const (machine_mode vmode, 
> machine_mode op_mode,
>return vectorize_vec_perm_const_1 (d);
>  }
>  
> +/* Consider a NOCE conversion as profitable if there is at least one
> +   conditional move.  */
> +
> +static bool
> +s390_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info 
> *if_info)
> +{
> +  if (if_info->speed_p)
> +{
> +  for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
> + {
> +   rtx set = single_set (insn);
> +   if (set == NULL)
> + continue;
> +   if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
> + continue;
> +   rtx src = SET_SRC (set);
> +   machine_mode mode = GET_MODE (src);
> +   if (GET_MODE_CLASS (mode) != MODE_INT
> +   && GET_MODE_CLASS (mode) != MODE_FLOAT)
> + continue;
> +   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
> + continue;
> +   return true;
> + }
> +}
> +  return default_noce_conversion_profitable_p (seq, if_info);
> +}
> +
>  /* Initialize GCC target structure.  */
>  
>  #undef  TARGET_ASM_ALIGNED_HI_OP
> @@ -18350,6 +18379,9 @@ s390_vectorize_vec_perm_const (machine_mode vmode, 
> machine_mode op_mode,
>  #undef TARGET_VECTORIZE_VEC_PERM_CONST
>  #define TARGET_VECTORIZE_VEC_PERM_CONST s390_vectorize_vec_perm_const
>  
> +#undef TARGET_NOCE_CONVERSION_PROFITABLE_P
> +#define TARGET_NOCE_CONVERSION_PROFITABLE_P s390_noce_conversion_profitable_p
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>  
>  #include "gt-s390.h"
> diff --git a/gcc/testsuite/gcc.target/s390/ccor.c 
> b/gcc/testsuite/gcc.target/s390/ccor.c
> index 31f30f60314..36a3c3a999a 100644
> --- a/gcc/testsuite/gcc.target/s390/ccor.c
> +++ b/gcc/testsuite/gcc.target/s390/ccor.c
> @@ -42,7 +42,7 @@ GENFUN1(2)
>  
>  GENFUN1(3)
>  
> -/* { dg-final { scan-assembler {locrno} } } */
> +/* { dg-final { scan-assembler {locro} } } */
>  
>  GENFUN2(0,1)
>  
> @@ -58,7 +58,7 @@ GENFUN2(0,3)
>  
>  GENFUN2(1,2)
>  
> -/* { dg-final { scan-assembler {locrnlh} } } */
> +/* { dg-final { scan-assembler {locrlh} } } */
>  
>  GENFUN2(1,3)
>  
> -- 
> 2.45.0
>

Re: [PATCH] ifcvt: Clarify if_info.original_cost.

2024-06-01 Thread Stefan Schulze Frielinghaus

On Fri, May 31, 2024 at 10:05:55PM -0600, Jeff Law wrote:
> 
> 
> On 5/31/24 9:03 AM, Robin Dapp wrote:
> > Hi,
> > 
> > before noce_find_if_block processes a block it sets up an if_info
> > structure that holds the original costs.  At that point the costs of
> > the then/else blocks have not been added so we only care about the
> > "if" cost.
> > 
> > The code originally used BRANCH_COST for that but was then changed
> > to COST_N_INSNS (2) - a compare and a jump.
> > This patch computes the jump costs via
> >insn_cost (if_info.jump, ...)
> > which is supposed to incorporate the branch costs and, in case of a CC
> > comparison,
> >pattern_cost (if_info.cond, ...)
> > which is supposed to account for the CC creation.
> > 
> > For compare_and_jump patterns insn_cost should have already computed
> > the right cost.
> > 
> > Does this "split" make sense, generally?
> > 
> > Bootstrapped and regtested on x86, aarch64 and power10.  Regtested
> > on riscv.
> > 
> > Regards
> >   Robin
> > 
> > gcc/ChangeLog:
> > 
> > * ifcvt.cc (noce_process_if_block): Subtract condition pattern
> > cost if applicable.
> > (noce_find_if_block): Use insn_cost and pattern_cost for
> > original cost.
> OK.  Obviously we'll need to be on the lookout for regressions.  My bet is
> on s390 since you already tested the x86, aarch64 & p10 targets :-)

I just gave it a try on s390 where bootstrap and regtest were successful.

Cheers,
Stefan

> 
> 
> jeff
>

[PATCH] Hard register asm constraint

2024-05-24 Thread Stefan Schulze Frielinghaus

This implements hard register constraints for inline asm.  A hard register
constraint is of the form {regname} where regname is any valid register.  This
basically renders register asm superfluous.  For example, the snippet

int test (int x, int y)
{
  register int r4 asm ("r4") = x;
  register int r5 asm ("r5") = y;
  unsigned int copy = y;
  asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy));
  return r4;
}

could be rewritten into

int test (int x, int y)
{
  asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y));
  return x;
}

As a side-effect this also solves the problem of call-clobbered registers.
That being said, I was wondering whether we could utilize this feature in order
to get rid of local register asm automatically?  For example, converting

// Result will be in r2 on s390
extern int bar (void);

void test (void)
{
  register int x asm ("r2") = 42;
  bar ();
  asm ("foo %0\n" :: "r" (x));
}

into

void test (void)
{
  int x = 42;
  bar ();
  asm ("foo %0\n" :: "{r2}" (x));
}

in order to get rid of the limitation of call-clobbered registers which may
lead to subtle bugs---especially if you think of non-obvious calls e.g.
introduced by sanitizer/tracer/whatever.  Since such a transformation has the
potential to break existing code do you see any edge cases where this might be
problematic or even show stoppers?  Currently, even

int test (void)
{
  register int x asm ("r2") = 42;
  register int y asm ("r2") = 24;
  asm ("foo %0,%1\n" :: "r" (x), "r" (y));
}

is allowed which seems error prone to me.  Thus, if 100% backwards
compatibility would be required, then automatically converting every register
asm to the new mechanism isn't viable.  Still quite a lot could be transformed.
Any thoughts?

Currently I allow multiple alternatives as demonstrated by
gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c.  However, since a hard register
constraint is pretty specific I could also think of erroring out in case of
alternatives.  Are there any real use cases out there for multiple
alternatives where one would like to use hard register constraints?

With the current implementation we have a "user visible change" in the sense
that for

void test (void)
{
  register int x asm ("r2") = 42;
  register int y asm ("r2") = 24;
  asm ("foo %0,%1\n" : "=r" (x), "=r" (y));
}

we do not get the error

  "invalid hard register usage between output operands"

anymore but rather

  "multiple outputs to hard register: %r2"

This is due to the error handling in gimplify_asm_expr ().  Speaking of errors,
I also error out earlier as before which means that e.g. in pr87600-2.c only
the first error is reported and processing is stopped afterwards which means
the subsequent tests fail.

I've been skimming through all targets and it looks to me as if none is using
curly brackets for their constraints.  Of course, I may have missed something.

Cheers,
Stefan

PS: Current state for Clang: https://reviews.llvm.org/D105142

---
 gcc/cfgexpand.cc  |  42 ---
 gcc/genpreds.cc   |   4 +-
 gcc/gimplify.cc   | 115 +-
 gcc/lra-constraints.cc|  17 +++
 gcc/recog.cc  |  14 ++-
 gcc/stmt.cc   | 102 +++-
 gcc/stmt.h|  10 +-
 .../gcc.target/s390/asm-hard-reg-1.c  | 103 
 .../gcc.target/s390/asm-hard-reg-2.c  |  29 +
 .../gcc.target/s390/asm-hard-reg-3.c  |  24 
 gcc/testsuite/lib/scanasm.exp |   4 +
 11 files changed, 407 insertions(+), 57 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c
 create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-3.c

diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index 557cb28733b..47f71a2e803 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -2955,44 +2955,6 @@ expand_asm_loc (tree string, int vol, location_t locus)
   emit_insn (body);
 }
 
-/* Return the number of times character C occurs in string S.  */
-static int
-n_occurrences (int c, const char *s)
-{
-  int n = 0;
-  while (*s)
-n += (*s++ == c);
-  return n;
-}
-
-/* A subroutine of expand_asm_operands.  Check that all operands have
-   the same number of alternatives.  Return true if so.  */
-
-static bool
-check_operand_nalternatives (const vec &constraints)
-{
-  unsigned len = constraints.length();
-  if (len > 0)
-{
-  int nalternatives = n_occurrences (',', constraints[0]);
-
-  if (nal

[PATCH v2] s390: Implement TARGET_NOCE_CONVERSION_PROFITABLE_P [PR109549]

2024-05-17 Thread Stefan Schulze Frielinghaus

I've adapted the patch as follows and will push.

Thanks,
Stefan

--

Consider a NOCE conversion as profitable if there is at least one
conditional move.

gcc/ChangeLog:

* config/s390/s390.cc (TARGET_NOCE_CONVERSION_PROFITABLE_P):
Define.
(s390_noce_conversion_profitable_p): Implement.

gcc/testsuite/ChangeLog:

* gcc.target/s390/ccor.c: Order of loads are reversed, now, as a
consequence the condition has to be reversed.
---
 gcc/config/s390/s390.cc  | 32 
 gcc/testsuite/gcc.target/s390/ccor.c |  4 ++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index bf46eab2d63..7f8f1681c2a 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -78,6 +78,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "context.h"
 #include "builtins.h"
+#include "ifcvt.h"
 #include "rtl-iter.h"
 #include "intl.h"
 #include "tm-constrs.h"
@@ -18037,6 +18038,34 @@ s390_vectorize_vec_perm_const (machine_mode vmode, 
machine_mode op_mode,
   return vectorize_vec_perm_const_1 (d);
 }
 
+/* Consider a NOCE conversion as profitable if there is at least one
+   conditional move.  */
+
+static bool
+s390_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
+{
+  if (if_info->speed_p)
+{
+  for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
+   {
+ rtx set = single_set (insn);
+ if (set == NULL)
+   continue;
+ if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
+   continue;
+ rtx src = SET_SRC (set);
+ machine_mode mode = GET_MODE (src);
+ if (GET_MODE_CLASS (mode) != MODE_INT
+ && GET_MODE_CLASS (mode) != MODE_FLOAT)
+   continue;
+ if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+   continue;
+ return true;
+   }
+}
+  return default_noce_conversion_profitable_p (seq, if_info);
+}
+
 /* Initialize GCC target structure.  */
 
 #undef  TARGET_ASM_ALIGNED_HI_OP
@@ -18350,6 +18379,9 @@ s390_vectorize_vec_perm_const (machine_mode vmode, 
machine_mode op_mode,
 #undef TARGET_VECTORIZE_VEC_PERM_CONST
 #define TARGET_VECTORIZE_VEC_PERM_CONST s390_vectorize_vec_perm_const
 
+#undef TARGET_NOCE_CONVERSION_PROFITABLE_P
+#define TARGET_NOCE_CONVERSION_PROFITABLE_P s390_noce_conversion_profitable_p
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-s390.h"
diff --git a/gcc/testsuite/gcc.target/s390/ccor.c 
b/gcc/testsuite/gcc.target/s390/ccor.c
index 31f30f60314..36a3c3a999a 100644
--- a/gcc/testsuite/gcc.target/s390/ccor.c
+++ b/gcc/testsuite/gcc.target/s390/ccor.c
@@ -42,7 +42,7 @@ GENFUN1(2)
 
 GENFUN1(3)
 
-/* { dg-final { scan-assembler {locrno} } } */
+/* { dg-final { scan-assembler {locro} } } */
 
 GENFUN2(0,1)
 
@@ -58,7 +58,7 @@ GENFUN2(0,3)
 
 GENFUN2(1,2)
 
-/* { dg-final { scan-assembler {locrnlh} } } */
+/* { dg-final { scan-assembler {locrlh} } } */
 
 GENFUN2(1,3)
 
-- 
2.45.0

[PATCH] s390: Implement TARGET_NOCE_CONVERSION_PROFITABLE_P [PR109549]

2024-05-08 Thread Stefan Schulze Frielinghaus

Consider a NOCE conversion as profitable if there is at least one
conditional move.

gcc/ChangeLog:

* config/s390/s390.cc (TARGET_NOCE_CONVERSION_PROFITABLE_P):
Define.
(s390_noce_conversion_profitable_p): Implement.

gcc/testsuite/ChangeLog:

* gcc.target/s390/ccor.c: Order of loads are reversed, now, as a
consequence the condition has to be reversed.
---
 Bootstrapped and regtested on s390.  Ok for mainline?

 gcc/config/s390/s390.cc  | 32 
 gcc/testsuite/gcc.target/s390/ccor.c |  4 ++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index bf46eab2d63..23b18b5c506 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -78,6 +78,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "context.h"
 #include "builtins.h"
+#include "ifcvt.h"
 #include "rtl-iter.h"
 #include "intl.h"
 #include "tm-constrs.h"
@@ -18037,6 +18038,37 @@ s390_vectorize_vec_perm_const (machine_mode vmode, 
machine_mode op_mode,
   return vectorize_vec_perm_const_1 (d);
 }
 
+/* Consider a NOCE conversion as profitable if there is at least one
+   conditional move.  */
+
+#undef TARGET_NOCE_CONVERSION_PROFITABLE_P
+#define TARGET_NOCE_CONVERSION_PROFITABLE_P s390_noce_conversion_profitable_p
+
+static bool
+s390_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
+{
+  if (if_info->speed_p)
+{
+  for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
+   {
+ rtx set = single_set (insn);
+ if (set == NULL)
+   continue;
+ if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
+   continue;
+ rtx src = SET_SRC (set);
+ machine_mode mode = GET_MODE (src);
+ if (GET_MODE_CLASS (mode) != MODE_INT
+ && GET_MODE_CLASS (mode) != MODE_FLOAT)
+   continue;
+ if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (Pmode))
+   continue;
+ return true;
+   }
+}
+  return default_noce_conversion_profitable_p (seq, if_info);
+}
+
 /* Initialize GCC target structure.  */
 
 #undef  TARGET_ASM_ALIGNED_HI_OP
diff --git a/gcc/testsuite/gcc.target/s390/ccor.c 
b/gcc/testsuite/gcc.target/s390/ccor.c
index 31f30f60314..36a3c3a999a 100644
--- a/gcc/testsuite/gcc.target/s390/ccor.c
+++ b/gcc/testsuite/gcc.target/s390/ccor.c
@@ -42,7 +42,7 @@ GENFUN1(2)
 
 GENFUN1(3)
 
-/* { dg-final { scan-assembler {locrno} } } */
+/* { dg-final { scan-assembler {locro} } } */
 
 GENFUN2(0,1)
 
@@ -58,7 +58,7 @@ GENFUN2(0,3)
 
 GENFUN2(1,2)
 
-/* { dg-final { scan-assembler {locrnlh} } } */
+/* { dg-final { scan-assembler {locrlh} } } */
 
 GENFUN2(1,3)
 
-- 
2.44.0

[PATCH] tree-ssa-loop-prefetch.cc: Honour -fno-unroll-loops

2024-05-08 Thread Stefan Schulze Frielinghaus

On s390 the following tests fail

FAIL: gcc.dg/vect/pr109011-1.c -flto -ffat-lto-objects  scan-tree-dump-times 
optimized " = .CLZ (vect" 1
FAIL: gcc.dg/vect/pr109011-1.c -flto -ffat-lto-objects  scan-tree-dump-times 
optimized " = .POPCOUNT (vect" 1
FAIL: gcc.dg/vect/pr109011-1.c scan-tree-dump-times optimized " = .CLZ 
(vect" 1
FAIL: gcc.dg/vect/pr109011-1.c scan-tree-dump-times optimized " = .POPCOUNT 
(vect" 1
FAIL: gcc.dg/vect/pr109011-2.c -flto -ffat-lto-objects  scan-tree-dump-times 
optimized " = .CTZ (vect" 2
FAIL: gcc.dg/vect/pr109011-2.c -flto -ffat-lto-objects  scan-tree-dump-times 
optimized " = .POPCOUNT (vect" 1
FAIL: gcc.dg/vect/pr109011-2.c scan-tree-dump-times optimized " = .CTZ 
(vect" 2
FAIL: gcc.dg/vect/pr109011-2.c scan-tree-dump-times optimized " = .POPCOUNT 
(vect" 1
FAIL: gcc.dg/vect/pr109011-4.c -flto -ffat-lto-objects  scan-tree-dump-times 
optimized " = .CTZ (vect" 2
FAIL: gcc.dg/vect/pr109011-4.c -flto -ffat-lto-objects  scan-tree-dump-times 
optimized " = .POPCOUNT (vect" 1
FAIL: gcc.dg/vect/pr109011-4.c scan-tree-dump-times optimized " = .CTZ 
(vect" 2
FAIL: gcc.dg/vect/pr109011-4.c scan-tree-dump-times optimized " = .POPCOUNT 
(vect" 1

because aprefetch unrolls loops even if -fno-unroll-loops is used.
Accordingly, the scan patterns match more than one time.

Could also be fixed by using -fno-prefetch-loop-arrays for the tests.
Though, I tend to prefer if aprefetch honours -fno-unroll-loops.  Any
preferences?

Bootstrapped and regtested on x86_64 and s390.  Ok for mainline?

gcc/ChangeLog:

* tree-ssa-loop-prefetch.cc (determine_unroll_factor): Honour
-fno-unroll-loops.
---
 gcc/tree-ssa-loop-prefetch.cc | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/tree-ssa-loop-prefetch.cc b/gcc/tree-ssa-loop-prefetch.cc
index 70073cc4fe4..bb5d5dec779 100644
--- a/gcc/tree-ssa-loop-prefetch.cc
+++ b/gcc/tree-ssa-loop-prefetch.cc
@@ -1401,6 +1401,10 @@ determine_unroll_factor (class loop *loop, struct 
mem_ref_group *refs,
   struct mem_ref_group *agp;
   struct mem_ref *ref;
 
+  /* Bail out early in case we must not unroll loops.  */
+  if (!flag_unroll_loops)
+return 1;
+
   /* First check whether the loop is not too large to unroll.  We ignore
  PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
  from unrolling them enough to make exactly one cache line covered by each
-- 
2.44.0

Re: [PATCH] tree-optimization/110490 - bitcount for narrow modes

2024-05-07 Thread Stefan Schulze Frielinghaus

Ping.  Ok for mainline?

On Thu, Apr 25, 2024 at 09:26:45AM +0200, Stefan Schulze Frielinghaus wrote:
> Bitcount operations popcount, clz, and ctz are emulated for narrow modes
> in case an operation is only supported for wider modes.  Beside that ctz
> may be emulated via clz in expand_ctz.  Reflect this in
> expression_expensive_p.
> 
> I considered the emulation of ctz via clz as not expensive since this
> basically reduces to ctz (x) = c - (clz (x & ~x)) where c is the mode
> precision minus 1 which should be faster than a loop.
> 
> Bootstrapped and regtested on x86_64 and s390.  Though, this is probably
> stage1 material?
> 
> gcc/ChangeLog:
> 
>   PR tree-optimization/110490
>   * tree-scalar-evolution.cc (expression_expensive_p): Also
>   consider mode widening for popcount, clz, and ctz.
> ---
>  gcc/tree-scalar-evolution.cc | 23 +++
>  1 file changed, 23 insertions(+)
> 
> diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> index b0a5e09a77c..622c7246c1b 100644
> --- a/gcc/tree-scalar-evolution.cc
> +++ b/gcc/tree-scalar-evolution.cc
> @@ -3458,6 +3458,28 @@ bitcount_call:
> && (optab_handler (optab, word_mode)
> != CODE_FOR_nothing))
> break;
> +   /* If popcount is available for a wider mode, we emulate the
> +  operation for a narrow mode by first zero-extending the value
> +  and then computing popcount in the wider mode.  Analogue for
> +  ctz.  For clz we do the same except that we additionally have
> +  to subtract the difference of the mode precisions from the
> +  result.  */
> +   if (is_a  (mode, &int_mode))
> + {
> +   machine_mode wider_mode_iter;
> +   FOR_EACH_WIDER_MODE (wider_mode_iter, mode)
> + if (optab_handler (optab, wider_mode_iter)
> + != CODE_FOR_nothing)
> +   goto check_call_args;
> +   /* Operation ctz may be emulated via clz in expand_ctz.  */
> +   if (optab == ctz_optab)
> + {
> +   FOR_EACH_WIDER_MODE_FROM (wider_mode_iter, mode)
> + if (optab_handler (clz_optab, wider_mode_iter)
> + != CODE_FOR_nothing)
> +   goto check_call_args;
> + }
> + }
> return true;
>   }
> break;
> @@ -3469,6 +3491,7 @@ bitcount_call:
> break;
>   }
>  
> +check_call_args:
>FOR_EACH_CALL_EXPR_ARG (arg, iter, expr)
>   if (expression_expensive_p (arg, cond_overflow_p, cache, op_cost))
> return true;
> -- 
> 2.44.0
>

[PATCH] s390: testsuite: Fix risbg-ll-2.c

2024-04-30 Thread Stefan Schulze Frielinghaus

Starting with r14-2047-gd0e891406b16dc we see through subregs which
means for f10 in risbg-ll-2.c we do not end up with rosbg_si_noshift but
rather rosbg_di_noshift which materializes in slightly different start
index.  This saves us an extend.

gcc/testsuite/ChangeLog:

* gcc.target/s390/risbg-ll-2.c: Fix start offset for rosbg of
f10.
---
 Ok for mainline?

 gcc/testsuite/gcc.target/s390/risbg-ll-2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/s390/risbg-ll-2.c 
b/gcc/testsuite/gcc.target/s390/risbg-ll-2.c
index 8bf1a0ff88b..ca80602a83f 100644
--- a/gcc/testsuite/gcc.target/s390/risbg-ll-2.c
+++ b/gcc/testsuite/gcc.target/s390/risbg-ll-2.c
@@ -113,7 +113,7 @@ i32 f9 (i64 v_x, i32 v_y)
 // ands with incompatible masks.
 i32 f10 (i64 v_x, i32 v_y)
 {
-  /* { dg-final { scan-assembler 
"f10:\n\tsrlg\t%r2,%r2,48\n\trosbg\t%r2,%r3,32,39,0" { target { lp64 } } } } */
+  /* { dg-final { scan-assembler 
"f10:\n\tsrlg\t%r2,%r2,48\n\trosbg\t%r2,%r3,0,39,0" { target { lp64 } } } } */
   /* { dg-final { scan-assembler 
"f10:\n\tnilf\t%r4,4278190080\n\trosbg\t%r4,%r2,48,63,48" { target { ! lp64 } } 
} } */
   i64 v_shr6 = ((ui64)v_x) >> 48;
   i32 v_conv = (ui32)v_shr6;
-- 
2.44.0

[PATCH] s390: testsuite: Fix zero_bits_compound-1.c

2024-04-30 Thread Stefan Schulze Frielinghaus

Starting with r12-2731-g96146e61cd7aee we do not generate code like

_5 = (unsigned int) c_2(D);
i_6 = _5 << 8;
_7 = _5 << 20;
i_8 = i_6 | _7;

anymore but instead

_5 = (unsigned int) c_2(D);
_3 = _5 * 1048832;

which leads finally to slightly different assembly code where we
previously ended up for z10 or newer with

lr  %r1,%r2
sll %r1,8
rosbg   %r1,%r2,32,43,20
llgfr   %r2,%r1
br  %r14

and now

lr  %r1,%r2
sll %r1,12
ar  %r2,%r1
risbg   %r2,%r2,35,128+55,8
br  %r14

The zero-extend materializes via risbg for which the pattern contains an
"and" which is why the test fails.  Thus, instead of scanning for RTL
expressions rather scan for assembler instructions for s390.
---
 Ok for mainline?

 gcc/testsuite/gcc.dg/zero_bits_compound-1.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/zero_bits_compound-1.c 
b/gcc/testsuite/gcc.dg/zero_bits_compound-1.c
index e71594911b2..f1e267e0fb0 100644
--- a/gcc/testsuite/gcc.dg/zero_bits_compound-1.c
+++ b/gcc/testsuite/gcc.dg/zero_bits_compound-1.c
@@ -39,4 +39,5 @@ unsigned long bar (unsigned char c)
 }
 
 /* Check that no pattern containing an AND expression was used.  */
-/* { dg-final { scan-assembler-not "\\(and:" } } */
+/* { dg-final { scan-assembler-not "\\(and:" { target { ! { s390*-*-* } } } } 
} */
+/* { dg-final { scan-assembler-not "\\tng?rk?\\t" { target { s390*-*-* } } } } 
*/
-- 
2.44.0

[PATCH] tree-optimization/110490 - bitcount for narrow modes

2024-04-25 Thread Stefan Schulze Frielinghaus

Bitcount operations popcount, clz, and ctz are emulated for narrow modes
in case an operation is only supported for wider modes.  Beside that ctz
may be emulated via clz in expand_ctz.  Reflect this in
expression_expensive_p.

I considered the emulation of ctz via clz as not expensive since this
basically reduces to ctz (x) = c - (clz (x & ~x)) where c is the mode
precision minus 1 which should be faster than a loop.

Bootstrapped and regtested on x86_64 and s390.  Though, this is probably
stage1 material?

gcc/ChangeLog:

PR tree-optimization/110490
* tree-scalar-evolution.cc (expression_expensive_p): Also
consider mode widening for popcount, clz, and ctz.
---
 gcc/tree-scalar-evolution.cc | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
index b0a5e09a77c..622c7246c1b 100644
--- a/gcc/tree-scalar-evolution.cc
+++ b/gcc/tree-scalar-evolution.cc
@@ -3458,6 +3458,28 @@ bitcount_call:
  && (optab_handler (optab, word_mode)
  != CODE_FOR_nothing))
  break;
+ /* If popcount is available for a wider mode, we emulate the
+operation for a narrow mode by first zero-extending the value
+and then computing popcount in the wider mode.  Analogue for
+ctz.  For clz we do the same except that we additionally have
+to subtract the difference of the mode precisions from the
+result.  */
+ if (is_a  (mode, &int_mode))
+   {
+ machine_mode wider_mode_iter;
+ FOR_EACH_WIDER_MODE (wider_mode_iter, mode)
+   if (optab_handler (optab, wider_mode_iter)
+   != CODE_FOR_nothing)
+ goto check_call_args;
+ /* Operation ctz may be emulated via clz in expand_ctz.  */
+ if (optab == ctz_optab)
+   {
+ FOR_EACH_WIDER_MODE_FROM (wider_mode_iter, mode)
+   if (optab_handler (clz_optab, wider_mode_iter)
+   != CODE_FOR_nothing)
+ goto check_call_args;
+   }
+   }
  return true;
}
  break;
@@ -3469,6 +3491,7 @@ bitcount_call:
  break;
}
 
+check_call_args:
   FOR_EACH_CALL_EXPR_ARG (arg, iter, expr)
if (expression_expensive_p (arg, cond_overflow_p, cache, op_cost))
  return true;
-- 
2.44.0

[PATCH] s390: testsuite: Xfail forwprop-4{0,1}.c

2024-04-22 Thread Stefan Schulze Frielinghaus

Hi Andreas,

Ok then I will proceed with the patch as is.  Opened PR114802.

Cheers,
Stefan

--

The tests fail on s390 since can_vec_perm_const_p fails and therefore
the bit insert/ref survive which r14-3381-g27de9aa152141e aims for.
Strictly speaking, the tests only fail in case the target supports
vectors, i.e., for targets prior z13 or in case of -mesa the emulated
vector operations are optimized out.

Set to xfail and tracked by PR114802.
---
 gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c |  4 ++--
 gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c |  4 ++--
 gcc/testsuite/lib/target-supports.exp   | 14 ++
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c 
b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c
index 7513497f552..0c5233a68f4 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c
@@ -10,5 +10,5 @@ vector int g(vector int a)
   return a;
 }
 
-/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 0 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 0 "optimized" { xfail 
s390_mvx } } } Xfail: PR114802 */
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" { xfail 
s390_mvx } } } Xfail: PR114802 */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c 
b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c
index b1e75797a90..a1f08289dd6 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c
@@ -11,6 +11,6 @@ vector int g(vector int a, int c)
   return a;
 }
 
-/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 1 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 1 "optimized" { xfail 
s390_mvx } } } Xfail PR114802 */
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" { xfail 
s390_mvx } } } Xfail PR114802 */
 /* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 0 "optimized" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 3a5713d9869..3a55b2a4159 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -12392,6 +12392,20 @@ proc check_effective_target_profile_update_atomic {} {
 } "-fprofile-update=atomic -fprofile-generate"]
 }
 
+# Return 1 if the target has a vector facility.
+proc check_effective_target_s390_mvx { } {
+if ![istarget s390*-*-*] then {
+   return 0;
+}
+
+return [check_no_compiler_messages_nocache s390_mvx assembly {
+   #if !defined __VX__
+   #error no vector facility.
+   #endif
+   int dummy;
+} [current_compiler_flags]]
+}
+
 # Return 1 if vector (va - vector add) instructions are understood by
 # the assembler and can be executed.  This also covers checking for
 # the VX kernel feature.  A kernel without that feature does not
-- 
2.44.0

[PATCH] s390: testsuite: Fix forwprop-4{0,1}.c

2024-04-21 Thread Stefan Schulze Frielinghaus

The tests fail on s390 since can_vec_perm_const_p fails and therefore
the bit insert/ref survive which r14-3381-g27de9aa152141e aims for.
Strictly speaking, the tests only fail in case the target supports
vectors, i.e., for targets prior z13 or in case of -mesa the emulated
vector operations are optimized out.

Easiest would be to skip the entire test for s390.  Another solution
would be to xfail in case of vector support hoping that eventually we
end up with an xpass for a future machine generation or if gcc advances.
That is implemented by this patch.  In order to do so I implemented a
new target test s390_mvx which tests whether vector support is available
or not.  Maybe this is already over-engineered for a simple test?  Any
thoughts?
---
 gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c |  4 ++--
 gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c |  4 ++--
 gcc/testsuite/lib/target-supports.exp   | 14 ++
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c 
b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c
index 7513497f552..b67e3e93a7f 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c
@@ -10,5 +10,5 @@ vector int g(vector int a)
   return a;
 }
 
-/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 0 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 0 "optimized" { xfail 
s390_mvx } } } */
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" { xfail 
s390_mvx } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c 
b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c
index b1e75797a90..0f119675207 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c
@@ -11,6 +11,6 @@ vector int g(vector int a, int c)
   return a;
 }
 
-/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 1 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 1 "optimized" { xfail 
s390_mvx } } } */
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" { xfail 
s390_mvx } } } */
 /* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 0 "optimized" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index edce672c0e2..5a692baa8ef 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -12380,6 +12380,20 @@ proc check_effective_target_profile_update_atomic {} {
 } "-fprofile-update=atomic -fprofile-generate"]
 }
 
+# Return 1 if the target has a vector facility.
+proc check_effective_target_s390_mvx { } {
+if ![istarget s390*-*-*] then {
+   return 0;
+}
+
+return [check_no_compiler_messages_nocache s390_mvx assembly {
+   #if !defined __VX__
+   #error no vector facility.
+   #endif
+   int dummy;
+} [current_compiler_flags]]
+}
+
 # Return 1 if vector (va - vector add) instructions are understood by
 # the assembler and can be executed.  This also covers checking for
 # the VX kernel feature.  A kernel without that feature does not
-- 
2.44.0

[PATCH] s390: testsuite: Remove xfail for vpopct{b,h}

2024-04-21 Thread Stefan Schulze Frielinghaus

Starting with r14-9316-g7890836de20912 patterns for vpopct{b,h} are also
detected.  Thus, remove xfails.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vxe/popcount-1.c: Remove xfail.
---
 Ok for mainline?

 gcc/testsuite/gcc.target/s390/vxe/popcount-1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/s390/vxe/popcount-1.c 
b/gcc/testsuite/gcc.target/s390/vxe/popcount-1.c
index 9ea835a1cf0..25ef354f963 100644
--- a/gcc/testsuite/gcc.target/s390/vxe/popcount-1.c
+++ b/gcc/testsuite/gcc.target/s390/vxe/popcount-1.c
@@ -21,7 +21,7 @@ vpopctb (uv16qi a)
 
   return r;
 }
-/* { dg-final { scan-assembler "vpopctb\t%v24,%v24" { xfail *-*-* } } } */
+/* { dg-final { scan-assembler "vpopctb\t%v24,%v24" } } */
 
 uv8hi __attribute__((noinline))
 vpopcth (uv8hi a)
@@ -34,7 +34,7 @@ vpopcth (uv8hi a)
 
   return r;
 }
-/* { dg-final { scan-assembler "vpopcth\t%v24,%v24" { xfail *-*-* } } } */
+/* { dg-final { scan-assembler "vpopcth\t%v24,%v24" } } */
 
 uv4si __attribute__((noinline))
 vpopctf (uv4si a)
-- 
2.44.0

[PATCH] s390: testsuite: Xfail range-sincos.c and vrp-float-abs-1.c

2024-04-12 Thread Stefan Schulze Frielinghaus

As mentioned in PR114678 those failures will be fixed by
https://gcc.gnu.org/pipermail/gcc-patches/2024-March/648303.html
For GCC 14 just xfail them which should be reverted once the patch is
applied.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/range-sincos.c: Xfail for s390.
* gcc.dg/tree-ssa/vrp-float-abs-1.c: Dito.
---
 Ok for mainline?

 gcc/testsuite/gcc.dg/tree-ssa/range-sincos.c| 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/vrp-float-abs-1.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/range-sincos.c 
b/gcc/testsuite/gcc.dg/tree-ssa/range-sincos.c
index 337f9cda02f..35b38c3c914 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/range-sincos.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/range-sincos.c
@@ -40,4 +40,4 @@ stool (double x)
 link_error ();
 }
 
-// { dg-final { scan-tree-dump-not "link_error" "evrp" { target { { *-*-linux* 
} && { glibc } } } } }
+// { dg-final { scan-tree-dump-not "link_error" "evrp" { target { { *-*-linux* 
} && { glibc } } xfail s390*-*-* } } } xfail: PR114678
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp-float-abs-1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/vrp-float-abs-1.c
index 4b7b75833e0..a814a973963 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/vrp-float-abs-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp-float-abs-1.c
@@ -14,4 +14,4 @@ foo (double x, double y)
 }
 }
 
-// { dg-final { scan-tree-dump-not "link_error" "evrp" } }
+// { dg-final { scan-tree-dump-not "link_error" "evrp" { xfail s390*-*-* } } } 
xfail: PR114678
-- 
2.43.0

[PATCH] testsuite: Fix loop-interchange-16.c

2024-04-11 Thread Stefan Schulze Frielinghaus

Yes, that works, too.  Will commit.

Thanks,
Stefan

--

Prevent loop unrolling of the innermost loop because otherwise we are
left with no loop interchange for targets like s390 which have a more
aggressive loop unrolling strategy.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/loop-interchange-16.c: Prevent loop unrolling
of the innermost loop.
---
 gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c 
b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c
index 781555e085d..bbcb14f9c6c 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c
@@ -11,6 +11,7 @@ double s231(int iterations)
 //loop with data dependency
 for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) {
 for (int i = 0; i < LEN_2D; ++i) {
+#pragma GCC unroll 0
 for (int j = 1; j < LEN_2D; j++) {
 aa[j][i] = aa[j - 1][i] + bb[j][i];
 }
-- 
2.43.0

[PATCH] s390: testsuite: Fix loop-interchange-16.c

2024-04-11 Thread Stefan Schulze Frielinghaus

Revert parameter max-completely-peel-times to 16, otherwise, the
innermost loop is removed and we are left with no loop interchange which
this test is all about.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/loop-interchange-16.c: Revert parameter
max-completely-peel-times for s390.
---
 Ok for mainline?

 gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c 
b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c
index 781555e085d..2530ec84bc0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c
@@ -1,6 +1,7 @@
 /* PR/101280 */
 /* { dg-do compile } */
 /* { dg-options "-O3 -fdump-tree-linterchange-details" } */
+/* { dg-additional-options "--param max-completely-peel-times=16" { target 
s390*-*-* } } */
 
 void dummy (double *, double *);
 #define LEN_2D 32
-- 
2.43.0

Re: [PATCH] s390x: Optimize vector permute with constant indexes

2024-04-09 Thread Stefan Schulze Frielinghaus

On Tue, Apr 02, 2024 at 09:56:01AM +0200, Juergen Christ wrote:
> Loop vectorizer can generate vector permutes with constant indexes
> where all indexes are equal.  Optimize this case to use vector
> replicate instead of vector permute.
> 
> gcc/ChangeLog:
> 
>   * config/s390/s390.cc (expand_perm_as_replicate): Implement.
>   (vectorize_vec_perm_const_1): Call new function.
>   * config/s390/vx-builtins.md (vec_splat): Change to...
>   (@vec_splat): ...this.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/s390/vector/vec-expand-replicate.c: New test.
> 
> Bootstrapped and regtested on s390x.  Ok for trunk?
> 
> Signed-off-by: Juergen Christ 
> ---
>  gcc/config/s390/s390.cc   | 32 +++
>  gcc/config/s390/vx-builtins.md|  2 +-
>  .../s390/vector/vec-expand-replicate.c| 30 +
>  3 files changed, 63 insertions(+), 1 deletion(-)
>  create mode 100644 
> gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
> 
> diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
> index 372a23244032..4b4014ebe444 100644
> --- a/gcc/config/s390/s390.cc
> +++ b/gcc/config/s390/s390.cc
> @@ -17923,6 +17923,35 @@ expand_perm_as_a_vlbr_vstbr_candidate (const struct 
> expand_vec_perm_d &d)
>return false;
>  }
>  
> +static bool expand_perm_as_replicate (const struct expand_vec_perm_d &d)
   ^~~~
Function names start on a new line.

> +{
> +  unsigned char i;
> +  unsigned char elem;
> +  rtx base = d.op0;
> +  rtx insn;
> +  /* Needed to silence maybe-uninitialized warning.  */
> +  gcc_assert(d.nelt > 0);
 ~~^~~~
Between function name and open bracket whitespace is missing.

Curiously enough, the error is about d which is a reference and cannot
be null.  If you are eager you could reduce this and open a PR.

s390.cc:17935:8: warning: ‘d’ may be used uninitialized [-Wmaybe-uninitialized]
17935 |   elem = d.perm[0];
  |   ~^~~

> +  elem = d.perm[0];
> +  for (i = 1; i < d.nelt; ++i)
> +if (d.perm[i] != elem)
> +  return false;
> +  if (!d.testing_p)
> +{
> +  if (elem >= d.nelt)
> + {
> +   base = d.op1;
> +   elem -= d.nelt;
> + }
> +  insn = maybe_gen_vec_splat (d.vmode, d.target, base, GEN_INT (elem));
> +  if (insn == NULL_RTX)
> + return false;
> +  emit_insn (insn);
> +  return true;
> +}
> +  else
> +return maybe_code_for_vec_splat (d.vmode) != CODE_FOR_nothing;
> +}
> +
>  /* Try to find the best sequence for the vector permute operation
> described by D.  Return true if the operation could be
> expanded.  */
> @@ -17941,6 +17970,9 @@ vectorize_vec_perm_const_1 (const struct 
> expand_vec_perm_d &d)
>if (expand_perm_as_a_vlbr_vstbr_candidate (d))
>  return true;
>  
> +  if (expand_perm_as_replicate(d))
 ^~~
Between function name and open bracket whitespace is missing.

> +return true;
> +
>return false;
>  }
>  
> diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
> index 432d81a719fc..93c0d408a43e 100644
> --- a/gcc/config/s390/vx-builtins.md
> +++ b/gcc/config/s390/vx-builtins.md
> @@ -424,7 +424,7 @@
>  
>  
>  ; Replicate from vector element
> -(define_expand "vec_splat"
> +(define_expand "@vec_splat"
>[(set (match_operand:V_HW  0 "register_operand"  "")
>   (vec_duplicate:V_HW (vec_select:
>(match_operand:V_HW 1 "register_operand"  "")
> diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c 
> b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
> new file mode 100644
> index ..27563a00f22b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
> @@ -0,0 +1,30 @@
> +/* Check that the vectorize_vec_perm_const expander correctly deals with
> +   replication.  Extracted from spec "nab".  */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */
> +
> +
> +#define REAL_T  double
> +typedef REAL_T  MATRIX_T[ 4 ][ 4 ];
> +
> +int concat_mat_i, concat_mat_j;
> +static void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3);
> +MATRIX_T *rot4p() {
> +  MATRIX_T mat3, mat4;
> +  static MATRIX_T mat5;
> +  concat_mat(mat4, mat3, mat5);
> +}
> +void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3) {
> +  int k;
> +  for (;; concat_mat_i++) {
> +concat_mat_j = 0;
> +for (; 4; concat_mat_j++) {
> +  k = 0;
> +  for (; k < 4; k++)
> +m3[concat_mat_i][concat_mat_j] += m1[concat_mat_i][k];
> +}

Just nitpicking, if we could come up with a test case which does not
involve integer overflows due to non-terminating loops, I would prefer
that.

Cheers,
Stefan

> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "vperm" } } */
> -- 
> 2.39.3
>

[PATCH] testsuite: Fix copy-headers-8.c

2024-03-26 Thread Stefan Schulze Frielinghaus

This fixes the test on s390x.  I'm also seeing test failures for
riscv64-suse-linux-gnu, m68k-unknown-linux-gnu, pru-unknown-elf, and
powerpc64le-unknown-linux-gnu.  However, I didn't check them so this
might or might not fix those, too.

OK for mainline?

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/copy-headers-8.c: Set
LOGICAL_OP_NON_SHORT_CIRCUIT to true.
---
 gcc/testsuite/gcc.dg/tree-ssa/copy-headers-8.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-8.c 
b/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-8.c
index 8b4b5e7ea81..28b4d15d87f 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-8.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-8.c
@@ -1,5 +1,8 @@
+/* For targets where LOGICAL_OP_NON_SHORT_CIRCUIT evaluates to false, two
+   conditional jumps are emitted instead of a combined conditional which this
+   test is all about.  Thus, set it to true.  */
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-ch2-details" } */
+/* { dg-options "-O2 -fdump-tree-ch2-details --param 
logical-op-non-short-circuit=1" } */
 
 int is_sorted(int *a, int n, int m, int k)
 {
-- 
2.43.0

[PATCH] s390: testsuite: Fix backprop-6.c

2024-03-22 Thread Stefan Schulze Frielinghaus

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/backprop-6.c: On s390 we also have a copysign
optab for long double.  Thus, scan 3 instead of 2 times for it.
---
 OK for mainline?

 gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c 
b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
index 4087ba93018..dbde681e383 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
@@ -27,8 +27,9 @@ TEST_FUNCTION (float, f)
 TEST_FUNCTION (double, )
 TEST_FUNCTION (long double, l)
 
-/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 4 "backprop" { 
target ifn_copysign } } } */
-/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = \.COPYSIGN} 2 
"backprop" { target ifn_copysign } } } */
-/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 1 
"backprop" { target ifn_copysign } } } */
+/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 4 "backprop" { 
target { ifn_copysign && { ! { s390*-*-* } } } } } } */
+/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = \.COPYSIGN} 2 
"backprop" { target { ifn_copysign && { ! { s390*-*-* } } } } } } */
+/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 1 
"backprop" { target { ifn_copysign && { ! { s390*-*-* } } } } } } */
+/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = \.COPYSIGN} 3 
"backprop" { target { ifn_copysign && s390*-*-* } } } } */
 /* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 6 "backprop" { 
target { ! ifn_copysign } } } } */
 /* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 3 
"backprop" { target { ! ifn_copysign } } } } */
-- 
2.43.0

[PATCH] s390: testsuite: Fix abs-4.c

2024-03-21 Thread Stefan Schulze Frielinghaus

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/abs-4.c: On s390 we also have a copysign optab
for long double.  Thus, scan 3 instead of 2 times for it.
---
 Ok for mainline?

 gcc/testsuite/gcc.dg/tree-ssa/abs-4.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c 
b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
index 80fa448df12..4144d1cd954 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
@@ -10,8 +10,9 @@ long double abs_ld(long double x) { return 
__builtin_signbit(x) ? x : -x; }
 
 /* __builtin_signbit(x) ? x : -x. Should be convert into - ABS_EXP */
 /* { dg-final { scan-tree-dump-not "signbit" "optimized"} } */
-/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "optimized" { target 
ifn_copysign } } } */
-/* { dg-final { scan-tree-dump-times "= -" 1 "optimized" { target ifn_copysign 
} } } */
-/* { dg-final { scan-tree-dump-times "= \.COPYSIGN" 2 "optimized" { target 
ifn_copysign } } } */
+/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "optimized" { target { 
ifn_copysign && { ! { s390*-*-* } } } } } } */
+/* { dg-final { scan-tree-dump-times "= -" 1 "optimized" { target { 
ifn_copysign && { ! { s390*-*-* } } } } } } */
+/* { dg-final { scan-tree-dump-times "= \.COPYSIGN" 2 "optimized" { target { 
ifn_copysign && { ! { s390*-*-* } } } } } } */
+/* { dg-final { scan-tree-dump-times "= \.COPYSIGN" 3 "optimized" { target { 
ifn_copysign && s390*-*-* } } } } */
 /* { dg-final { scan-tree-dump-times "= ABS_EXPR" 3 "optimized" { target { ! 
ifn_copysign } } } } */
 /* { dg-final { scan-tree-dump-times "= -" 3 "optimized" { target { ! 
ifn_copysign } } } } */
-- 
2.43.0

Re: [PATCH] analyzer: Bail out on function pointer for -Wanalyzer-allocation-size

2024-03-21 Thread Stefan Schulze Frielinghaus

On Tue, Mar 19, 2024 at 12:38:34PM -0400, David Malcolm wrote:
> On Tue, 2024-03-19 at 16:10 +0100, Stefan Schulze Frielinghaus wrote:
> > On s390 pr94688.c is failing due to excess error
> > 
> > pr94688.c:6:5: warning: allocated buffer size is not a multiple of
> > the pointee's size [CWE-131] [-Wanalyzer-allocation-size]
> > 
> > This is because on s390 functions are by default aligned to an 8-byte
> > boundary and during function type construction size is set to
> > function
> > boundary.  Thus, for the assignment
> > 
> > a.0_1 = (void (*) ()) &a;
> > 
> > we have that the right-hand side is pointing to a 4-byte memory
> > region
> > whereas the size of the function pointer is 8 byte and a warning is
> > emitted.
> 
> FWIW the test case in question is a regression test for an ICE seen in
> the GCC 10 implementation of the analyzer, which was fixed by the big
> rewrite in r11-2694-g808f4dfeb3a95f.
> 
> So the code in the test doesn't make a great deal of sense.
> 
> > 
> > I could follow and skip this test as done in PR112705, or we could
> > bail
> > out early in the analyzer for function pointers.  My intuition so far
> > is that -Wanalyzer-allocation-size shouldn't care about function
> > pointer.  Therefore, I went for bailing out early.  If you believe
> > this
> > is wrong I can still go by skipping this test on s390.  Any thoughts?
> 
> I tried imagining a situation where we're analyzing a function
> generated at run-time, but it strikes me that the buffer allocated for
> such a function can be of arbitrary size.  So -Wanalyzer-allocation-
> size is meaningless for functions.
> 
> There's probably a case for checking for mismatches between pointers to
> code vs pointers to data (e.g. alignments, Harvard architecture
> machines, etc), but -Wanalyzer-allocation-size doesn't do that.
> 
> So I think your patch is correct.
> 
> OK to push it if it passes bootstrap®ression testing.

Bootstrapped and regtested on x64 and s390x.

Thanks,
Stefan

> 
> Thanks
> Dave
> 
> > ---
> >  gcc/analyzer/region-model.cc | 4 
> >  1 file changed, 4 insertions(+)
> > 
> > diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-
> > model.cc
> > index f079d1fb37e..1b43443d168 100644
> > --- a/gcc/analyzer/region-model.cc
> > +++ b/gcc/analyzer/region-model.cc
> > @@ -3514,6 +3514,10 @@ region_model::check_region_size (const region
> > *lhs_reg, const svalue *rhs_sval,
> >    || TYPE_SIZE_UNIT (pointee_type) == NULL_TREE)
> >  return;
> >  
> > +  /* Bail out early on function pointers.  */
> > +  if (TREE_CODE (pointee_type) == FUNCTION_TYPE)
> > +    return;
> > +
> >    /* Bail out early on pointers to structs where we can
> >   not deduce whether the buffer size is compatible.  */
> >    bool is_struct = RECORD_OR_UNION_TYPE_P (pointee_type);
>

[PATCH] analyzer: Bail out on function pointer for -Wanalyzer-allocation-size

2024-03-19 Thread Stefan Schulze Frielinghaus

On s390 pr94688.c is failing due to excess error

pr94688.c:6:5: warning: allocated buffer size is not a multiple of the 
pointee's size [CWE-131] [-Wanalyzer-allocation-size]

This is because on s390 functions are by default aligned to an 8-byte
boundary and during function type construction size is set to function
boundary.  Thus, for the assignment

a.0_1 = (void (*) ()) &a;

we have that the right-hand side is pointing to a 4-byte memory region
whereas the size of the function pointer is 8 byte and a warning is
emitted.

I could follow and skip this test as done in PR112705, or we could bail
out early in the analyzer for function pointers.  My intuition so far
is that -Wanalyzer-allocation-size shouldn't care about function
pointer.  Therefore, I went for bailing out early.  If you believe this
is wrong I can still go by skipping this test on s390.  Any thoughts?
---
 gcc/analyzer/region-model.cc | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc
index f079d1fb37e..1b43443d168 100644
--- a/gcc/analyzer/region-model.cc
+++ b/gcc/analyzer/region-model.cc
@@ -3514,6 +3514,10 @@ region_model::check_region_size (const region *lhs_reg, 
const svalue *rhs_sval,
   || TYPE_SIZE_UNIT (pointee_type) == NULL_TREE)
 return;
 
+  /* Bail out early on function pointers.  */
+  if (TREE_CODE (pointee_type) == FUNCTION_TYPE)
+return;
+
   /* Bail out early on pointers to structs where we can
  not deduce whether the buffer size is compatible.  */
   bool is_struct = RECORD_OR_UNION_TYPE_P (pointee_type);
-- 
2.43.0

Re: RFC: New mechanism for hard reg operands to inline asm

2024-03-15 Thread Stefan Schulze Frielinghaus

On Fri, Jun 04, 2021 at 06:02:27PM +, Andreas Krebbel via Gcc wrote:
> Hi,
> 
> I wonder if we could replace the register asm construct for
> inline assemblies with something a bit nicer and more obvious.
> E.g. turning this (real world example from IBM Z kernel code):
> 
> int diag8_response(int cmdlen, char *response, int *rlen)
> {
> register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf;
> register unsigned long reg3 asm ("3") = (addr_t) response;
> register unsigned long reg4 asm ("4") = cmdlen | 0x4000L;
> register unsigned long reg5 asm ("5") = *rlen; /* <-- */
> asm volatile(
> "   diag%2,%0,0x8\n"
> "   brc 8,1f\n"
> "   agr %1,%4\n"
> "1:\n"
> : "+d" (reg4), "+d" (reg5)
> : "d" (reg2), "d" (reg3), "d" (*rlen): "cc");
> *rlen = reg5;
> return reg4;
> }
> 
> into this:
> 
> int diag8_response(int cmdlen, char *response, int *rlen)
> {
> unsigned long len = cmdlen | 0x4000L;
> 
> asm volatile(
> "   diag%2,%0,0x8\n"
> "   brc 8,1f\n"
> "   agr %1,%4\n"
> "1:\n"
> : "+{r4}" (len), "+{r5}" (*rlen)
> : "{r2}" ((addr_t)cpcmd_buf), "{r3}" ((addr_t)response), "d" 
> (*rlen): "cc");
> return len;
> }
> 
> Apart from being much easier to read because the hard regs become part
> of the inline assembly it solves also a couple of other issues:
> 
> - function calls might clobber register asm variables see BZ100908
> - the constraints for the register asm operands are superfluous
> - one register asm variable cannot be used for 2 different inline
>   assemblies if the value is expected in different hard regs
> 
> I've started with a hackish implementation for IBM Z using the
> TARGET_MD_ASM_ADJUST hook and let all the places parsing constraints
> skip over the {} parts.  But perhaps it would be useful to make this a
> generic mechanism for all targets?!
> 
> Andrea

Hi all,

I would like to resurrect this topic
https://gcc.gnu.org/pipermail/gcc/2021-June/236269.html and have been
coming up with a first implementation in order to discuss this further.

Basically, I see two ways to implement this.  First is by letting LRA
assign the registers and the second one by introducing extra moves just
before/after asm statements.  Currently I went for the latter and emit
extra moves during expand into hard regs as specified by the
input/output constraints.

Before going forward I would like to get some feedback whether this approach
makes sense to you at all or whether you see some show stoppers.  I was
wondering whether my current approach is robust enough in the sense that no
other pass could potentially remove the extra moves I introduced before.
In particular I was first worried about code motion.  Initially I thought I
have to make use not only of hard regs but hard regs which are flagged as
register-asms in order to prevent optimizations to fiddly around with those
moves.  However, after some more investigation I tend to conclude that this is
not necessary.  Any thoughts about this approach?

With the current approach I can at least handle cases like:

int __attribute__ ((noipa))
foo (int x) { return x; }

int test (int x)
{
  asm ("foo %0,%1\n" :: "{r3}" (foo (x + 1)), "{r2}" (x));
  return x;
}

Note, this is written with the s390 ABI in mind where the first int argument
and return value are passed in register r2.  The point here is that r2 needs to
be altered and restored multiple times until we reach } of function test().
Luckily, during expand we get all this basically for free.

This brings me to the general question what should be allowed and what not?
Evaluation order of input expressions is probably unspecified similar to
function arguments.  However, what about this one:

int test (int x)
{
  register int y asm ("r5") = x + 1;
  asm ("foo %0,%1\n" : "={r4}" (y) : "{r1}" (y));
  return y;
}

IMHO the input is just fine but the output constraint is misleading and it is
not obvious in which register variable y resides after the asm statement.
With my current implementation, were I don't bail out, it is register r4
contrary to the decl.  Interestingly, the other way around where one register
is "aliased" by multiple variables is accepted by vanilla GCC:

int foo (int x, int y)
{
  register int a asm ("r1") = x;
  register int b asm ("r1") = y;
  return a + b;
}

Though, probably not intentionally.

Cheers,
Stefan

Re: [PATCH] s390: Fix test vector/long-double-to-i64.c

2024-03-12 Thread Stefan Schulze Frielinghaus

On Mon, Mar 11, 2024 at 11:14:04AM +0100, Andreas Krebbel wrote:
> On 2/29/24 13:15, Stefan Schulze Frielinghaus wrote:
> > Starting with r14-8319-g86de9b66480b71 fwprop improved so that vpdi is
> > no longer required.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > * gcc.target/s390/vector/long-double-to-i64.c: Fix scan
> > assembler directive.
> 
> Should we perhaps rather turn the scan-assembler directives into something 
> which checks for the
> absence of vpdi then? In order to get notified once this really useful 
> optimization breaks?

I thought about checking for the most optimal code which would be just
two loads and a convert instruction.  Thus if this fails, then we have a
regression.  Speaking of regressions, the old behaviour was restored by
r14-9412-g3e3e4156a5f93e which means we are back using vpdi.  Thus, I
will leave this patch on hold and have a second look.

Cheers,
Stefan

> 
> Andreas
> 
> > ---
> >  .../gcc.target/s390/vector/long-double-to-i64.c | 13 +
> >  1 file changed, 9 insertions(+), 4 deletions(-)
> > 
> > diff --git a/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c 
> > b/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c
> > index 2dbbb5d1c03..ed89878e6ee 100644
> > --- a/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c
> > +++ b/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c
> > @@ -1,19 +1,24 @@
> >  /* { dg-do compile } */
> >  /* { dg-options "-O3 -march=z14 -mzarch --save-temps" } */
> >  /* { dg-do run { target { s390_z14_hw } } } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } } } */
> > +
> >  #include 
> >  #include 
> >  
> > +/*
> > +** long_double_to_i64:
> > +** ld  %f0,0\(%r2\)
> > +** ld  %f2,8\(%r2\)
> > +** cgxbr   %r2,5,%f0
> > +** br  %r14
> > +*/
> >  __attribute__ ((noipa)) static int64_t
> >  long_double_to_i64 (long double x)
> >  {
> >return x;
> >  }
> >  
> > -/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,1\n} 1 } 
> > } */
> > -/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,5\n} 1 } 
> > } */
> > -/* { dg-final { scan-assembler-times {\n\tcgxbr\t} 1 } } */
> > -
> >  int
> >  main (void)
> >  {
>

Re: [PATCH v3] RISC-V: Introduce gcc attribute riscv_rvv_vector_bits for RVV

2024-03-12 Thread Stefan O'Rear

On Tue, Mar 12, 2024, at 2:15 AM, pan2...@intel.com wrote:
> From: Pan Li 
>
> Update in v3:
> * Add pre-defined __riscv_v_fixed_vlen when zvl.
>
> Update in v2:
> * Cleanup some unused code.
> * Fix some typo of commit log.
>
> Original log:
>
> This patch would like to introduce one new gcc attribute for RVV.
> This attribute is used to define fixed-length variants of one
> existing sizeless RVV types.
>
> This attribute is valid if and only if the mrvv-vector-bits=zvl, the only
> one args should be the integer constant and its' value is terminated
> by the LMUL and the vector register bits in zvl*b.  For example:
>
> typedef vint32m2_t fixed_vint32m2_t 
> __attribute__((riscv_rvv_vector_bits(128)));
>
> The above type define is valid when -march=rv64gc_zve64d_zvl64b
> (aka 2(m2) * 64 = 128 for vin32m2_t), and will report error when
> -march=rv64gcv_zvl128b similar to below.
>
> "error: invalid RVV vector size '128', expected size is '256' based on
> LMUL of type and '-mrvv-vector-bits=zvl'"
>
> Meanwhile, a pre-define macro __riscv_v_fixed_vlen is introduced to
> represent the fixed vlen in a RVV vector register.

Shouldn't a major user-facing change like this be discussed in a PR against
https://github.com/riscv-non-isa/riscv-c-api-doc/ or
https://github.com/riscv-non-isa/rvv-intrinsic-doc before or concurrent with
compiler implementation?

-s

> For the vint*m*_t below operations are allowed.
> * The sizeof.
> * The global variable(s).
> * The element of union and struct.
> * The cast to other equalities.
> * CMP: >, <, ==, !=, <=, >=
> * ALU: +, -, *, /, %, &, |, ^, >>, <<, ~, -
>
> For the vfloat*m*_t below operations are allowed.
> * The sizeof.
> * The global variable(s).
> * The element of union and struct.
> * The cast to other equalities.
> * CMP: >, <, ==, !=, <=, >=
> * ALU: +, -, *, /, -
>
> For the vbool*_t types only below operations are allowed except
> the CMP and ALU. The CMP and ALU operations on vbool*_t is not
> well defined currently.
> * The sizeof.
> * The global variable(s).
> * The element of union and struct.
> * The cast to other equalities.
>
> For the vint*x*m*_t tuple types are not suppored in this patch
> which is compatible with clang.
>
> This patch passed the below testsuites.
> * The riscv fully regression tests.
>
> gcc/ChangeLog:
>
>   * config/riscv/riscv-c.cc (riscv_cpu_cpp_builtins): Add pre-define
>   macro __riscv_v_fixed_vlen when zvl.
>   * config/riscv/riscv.cc (riscv_handle_rvv_vector_bits_attribute):
>   New static func to take care of the RVV types decorated by
>   the attributes.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-1.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-10.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-11.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-12.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-13.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-14.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-15.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-16.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-17.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-2.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-3.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-4.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-5.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-6.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-7.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-8.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-9.c: New test.
>   * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits.h: New test.
>
> Signed-off-by: Pan Li 
> ---
>  gcc/config/riscv/riscv-c.cc   |   3 +
>  gcc/config/riscv/riscv.cc |  87 +-
>  .../riscv/rvv/base/riscv_rvv_vector_bits-1.c  |   6 +
>  .../riscv/rvv/base/riscv_rvv_vector_bits-10.c |  53 +
>  .../riscv/rvv/base/riscv_rvv_vector_bits-11.c |  76 
>  .../riscv/rvv/base/riscv_rvv_vector_bits-12.c |  14 +++
>  .../riscv/rvv/base/riscv_rvv_vector_bits-13.c |  10 ++
>  .../riscv/rvv/base/riscv_rvv_vector_bits-14.c |  10 ++
>  .../riscv/rvv/base/riscv_rvv_vector_bits-15.c |  10 ++
>  .../riscv/rvv/base/riscv_rvv_vector_bits-16.c |  11 ++
>  .../riscv/rvv/base/riscv_rvv_vector_bits-17.c |  10 ++
>  .../riscv/rvv/base/riscv_rvv_vector_bits-2.c  |   6 +
>  .../riscv/rvv/base/riscv_rvv_vector_bits-3.c  |   6 +
>  .../riscv/rvv/base/riscv_rvv_vector_bits-4.c  |   6 +
>  .../riscv/rvv/base/riscv_rvv_vector_bits-5.c  |   6 +
>  .../riscv/rvv/base/riscv_rvv_vector_bits-6.c  |   6 +
>  .../riscv/rvv/base/riscv_rvv_vector_bits-7.c  |  76

Re: [PATCH] s390: Streamline NNPA builtins with POP mnemonics

2024-03-06 Thread Stefan Schulze Frielinghaus

Since there is no straight forward way to introduce an overload with
different return types where we would expand differently depending on an
immediate operand, lets drop this patch.

On Fri, Mar 01, 2024 at 04:18:31PM +0100, Stefan Schulze Frielinghaus wrote:
> At the moment there are no extended mnemonics for vclfn(h,l) and vcrnf
> defined in the Principles of Operation.  Thus, remove the suffix "s"
> from the builtins and expanders and introduce a further operand for the
> data type.
> 
> gcc/ChangeLog:
> 
>   * config/s390/s390-builtin-types.def: Update to reflect latest
>   changes.
>   * config/s390/s390-builtins.def: Remove suffix s from
>   s390_vclfn(h,l)s and s390_vcrnfs.
>   * config/s390/s390.md: Similar, remove suffix s from unspec
>   definitions.
>   * config/s390/vecintrin.h (vec_extend_to_fp32_hi): Redefine.
>   (vec_extend_to_fp32_lo): Redefine.
>   (vec_round_from_fp32): Redefine.
>   * config/s390/vx-builtins.md (vclfnhs_v8hi): Remove suffix s.
>   (vclfnh_v8hi): Add with extra operand.
>   (vclfnls_v8hi): Remove suffix s.
>   (vclfnl_v8hi): Add with extra operand.
>   (vcrnfs_v8hi): Remove suffix s.
>   (vcrnf_v8hi): Add with extra operand.
> ---
> OK for mainline?
> 
>  gcc/config/s390/s390-builtin-types.def |  4 ++--
>  gcc/config/s390/s390-builtins.def  |  6 +++---
>  gcc/config/s390/s390.md|  6 +++---
>  gcc/config/s390/vecintrin.h|  6 +++---
>  gcc/config/s390/vx-builtins.md | 27 ++
>  5 files changed, 26 insertions(+), 23 deletions(-)
> 
> diff --git a/gcc/config/s390/s390-builtin-types.def 
> b/gcc/config/s390/s390-builtin-types.def
> index ce51ae8cd3f..c3d09b42835 100644
> --- a/gcc/config/s390/s390-builtin-types.def
> +++ b/gcc/config/s390/s390-builtin-types.def
> @@ -273,7 +273,6 @@ DEF_FN_TYPE_2 (BT_FN_V2DI_V2DF_V2DF, BT_V2DI, BT_V2DF, 
> BT_V2DF)
>  DEF_FN_TYPE_2 (BT_FN_V2DI_V2DI_V2DI, BT_V2DI, BT_V2DI, BT_V2DI)
>  DEF_FN_TYPE_2 (BT_FN_V2DI_V4SI_V4SI, BT_V2DI, BT_V4SI, BT_V4SI)
>  DEF_FN_TYPE_2 (BT_FN_V4SF_FLT_INT, BT_V4SF, BT_FLT, BT_INT)
> -DEF_FN_TYPE_2 (BT_FN_V4SF_UV8HI_UINT, BT_V4SF, BT_UV8HI, BT_UINT)
>  DEF_FN_TYPE_2 (BT_FN_V4SF_V4SF_UCHAR, BT_V4SF, BT_V4SF, BT_UCHAR)
>  DEF_FN_TYPE_2 (BT_FN_V4SF_V4SF_V4SF, BT_V4SF, BT_V4SF, BT_V4SF)
>  DEF_FN_TYPE_2 (BT_FN_V4SI_BV4SI_V4SI, BT_V4SI, BT_BV4SI, BT_V4SI)
> @@ -324,7 +323,6 @@ DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_USHORT_INT, BT_UV8HI, 
> BT_UV8HI, BT_USHORT, BT_I
>  DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_INT, BT_UV8HI, BT_UV8HI, BT_UV8HI, 
> BT_INT)
>  DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_INTPTR, BT_UV8HI, BT_UV8HI, BT_UV8HI, 
> BT_INTPTR)
>  DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI, 
> BT_UV8HI)
> -DEF_FN_TYPE_3 (BT_FN_UV8HI_V4SF_V4SF_UINT, BT_UV8HI, BT_V4SF, BT_V4SF, 
> BT_UINT)
>  DEF_FN_TYPE_3 (BT_FN_V16QI_UV16QI_UV16QI_INTPTR, BT_V16QI, BT_UV16QI, 
> BT_UV16QI, BT_INTPTR)
>  DEF_FN_TYPE_3 (BT_FN_V16QI_V16QI_V16QI_INTPTR, BT_V16QI, BT_V16QI, BT_V16QI, 
> BT_INTPTR)
>  DEF_FN_TYPE_3 (BT_FN_V16QI_V16QI_V16QI_V16QI, BT_V16QI, BT_V16QI, BT_V16QI, 
> BT_V16QI)
> @@ -340,6 +338,7 @@ DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_INT_INTPTR, BT_V2DI, 
> BT_V2DF, BT_INT, BT_INTPTR)
>  DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_V2DF_INTPTR, BT_V2DI, BT_V2DF, BT_V2DF, 
> BT_INTPTR)
>  DEF_FN_TYPE_3 (BT_FN_V2DI_V2DI_V2DI_INTPTR, BT_V2DI, BT_V2DI, BT_V2DI, 
> BT_INTPTR)
>  DEF_FN_TYPE_3 (BT_FN_V2DI_V4SI_V4SI_V2DI, BT_V2DI, BT_V4SI, BT_V4SI, BT_V2DI)
> +DEF_FN_TYPE_3 (BT_FN_V4SF_UV8HI_UINT_UINT, BT_V4SF, BT_UV8HI, BT_UINT, 
> BT_UINT)
>  DEF_FN_TYPE_3 (BT_FN_V4SF_V2DF_INT_INT, BT_V4SF, BT_V2DF, BT_INT, BT_INT)
>  DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_FLT_INT, BT_V4SF, BT_V4SF, BT_FLT, BT_INT)
>  DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_UCHAR_UCHAR, BT_V4SF, BT_V4SF, BT_UCHAR, 
> BT_UCHAR)
> @@ -377,6 +376,7 @@ DEF_FN_TYPE_4 
> (BT_FN_UV4SI_UV4SI_UV4SI_UINTCONSTPTR_UCHAR, BT_UV4SI, BT_UV4SI, B
>  DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_UV4SI_INT, BT_UV4SI, BT_UV4SI, 
> BT_UV4SI, BT_UV4SI, BT_INT)
>  DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_INT_INTPTR, BT_UV8HI, BT_UV8HI, 
> BT_UV8HI, BT_INT, BT_INTPTR)
>  DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI_INT, BT_UV8HI, BT_UV8HI, 
> BT_UV8HI, BT_UV8HI, BT_INT)
> +DEF_FN_TYPE_4 (BT_FN_UV8HI_V4SF_V4SF_UINT_UINT, BT_UV8HI, BT_V4SF, BT_V4SF, 
> BT_UINT, BT_UINT)
>  DEF_FN_TYPE_4 (BT_FN_VOID_UV2DI_UV2DI_ULONGLONGPTR_ULONGLONG, BT_VOID, 
> BT_UV2DI, BT_UV2DI, BT_ULONGLONGPTR, BT_ULONGLONG)
>  DEF_FN_TYPE_4 (BT_FN_VOID_UV4SI_UV4SI_UINTPTR_ULONGLONG, BT_VOID, BT_UV4SI, 
> BT_UV4SI, BT_UINTPTR, BT_ULONGLONG)
>  DEF_FN_TYPE_4 (BT_FN_VOID_V4SI_V4SI_INTPTR_ULONGLONG, BT_VOID, BT_V4SI, 
> BT_V4SI, BT_I

[PATCH] s390: Deprecate some vector builtins

2024-03-01 Thread Stefan Schulze Frielinghaus

According to IBM Open XL C/C++ for z/OS version 1.1 builtins

- vec_permi
- vec_ctd
- vec_ctsl
- vec_ctul
- vec_ld2f
- vec_st2f

are deprecated.  Also deprecate helper builtins vec_ctd_s64 and
vec_ctd_u64.

Furthermore, the overloads of vec_insert which make use of a bool vector
are deprecated, too.

gcc/ChangeLog:

* config/s390/s390-builtins.def (vec_permi): Deprecate.
(vec_ctd): Deprecate.
(vec_ctd_s64): Deprecate.
(vec_ctd_u64): Deprecate.
(vec_ctsl): Deprecate.
(vec_ctul): Deprecate.
(vec_ld2f): Deprecate.
(vec_st2f): Deprecate.
(vec_insert): Deprecate overloads with bool vectors.
---
 Ok for mainline?

 gcc/config/s390/s390-builtins.def | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/gcc/config/s390/s390-builtins.def 
b/gcc/config/s390/s390-builtins.def
index 680a038fa4b..54f400ceb5a 100644
--- a/gcc/config/s390/s390-builtins.def
+++ b/gcc/config/s390/s390-builtins.def
@@ -416,16 +416,16 @@ B_DEF  (s390_vec_splat_s64, vec_splatsv2di,   
  0,
 OB_DEF (s390_vec_insert,s390_vec_insert_s8, 
s390_vec_insert_dbl,B_VX,   BT_FN_OV4SI_INT_OV4SI_INT)
 OB_DEF_VAR (s390_vec_insert_s8, s390_vlvgb, 0, 
 O3_ELEM,BT_OV_V16QI_SCHAR_V16QI_INT)
 OB_DEF_VAR (s390_vec_insert_u8, s390_vlvgb, 0, 
 O3_ELEM,BT_OV_UV16QI_UCHAR_UV16QI_INT)
-OB_DEF_VAR (s390_vec_insert_b8, s390_vlvgb, 0, 
 O3_ELEM,BT_OV_UV16QI_UCHAR_BV16QI_INT)
+OB_DEF_VAR (s390_vec_insert_b8, s390_vlvgb, B_DEP, 
 O3_ELEM,BT_OV_UV16QI_UCHAR_BV16QI_INT)
 OB_DEF_VAR (s390_vec_insert_s16,s390_vlvgh, 0, 
 O3_ELEM,BT_OV_V8HI_SHORT_V8HI_INT)
 OB_DEF_VAR (s390_vec_insert_u16,s390_vlvgh, 0, 
 O3_ELEM,BT_OV_UV8HI_USHORT_UV8HI_INT)
-OB_DEF_VAR (s390_vec_insert_b16,s390_vlvgh, 0, 
 O3_ELEM,BT_OV_UV8HI_USHORT_BV8HI_INT)
+OB_DEF_VAR (s390_vec_insert_b16,s390_vlvgh, B_DEP, 
 O3_ELEM,BT_OV_UV8HI_USHORT_BV8HI_INT)
 OB_DEF_VAR (s390_vec_insert_s32,s390_vlvgf, 0, 
 O3_ELEM,BT_OV_V4SI_INT_V4SI_INT)
 OB_DEF_VAR (s390_vec_insert_u32,s390_vlvgf, 0, 
 O3_ELEM,BT_OV_UV4SI_UINT_UV4SI_INT)
-OB_DEF_VAR (s390_vec_insert_b32,s390_vlvgf, 0, 
 O3_ELEM,BT_OV_UV4SI_UINT_BV4SI_INT)
+OB_DEF_VAR (s390_vec_insert_b32,s390_vlvgf, B_DEP, 
 O3_ELEM,BT_OV_UV4SI_UINT_BV4SI_INT)
 OB_DEF_VAR (s390_vec_insert_s64,s390_vlvgg, 0, 
 O3_ELEM,BT_OV_V2DI_LONGLONG_V2DI_INT)
 OB_DEF_VAR (s390_vec_insert_u64,s390_vlvgg, 0, 
 O3_ELEM,BT_OV_UV2DI_ULONGLONG_UV2DI_INT)
-OB_DEF_VAR (s390_vec_insert_b64,s390_vlvgg, 0, 
 O3_ELEM,BT_OV_UV2DI_ULONGLONG_BV2DI_INT)
+OB_DEF_VAR (s390_vec_insert_b64,s390_vlvgg, B_DEP, 
 O3_ELEM,BT_OV_UV2DI_ULONGLONG_BV2DI_INT)
 OB_DEF_VAR (s390_vec_insert_flt,s390_vlvgf_flt, B_VXE, 
 O3_ELEM,BT_OV_V4SF_FLT_V4SF_INT) /* vlvgf */
 OB_DEF_VAR (s390_vec_insert_dbl,s390_vlvgg_dbl, 0, 
 O3_ELEM,BT_OV_V2DF_DBL_V2DF_INT) /* vlvgg */
 
@@ -658,7 +658,7 @@ OB_DEF_VAR (s390_vec_perm_dbl,  s390_vperm, 
0,
 
 B_DEF  (s390_vperm, vec_permv16qi,  0, 
 B_VX,   0,  BT_FN_UV16QI_UV16QI_UV16QI_UV16QI)
 
-OB_DEF (s390_vec_permi, s390_vec_permi_s64, 
s390_vec_permi_dbl, B_VX,   BT_FN_OV4SI_OV4SI_OV4SI_INT)
+OB_DEF (s390_vec_permi, s390_vec_permi_s64, 
s390_vec_permi_dbl, B_DEP | B_VX,   BT_FN_OV4SI_OV4SI_OV4SI_INT)
 OB_DEF_VAR (s390_vec_permi_s64, s390_vpdi,  0, 
 O3_U2,  BT_OV_V2DI_V2DI_V2DI_INT)
 OB_DEF_VAR (s390_vec_permi_b64, s390_vpdi,  0, 
 O3_U2,  BT_OV_BV2DI_BV2DI_BV2DI_INT)
 OB_DEF_VAR (s390_vec_permi_u64, s390_vpdi,  0, 
 O3_U2,  BT_OV_UV2DI_UV2DI_UV2DI_INT)
@@ -2806,7 +2806,7 @@ OB_DEF (s390_vec_any_ngt,   
s390_vec_any_ngt_flt,s390_vec_any_ngt_db
 OB_DEF_VAR (s390_vec_any_ngt_flt,   vec_any_unlev4sf,   B_VXE, 
 0,  BT_OV_INT_V4SF_V4SF)
 OB_DEF_VAR (s390_vec_any_ngt_dbl,   vec_any_unlev2df,   0, 
 0,  BT_OV_INT_V2DF_V2DF)
 
-OB_DEF (s390_vec_ctd,   s390_vec_ctd_s64,   s390_vec_ctd_u64,  
 B_VX,   BT_FN

[PATCH] s390: Streamline NNPA builtins with POP mnemonics

2024-03-01 Thread Stefan Schulze Frielinghaus

At the moment there are no extended mnemonics for vclfn(h,l) and vcrnf
defined in the Principles of Operation.  Thus, remove the suffix "s"
from the builtins and expanders and introduce a further operand for the
data type.

gcc/ChangeLog:

* config/s390/s390-builtin-types.def: Update to reflect latest
changes.
* config/s390/s390-builtins.def: Remove suffix s from
s390_vclfn(h,l)s and s390_vcrnfs.
* config/s390/s390.md: Similar, remove suffix s from unspec
definitions.
* config/s390/vecintrin.h (vec_extend_to_fp32_hi): Redefine.
(vec_extend_to_fp32_lo): Redefine.
(vec_round_from_fp32): Redefine.
* config/s390/vx-builtins.md (vclfnhs_v8hi): Remove suffix s.
(vclfnh_v8hi): Add with extra operand.
(vclfnls_v8hi): Remove suffix s.
(vclfnl_v8hi): Add with extra operand.
(vcrnfs_v8hi): Remove suffix s.
(vcrnf_v8hi): Add with extra operand.
---
OK for mainline?

 gcc/config/s390/s390-builtin-types.def |  4 ++--
 gcc/config/s390/s390-builtins.def  |  6 +++---
 gcc/config/s390/s390.md|  6 +++---
 gcc/config/s390/vecintrin.h|  6 +++---
 gcc/config/s390/vx-builtins.md | 27 ++
 5 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/gcc/config/s390/s390-builtin-types.def 
b/gcc/config/s390/s390-builtin-types.def
index ce51ae8cd3f..c3d09b42835 100644
--- a/gcc/config/s390/s390-builtin-types.def
+++ b/gcc/config/s390/s390-builtin-types.def
@@ -273,7 +273,6 @@ DEF_FN_TYPE_2 (BT_FN_V2DI_V2DF_V2DF, BT_V2DI, BT_V2DF, 
BT_V2DF)
 DEF_FN_TYPE_2 (BT_FN_V2DI_V2DI_V2DI, BT_V2DI, BT_V2DI, BT_V2DI)
 DEF_FN_TYPE_2 (BT_FN_V2DI_V4SI_V4SI, BT_V2DI, BT_V4SI, BT_V4SI)
 DEF_FN_TYPE_2 (BT_FN_V4SF_FLT_INT, BT_V4SF, BT_FLT, BT_INT)
-DEF_FN_TYPE_2 (BT_FN_V4SF_UV8HI_UINT, BT_V4SF, BT_UV8HI, BT_UINT)
 DEF_FN_TYPE_2 (BT_FN_V4SF_V4SF_UCHAR, BT_V4SF, BT_V4SF, BT_UCHAR)
 DEF_FN_TYPE_2 (BT_FN_V4SF_V4SF_V4SF, BT_V4SF, BT_V4SF, BT_V4SF)
 DEF_FN_TYPE_2 (BT_FN_V4SI_BV4SI_V4SI, BT_V4SI, BT_BV4SI, BT_V4SI)
@@ -324,7 +323,6 @@ DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_USHORT_INT, BT_UV8HI, 
BT_UV8HI, BT_USHORT, BT_I
 DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_INT, BT_UV8HI, BT_UV8HI, BT_UV8HI, 
BT_INT)
 DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_INTPTR, BT_UV8HI, BT_UV8HI, BT_UV8HI, 
BT_INTPTR)
 DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI, 
BT_UV8HI)
-DEF_FN_TYPE_3 (BT_FN_UV8HI_V4SF_V4SF_UINT, BT_UV8HI, BT_V4SF, BT_V4SF, BT_UINT)
 DEF_FN_TYPE_3 (BT_FN_V16QI_UV16QI_UV16QI_INTPTR, BT_V16QI, BT_UV16QI, 
BT_UV16QI, BT_INTPTR)
 DEF_FN_TYPE_3 (BT_FN_V16QI_V16QI_V16QI_INTPTR, BT_V16QI, BT_V16QI, BT_V16QI, 
BT_INTPTR)
 DEF_FN_TYPE_3 (BT_FN_V16QI_V16QI_V16QI_V16QI, BT_V16QI, BT_V16QI, BT_V16QI, 
BT_V16QI)
@@ -340,6 +338,7 @@ DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_INT_INTPTR, BT_V2DI, 
BT_V2DF, BT_INT, BT_INTPTR)
 DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_V2DF_INTPTR, BT_V2DI, BT_V2DF, BT_V2DF, 
BT_INTPTR)
 DEF_FN_TYPE_3 (BT_FN_V2DI_V2DI_V2DI_INTPTR, BT_V2DI, BT_V2DI, BT_V2DI, 
BT_INTPTR)
 DEF_FN_TYPE_3 (BT_FN_V2DI_V4SI_V4SI_V2DI, BT_V2DI, BT_V4SI, BT_V4SI, BT_V2DI)
+DEF_FN_TYPE_3 (BT_FN_V4SF_UV8HI_UINT_UINT, BT_V4SF, BT_UV8HI, BT_UINT, BT_UINT)
 DEF_FN_TYPE_3 (BT_FN_V4SF_V2DF_INT_INT, BT_V4SF, BT_V2DF, BT_INT, BT_INT)
 DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_FLT_INT, BT_V4SF, BT_V4SF, BT_FLT, BT_INT)
 DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_UCHAR_UCHAR, BT_V4SF, BT_V4SF, BT_UCHAR, 
BT_UCHAR)
@@ -377,6 +376,7 @@ DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_UINTCONSTPTR_UCHAR, 
BT_UV4SI, BT_UV4SI, B
 DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_UV4SI_INT, BT_UV4SI, BT_UV4SI, 
BT_UV4SI, BT_UV4SI, BT_INT)
 DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_INT_INTPTR, BT_UV8HI, BT_UV8HI, 
BT_UV8HI, BT_INT, BT_INTPTR)
 DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI_INT, BT_UV8HI, BT_UV8HI, 
BT_UV8HI, BT_UV8HI, BT_INT)
+DEF_FN_TYPE_4 (BT_FN_UV8HI_V4SF_V4SF_UINT_UINT, BT_UV8HI, BT_V4SF, BT_V4SF, 
BT_UINT, BT_UINT)
 DEF_FN_TYPE_4 (BT_FN_VOID_UV2DI_UV2DI_ULONGLONGPTR_ULONGLONG, BT_VOID, 
BT_UV2DI, BT_UV2DI, BT_ULONGLONGPTR, BT_ULONGLONG)
 DEF_FN_TYPE_4 (BT_FN_VOID_UV4SI_UV4SI_UINTPTR_ULONGLONG, BT_VOID, BT_UV4SI, 
BT_UV4SI, BT_UINTPTR, BT_ULONGLONG)
 DEF_FN_TYPE_4 (BT_FN_VOID_V4SI_V4SI_INTPTR_ULONGLONG, BT_VOID, BT_V4SI, 
BT_V4SI, BT_INTPTR, BT_ULONGLONG)
diff --git a/gcc/config/s390/s390-builtins.def 
b/gcc/config/s390/s390-builtins.def
index 02ff516c677..0d4e20ea425 100644
--- a/gcc/config/s390/s390-builtins.def
+++ b/gcc/config/s390/s390-builtins.def
@@ -3025,10 +3025,10 @@ B_DEF  (s390_vstrszf,vstrszv4si,
0,
 
 /* arch 14 builtins */
 
-B_DEF  (s390_vclfnhs,vclfnhs_v8hi,  0, 
 B_NNPA, O2_U4,  BT_FN_V4SF_UV8HI_UINT)
-B_DEF  (s390_vclfnls,vclfnls_v8hi,  0, 
 B_NNPA, O2_U4,  BT_FN_V4SF_UV8HI_UINT)
+B_DEF  (s390_vclfnh, vclfnh_v8hi,   0, 
 B

[PATCH] s390: Streamline vector builtins with LLVM

2024-03-01 Thread Stefan Schulze Frielinghaus

Similar as to s390_lcbb, s390_vll, s390_vstl, et al. make use of a
signed vector type for vlbb.  Furthermore, a const void pointer seems
more common and an integer for the mask.

For s390_vfi(s,d)b make use of integers for masks, too.

Use unsigned integers for all s390_vlbr/vstbr variants.

Make use of type UV16QI for the length operand of s390_vstrs(,z)(h,f).

Following the Principles of Operation, change from signed to unsigned
type for s390_va(c,cc,ccc)q and s390_vs(,c,bc)biq and s390_vmslg.

Make use of scalar type UINT128 instead of UV16QI for s390_vgfm(,a)g,
and s390_vsumq(f,g).

Ok for mainline?

gcc/ChangeLog:

* config/s390/s390-builtin-types.def: Update to reflect latest
changes.
* config/s390/s390-builtins.def: Streamline vector builtins with
LLVM.
---
 gcc/config/s390/s390-builtin-types.def | 29 +++-
 gcc/config/s390/s390-builtins.def  | 48 +-
 2 files changed, 44 insertions(+), 33 deletions(-)

diff --git a/gcc/config/s390/s390-builtin-types.def 
b/gcc/config/s390/s390-builtin-types.def
index 556104e0e23..ce51ae8cd3f 100644
--- a/gcc/config/s390/s390-builtin-types.def
+++ b/gcc/config/s390/s390-builtin-types.def
@@ -58,6 +58,7 @@ DEF_TYPE (BT_FLT, float_type_node, 0)
 DEF_TYPE (BT_FLTCONST, float_type_node, 1)
 DEF_TYPE (BT_INT, integer_type_node, 0)
 DEF_TYPE (BT_INT128, intTI_type_node, 0)
+DEF_TYPE (BT_INT128CONST, intTI_type_node, 1)
 DEF_TYPE (BT_INTCONST, integer_type_node, 1)
 DEF_TYPE (BT_LONG, long_integer_type_node, 0)
 DEF_TYPE (BT_LONGLONG, long_long_integer_type_node, 0)
@@ -69,6 +70,8 @@ DEF_TYPE (BT_SHORTCONST, short_integer_type_node, 1)
 DEF_TYPE (BT_UCHAR, unsigned_char_type_node, 0)
 DEF_TYPE (BT_UCHARCONST, unsigned_char_type_node, 1)
 DEF_TYPE (BT_UINT, unsigned_type_node, 0)
+DEF_TYPE (BT_UINT128, unsigned_intTI_type_node, 0)
+DEF_TYPE (BT_UINT128CONST, unsigned_intTI_type_node, 1)
 DEF_TYPE (BT_UINT64, c_uint64_type_node, 0)
 DEF_TYPE (BT_UINTCONST, unsigned_type_node, 1)
 DEF_TYPE (BT_ULONG, long_unsigned_type_node, 0)
@@ -79,6 +82,7 @@ DEF_TYPE (BT_USHORTCONST, short_unsigned_type_node, 1)
 DEF_TYPE (BT_VOID, void_type_node, 0)
 DEF_TYPE (BT_VOIDCONST, void_type_node, 1)
 DEF_VECTOR_TYPE (BT_UV16QI, BT_UCHAR, 16)
+DEF_VECTOR_TYPE (BT_UV1TI, BT_UINT128, 1)
 DEF_VECTOR_TYPE (BT_UV2DI, BT_ULONGLONG, 2)
 DEF_VECTOR_TYPE (BT_UV4SI, BT_UINT, 4)
 DEF_VECTOR_TYPE (BT_UV8HI, BT_USHORT, 8)
@@ -93,6 +97,8 @@ DEF_POINTER_TYPE (BT_DBLCONSTPTR, BT_DBLCONST)
 DEF_POINTER_TYPE (BT_DBLPTR, BT_DBL)
 DEF_POINTER_TYPE (BT_FLTCONSTPTR, BT_FLTCONST)
 DEF_POINTER_TYPE (BT_FLTPTR, BT_FLT)
+DEF_POINTER_TYPE (BT_INT128CONSTPTR, BT_INT128CONST)
+DEF_POINTER_TYPE (BT_INT128PTR, BT_INT128)
 DEF_POINTER_TYPE (BT_INTCONSTPTR, BT_INTCONST)
 DEF_POINTER_TYPE (BT_INTPTR, BT_INT)
 DEF_POINTER_TYPE (BT_LONGLONGCONSTPTR, BT_LONGLONGCONST)
@@ -103,6 +109,8 @@ DEF_POINTER_TYPE (BT_SHORTCONSTPTR, BT_SHORTCONST)
 DEF_POINTER_TYPE (BT_SHORTPTR, BT_SHORT)
 DEF_POINTER_TYPE (BT_UCHARCONSTPTR, BT_UCHARCONST)
 DEF_POINTER_TYPE (BT_UCHARPTR, BT_UCHAR)
+DEF_POINTER_TYPE (BT_UINT128CONSTPTR, BT_UINT128CONST)
+DEF_POINTER_TYPE (BT_UINT128PTR, BT_UINT128)
 DEF_POINTER_TYPE (BT_UINT64PTR, BT_UINT64)
 DEF_POINTER_TYPE (BT_UINTCONSTPTR, BT_UINTCONST)
 DEF_POINTER_TYPE (BT_UINTPTR, BT_UINT)
@@ -114,9 +122,11 @@ DEF_POINTER_TYPE (BT_VOIDCONSTPTR, BT_VOIDCONST)
 DEF_POINTER_TYPE (BT_VOIDPTR, BT_VOID)
 DEF_DISTINCT_TYPE (BT_BCHAR, BT_UCHAR)
 DEF_DISTINCT_TYPE (BT_BINT, BT_UINT)
+DEF_DISTINCT_TYPE (BT_BINT128, BT_UINT128)
 DEF_DISTINCT_TYPE (BT_BLONGLONG, BT_ULONGLONG)
 DEF_DISTINCT_TYPE (BT_BSHORT, BT_USHORT)
 DEF_OPAQUE_VECTOR_TYPE (BT_BV16QI, BT_BCHAR, 16)
+DEF_OPAQUE_VECTOR_TYPE (BT_BV1TI, BT_BINT128, 1)
 DEF_OPAQUE_VECTOR_TYPE (BT_BV2DI, BT_BLONGLONG, 2)
 DEF_OPAQUE_VECTOR_TYPE (BT_BV4SI, BT_BINT, 4)
 DEF_OPAQUE_VECTOR_TYPE (BT_BV8HI, BT_BSHORT, 8)
@@ -131,6 +141,7 @@ DEF_FN_TYPE_1 (BT_FN_INT_VOIDPTR, BT_INT, BT_VOIDPTR)
 DEF_FN_TYPE_1 (BT_FN_OV4SI_INT, BT_OV4SI, BT_INT)
 DEF_FN_TYPE_1 (BT_FN_OV4SI_INTCONSTPTR, BT_OV4SI, BT_INTCONSTPTR)
 DEF_FN_TYPE_1 (BT_FN_OV4SI_OV4SI, BT_OV4SI, BT_OV4SI)
+DEF_FN_TYPE_1 (BT_FN_UINT128_UINT128, BT_UINT128, BT_UINT128)
 DEF_FN_TYPE_1 (BT_FN_UV16QI_UCHAR, BT_UV16QI, BT_UCHAR)
 DEF_FN_TYPE_1 (BT_FN_UV16QI_UCHARCONSTPTR, BT_UV16QI, BT_UCHARCONSTPTR)
 DEF_FN_TYPE_1 (BT_FN_UV16QI_USHORT, BT_UV16QI, BT_USHORT)
@@ -154,7 +165,6 @@ DEF_FN_TYPE_1 (BT_FN_UV8HI_UV8HI, BT_UV8HI, BT_UV8HI)
 DEF_FN_TYPE_1 (BT_FN_V16QI_SCHAR, BT_V16QI, BT_SCHAR)
 DEF_FN_TYPE_1 (BT_FN_V16QI_UCHAR, BT_V16QI, BT_UCHAR)
 DEF_FN_TYPE_1 (BT_FN_V16QI_V16QI, BT_V16QI, BT_V16QI)
-DEF_FN_TYPE_1 (BT_FN_V1TI_V1TI, BT_V1TI, BT_V1TI)
 DEF_FN_TYPE_1 (BT_FN_V2DF_DBL, BT_V2DF, BT_DBL)
 DEF_FN_TYPE_1 (BT_FN_V2DF_DBLCONSTPTR, BT_V2DF, BT_DBLCONSTPTR)
 DEF_FN_TYPE_1 (BT_FN_V2DF_FLTCONSTPTR, BT_V2DF, BT_FLTCONSTPTR)
@@ -207,18 +217,18 @@ DEF_FN_TYPE_2 (BT_FN_OV4SI_OV4SI_OV4SI, BT_OV4SI, 
BT_OV4SI, BT_OV4SI)
 DEF_FN_TYPE_2 (BT_FN_OV4SI_OV4SI_UCHAR, BT_OV4SI, BT_OV4SI, B

Re: [PATCH] s390: Fix TARGET_SECONDARY_RELOAD for non-SYMBOL_REFs

2024-02-29 Thread Stefan Schulze Frielinghaus

On Thu, Feb 29, 2024 at 01:26:54PM +0100, Andreas Schwab wrote:
> On Feb 29 2024, Stefan Schulze Frielinghaus wrote:
> 
> > RTX X must not necessarily be a SYMBOL_REF and may e.g. be an
> 
> False friend: s/must not/need not/

Argh I always fall for this ;-) Thanks for pointing this out.  Changed
for the final commit.

Cheers,
Stefan

[PATCH] s390: Fix test vector/long-double-to-i64.c

2024-02-29 Thread Stefan Schulze Frielinghaus

Starting with r14-8319-g86de9b66480b71 fwprop improved so that vpdi is
no longer required.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/long-double-to-i64.c: Fix scan
assembler directive.
---
 .../gcc.target/s390/vector/long-double-to-i64.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c 
b/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c
index 2dbbb5d1c03..ed89878e6ee 100644
--- a/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c
+++ b/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c
@@ -1,19 +1,24 @@
 /* { dg-do compile } */
 /* { dg-options "-O3 -march=z14 -mzarch --save-temps" } */
 /* { dg-do run { target { s390_z14_hw } } } */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } } } */
+
 #include 
 #include 
 
+/*
+** long_double_to_i64:
+** ld  %f0,0\(%r2\)
+** ld  %f2,8\(%r2\)
+** cgxbr   %r2,5,%f0
+** br  %r14
+*/
 __attribute__ ((noipa)) static int64_t
 long_double_to_i64 (long double x)
 {
   return x;
 }
 
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,1\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,5\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tcgxbr\t} 1 } } */
-
 int
 main (void)
 {
-- 
2.43.0

[PATCH] s390: Fix tests rosbg_si_srl and rxsbg_si_srl

2024-02-29 Thread Stefan Schulze Frielinghaus

Starting with r14-2047-gd0e891406b16dc two SI mode tests are optimized
into DI mode.  Thus, the scan-assembler directives fail.  For example
RTL expression

(ior:SI (subreg:SI (lshiftrt:DI (reg:DI 69)
(const_int 2 [0x2])) 4)
(subreg:SI (reg:DI 68) 4))

is optimized into

(ior:DI (lshiftrt:DI (reg:DI 69)
(const_int 2 [0x2]))
(reg:DI 68))

Fixed by moving operands into memory in order to enforce SI mode
computation.

Furthermore, in r9-6056-g290dfd9bc7bea2 the starting bit position of the
scan-assembler directive for rosbg was incorrectly set to 32 which
actually should be 32+SHIFT_AMOUNT, i.e., in this particular case 34.

gcc/testsuite/ChangeLog:

* gcc.target/s390/md/rXsbg_mode_sXl.c: Fix tests rosbg_si_srl
and rxsbg_si_srl.
---
 .../gcc.target/s390/md/rXsbg_mode_sXl.c| 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/gcc/testsuite/gcc.target/s390/md/rXsbg_mode_sXl.c 
b/gcc/testsuite/gcc.target/s390/md/rXsbg_mode_sXl.c
index ede813818ff..cf454d2783c 100644
--- a/gcc/testsuite/gcc.target/s390/md/rXsbg_mode_sXl.c
+++ b/gcc/testsuite/gcc.target/s390/md/rXsbg_mode_sXl.c
@@ -22,6 +22,8 @@
 { dg-skip-if "" { *-*-* } { "*" } { "-march=*" } }
 */
 
+unsigned int a, b;
+
 __attribute__ ((noinline)) unsigned int
 si_sll (unsigned int x)
 {
@@ -42,11 +44,11 @@ rosbg_si_sll (unsigned int a, unsigned int b)
 /* { dg-final { scan-assembler-times "rosbg\t%r.,%r.,32,62,1" 1 } } */
 
 __attribute__ ((noinline)) unsigned int
-rosbg_si_srl (unsigned int a, unsigned int b)
+rosbg_si_srl (void)
 {
   return a | (b >> 2);
 }
-/* { dg-final { scan-assembler-times "rosbg\t%r.,%r.,32,63,62" 1 } } */
+/* { dg-final { scan-assembler-times "rosbg\t%r.,%r.,34,63,62" 1 } } */
 
 __attribute__ ((noinline)) unsigned int
 rxsbg_si_sll (unsigned int a, unsigned int b)
@@ -56,11 +58,11 @@ rxsbg_si_sll (unsigned int a, unsigned int b)
 /* { dg-final { scan-assembler-times "rxsbg\t%r.,%r.,32,62,1" 1 } } */
 
 __attribute__ ((noinline)) unsigned int
-rxsbg_si_srl (unsigned int a, unsigned int b)
+rxsbg_si_srl (void)
 {
   return a ^ (b >> 2);
 }
-/* { dg-final { scan-assembler-times "rxsbg\t%r.,%r.,32,63,62" 1 } } */
+/* { dg-final { scan-assembler-times "rxsbg\t%r.,%r.,34,63,62" 1 } } */
 
 __attribute__ ((noinline)) unsigned long long
 di_sll (unsigned long long x)
@@ -108,21 +110,21 @@ main (void)
   /* SIMode */
   {
 unsigned int r;
-unsigned int a = 0x12488421u;
-unsigned int b = 0xu;
+a = 0x12488421u;
+b = 0xu;
 unsigned int csll = si_sll (b);
 unsigned int csrl = si_srl (b);
 
 r = rosbg_si_sll (a, b);
 if (r != (a | csll))
   __builtin_abort ();
-r = rosbg_si_srl (a, b);
+r = rosbg_si_srl ();
 if (r != (a | csrl))
   __builtin_abort ();
 r = rxsbg_si_sll (a, b);
 if (r != (a ^ csll))
   __builtin_abort ();
-r = rxsbg_si_srl (a, b);
+r = rxsbg_si_srl ();
 if (r != (a ^ csrl))
   __builtin_abort ();
   }
-- 
2.43.0

[PATCH] s390: Fix TARGET_SECONDARY_RELOAD for non-SYMBOL_REFs

2024-02-29 Thread Stefan Schulze Frielinghaus

RTX X must not necessarily be a SYMBOL_REF and may e.g. be an
UNSPEC_GOTENT for which SYMBOL_FLAG_NOTALIGN2_P fails.

gcc/ChangeLog:

* config/s390/s390.cc (s390_secondary_reload): Guard
SYMBOL_FLAG_NOTALIGN2_P.
---
 gcc/config/s390/s390.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 943fc9bfd72..12430d77786 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -4778,7 +4778,7 @@ s390_secondary_reload (bool in_p, rtx x, reg_class_t 
rclass_i,
   if (in_p
  && s390_loadrelative_operand_p (x, &symref, &offset)
  && mode == Pmode
- && !SYMBOL_FLAG_NOTALIGN2_P (symref)
+ && (!SYMBOL_REF_P (symref) || !SYMBOL_FLAG_NOTALIGN2_P (symref))
  && (offset & 1) == 1)
sri->icode = ((mode == DImode) ? CODE_FOR_reloaddi_larl_odd_addend_z10
  : CODE_FOR_reloadsi_larl_odd_addend_z10);
-- 
2.43.0

1 2 3 4 >

1 - 100 of 306 matches

Mail list logo