[ABOUT-TO-PUSH PATCH] MAINTAINERS (s390 port): Add myself
ChangeLog: * MAINTAINERS (s390 port): Add myself. --- I hope the overflow into the email column doesn't break any tooling. I will leave the patch as is for some time before pushing. MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9257b33ff08..f76d12f7f3f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -120,6 +120,7 @@ rs6000 vector extns Aldy Hernandez rx port Nick Clifton s390 port Ulrich Weigand s390 port Andreas Krebbel +s390 port Stefan Schulze Frielinghaus sh port Alexandre Oliva sh port Oleg Endo sparc port David S. Miller -- 2.45.2
[PATCH] s390: Remove -m{,no-}lra option
I have been missing the two test cases and removed them since they depend on -mno-lra. -- 8< -- Since the old reload pass is about to be removed and we defaulted to LRA for over a decade, remove option -m{,no-}lra. PR target/113953 gcc/ChangeLog: * config/s390/s390.cc (s390_lra_p): Remove. (TARGET_LRA_P): Remove. * config/s390/s390.opt (mlra): Remove. * config/s390/s390.opt.urls (mlra): Remove. gcc/testsuite/ChangeLog: * gcc.target/s390/TI-constants-nolra.c: Removed. * gcc.target/s390/pr79895.c: Removed. --- gcc/config/s390/s390.cc | 10 gcc/config/s390/s390.opt | 4 -- gcc/config/s390/s390.opt.urls | 2 - .../gcc.target/s390/TI-constants-nolra.c | 47 --- gcc/testsuite/gcc.target/s390/pr79895.c | 9 5 files changed, 72 deletions(-) delete mode 100644 gcc/testsuite/gcc.target/s390/TI-constants-nolra.c delete mode 100644 gcc/testsuite/gcc.target/s390/pr79895.c diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index c9172d1153a..25d43ae3e13 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -11342,13 +11342,6 @@ s390_can_change_mode_class (machine_mode from_mode, return true; } -/* Return true if we use LRA instead of reload pass. */ -static bool -s390_lra_p (void) -{ - return s390_lra_flag; -} - /* Return true if register FROM can be eliminated via register TO. */ static bool @@ -18444,9 +18437,6 @@ s390_c_mode_for_floating_type (enum tree_index ti) #undef TARGET_LEGITIMATE_CONSTANT_P #define TARGET_LEGITIMATE_CONSTANT_P s390_legitimate_constant_p -#undef TARGET_LRA_P -#define TARGET_LRA_P s390_lra_p - #undef TARGET_CAN_ELIMINATE #define TARGET_CAN_ELIMINATE s390_can_eliminate diff --git a/gcc/config/s390/s390.opt b/gcc/config/s390/s390.opt index a5b5aa95a12..23ea4b8232d 100644 --- a/gcc/config/s390/s390.opt +++ b/gcc/config/s390/s390.opt @@ -229,10 +229,6 @@ Set the branch costs for conditional branch instructions. Reasonable values are small, non-negative integers. The default branch cost is 1. -mlra -Target Var(s390_lra_flag) Init(1) Save -Use LRA instead of reload. - mpic-data-is-text-relative Target Var(s390_pic_data_is_text_relative) Init(TARGET_DEFAULT_PIC_DATA_IS_TEXT_RELATIVE) Assume data segments are relative to text segment. diff --git a/gcc/config/s390/s390.opt.urls b/gcc/config/s390/s390.opt.urls index ab1e761efa8..bc772d2ffc7 100644 --- a/gcc/config/s390/s390.opt.urls +++ b/gcc/config/s390/s390.opt.urls @@ -74,8 +74,6 @@ UrlSuffix(gcc/S_002f390-and-zSeries-Options.html#index-mzarch) ; skipping UrlSuffix for 'mbranch-cost=' due to finding no URLs -; skipping UrlSuffix for 'mlra' due to finding no URLs - ; skipping UrlSuffix for 'mpic-data-is-text-relative' due to finding no URLs ; skipping UrlSuffix for 'mindirect-branch=' due to finding no URLs diff --git a/gcc/testsuite/gcc.target/s390/TI-constants-nolra.c b/gcc/testsuite/gcc.target/s390/TI-constants-nolra.c deleted file mode 100644 index b9948fc4aa5..000 --- a/gcc/testsuite/gcc.target/s390/TI-constants-nolra.c +++ /dev/null @@ -1,47 +0,0 @@ -/* { dg-do compile { target int128 } } */ -/* { dg-options "-O3 -mno-lra" } */ - -/* 2x lghi */ -__int128 a() { - return 0; -} - -/* 2x lghi */ -__int128 b() { - return -1; -} - -/* 2x lghi */ -__int128 c() { - return -2; -} - -/* lghi + llilh */ -__int128 d() { - return 16000 << 16; -} - -/* lghi + llihf */ -__int128 e() { - return (unsigned long long)8 << 32; -} - -/* lghi + llihf */ -__int128 f() { - return (unsigned __int128)8 << 96; -} - -/* llihf + llihf - this is handled via movti_bigconst pattern */ -__int128 g() { - return ((unsigned __int128)8 << 96) | ((unsigned __int128)8 << 32); -} - -/* Literal pool */ -__int128 h() { - return ((unsigned __int128)8 << 32) | 1; -} - -/* Literal pool */ -__int128 i() { - return (((unsigned __int128)8 << 32) | 1) << 64; -} diff --git a/gcc/testsuite/gcc.target/s390/pr79895.c b/gcc/testsuite/gcc.target/s390/pr79895.c deleted file mode 100644 index 02374e4b8a8..000 --- a/gcc/testsuite/gcc.target/s390/pr79895.c +++ /dev/null @@ -1,9 +0,0 @@ -/* { dg-do compile { target int128 } } */ -/* { dg-options "-O1 -mno-lra" } */ - -unsigned __int128 g; -void -foo () -{ - g = (unsigned __int128)1 << 127; -} -- 2.45.2
[PATCH] s390: Remove -m{,no-}lra option
Since the old reload pass is about to be removed and we defaulted to LRA for over a decade, remove option -m{,no-}lra. PR target/113953 gcc/ChangeLog: * config/s390/s390.cc (s390_lra_p): Remove. (TARGET_LRA_P): Remove. * config/s390/s390.opt (mlra): Remove. * config/s390/s390.opt.urls (mlra): Remove. --- Assuming that bootstrap and regtest (which are still running) finish successful, ok for mainline? gcc/config/s390/s390.cc | 10 -- gcc/config/s390/s390.opt | 4 gcc/config/s390/s390.opt.urls | 2 -- 3 files changed, 16 deletions(-) diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index c9172d1153a..25d43ae3e13 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -11342,13 +11342,6 @@ s390_can_change_mode_class (machine_mode from_mode, return true; } -/* Return true if we use LRA instead of reload pass. */ -static bool -s390_lra_p (void) -{ - return s390_lra_flag; -} - /* Return true if register FROM can be eliminated via register TO. */ static bool @@ -18444,9 +18437,6 @@ s390_c_mode_for_floating_type (enum tree_index ti) #undef TARGET_LEGITIMATE_CONSTANT_P #define TARGET_LEGITIMATE_CONSTANT_P s390_legitimate_constant_p -#undef TARGET_LRA_P -#define TARGET_LRA_P s390_lra_p - #undef TARGET_CAN_ELIMINATE #define TARGET_CAN_ELIMINATE s390_can_eliminate diff --git a/gcc/config/s390/s390.opt b/gcc/config/s390/s390.opt index a5b5aa95a12..23ea4b8232d 100644 --- a/gcc/config/s390/s390.opt +++ b/gcc/config/s390/s390.opt @@ -229,10 +229,6 @@ Set the branch costs for conditional branch instructions. Reasonable values are small, non-negative integers. The default branch cost is 1. -mlra -Target Var(s390_lra_flag) Init(1) Save -Use LRA instead of reload. - mpic-data-is-text-relative Target Var(s390_pic_data_is_text_relative) Init(TARGET_DEFAULT_PIC_DATA_IS_TEXT_RELATIVE) Assume data segments are relative to text segment. diff --git a/gcc/config/s390/s390.opt.urls b/gcc/config/s390/s390.opt.urls index ab1e761efa8..bc772d2ffc7 100644 --- a/gcc/config/s390/s390.opt.urls +++ b/gcc/config/s390/s390.opt.urls @@ -74,8 +74,6 @@ UrlSuffix(gcc/S_002f390-and-zSeries-Options.html#index-mzarch) ; skipping UrlSuffix for 'mbranch-cost=' due to finding no URLs -; skipping UrlSuffix for 'mlra' due to finding no URLs - ; skipping UrlSuffix for 'mpic-data-is-text-relative' due to finding no URLs ; skipping UrlSuffix for 'mindirect-branch=' due to finding no URLs -- 2.45.2
[PATCH] s390: Add expander for uaddc/usubc optabs
Bootstrapped and regtested on s390. Both expander are constrained to z196 because of the conditional moves. I guess this is reasonable nowadays. Would be great if you could have a second look that setting the carry/borrow bit (bit 18 of the PSW) is indeed correct. Brain twisted me at first ;-) -- 8< -- gcc/ChangeLog: * config/s390/s390.md (*add3_carry1_cc): Renamed to ... (add3_carry1_cc): this and in order to use the corresponding gen function, encode CC mode into pattern. (*sub3_borrow_cc): Renamed to ... (sub3_borrow_cc): this and in order to use the corresponding gen function, encode CC mode into pattern. (*add3_alc_carry1_cc): Renamed to ... (add3_alc_carry1_cc): this and in order to use the corresponding gen function, encode CC mode into pattern. (sub3_slb_borrow1_cc): New. (uaddc5): New. (usubc5): New. gcc/testsuite/ChangeLog: * gcc.target/s390/uaddc-1.c: New test. * gcc.target/s390/uaddc-2.c: New test. * gcc.target/s390/usubc-1.c: New test. * gcc.target/s390/usubc-2.c: New test. --- gcc/config/s390/s390.md | 103 +++- gcc/testsuite/gcc.target/s390/uaddc-1.c | 80 ++ gcc/testsuite/gcc.target/s390/uaddc-2.c | 25 ++ gcc/testsuite/gcc.target/s390/usubc-1.c | 80 ++ gcc/testsuite/gcc.target/s390/usubc-2.c | 25 ++ 5 files changed, 295 insertions(+), 18 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/uaddc-1.c create mode 100644 gcc/testsuite/gcc.target/s390/uaddc-2.c create mode 100644 gcc/testsuite/gcc.target/s390/usubc-1.c create mode 100644 gcc/testsuite/gcc.target/s390/usubc-2.c diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 4a225ae24f3..6fd3f943fe1 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -6001,14 +6001,14 @@ z10_super_E1,z10_super_E1,z10_super_E1")]) ; alr, alfi, slfi, al, aly, alrk, alhsik, algr, algfi, slgfi, alg, alsi, algsi, algrk, alghsik -(define_insn "*add3_carry1_cc" - [(set (reg CC_REGNUM) -(compare (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" "%0,d, 0, 0,d,0,0,0") - (match_operand:GPR 2 "general_operand" " d,d,Op,On,K,R,T,C")) - (match_dup 1))) - (set (match_operand:GPR 0 "nonimmediate_operand""=d,d, d, d,d,d,d,d") +(define_insn "add3_carry1_cc" + [(set (reg:CCL1 CC_REGNUM) +(compare:CCL1 (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" "%0,d, 0, 0,d,0,0,0") + (match_operand:GPR 2 "general_operand" " d,d,Op,On,K,R,T,C")) + (match_dup 1))) + (set (match_operand:GPR 0 "nonimmediate_operand" "=d,d, d, d,d,d,d,d") (plus:GPR (match_dup 1) (match_dup 2)))] - "s390_match_ccmode (insn, CCL1mode)" + "" "@ alr\t%0,%2 alrk\t%0,%1,%2 @@ -6541,14 +6541,14 @@ (set_attr "z10prop" "z10_super_c_E1,*,z10_super_E1,z10_super_E1")]) ; slr, sl, sly, slgr, slg, slrk, slgrk -(define_insn "*sub3_borrow_cc" - [(set (reg CC_REGNUM) -(compare (minus:GPR (match_operand:GPR 1 "register_operand" "0,d,0,0") - (match_operand:GPR 2 "general_operand" "d,d,R,T")) - (match_dup 1))) - (set (match_operand:GPR 0 "register_operand""=d,d,d,d") +(define_insn "sub3_borrow_cc" + [(set (reg:CCL2 CC_REGNUM) +(compare:CCL2 (minus:GPR (match_operand:GPR 1 "register_operand" "0,d,0,0") +(match_operand:GPR 2 "general_operand" "d,d,R,T")) + (match_dup 1))) + (set (match_operand:GPR 0 "register_operand" "=d,d,d,d") (minus:GPR (match_dup 1) (match_dup 2)))] - "s390_match_ccmode (insn, CCL2mode)" + "" "@ slr\t%0,%2 slrk\t%0,%1,%2 @@ -6754,22 +6754,50 @@ ; add(di|si)cc instruction pattern(s). ; +(define_expand "uaddc5" + [(match_operand:GPR 0 "register_operand") + (match_operand:GPR 1 "nonimmediate_operand") + (match_operand:GPR 2 "nonimmediate_operand") + (match_operand:GPR 3 "nonimmediate_operand") + (match_operand:GPR 4 "general_operand")] + "TARGET_Z196 && (mode != DImode || TARGET_64BIT)" +{ + rtx cond = gen_rtx_LTU (mode, gen_rtx_REG (CCL1mode, CC_REGNUM), const0_rtx); + if (operands[4] == const0_rtx) +emit_insn (gen_add3_carry1_cc (operands[0], operands[2], operands[3])); + else +{ + rtx tmp; + if (CONSTANT_P (operands[4])) + { + tmp = gen_reg_rtx (SImode); + emit_move_insn (tmp, operands[4]); + } + else + tmp = operands[4]; + s390_emit_compare (LTU, tmp, const0_rtx); + emit_insn (gen_add3_alc_carry1_cc (operands[0], operands[2], operands[3], cond)); +} + emit_insn (gen_movcc (operands[1], cond, const1_rtx, const0_rtx)); + DONE; +}) + ;
Re: [RFC 0/4] Hard Register Constraints
On Wed, Sep 18, 2024 at 03:53:37PM +0200, Michael Matz wrote: > Hello, > > On Thu, 12 Sep 2024, Stefan Schulze Frielinghaus wrote: > > > > > #define call_on_stack(stack, func, asm_call, argconstr...) > > > > \ > > > > { > > > > \ > > > > register void *tos asm("r11"); > > > > \ > > > > > > > > \ > > > > tos = ((void *)(stack)); > > > > \ > > > > > > > > \ > > > > asm_inline volatile( > > > > \ > > > > "movq %%rsp, (%[tos]) \n" > > > > \ > > > > "movq %[tos], %%rsp \n" > > > > \ > > > > > > > > \ > > > > asm_call > > > > \ > > > > > > > > \ > > > > "popq %%rsp \n" > > > > \ > > > > > > > > \ > > > > : "+r" (tos), ASM_CALL_CONSTRAINT > > > > \ > > > > : [__func] "i" (func), [tos] "r" (tos) argconstr > > > > \ > > > > : "cc", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", > > > > \ > > > >"memory" > > > > \ > > > > ); > > > > \ > > > > } > > > > I didn't find documentation how "digit references" behave in combination > > with register asm. > > Because noone thought of that corner case while documenting stuff :-) > > As you say: it only works because the involved inputs/outputs are the same > expression. If they weren't the inconsistency would be detected in > reload/LRA when the necessary reloads would need to be generated and the > pass would find that that's impossible. > > Now, question is, what to do in this case in the light of a new feature. > I would say that while perhaps sometimes convenient it's more likely to be > a programmers fault, so for your new hardreg constraints it seems better > to ... > > > Anyway, I digress. I haven't made up my mind how hard register > > constraints should behave in those cases, i.e., in cases where multiple > > inputs share the same register. If the inputs are different or may be > > different, then we can reject those programs. > > > > asm ("" : "={r4}" (x) : "{r5}" (42), "{r5}" (24)); > > > > Whereas if the operands are provable equal (assuming y is not volatile) > > > > asm ("" : "={r4}" (x) : "{r5}" (y), "{r5}" (y)); > > > > we could accept those programs. Currently, I error out even for > > programs of the latter form which may be a bit to restrictive. > > ... do exactly this, error out unconditionally. I wouldn't change > behaviour for existing features, i.e. register-asm vars plus matching > constraints (if for inout operands, or explicit matching constraints > doesn't matter) because there's existing usage that happens to work fine. > > Why unconditionally and not just "when expressions are different"? > Because the latter is inherently hard to see when optimizations are > involved: is "(a + 0)" the same as "(a)"? At which optimization levels? > What if the "0" is an expression that needs further analysis to see that > it's actually zero? And so on. Thanks for sharing this. I also tend to error out in those cases as it rather looks like a programming error. I came up with an updated version just a few minutes ago where I added some documentation which also discusses this. I also added some discussion for output operands where I also tend to error out because those look like programming errors. > > If it's not easily possible to error out only for the new hard-reg > constraints, and accept whatever is there for register-asm vars and > matching constraints, then I would opt to also _not_ error out for the new > feature, though. Essentially that's saying that if the user writes > wacky code then its their responsibility that everything works out, which > is exactly what the current implementation does: if at the end no reloads > are required, it's fine (because it indeed adheres to all given > constraints!), otherwise we give an error. I will have a look and try to distinguish between both mechanisms during error checking. Thanks! Stefan
[PATCH v2 3/4] genoutput: Verify hard register constraints
Since genoutput has no information about hard register names we cannot statically verify those names in constraints of the machine description. Therefore, we have to do it at runtime. Although verification shouldn't be too expensive, restrict it to checking builds. This should be sufficient since hard register constraints in machine descriptions probably change rarely, and each commit should be tested with checking anyway, or at the very least before a release is taken. --- gcc/genoutput.cc | 46 ++ gcc/output.h | 2 ++ gcc/toplev.cc| 4 3 files changed, 52 insertions(+) diff --git a/gcc/genoutput.cc b/gcc/genoutput.cc index 2ffb2fb28d2..4f4fde83608 100644 --- a/gcc/genoutput.cc +++ b/gcc/genoutput.cc @@ -200,6 +200,8 @@ static const char indep_constraints[] = ",=+%*?!^$#&g"; static class constraint_data * constraints_by_letter_table[1 << CHAR_BIT]; +static hash_set used_reg_names; + static int mdep_constraint_len (const char *, file_location, int); static void note_constraint (md_rtx_info *); @@ -1156,6 +1158,45 @@ main (int argc, const char **argv) output_insn_data (); output_get_insn_name (); + /* Since genoutput has no information about hard register names we cannot + statically verify hard register names in constraints of the machine + description. Therefore, we have to do it at runtime. Although + verification shouldn't be too expensive, restrict it to checking builds. + */ + printf ("\n\n#if CHECKING_P\n"); + if (used_reg_names.is_empty ()) +printf ("void verify_reg_names_in_constraints () { }\n"); + else +{ + size_t max_len = 0; + for (auto it = used_reg_names.begin (); it != used_reg_names.end (); ++it) + { + size_t len = strlen (*it); + if (len > max_len) + max_len = len; + } + printf ("void\nverify_reg_names_in_constraints ()\n{\n"); + printf (" static const char hregnames[%zu][%zu] = {\n", + used_reg_names.elements (), max_len + 1); + auto it = used_reg_names.begin (); + while (it != used_reg_names.end ()) + { + printf ("\"%s\"", *it); + ++it; + if (it != used_reg_names.end ()) + printf (","); + printf ("\n"); + } + printf (" };\n"); + printf (" for (size_t i = 0; i < %zu; ++i)\n", + used_reg_names.elements ()); + printf ("if (decode_reg_name (hregnames[i]) < 0)\n"); + printf (" internal_error (\"invalid register %%qs used in " + "constraint of machine description\", hregnames[i]);\n"); + printf ("}\n"); +} + printf ("#endif\n"); + fflush (stdout); return (ferror (stdout) != 0 || have_error ? FATAL_EXIT_CODE : SUCCESS_EXIT_CODE); @@ -1294,6 +1335,11 @@ mdep_constraint_len (const char *s, file_location loc, int opno) ptrdiff_t len = end - s; if (*end == '}' && len > 1 && len < 31) { + char *regname = new char[len]; + memcpy (regname, s + 1, len - 1); + regname[len - 1] = '\0'; + if (used_reg_names.add (regname)) + delete[] regname; return len + 1; } } diff --git a/gcc/output.h b/gcc/output.h index 46b0033b221..5f0f8a6098c 100644 --- a/gcc/output.h +++ b/gcc/output.h @@ -636,4 +636,6 @@ extern int default_address_cost (rtx, machine_mode, addr_space_t, bool); /* Stack usage. */ extern void output_stack_usage (void); +extern void verify_reg_names_in_constraints (); + #endif /* ! GCC_OUTPUT_H */ diff --git a/gcc/toplev.cc b/gcc/toplev.cc index bc442a08c63..34c372ad1a2 100644 --- a/gcc/toplev.cc +++ b/gcc/toplev.cc @@ -1817,6 +1817,10 @@ backend_init_target (void) static void backend_init (void) { +#if CHECKING_P + verify_reg_names_in_constraints (); +#endif + init_emit_once (); init_rtlanal (); -- 2.45.2
[PATCH v2 2/4] Error handling for hard register constraints
This implements some basic error handling for hard register constraints including potential conflics with register asm operands. In contrast to register asm operands, hard register constraints allow more than just one register per operand. Even more than just one register per alternative. For example, a valid constraint for an operand is "{r0}{r1}m,{r2}". However, this also means that we have to make sure that each register is used at most once in each alternative over all outputs and likewise over all inputs. For asm statements this is done by this patch during gimplification. For hard register constraints used in machine description, error handling is still a todo and I haven't investigated this so far and consider this rather a low priority. There are 9/10 call sides for parse_{input,output}_constraint() which I didn't dare to touch in the first run. If this patch is about to be accepted I could change those call sides and explicitly pass a null pointer instead of overloading those functions as it is done right now. I consider this an implementation nit and didn't want to clutter the patch for reviewing. --- gcc/cfgexpand.cc | 42 gcc/gimplify.cc | 73 +- gcc/gimplify_reg_info.h | 130 ++ gcc/stmt.cc | 229 +- gcc/stmt.h| 8 +- gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c | 83 +++ gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c | 20 ++ gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c | 21 ++ gcc/testsuite/gcc.dg/pr87600-2.c | 30 +-- gcc/testsuite/gcc.dg/pr87600-3.c | 35 +++ gcc/testsuite/gcc.dg/pr87600-3.s | 0 .../gcc.target/s390/asm-hard-reg-1.c | 103 .../gcc.target/s390/asm-hard-reg-2.c | 43 .../gcc.target/s390/asm-hard-reg-3.c | 42 gcc/testsuite/lib/scanasm.exp | 4 + 15 files changed, 779 insertions(+), 84 deletions(-) create mode 100644 gcc/gimplify_reg_info.h create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.c create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.s create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-1.c create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-3.c diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc index 13f8c08d295..fdbbd93f1b5 100644 --- a/gcc/cfgexpand.cc +++ b/gcc/cfgexpand.cc @@ -2966,44 +2966,6 @@ expand_asm_loc (tree string, int vol, location_t locus) emit_insn (body); } -/* Return the number of times character C occurs in string S. */ -static int -n_occurrences (int c, const char *s) -{ - int n = 0; - while (*s) -n += (*s++ == c); - return n; -} - -/* A subroutine of expand_asm_operands. Check that all operands have - the same number of alternatives. Return true if so. */ - -static bool -check_operand_nalternatives (const vec &constraints) -{ - unsigned len = constraints.length(); - if (len > 0) -{ - int nalternatives = n_occurrences (',', constraints[0]); - - if (nalternatives + 1 > MAX_RECOG_ALTERNATIVES) - { - error ("too many alternatives in %"); - return false; - } - - for (unsigned i = 1; i < len; ++i) - if (n_occurrences (',', constraints[i]) != nalternatives) - { - error ("operand constraints for % differ " - "in number of alternatives"); - return false; - } -} - return true; -} - /* Check for overlap between registers marked in CLOBBERED_REGS and anything inappropriate in T. Emit error and return the register variable definition for error, NULL_TREE for ok. */ @@ -3169,10 +3131,6 @@ expand_asm_stmt (gasm *stmt) = TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE (t))); } - /* ??? Diagnose during gimplification? */ - if (! check_operand_nalternatives (constraints)) -return; - /* Count the number of meaningful clobbered registers, ignoring what we would ignore later. */ auto_vec clobber_rvec; diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc index 26a216e151d..08e0b5d047b 100644 --- a/gcc/gimplify.cc +++ b/gcc/gimplify.cc @@ -70,6 +70,10 @@ along with GCC; see the file COPYING3. If not see #include "omp-offload.h" #include "context.h" #include "tree-nested.h" +#include "insn-config.h" +#include "recog.h" +#include "output.h" +#include "gimplify_reg_info.h" /* Identifier for a basic condition, mapping it to other basic conditions of its Boolean expression. Basic conditions given the same uid (in the same @@ -7009,6 +7013,42 @@ gimplify_addr_expr (tree *expr_p, gimple_seq *pre_p, gimple_seq *
[PATCH v2 4/4] Rewrite register asm into hard register constraints
Currently a register asm already materializes during expand. This means, a hard register is allocated for the very first access of a register asm as e.g. in an assignment. As a consequence this might lead to suboptimal register allocation if the assignment and the using asm statement are spread far apart. Even more problematic are function calls in between register asm assignments and its using asm statement since hard registers may be clobbered by a call. The former may be solved by pulling register asm assignments and asm statements close by. However, the latter is not easily solved since sometimes function calls are implicit. For example int foo (int *x) { register int y asm ("0") = 42; register int z asm ("1") = *x; asm ("bar\t%0,%1" : "+r" (z) : "r" (y)); return z; } If compiled with address sanitizer, then a function call is introduced for the memory load which in turn may interfer with the initialization of register asm y. Likewise, for some targets and configurations even an operation like an addition may lead to an implicit library call. In contrast hard register constraints materialize during register allocation and therefore do not suffer from this, i.e., asm operands are kept in pseudos until RA. This patch adds the feature of rewriting local register asm into code which exploits hard register constraints. For example register int global asm ("r3"); int foo (int x0) { register int x asm ("r4") = x0; register int y asm ("r5"); asm ("bar\t%0,%1,%2" : "=r" (x) : "0" (x), "r" (global)); x += 42; asm ("baz\t%0,%1" : "=r" (y) : "r" (x)); return y; } is rewritten during gimplification into register int global asm ("r3"); int foo (int x0) { int x = x0; int y; asm ("bar\t%0,%1,%2" : "={r4}" (x) : "0" (x), "r" (global)); x += 42; asm ("baz\t%0,%1" : "={r5}" (y) : "{r4}" (x)); return y; } The resulting code solely relies on hard register constraints modulo global register asm. Since I consider this as an experimental feature it is hidden behind new flag -fdemote-register-asm (I'm open for other naming suggestions). --- gcc/common.opt| 4 + gcc/gimplify.cc | 78 +++ .../gcc.dg/asm-hard-reg-demotion-1.c | 19 + .../gcc.dg/asm-hard-reg-demotion-2.c | 19 + gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h | 52 + 5 files changed, 172 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-1.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-2.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h diff --git a/gcc/common.opt b/gcc/common.opt index ea39f87ae71..859a735a0b7 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -3422,6 +3422,10 @@ fverbose-asm Common Var(flag_verbose_asm) Add extra commentary to assembler output. +fdemote-register-asm +Common Var(flag_demote_register_asm) Init(0) +Demote local register asm and use hard register constraints instead + fvisibility= Common Joined RejectNegative Enum(symbol_visibility) Var(default_visibility) Init(VISIBILITY_DEFAULT) -fvisibility=[default|internal|hidden|protected] Set the default symbol visibility. diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc index 08e0b5d047b..c9bd1769c28 100644 --- a/gcc/gimplify.cc +++ b/gcc/gimplify.cc @@ -7049,6 +7049,73 @@ num_alternatives (const_tree link) return num + 1; } +static hash_set demote_register_asm; + +static void +gimplify_demote_register_asm (tree link) +{ + if (!flag_demote_register_asm) +return; + tree op = TREE_VALUE (link); + if (!VAR_P (op) || !DECL_HARD_REGISTER (op) || is_global_var (op)) +return; + tree id = DECL_ASSEMBLER_NAME (op); + const char *regname = IDENTIFIER_POINTER (id); + ++regname; + int regno = decode_reg_name (regname); + if (regno < 0) +/* This indicates an error and we error out later on. */ +return; + const char *constraint = TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE (link))); + auto_vec constraint_new; + for (const char *p = constraint; *p; ) +{ + bool pushed = false; + switch (*p) + { + case '+': case '=': case '%': case '?': case '!': case '*': case '&': + case '#': case '$': case '^': case '{': case 'E': case 'F': case 'G': + case 'H': case 's': case 'i': case 'n': case 'I': case 'J': case 'K': + case 'L': case 'M': case 'N': case 'O': case 'P': case ',': case '0': + case '1': case '2': case '3': case '4': case '5': case '6': case '7': + case '8': case '9': case '[': case '<': case '>': case 'g': case 'X': + break; + + default: + if (!ISALPHA (*p)) + break; + enum constraint_num cn = lookup_constraint (p); + enum reg_class rclass = reg_class_for_constraint (cn); + if (rclass != NO_REGS || insn_extra_address_constraint (cn)) + { + gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (regno), r
[PATCH v2 0/4] Hard Register Constraints
This is a follow-up to https://gcc.gnu.org/pipermail/gcc-patches/2024-September/662725.html I basically added only some documentation to the first patch. If you think that gcc/doc/extend.texi isn't the right place (especially the discussion part which will be dropped/rephrased in the end anyway), then just let me know. Stefan Schulze Frielinghaus (4): Hard register constraints Error handling for hard register constraints genoutput: Verify hard register constraints Rewrite register asm into hard register constraints gcc/cfgexpand.cc | 42 --- gcc/common.opt| 4 + gcc/doc/extend.texi | 189 gcc/doc/md.texi | 6 + gcc/function.cc | 116 gcc/genoutput.cc | 60 gcc/genpreds.cc | 4 +- gcc/gimplify.cc | 151 +- gcc/gimplify_reg_info.h | 130 + gcc/ira.cc| 79 +- gcc/lra-constraints.cc| 13 + gcc/output.h | 2 + gcc/recog.cc | 11 +- gcc/stmt.cc | 268 +- gcc/stmt.h| 9 +- gcc/testsuite/gcc.dg/asm-hard-reg-1.c | 85 ++ gcc/testsuite/gcc.dg/asm-hard-reg-2.c | 33 +++ gcc/testsuite/gcc.dg/asm-hard-reg-3.c | 25 ++ gcc/testsuite/gcc.dg/asm-hard-reg-4.c | 50 gcc/testsuite/gcc.dg/asm-hard-reg-5.c | 36 +++ gcc/testsuite/gcc.dg/asm-hard-reg-6.c | 60 gcc/testsuite/gcc.dg/asm-hard-reg-7.c | 41 +++ gcc/testsuite/gcc.dg/asm-hard-reg-8.c | 49 .../gcc.dg/asm-hard-reg-demotion-1.c | 19 ++ .../gcc.dg/asm-hard-reg-demotion-2.c | 19 ++ gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h | 52 gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c | 83 ++ gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c | 20 ++ gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c | 21 ++ gcc/testsuite/gcc.dg/pr87600-2.c | 30 +- gcc/testsuite/gcc.dg/pr87600-3.c | 35 +++ gcc/testsuite/gcc.dg/pr87600-3.s | 0 .../gcc.target/s390/asm-hard-reg-1.c | 103 +++ .../gcc.target/s390/asm-hard-reg-2.c | 43 +++ .../gcc.target/s390/asm-hard-reg-3.c | 42 +++ gcc/testsuite/lib/scanasm.exp | 4 + gcc/toplev.cc | 4 + 37 files changed, 1851 insertions(+), 87 deletions(-) create mode 100644 gcc/gimplify_reg_info.h create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-1.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-2.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-3.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-4.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-5.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-6.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-7.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-8.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-1.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-2.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.c create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.s create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-1.c create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-3.c -- 2.45.2
[PATCH v2 1/4] Hard register constraints
Implement hard register constraints of the form {regname} where regname must be any valid register name for the target. Such constraints may be used in asm statements as a replacement for register asm and in machine descriptions. Due to optimizations it is not unexpected if two or more inputs require the same value, then those also share a common pseudo. However, this in turn may lead to unsatisfiable asm where multiple inputs with different hard register constraints share the same pseudo. Therefore, we have to introduce copies of such a pseudo and use these for conflicting inputs. This is done prior RA during asmcons in match_asm_constraints_2(). While IRA tries to reduce live ranges, it also replaces some register-register moves. That in turn might undo those copies of a pseudo which we just introduced during asmcons. Thus, check in decrease_live_ranges_number() via valid_replacement_for_asm_input_p() whether it is valid to perform a replacement. The reminder of the patch mostly deals with parsing and decoding hard register constraints. The actual work is done by LRA in process_alt_operands() where a register filter, according to the constraint, is installed. For the sake of "reviewability" and in order to show the beauty of LRA, error handling (which gets pretty involved) is spread out into a subsequent patch. Limitation: Currently, a fixed register cannot be used as hard register constraint. For example, accessing the stack pointer on x86_64 via void * foo (void) { void *y; __asm__ ("" : "={rsp}" (y)); return y; } leads to an error. This is unfortunate since register asm does not have this limitation. The culprit seems to be that during reload ira_class_hard_regs_num[rclass] does not even include fixed registers which is why lra_assign() ultimately fails. Does anyone have an idea how to lift this limitation? Maybe there is even a shortcut in order to force a pseudo into a hard reg? --- gcc/doc/extend.texi | 189 ++ gcc/doc/md.texi | 6 + gcc/function.cc | 116 gcc/genoutput.cc | 14 ++ gcc/genpreds.cc | 4 +- gcc/ira.cc| 79 ++- gcc/lra-constraints.cc| 13 ++ gcc/recog.cc | 11 +- gcc/stmt.cc | 39 ++ gcc/stmt.h| 1 + gcc/testsuite/gcc.dg/asm-hard-reg-1.c | 85 gcc/testsuite/gcc.dg/asm-hard-reg-2.c | 33 + gcc/testsuite/gcc.dg/asm-hard-reg-3.c | 25 gcc/testsuite/gcc.dg/asm-hard-reg-4.c | 50 +++ gcc/testsuite/gcc.dg/asm-hard-reg-5.c | 36 + gcc/testsuite/gcc.dg/asm-hard-reg-6.c | 60 gcc/testsuite/gcc.dg/asm-hard-reg-7.c | 41 ++ gcc/testsuite/gcc.dg/asm-hard-reg-8.c | 49 +++ 18 files changed, 848 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-1.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-2.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-3.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-4.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-5.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-6.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-7.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-8.c diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 0ea7a87053c..f8cbbabeba7 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -12549,6 +12549,7 @@ the two, as explained in the sections below. @menu * Global Register Variables:: Variables declared at global scope. * Local Register Variables::Variables declared within a function. +* Hard Register Constraints:: Operands forced into specific machine registers. @end menu @node Global Register Variables @@ -12754,6 +12755,194 @@ with slightly different characteristics (@pxref{MIPS Coprocessors,, Defining coprocessor specifics for MIPS targets, gccint, GNU Compiler Collection (GCC) Internals}). +@node Hard Register Constraints +@subsubsection Hard Register Constraints + +Similar to register @code{asm} but still distinct, hard register constraints +are another way to force operands of inline @code{asm} into specific machine +registers. In contrast to register @code{asm} where a variable is bound to a +machine register, a hard register constraint loads an @code{asm} operand into a +machine register. Assume in the following that @code{r4} is a general-purpose +register, @code{f5} a floating-point register, and @code{v6} a vector register +for some target. + +@smallexample +int x; +int y __attribute__ ((vector_size (16))); +@dots{} +asm ("some instructions" + : "=@{r4@}" (x) + : "@{f5@}" (42.0), "@{v6@}" (y)); +@end smallexample + +For the inline @code{asm}, variable @code{x} is loaded into register @code{r4}, +and @code{y} into @code{v6}. Furthermore, constant @co
[PATCH] s390: Fix TF to FPRX2 conversion [PR115860]
Bootstrapped and regtested on s390. Approved offlist and as also discussed offlist I went for removing format specifier %V. This fixes FAIL: g++.dg/cpp23/ext-floating14.C -std=gnu++23 execution test FAIL: g++.dg/cpp23/ext-floating14.C -std=gnu++26 execution test FAIL: c-c++-common/ubsan/float-cast-overflow-7.c -O2 execution test FAIL: c-c++-common/ubsan/float-cast-overflow-7.c -O2 -flto -fno-use-linker-plugin -flto-partition=none execution test FAIL: c-c++-common/ubsan/float-cast-overflow-7.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O0 execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O1 execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O2 execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O2 -flto -fno-use-linker-plugin -flto-partition=none execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O3 -g execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -Os execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O0 execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O1 execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O2 execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O2 -flto -fno-use-linker-plugin -flto-partition=none execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O3 -g execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -Os execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O0 execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O1 execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O2 execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O2 -flto -fno-use-linker-plugin -flto-partition=none execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O3 -g execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -Os execution test FAIL: gfortran.dg/pr96711.f90 -O0 execution test FAIL: libffi.closures/nested_struct5.c -W -Wall -Wno-psabi -O2 output pattern test FAIL: libphobos.phobos/std/algorithm/mutation.d execution test FAIL: libphobos.phobos/std/conv.d execution test FAIL: libphobos.phobos/std/internal/math/errorfunction.d execution test FAIL: libphobos.phobos/std/variant.d execution test FAIL: libphobos.phobos_shared/std/algorithm/mutation.d execution test FAIL: libphobos.phobos_shared/std/conv.d execution test FAIL: libphobos.phobos_shared/std/internal/math/errorfunction.d execution test FAIL: libphobos.phobos_shared/std/variant.d execution test I will push shortly. -- >8 -- Currently subregs originating from *tf_to_fprx2_0 and *tf_to_fprx2_1 survive register allocation. This in turn leads to wrong register renaming. Keeping the current approach would mean we need two insns for *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively. Something along the lines (define_insn "*tf_to_fprx2_0" [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "=f") 0) (unspec:DF [(match_operand:TF 1 "general_operand" "v")] UNSPEC_TF_TO_FPRX2_0))] "TARGET_VXE" "#") (define_insn "*tf_to_fprx2_0" [(set (match_operand:DF 0 "nonimmediate_operand" "=f") (unspec:DF [(match_operand:TF 1 "general_operand" "v")] UNSPEC_TF_TO_FPRX2_0))] "TARGET_VXE" "vpdi\t%v0,%v1,%v0,1 [(set_attr "op_type" "VRR")]) and similar for *tf_to_fprx2_1. Note, pre register allocation operand 0 has mode FPRX2 and afterwards DF once subregs have been eliminated. Since we always copy a whole vector register into a floating-point register pair, another way to fix this is to merge *tf_to_fprx2_0 and *tf_to_fprx2_1 into a single insn which means we don't have to use subregs at all. The downside of this is that the assembler template contains two instructions, now. The upside is that we don't have to come up with some artificial insn before RA which might be more readable/maintainable. That is implemented by this patch. In commit r11-4872-ge627cda5686592, the output operand specifier %V was introduced which is used in tf_to_fprx2 only, now. Instead of coming up with its counterpart %F for floating-point registers, which would also only be used in tf_to_fprx2, I print the operands directly. This renders %V unused which is why it is removed by this patch. gcc/ChangeLog: PR 115860 * config/s390/s390.cc (print_operand): Remove operand specifier %V. * config/s390
[PATCH] s390: Fix AQ and AR constraints
Ensure for AQ and AR constraints that the resulting displacement after adding any positive offset less than the size of the object being referenced is still valid. Bootstrapped and regtested on s390. As approved by https://gcc.gnu.org/pipermail/gcc-patches/2024-September/662865.html I will push shortly. gcc/ChangeLog: * config/s390/s390.cc (s390_mem_constraint): Check displacement for AQ and AR constraints. --- gcc/config/s390/s390.cc | 12 1 file changed, 12 insertions(+) diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 7aea776da2f..ae1f369e19d 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -3714,6 +3714,18 @@ s390_mem_constraint (const char *str, rtx op) if ((reload_completed || reload_in_progress) ? !offsettable_memref_p (op) : !offsettable_nonstrict_memref_p (op)) return 0; + /* offsettable_memref_p ensures only that any positive offset added to +the address forms a valid general address. For AQ and AR constraints +we also have to verify that the resulting displacement after adding +any positive offset less than the size of the object being referenced +is still valid. */ + if (str[1] == 'Q' || str[1] == 'R') + { + int o = GET_MODE_SIZE (GET_MODE (op)) - 1; + rtx tmp = adjust_address (op, QImode, o); + if (!s390_check_qrst_address (str[1], XEXP (tmp, 0), true)) + return 0; + } return s390_check_qrst_address (str[1], XEXP (op, 0), true); case 'B': /* Check for non-literal-pool variants of memory constraints. */ -- 2.45.2
Re: [RFC 0/4] Hard Register Constraints
On Thu, Sep 12, 2024 at 04:03:33PM +0200, Georg-Johann Lay wrote: > > > Am 10.09.24 um 16:20 schrieb Stefan Schulze Frielinghaus: > > This series introduces hard register constraints. The first patch > > enables hard register constraints for asm statements and for > > machine descriptions. The subsequent patch adds some basic error > > handling for asm statements. The third patch adds some verification of > > register names used in machine description. The fourth and last patch > > adds the feature of rewriting local register asm into hard register > > constraints. > > > > This series was bootstrapped and regtested on s390. Furthermore, the > > new dg-compile tests were verified via cross compilers for the enabled > > targets. There is still some fallout if -fdemote-register-asm is used > > since a couple of features are missing as e.g. erroring out during > > gimplification if the clobber set of registers intersects with > > input/output registers. > > > > As a larger test vehicle I've compiled and regtested glibc on s390 using > > -fdemote-register-asm without any fallout. On x86_64 this fails due to > > the limitation that fixed registers are currently not supported for hard > > register constraints (see commit message of the first patch). This is > > also the reason why I'm posting this series already since I was hoping > > to get some feedback about this limitation. > > > > Furthermore, I've compiled the Linux kernel on s390 and x86_64 with > > -fdemote-register-asm. Interestingly, the Linux kernel for x86_64 makes > > use of the following asm statement: > > > > #define call_on_stack(stack, func, asm_call, argconstr...) \ > > { \ > > register void *tos asm("r11"); \ > > \ > > tos = ((void *)(stack));\ > > \ > > asm_inline volatile(\ > > "movq %%rsp, (%[tos]) \n" \ > > "movq %[tos], %%rsp \n" \ > > \ > > asm_call\ > > \ > > "popq %%rsp \n" \ > > \ > > : "+r" (tos), ASM_CALL_CONSTRAINT \ > > : [__func] "i" (func), [tos] "r" (tos) argconstr\ > > : "cc", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", \ > >"memory" \ > > ); \ > > } > > > > Note the output > >"+r" (tos) > > and the input > >[tos] "r" (tos) > > Currently I error out for this since I consider this as two inputs using > > the same hard register. One time an implicit input via '+' and a second > > time via the explicit input. Thus, actually I would expect a '=' > > Would you explain why the two operands are supposed to live in the same > hard register? > > From my understanding of asm semantics, this gives you two copies of > tos: The 1st one may be altered by the asm, and the 2nd one may not be > changed. As the operands neither refer to each other by "0" nor don't > they use the same (single-register) register constraint, there is no > reason / requirement to allocate the two operands to the same reg, no? During gimplification an inout operand is canonicalized into one output and one input operand. The input operand refers via a digit to the output operand. For example asm ("" : "+r" (x)); is rewritten into asm ("" : "=r" (x) : "0" (x)); I didn't find documentation how "digit references" behave in combination with register asm. At least it is not defined here https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html#index-0-in-constraint
Re: [PATCH] s390: Fix TF to FPRX2 conversion [PR115860]
On Wed, Sep 11, 2024 at 08:57:23PM +0200, Ilya Leoshkevich wrote: > On Wed, 2024-09-11 at 16:44 +0200, Stefan Schulze Frielinghaus wrote: > > On Wed, Sep 11, 2024 at 01:59:48PM +0200, Ilya Leoshkevich wrote: > > > On Wed, 2024-09-11 at 13:34 +0200, Stefan Schulze Frielinghaus > > > wrote: > > > > On Wed, Sep 11, 2024 at 01:22:30PM +0200, Ilya Leoshkevich wrote: > > > > > On Wed, 2024-09-11 at 12:35 +0200, Stefan Schulze Frielinghaus > > > > > wrote: > > > > > > On Wed, Sep 11, 2024 at 11:47:54AM +0200, Ilya Leoshkevich > > > > > > wrote: > > > > > > > On Fri, 2024-08-16 at 09:41 +0200, Stefan Schulze > > > > > > > Frielinghaus > > > > > > > wrote: > > > > > > > > Currently subregs originating from *tf_to_fprx2_0 and > > > > > > > > *tf_to_fprx2_1 > > > > > > > > survive register allocation. This in turn leads to wrong > > > > > > > > register > > > > > > > > renaming. Keeping the current approach would mean we > > > > > > > > need > > > > > > > > two > > > > > > > > insns > > > > > > > > for > > > > > > > > *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively. > > > > > > > > Something > > > > > > > > along > > > > > > > > the > > > > > > > > lines > > > > > > > > > > > > > > > > (define_insn "*tf_to_fprx2_0" > > > > > > > > [(set (subreg:DF (match_operand:FPRX2 0 > > > > > > > > "nonimmediate_operand" > > > > > > > > "=f") 0) > > > > > > > > (unspec:DF [(match_operand:TF 1 "general_operand" > > > > > > > > "v")] > > > > > > > > UNSPEC_TF_TO_FPRX2_0))] > > > > > > > > "TARGET_VXE" > > > > > > > > "#") > > > > > > > > > > > > > > > > (define_insn "*tf_to_fprx2_0" > > > > > > > > [(set (match_operand:DF 0 "nonimmediate_operand" "=f") > > > > > > > > (unspec:DF [(match_operand:TF 1 "general_operand" > > > > > > > > "v")] > > > > > > > > UNSPEC_TF_TO_FPRX2_0))] > > > > > > > > "TARGET_VXE" > > > > > > > > "vpdi\t%v0,%v1,%v0,1 > > > > > > > > [(set_attr "op_type" "VRR")]) > > > > > > > > > > > > > > > > and similar for *tf_to_fprx2_1. Note, pre register > > > > > > > > allocation > > > > > > > > operand 0 > > > > > > > > has mode FPRX2 and afterwards DF once subregs have been > > > > > > > > eliminated. > > > > > > > > > > > > > > > > Since we always copy a whole vector register into a > > > > > > > > floating- > > > > > > > > point > > > > > > > > register pair, another way to fix this is to merge > > > > > > > > *tf_to_fprx2_0 > > > > > > > > and > > > > > > > > *tf_to_fprx2_1 into a single insn which means we don't > > > > > > > > have > > > > > > > > to > > > > > > > > use > > > > > > > > subregs at all. The downside of this is that the > > > > > > > > assembler > > > > > > > > template > > > > > > > > contains two instructions, now. The upside is that we > > > > > > > > don't > > > > > > > > have > > > > > > > > to > > > > > > > > come up with some artificial insn before RA which might > > > > > > > > be > > > > > > > > more > > > > > > > > readable/maintainable. That is implemented by this > > > > > > > > patch. > > > > > > > > > > > > > > > > In commit r11-4872-ge627cda5686592, the output operand
Re: [PATCH] s390: Fix TF to FPRX2 conversion [PR115860]
On Wed, Sep 11, 2024 at 01:59:48PM +0200, Ilya Leoshkevich wrote: > On Wed, 2024-09-11 at 13:34 +0200, Stefan Schulze Frielinghaus wrote: > > On Wed, Sep 11, 2024 at 01:22:30PM +0200, Ilya Leoshkevich wrote: > > > On Wed, 2024-09-11 at 12:35 +0200, Stefan Schulze Frielinghaus > > > wrote: > > > > On Wed, Sep 11, 2024 at 11:47:54AM +0200, Ilya Leoshkevich wrote: > > > > > On Fri, 2024-08-16 at 09:41 +0200, Stefan Schulze Frielinghaus > > > > > wrote: > > > > > > Currently subregs originating from *tf_to_fprx2_0 and > > > > > > *tf_to_fprx2_1 > > > > > > survive register allocation. This in turn leads to wrong > > > > > > register > > > > > > renaming. Keeping the current approach would mean we need > > > > > > two > > > > > > insns > > > > > > for > > > > > > *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively. Something > > > > > > along > > > > > > the > > > > > > lines > > > > > > > > > > > > (define_insn "*tf_to_fprx2_0" > > > > > > [(set (subreg:DF (match_operand:FPRX2 0 > > > > > > "nonimmediate_operand" > > > > > > "=f") 0) > > > > > > (unspec:DF [(match_operand:TF 1 "general_operand" > > > > > > "v")] > > > > > > UNSPEC_TF_TO_FPRX2_0))] > > > > > > "TARGET_VXE" > > > > > > "#") > > > > > > > > > > > > (define_insn "*tf_to_fprx2_0" > > > > > > [(set (match_operand:DF 0 "nonimmediate_operand" "=f") > > > > > > (unspec:DF [(match_operand:TF 1 "general_operand" > > > > > > "v")] > > > > > > UNSPEC_TF_TO_FPRX2_0))] > > > > > > "TARGET_VXE" > > > > > > "vpdi\t%v0,%v1,%v0,1 > > > > > > [(set_attr "op_type" "VRR")]) > > > > > > > > > > > > and similar for *tf_to_fprx2_1. Note, pre register > > > > > > allocation > > > > > > operand 0 > > > > > > has mode FPRX2 and afterwards DF once subregs have been > > > > > > eliminated. > > > > > > > > > > > > Since we always copy a whole vector register into a floating- > > > > > > point > > > > > > register pair, another way to fix this is to merge > > > > > > *tf_to_fprx2_0 > > > > > > and > > > > > > *tf_to_fprx2_1 into a single insn which means we don't have > > > > > > to > > > > > > use > > > > > > subregs at all. The downside of this is that the assembler > > > > > > template > > > > > > contains two instructions, now. The upside is that we don't > > > > > > have > > > > > > to > > > > > > come up with some artificial insn before RA which might be > > > > > > more > > > > > > readable/maintainable. That is implemented by this patch. > > > > > > > > > > > > In commit r11-4872-ge627cda5686592, the output operand > > > > > > specifier > > > > > > %V > > > > > > was > > > > > > introduced which is used in tf_to_fprx2 only, now. I didn't > > > > > > come > > > > > > up > > > > > > with its counterpart like %F for floating-point registers. > > > > > > Instead I > > > > > > printed the register pair in the output function directly. > > > > > > This > > > > > > spares > > > > > > us a new and "rare" format specifier for a single insn. I > > > > > > don't > > > > > > have > > > > > > a > > > > > > strong opinion which option to choose, however, we should > > > > > > either > > > > > > add > > > > > > %F > > > > > > in order to mimic the same behaviour as %V or getting rid of > > > > > > %V > > > > > > and > > > > > > inline the logic in the
Re: [PATCH] s390: Fix TF to FPRX2 conversion [PR115860]
On Wed, Sep 11, 2024 at 01:22:30PM +0200, Ilya Leoshkevich wrote: > On Wed, 2024-09-11 at 12:35 +0200, Stefan Schulze Frielinghaus wrote: > > On Wed, Sep 11, 2024 at 11:47:54AM +0200, Ilya Leoshkevich wrote: > > > On Fri, 2024-08-16 at 09:41 +0200, Stefan Schulze Frielinghaus > > > wrote: > > > > Currently subregs originating from *tf_to_fprx2_0 and > > > > *tf_to_fprx2_1 > > > > survive register allocation. This in turn leads to wrong > > > > register > > > > renaming. Keeping the current approach would mean we need two > > > > insns > > > > for > > > > *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively. Something along > > > > the > > > > lines > > > > > > > > (define_insn "*tf_to_fprx2_0" > > > > [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" > > > > "=f") 0) > > > > (unspec:DF [(match_operand:TF 1 "general_operand" "v")] > > > > UNSPEC_TF_TO_FPRX2_0))] > > > > "TARGET_VXE" > > > > "#") > > > > > > > > (define_insn "*tf_to_fprx2_0" > > > > [(set (match_operand:DF 0 "nonimmediate_operand" "=f") > > > > (unspec:DF [(match_operand:TF 1 "general_operand" "v")] > > > > UNSPEC_TF_TO_FPRX2_0))] > > > > "TARGET_VXE" > > > > "vpdi\t%v0,%v1,%v0,1 > > > > [(set_attr "op_type" "VRR")]) > > > > > > > > and similar for *tf_to_fprx2_1. Note, pre register allocation > > > > operand 0 > > > > has mode FPRX2 and afterwards DF once subregs have been > > > > eliminated. > > > > > > > > Since we always copy a whole vector register into a floating- > > > > point > > > > register pair, another way to fix this is to merge *tf_to_fprx2_0 > > > > and > > > > *tf_to_fprx2_1 into a single insn which means we don't have to > > > > use > > > > subregs at all. The downside of this is that the assembler > > > > template > > > > contains two instructions, now. The upside is that we don't have > > > > to > > > > come up with some artificial insn before RA which might be more > > > > readable/maintainable. That is implemented by this patch. > > > > > > > > In commit r11-4872-ge627cda5686592, the output operand specifier > > > > %V > > > > was > > > > introduced which is used in tf_to_fprx2 only, now. I didn't come > > > > up > > > > with its counterpart like %F for floating-point registers. > > > > Instead I > > > > printed the register pair in the output function directly. This > > > > spares > > > > us a new and "rare" format specifier for a single insn. I don't > > > > have > > > > a > > > > strong opinion which option to choose, however, we should either > > > > add > > > > %F > > > > in order to mimic the same behaviour as %V or getting rid of %V > > > > and > > > > inline the logic in the output function. I lean towards the > > > > latter. > > > > Any preferences? > > > > --- > > > > gcc/config/s390/s390.md | 2 + > > > > gcc/config/s390/vector.md | 66 +++- > > > > > > > > -- > > > > gcc/testsuite/gcc.target/s390/pr115860-1.c | 26 + > > > > 3 files changed, 60 insertions(+), 34 deletions(-) > > > > create mode 100644 gcc/testsuite/gcc.target/s390/pr115860-1.c > > > > > > [...] > > > > > > > + char buf[64]; > > > > + switch (which_alternative) > > > > + { > > > > + case 0: > > > > + if (REGNO (operands[0]) == REGNO (operands[1])) > > > > + return "vpdi\t%V0,%v1,%V0,5"; > > > > + else > > > > + return "ldr\t%f0,%f1;vpdi\t%V0,%v1,%V0,5"; > > > > + case 1: > > > > + { > > > > + const char *reg_pair = reg_names[REGNO (operands[0]) + > > > > 1]; > > > > + snprintf (buf, sizeof (buf), > > > > "ld\t%%f0,%%1;ld\t%%%s,8+%%1", > > >
Re: [PATCH] s390: Fix TF to FPRX2 conversion [PR115860]
On Wed, Sep 11, 2024 at 11:47:54AM +0200, Ilya Leoshkevich wrote: > On Fri, 2024-08-16 at 09:41 +0200, Stefan Schulze Frielinghaus wrote: > > Currently subregs originating from *tf_to_fprx2_0 and *tf_to_fprx2_1 > > survive register allocation. This in turn leads to wrong register > > renaming. Keeping the current approach would mean we need two insns > > for > > *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively. Something along the > > lines > > > > (define_insn "*tf_to_fprx2_0" > > [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" > > "=f") 0) > > (unspec:DF [(match_operand:TF 1 "general_operand" "v")] > > UNSPEC_TF_TO_FPRX2_0))] > > "TARGET_VXE" > > "#") > > > > (define_insn "*tf_to_fprx2_0" > > [(set (match_operand:DF 0 "nonimmediate_operand" "=f") > > (unspec:DF [(match_operand:TF 1 "general_operand" "v")] > > UNSPEC_TF_TO_FPRX2_0))] > > "TARGET_VXE" > > "vpdi\t%v0,%v1,%v0,1 > > [(set_attr "op_type" "VRR")]) > > > > and similar for *tf_to_fprx2_1. Note, pre register allocation > > operand 0 > > has mode FPRX2 and afterwards DF once subregs have been eliminated. > > > > Since we always copy a whole vector register into a floating-point > > register pair, another way to fix this is to merge *tf_to_fprx2_0 and > > *tf_to_fprx2_1 into a single insn which means we don't have to use > > subregs at all. The downside of this is that the assembler template > > contains two instructions, now. The upside is that we don't have to > > come up with some artificial insn before RA which might be more > > readable/maintainable. That is implemented by this patch. > > > > In commit r11-4872-ge627cda5686592, the output operand specifier %V > > was > > introduced which is used in tf_to_fprx2 only, now. I didn't come up > > with its counterpart like %F for floating-point registers. Instead I > > printed the register pair in the output function directly. This > > spares > > us a new and "rare" format specifier for a single insn. I don't have > > a > > strong opinion which option to choose, however, we should either add > > %F > > in order to mimic the same behaviour as %V or getting rid of %V and > > inline the logic in the output function. I lean towards the latter. > > Any preferences? > > --- > > gcc/config/s390/s390.md | 2 + > > gcc/config/s390/vector.md | 66 +++- > > -- > > gcc/testsuite/gcc.target/s390/pr115860-1.c | 26 + > > 3 files changed, 60 insertions(+), 34 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/s390/pr115860-1.c > > [...] > > > + char buf[64]; > > + switch (which_alternative) > > + { > > + case 0: > > + if (REGNO (operands[0]) == REGNO (operands[1])) > > + return "vpdi\t%V0,%v1,%V0,5"; > > + else > > + return "ldr\t%f0,%f1;vpdi\t%V0,%v1,%V0,5"; > > + case 1: > > + { > > + const char *reg_pair = reg_names[REGNO (operands[0]) + 1]; > > + snprintf (buf, sizeof (buf), "ld\t%%f0,%%1;ld\t%%%s,8+%%1", > > reg_pair); > > I wonder if there is a corner case where 8+ does not fit into short > displacement? That is covered by constraint AR, i.e., for short displacement, and AT for long displacement.
[RFC 3/4] genoutput: Verify hard register constraints
Since genoutput has no information about hard register names we cannot statically verify those names in constraints of the machine description. Therefore, we have to do it at runtime. Although verification shouldn't be too expensive, restrict it to checking builds. This should be sufficient since hard register constraints in machine descriptions probably change rarely, and each commit should be tested with checking anyway, or at the very least before a release is taken. --- gcc/genoutput.cc | 46 ++ gcc/output.h | 2 ++ gcc/toplev.cc| 4 3 files changed, 52 insertions(+) diff --git a/gcc/genoutput.cc b/gcc/genoutput.cc index 2ffb2fb28d2..4f4fde83608 100644 --- a/gcc/genoutput.cc +++ b/gcc/genoutput.cc @@ -200,6 +200,8 @@ static const char indep_constraints[] = ",=+%*?!^$#&g"; static class constraint_data * constraints_by_letter_table[1 << CHAR_BIT]; +static hash_set used_reg_names; + static int mdep_constraint_len (const char *, file_location, int); static void note_constraint (md_rtx_info *); @@ -1156,6 +1158,45 @@ main (int argc, const char **argv) output_insn_data (); output_get_insn_name (); + /* Since genoutput has no information about hard register names we cannot + statically verify hard register names in constraints of the machine + description. Therefore, we have to do it at runtime. Although + verification shouldn't be too expensive, restrict it to checking builds. + */ + printf ("\n\n#if CHECKING_P\n"); + if (used_reg_names.is_empty ()) +printf ("void verify_reg_names_in_constraints () { }\n"); + else +{ + size_t max_len = 0; + for (auto it = used_reg_names.begin (); it != used_reg_names.end (); ++it) + { + size_t len = strlen (*it); + if (len > max_len) + max_len = len; + } + printf ("void\nverify_reg_names_in_constraints ()\n{\n"); + printf (" static const char hregnames[%zu][%zu] = {\n", + used_reg_names.elements (), max_len + 1); + auto it = used_reg_names.begin (); + while (it != used_reg_names.end ()) + { + printf ("\"%s\"", *it); + ++it; + if (it != used_reg_names.end ()) + printf (","); + printf ("\n"); + } + printf (" };\n"); + printf (" for (size_t i = 0; i < %zu; ++i)\n", + used_reg_names.elements ()); + printf ("if (decode_reg_name (hregnames[i]) < 0)\n"); + printf (" internal_error (\"invalid register %%qs used in " + "constraint of machine description\", hregnames[i]);\n"); + printf ("}\n"); +} + printf ("#endif\n"); + fflush (stdout); return (ferror (stdout) != 0 || have_error ? FATAL_EXIT_CODE : SUCCESS_EXIT_CODE); @@ -1294,6 +1335,11 @@ mdep_constraint_len (const char *s, file_location loc, int opno) ptrdiff_t len = end - s; if (*end == '}' && len > 1 && len < 31) { + char *regname = new char[len]; + memcpy (regname, s + 1, len - 1); + regname[len - 1] = '\0'; + if (used_reg_names.add (regname)) + delete[] regname; return len + 1; } } diff --git a/gcc/output.h b/gcc/output.h index 46b0033b221..5f0f8a6098c 100644 --- a/gcc/output.h +++ b/gcc/output.h @@ -636,4 +636,6 @@ extern int default_address_cost (rtx, machine_mode, addr_space_t, bool); /* Stack usage. */ extern void output_stack_usage (void); +extern void verify_reg_names_in_constraints (); + #endif /* ! GCC_OUTPUT_H */ diff --git a/gcc/toplev.cc b/gcc/toplev.cc index bc442a08c63..34c372ad1a2 100644 --- a/gcc/toplev.cc +++ b/gcc/toplev.cc @@ -1817,6 +1817,10 @@ backend_init_target (void) static void backend_init (void) { +#if CHECKING_P + verify_reg_names_in_constraints (); +#endif + init_emit_once (); init_rtlanal (); -- 2.45.2
[RFC 1/4] Hard register constraints
Implement hard register constraints of the form {regname} where regname must be any valid register name for the target. Such constraints may be used in asm statements as a replacement for register asm and in machine descriptions. Due to optimizations it is not unexpected if two or more inputs require the same value, then those also share a common pseudo. However, this in turn may lead to unsatisfiable asm where multiple inputs with different hard register constraints share the same pseudo. Therefore, we have to introduce copies of such a pseudo and use these for conflicting inputs. This is done prior RA during asmcons in match_asm_constraints_2(). While IRA tries to reduce live ranges, it also replaces some register-register moves. That in turn might undo those copies of a pseudo which we just introduced during asmcons. Thus, check in decrease_live_ranges_number() via valid_replacement_for_asm_input_p() whether it is valid to perform a replacement. The reminder of the patch mostly deals with parsing and decoding hard register constraints. The actual work is done by LRA in process_alt_operands() where a register filter, according to the constraint, is installed. For the sake of "reviewability" and in order to show the beauty of LRA, error handling (which gets pretty involved) is spread out into a subsequent patch. Limitation: Currently, a fixed register cannot be used as hard register constraint. For example, accessing the stack pointer on x86_64 via void * foo (void) { void *y; __asm__ ("" : "={rsp}" (y)); return y; } leads to an error. This is unfortunate since register asm does not have this limitation. The culprit seems to be that during reload ira_class_hard_regs_num[rclass] does not even include fixed registers which is why lra_assign() ultimately fails. Does anyone have an idea how to lift this limitation? Maybe there is even a shortcut in order to force a pseudo into a hard reg? --- gcc/function.cc | 116 ++ gcc/genoutput.cc | 14 gcc/genpreds.cc | 4 +- gcc/ira.cc| 79 +- gcc/lra-constraints.cc| 13 +++ gcc/recog.cc | 11 ++- gcc/stmt.cc | 39 + gcc/stmt.h| 1 + gcc/testsuite/gcc.dg/asm-hard-reg-1.c | 85 +++ gcc/testsuite/gcc.dg/asm-hard-reg-2.c | 33 gcc/testsuite/gcc.dg/asm-hard-reg-3.c | 25 ++ gcc/testsuite/gcc.dg/asm-hard-reg-4.c | 50 +++ gcc/testsuite/gcc.dg/asm-hard-reg-5.c | 36 gcc/testsuite/gcc.dg/asm-hard-reg-6.c | 60 + gcc/testsuite/gcc.dg/asm-hard-reg-7.c | 41 + gcc/testsuite/gcc.dg/asm-hard-reg-8.c | 49 +++ 16 files changed, 653 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-1.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-2.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-3.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-4.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-5.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-6.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-7.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-8.c diff --git a/gcc/function.cc b/gcc/function.cc index a6f6de34942..bf5992f2b06 100644 --- a/gcc/function.cc +++ b/gcc/function.cc @@ -6974,6 +6974,115 @@ match_asm_constraints_1 (rtx_insn *insn, rtx *p_sets, int noutputs) df_insn_rescan (insn); } +/* It is expected and desired that optimizations coalesce multiple pseudos into + one whenever possible. However, in case of hard register constraints we may + have to undo this and introduce copies since otherwise we could constraint a + single pseudo to different hard registers. For example, during register + allocation the following insn would be unsatisfiable since pseudo 60 is + constrained to hard register r5 and r6 at the same time. + + (insn 7 5 0 2 (asm_operands/v ("foo") ("") 0 [ + (reg:DI 60) repeated x2 + ] + [ + (asm_input:DI ("{r5}") t.c:4) + (asm_input:DI ("{r6}") t.c:4) + ] + [] t.c:4) "t.c":4:3 -1 + (expr_list:REG_DEAD (reg:DI 60) + (nil))) + + Therefore, introduce a copy of pseudo 60 and transform it into + + (insn 10 5 7 2 (set (reg:DI 62) + (reg:DI 60)) "t.c":4:3 1503 {*movdi_64} + (nil)) + (insn 7 10 11 2 (asm_operands/v ("foo") ("") 0 [ + (reg:DI 60) + (reg:DI 62) + ] + [ + (asm_input:DI ("{r5}") t.c:4) + (asm_input:DI ("{r6}") t.c:4) + ] + [] t.c:4) "t.c":4:3 -1 + (expr_list:REG_DEAD (reg:DI 62) + (expr_list:REG_DEAD (reg:DI 60) + (nil + + Now, LRA can assign pseudo 60 to r5, and pseudo 62 to r6. + +
[RFC 4/4] Rewrite register asm into hard register constraints
Currently a register asm already materializes during expand. This means, a hard register is allocated for the very first access of a register asm as e.g. in an assignment. As a consequence this might lead to suboptimal register allocation if the assignment and the using asm statement are spread far apart. Even more problematic are function calls in between register asm assignments and its using asm statement since hard registers may be clobbered by a call. The former may be solved by pulling register asm assignments and asm statements close by. However, the latter is not easily solved since sometimes function calls are implicit. For example int foo (int *x) { register int y asm ("0") = 42; register int z asm ("1") = *x; asm ("bar\t%0,%1" : "+r" (z) : "r" (y)); return z; } If compiled with address sanitizer, then a function call is introduced for the memory load which in turn may interfer with the initialization of register asm y. Likewise, for some targets and configurations even an operation like an addition may lead to an implicit library call. In contrast hard register constraints materialize during register allocation and therefore do not suffer from this, i.e., asm operands are kept in pseudos until RA. This patch adds the feature of rewriting local register asm into code which exploits hard register constraints. For example register int global asm ("r3"); int foo (int x0) { register int x asm ("r4") = x0; register int y asm ("r5"); asm ("bar\t%0,%1,%2" : "=r" (x) : "0" (x), "r" (global)); x += 42; asm ("baz\t%0,%1" : "=r" (y) : "r" (x)); return y; } is rewritten during gimplification into register int global asm ("r3"); int foo (int x0) { int x = x0; int y; asm ("bar\t%0,%1,%2" : "={r4}" (x) : "0" (x), "r" (global)); x += 42; asm ("baz\t%0,%1" : "={r5}" (y) : "{r4}" (x)); return y; } The resulting code solely relies on hard register constraints modulo global register asm. Since I consider this as an experimental feature it is hidden behind new flag -fdemote-register-asm (I'm open for other naming suggestions). --- gcc/common.opt| 4 + gcc/gimplify.cc | 78 +++ .../gcc.dg/asm-hard-reg-demotion-1.c | 19 + .../gcc.dg/asm-hard-reg-demotion-2.c | 19 + gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h | 52 + 5 files changed, 172 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-1.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion-2.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-demotion.h diff --git a/gcc/common.opt b/gcc/common.opt index ea39f87ae71..859a735a0b7 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -3422,6 +3422,10 @@ fverbose-asm Common Var(flag_verbose_asm) Add extra commentary to assembler output. +fdemote-register-asm +Common Var(flag_demote_register_asm) Init(0) +Demote local register asm and use hard register constraints instead + fvisibility= Common Joined RejectNegative Enum(symbol_visibility) Var(default_visibility) Init(VISIBILITY_DEFAULT) -fvisibility=[default|internal|hidden|protected] Set the default symbol visibility. diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc index 08e0b5d047b..c9bd1769c28 100644 --- a/gcc/gimplify.cc +++ b/gcc/gimplify.cc @@ -7049,6 +7049,73 @@ num_alternatives (const_tree link) return num + 1; } +static hash_set demote_register_asm; + +static void +gimplify_demote_register_asm (tree link) +{ + if (!flag_demote_register_asm) +return; + tree op = TREE_VALUE (link); + if (!VAR_P (op) || !DECL_HARD_REGISTER (op) || is_global_var (op)) +return; + tree id = DECL_ASSEMBLER_NAME (op); + const char *regname = IDENTIFIER_POINTER (id); + ++regname; + int regno = decode_reg_name (regname); + if (regno < 0) +/* This indicates an error and we error out later on. */ +return; + const char *constraint = TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE (link))); + auto_vec constraint_new; + for (const char *p = constraint; *p; ) +{ + bool pushed = false; + switch (*p) + { + case '+': case '=': case '%': case '?': case '!': case '*': case '&': + case '#': case '$': case '^': case '{': case 'E': case 'F': case 'G': + case 'H': case 's': case 'i': case 'n': case 'I': case 'J': case 'K': + case 'L': case 'M': case 'N': case 'O': case 'P': case ',': case '0': + case '1': case '2': case '3': case '4': case '5': case '6': case '7': + case '8': case '9': case '[': case '<': case '>': case 'g': case 'X': + break; + + default: + if (!ISALPHA (*p)) + break; + enum constraint_num cn = lookup_constraint (p); + enum reg_class rclass = reg_class_for_constraint (cn); + if (rclass != NO_REGS || insn_extra_address_constraint (cn)) + { + gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (regno), r
[RFC 2/4] Error handling for hard register constraints
This implements some basic error handling for hard register constraints including potential conflics with register asm operands. In contrast to register asm operands, hard register constraints allow more than just one register per operand. Even more than just one register per alternative. For example, a valid constraint for an operand is "{r0}{r1}m,{r2}". However, this also means that we have to make sure that each register is used at most once in each alternative over all outputs and likewise over all inputs. For asm statements this is done by this patch during gimplification. For hard register constraints used in machine description, error handling is still a todo and I haven't investigated this so far and consider this rather a low priority. There are 9/10 call sides for parse_{input,output}_constraint() which I didn't dare to touch in the first run. If this patch is about to be accepted I could change those call sides and explicitly pass a null pointer instead of overloading those functions as it is done right now. I consider this an implementation nit and didn't want to clutter the patch for reviewing. --- gcc/cfgexpand.cc | 42 gcc/gimplify.cc | 73 +- gcc/gimplify_reg_info.h | 130 ++ gcc/stmt.cc | 229 +- gcc/stmt.h| 8 +- gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c | 83 +++ gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c | 20 ++ gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c | 21 ++ gcc/testsuite/gcc.dg/pr87600-2.c | 30 +-- gcc/testsuite/gcc.dg/pr87600-3.c | 35 +++ gcc/testsuite/gcc.dg/pr87600-3.s | 0 .../gcc.target/s390/asm-hard-reg-1.c | 103 .../gcc.target/s390/asm-hard-reg-2.c | 43 .../gcc.target/s390/asm-hard-reg-3.c | 42 gcc/testsuite/lib/scanasm.exp | 4 + 15 files changed, 779 insertions(+), 84 deletions(-) create mode 100644 gcc/gimplify_reg_info.h create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-1.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-2.c create mode 100644 gcc/testsuite/gcc.dg/asm-hard-reg-error-3.c create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.c create mode 100644 gcc/testsuite/gcc.dg/pr87600-3.s create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-1.c create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-3.c diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc index 13f8c08d295..fdbbd93f1b5 100644 --- a/gcc/cfgexpand.cc +++ b/gcc/cfgexpand.cc @@ -2966,44 +2966,6 @@ expand_asm_loc (tree string, int vol, location_t locus) emit_insn (body); } -/* Return the number of times character C occurs in string S. */ -static int -n_occurrences (int c, const char *s) -{ - int n = 0; - while (*s) -n += (*s++ == c); - return n; -} - -/* A subroutine of expand_asm_operands. Check that all operands have - the same number of alternatives. Return true if so. */ - -static bool -check_operand_nalternatives (const vec &constraints) -{ - unsigned len = constraints.length(); - if (len > 0) -{ - int nalternatives = n_occurrences (',', constraints[0]); - - if (nalternatives + 1 > MAX_RECOG_ALTERNATIVES) - { - error ("too many alternatives in %"); - return false; - } - - for (unsigned i = 1; i < len; ++i) - if (n_occurrences (',', constraints[i]) != nalternatives) - { - error ("operand constraints for % differ " - "in number of alternatives"); - return false; - } -} - return true; -} - /* Check for overlap between registers marked in CLOBBERED_REGS and anything inappropriate in T. Emit error and return the register variable definition for error, NULL_TREE for ok. */ @@ -3169,10 +3131,6 @@ expand_asm_stmt (gasm *stmt) = TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE (t))); } - /* ??? Diagnose during gimplification? */ - if (! check_operand_nalternatives (constraints)) -return; - /* Count the number of meaningful clobbered registers, ignoring what we would ignore later. */ auto_vec clobber_rvec; diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc index 26a216e151d..08e0b5d047b 100644 --- a/gcc/gimplify.cc +++ b/gcc/gimplify.cc @@ -70,6 +70,10 @@ along with GCC; see the file COPYING3. If not see #include "omp-offload.h" #include "context.h" #include "tree-nested.h" +#include "insn-config.h" +#include "recog.h" +#include "output.h" +#include "gimplify_reg_info.h" /* Identifier for a basic condition, mapping it to other basic conditions of its Boolean expression. Basic conditions given the same uid (in the same @@ -7009,6 +7013,42 @@ gimplify_addr_expr (tree *expr_p, gimple_seq *pre_p, gimple_seq *
[RFC 0/4] Hard Register Constraints
This series introduces hard register constraints. The first patch enables hard register constraints for asm statements and for machine descriptions. The subsequent patch adds some basic error handling for asm statements. The third patch adds some verification of register names used in machine description. The fourth and last patch adds the feature of rewriting local register asm into hard register constraints. This series was bootstrapped and regtested on s390. Furthermore, the new dg-compile tests were verified via cross compilers for the enabled targets. There is still some fallout if -fdemote-register-asm is used since a couple of features are missing as e.g. erroring out during gimplification if the clobber set of registers intersects with input/output registers. As a larger test vehicle I've compiled and regtested glibc on s390 using -fdemote-register-asm without any fallout. On x86_64 this fails due to the limitation that fixed registers are currently not supported for hard register constraints (see commit message of the first patch). This is also the reason why I'm posting this series already since I was hoping to get some feedback about this limitation. Furthermore, I've compiled the Linux kernel on s390 and x86_64 with -fdemote-register-asm. Interestingly, the Linux kernel for x86_64 makes use of the following asm statement: #define call_on_stack(stack, func, asm_call, argconstr...) \ { \ register void *tos asm("r11"); \ \ tos = ((void *)(stack));\ \ asm_inline volatile(\ "movq %%rsp, (%[tos]) \n" \ "movq %[tos], %%rsp \n" \ \ asm_call\ \ "popq %%rsp \n" \ \ : "+r" (tos), ASM_CALL_CONSTRAINT \ : [__func] "i" (func), [tos] "r" (tos) argconstr\ : "cc", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", \ "memory" \ ); \ } Note the output "+r" (tos) and the input [tos] "r" (tos) Currently I error out for this since I consider this as two inputs using the same hard register. One time an implicit input via '+' and a second time via the explicit input. Thus, actually I would expect a '=' instead of a '+' for the output constraint since the input is explicitly mentioned, or remove the input entirely and just use the inoutput [tos] "+r" (tos) If you consider this valid asm I would have to adjust the error handling. Either way, this is just about error handling and doesn't really affect code generation. Stefan Schulze Frielinghaus (4): Hard register constraints Error handling for hard register constraints genoutput: Verify hard register constraints Rewrite register asm into hard register constraints gcc/cfgexpand.cc | 42 --- gcc/common.opt| 4 + gcc/function.cc | 116 gcc/genoutput.cc | 60 gcc/genpreds.cc | 4 +- gcc/gimplify.cc | 151 +- gcc/gimplify_reg_info.h | 130 + gcc/ira.cc| 79 +- gcc/lra-constraints.cc| 13 + gcc/output.h | 2 + gcc/recog.cc | 11 +- gcc/stmt.cc | 268 +- gcc/stmt.h| 9 +- gcc/testsuite/gcc.dg/asm-hard-reg-1.c | 85 ++ gcc/testsuite/gcc.dg/asm-hard-reg-2.c | 33 +++ gcc/testsuite/gcc.dg/asm-hard-reg-3.c | 25 ++ gcc/testsuite/gcc.dg/asm-hard-reg-4.c | 50 gcc/testsuite/gcc.dg/asm-hard-reg-5.c | 36 +++ gcc/testsuite/gcc.dg/asm-hard-reg-6.c | 60 gcc/testsuite/gcc.dg/asm-hard-reg-7.c |
Re: [PATCH] s390: Fix TF to FPRX2 conversion [PR115860]
Ping On Fri, Aug 16, 2024 at 09:41:55AM +0200, Stefan Schulze Frielinghaus wrote: > Currently subregs originating from *tf_to_fprx2_0 and *tf_to_fprx2_1 > survive register allocation. This in turn leads to wrong register > renaming. Keeping the current approach would mean we need two insns for > *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively. Something along the > lines > > (define_insn "*tf_to_fprx2_0" > [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "=f") 0) > (unspec:DF [(match_operand:TF 1 "general_operand" "v")] >UNSPEC_TF_TO_FPRX2_0))] > "TARGET_VXE" > "#") > > (define_insn "*tf_to_fprx2_0" > [(set (match_operand:DF 0 "nonimmediate_operand" "=f") > (unspec:DF [(match_operand:TF 1 "general_operand" "v")] >UNSPEC_TF_TO_FPRX2_0))] > "TARGET_VXE" > "vpdi\t%v0,%v1,%v0,1 > [(set_attr "op_type" "VRR")]) > > and similar for *tf_to_fprx2_1. Note, pre register allocation operand 0 > has mode FPRX2 and afterwards DF once subregs have been eliminated. > > Since we always copy a whole vector register into a floating-point > register pair, another way to fix this is to merge *tf_to_fprx2_0 and > *tf_to_fprx2_1 into a single insn which means we don't have to use > subregs at all. The downside of this is that the assembler template > contains two instructions, now. The upside is that we don't have to > come up with some artificial insn before RA which might be more > readable/maintainable. That is implemented by this patch. > > In commit r11-4872-ge627cda5686592, the output operand specifier %V was > introduced which is used in tf_to_fprx2 only, now. I didn't come up > with its counterpart like %F for floating-point registers. Instead I > printed the register pair in the output function directly. This spares > us a new and "rare" format specifier for a single insn. I don't have a > strong opinion which option to choose, however, we should either add %F > in order to mimic the same behaviour as %V or getting rid of %V and > inline the logic in the output function. I lean towards the latter. > Any preferences? > --- > gcc/config/s390/s390.md| 2 + > gcc/config/s390/vector.md | 66 +++--- > gcc/testsuite/gcc.target/s390/pr115860-1.c | 26 + > 3 files changed, 60 insertions(+), 34 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/s390/pr115860-1.c > > diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md > index 3d5759d6252..31240899934 100644 > --- a/gcc/config/s390/s390.md > +++ b/gcc/config/s390/s390.md > @@ -241,6 +241,8 @@ > UNSPEC_VEC_VFMIN > UNSPEC_VEC_VFMAX > > + UNSPEC_TF_TO_FPRX2 > + > UNSPEC_NNPA_VCLFNHS_V8HI > UNSPEC_NNPA_VCLFNLS_V8HI > UNSPEC_NNPA_VCRNFS_V8HI > diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md > index a75b7cb5825..561182e0c2c 100644 > --- a/gcc/config/s390/vector.md > +++ b/gcc/config/s390/vector.md > @@ -907,36 +907,36 @@ >"vmrlg\t%0,%1,%2"; >[(set_attr "op_type" "VRR")]) > > - > -(define_insn "*tf_to_fprx2_0" > - [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0) > - (subreg:DF (match_operand:TF1 "general_operand" "v") 0))] > - "TARGET_VXE" > - ; M4 == 1 corresponds to %v0[0] = %v1[0]; %v0[1] = %v0[1]; > - "vpdi\t%v0,%v1,%v0,1" > - [(set_attr "op_type" "VRR")]) > - > -(define_insn "*tf_to_fprx2_1" > - [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8) > - (subreg:DF (match_operand:TF1 "general_operand" "v") 8))] > +(define_insn "tf_to_fprx2" > + [(set (match_operand:FPRX2 0 "register_operand" "=f,f ,f") > + (unspec:FPRX2 [(match_operand:TF 1 "general_operand" "v,AR,AT")] > + UNSPEC_TF_TO_FPRX2))] >"TARGET_VXE" > - ; M4 == 5 corresponds to %V0[0] = %v1[1]; %V0[1] = %V0[1]; > - "vpdi\t%V0,%v1,%V0,5" > - [(set_attr "op_type" "VRR")]) > - > -(define_insn_and_split "tf_to_fprx2" > - [(set (match_operand:FPRX20 "nonimmediate_operand" "=f,f") > - (subreg:FPRX2 (match_operand:TF 1 "general_operand" "v,AR") 0))] > - "TARGET_VXE" > - "#&q
Re: [PATCH] s390: Fix strict_low_part generation
Ping On Fri, Aug 16, 2024 at 09:14:02AM +0200, Stefan Schulze Frielinghaus wrote: > In s390_expand_insv(), if generating code for ICM et al. src is a MEM > and gen_lowpart might force src into a register such that we end up with > patterns which do not match anymore. Use adjust_address() instead in > order to preserve a MEM. > > Furthermore, it is not straight forward to enforce a subreg. For > example, in case of a paradoxical subreg, gen_lowpart() may return a > register. In order to compensate this, s390_gen_lowpart_subreg() emits > a reference to a pseudo which does not coincide with its definition > which is wrong. Additionally, if dest is a paradoxical subreg, then do > not try to emit a strict_low_part since it could mean that dest was not > initialized even though this might be fixed up later by init-regs. > > Splitter for insn *get_tp_64, *zero_extendhisi2_31, > *zero_extendqisi2_31, *zero_extendqihi2_31 are applied after reload. > Thus, operands[0] is a hard register and gen_lowpart (m, operands[0]) > just returns the hard register for mode m which is fine to use as an > argument for strict_low_part, i.e., we do not need to enforce subregs > here since after reload subregs are supposed to be eliminated anyway. > > This fixes gcc.dg/torture/pr111821.c. > > gcc/ChangeLog: > > * config/s390/s390-protos.h (s390_gen_lowpart_subreg): Remove. > * config/s390/s390.cc (s390_gen_lowpart_subreg): Remove. > (s390_expand_insv): Use adjust_address() and emit a > strict_low_part only in case of a natural subreg. > * config/s390/s390.md: Use gen_lowpart() instead of > s390_gen_lowpart_subreg(). > --- > Bootstrapped and regtested on s390. Ok for mainline,gcc12,gcc13,gcc14? > > gcc/config/s390/s390-protos.h | 1 - > gcc/config/s390/s390.cc | 47 +++ > gcc/config/s390/s390.md | 13 +- > 3 files changed, 20 insertions(+), 41 deletions(-) > > diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h > index b4646ccb606..e7ac59d17da 100644 > --- a/gcc/config/s390/s390-protos.h > +++ b/gcc/config/s390/s390-protos.h > @@ -50,7 +50,6 @@ extern void s390_set_has_landing_pad_p (bool); > extern bool s390_hard_regno_rename_ok (unsigned int, unsigned int); > extern int s390_class_max_nregs (enum reg_class, machine_mode); > extern bool s390_return_addr_from_memory(void); > -extern rtx s390_gen_lowpart_subreg (machine_mode, rtx); > extern bool s390_fma_allowed_p (machine_mode); > #if S390_USE_TARGET_ATTRIBUTE > extern tree s390_valid_target_attribute_tree (tree args, > diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc > index 7aea776da2f..7cdcebfc08b 100644 > --- a/gcc/config/s390/s390.cc > +++ b/gcc/config/s390/s390.cc > @@ -516,31 +516,6 @@ s390_return_addr_from_memory () >return cfun_gpr_save_slot(RETURN_REGNUM) == SAVE_SLOT_STACK; > } > > -/* Generate a SUBREG for the MODE lowpart of EXPR. > - > - In contrast to gen_lowpart it will always return a SUBREG > - expression. This is useful to generate STRICT_LOW_PART > - expressions. */ > -rtx > -s390_gen_lowpart_subreg (machine_mode mode, rtx expr) > -{ > - rtx lowpart = gen_lowpart (mode, expr); > - > - /* There might be no SUBREG in case it could be applied to the hard > - REG rtx or it could be folded with a paradoxical subreg. Bring > - it back. */ > - if (!SUBREG_P (lowpart)) > -{ > - machine_mode reg_mode = TARGET_ZARCH ? DImode : SImode; > - gcc_assert (REG_P (lowpart)); > - lowpart = gen_lowpart_SUBREG (mode, > - gen_rtx_REG (reg_mode, > - REGNO (lowpart))); > -} > - > - return lowpart; > -} > - > /* Return nonzero if it's OK to use fused multiply-add for MODE. */ > bool > s390_fma_allowed_p (machine_mode mode) > @@ -7112,15 +7087,21 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src) >/* Emit a strict_low_part pattern if possible. */ >if (smode_bsize == bitsize && bitpos == mode_bsize - smode_bsize) > { > - rtx low_dest = s390_gen_lowpart_subreg (smode, dest); > - rtx low_src = gen_lowpart (smode, src); > - > - switch (smode) > + rtx low_dest = gen_lowpart (smode, dest); > + if (SUBREG_P (low_dest) && !paradoxical_subreg_p (low_dest)) > { > - case E_QImode: emit_insn (gen_movstrictqi (low_dest, low_src)); > return true; > - case E_HImode: emit_insn (gen_movstricthi (low_dest, low_src)); > return true; > - case E_SImode: emit_insn (gen_movstrictsi (low_de
[PATCH] s390: Fix TF to FPRX2 conversion [PR115860]
Currently subregs originating from *tf_to_fprx2_0 and *tf_to_fprx2_1 survive register allocation. This in turn leads to wrong register renaming. Keeping the current approach would mean we need two insns for *tf_to_fprx2_0 and *tf_to_fprx2_1, respectively. Something along the lines (define_insn "*tf_to_fprx2_0" [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "=f") 0) (unspec:DF [(match_operand:TF 1 "general_operand" "v")] UNSPEC_TF_TO_FPRX2_0))] "TARGET_VXE" "#") (define_insn "*tf_to_fprx2_0" [(set (match_operand:DF 0 "nonimmediate_operand" "=f") (unspec:DF [(match_operand:TF 1 "general_operand" "v")] UNSPEC_TF_TO_FPRX2_0))] "TARGET_VXE" "vpdi\t%v0,%v1,%v0,1 [(set_attr "op_type" "VRR")]) and similar for *tf_to_fprx2_1. Note, pre register allocation operand 0 has mode FPRX2 and afterwards DF once subregs have been eliminated. Since we always copy a whole vector register into a floating-point register pair, another way to fix this is to merge *tf_to_fprx2_0 and *tf_to_fprx2_1 into a single insn which means we don't have to use subregs at all. The downside of this is that the assembler template contains two instructions, now. The upside is that we don't have to come up with some artificial insn before RA which might be more readable/maintainable. That is implemented by this patch. In commit r11-4872-ge627cda5686592, the output operand specifier %V was introduced which is used in tf_to_fprx2 only, now. I didn't come up with its counterpart like %F for floating-point registers. Instead I printed the register pair in the output function directly. This spares us a new and "rare" format specifier for a single insn. I don't have a strong opinion which option to choose, however, we should either add %F in order to mimic the same behaviour as %V or getting rid of %V and inline the logic in the output function. I lean towards the latter. Any preferences? --- gcc/config/s390/s390.md| 2 + gcc/config/s390/vector.md | 66 +++--- gcc/testsuite/gcc.target/s390/pr115860-1.c | 26 + 3 files changed, 60 insertions(+), 34 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/pr115860-1.c diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 3d5759d6252..31240899934 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -241,6 +241,8 @@ UNSPEC_VEC_VFMIN UNSPEC_VEC_VFMAX + UNSPEC_TF_TO_FPRX2 + UNSPEC_NNPA_VCLFNHS_V8HI UNSPEC_NNPA_VCLFNLS_V8HI UNSPEC_NNPA_VCRNFS_V8HI diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index a75b7cb5825..561182e0c2c 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -907,36 +907,36 @@ "vmrlg\t%0,%1,%2"; [(set_attr "op_type" "VRR")]) - -(define_insn "*tf_to_fprx2_0" - [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0) - (subreg:DF (match_operand:TF1 "general_operand" "v") 0))] - "TARGET_VXE" - ; M4 == 1 corresponds to %v0[0] = %v1[0]; %v0[1] = %v0[1]; - "vpdi\t%v0,%v1,%v0,1" - [(set_attr "op_type" "VRR")]) - -(define_insn "*tf_to_fprx2_1" - [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8) - (subreg:DF (match_operand:TF1 "general_operand" "v") 8))] +(define_insn "tf_to_fprx2" + [(set (match_operand:FPRX2 0 "register_operand" "=f,f ,f") + (unspec:FPRX2 [(match_operand:TF 1 "general_operand" "v,AR,AT")] + UNSPEC_TF_TO_FPRX2))] "TARGET_VXE" - ; M4 == 5 corresponds to %V0[0] = %v1[1]; %V0[1] = %V0[1]; - "vpdi\t%V0,%v1,%V0,5" - [(set_attr "op_type" "VRR")]) - -(define_insn_and_split "tf_to_fprx2" - [(set (match_operand:FPRX20 "nonimmediate_operand" "=f,f") - (subreg:FPRX2 (match_operand:TF 1 "general_operand" "v,AR") 0))] - "TARGET_VXE" - "#" - "!(MEM_P (operands[1]) && MEM_VOLATILE_P (operands[1]))" - [(set (match_dup 2) (match_dup 3)) - (set (match_dup 4) (match_dup 5))] { - operands[2] = simplify_gen_subreg (DFmode, operands[0], FPRX2mode, 0); - operands[3] = simplify_gen_subreg (DFmode, operands[1], TFmode, 0); - operands[4] = simplify_gen_subreg (DFmode, operands[0], FPRX2mode, 8); - operands[5] = simplify_gen_subreg (DFmode, operands[1], TFmode, 8); + char buf[64]; + switch (which_alternative) +{ +case 0: + if (REGNO (operands[0]) == REGNO (operands[1])) + return "vpdi\t%V0,%v1,%V0,5"; + else + return "ldr\t%f0,%f1;vpdi\t%V0,%v1,%V0,5"; +case 1: + { + const char *reg_pair = reg_names[REGNO (operands[0]) + 1]; + snprintf (buf, sizeof (buf), "ld\t%%f0,%%1;ld\t%%%s,8+%%1", reg_pair); + output_asm_insn (buf, operands); + return ""; + } +case 2: + { + const char *reg_pair = reg_names[REGNO (operands[0]) + 1]; + snprintf (buf, sizeof (buf), "ldy\t%%f0,%%1;ldy\t%%%s,8+%%1", reg_pair); +
[PATCH] s390: Fix strict_low_part generation
In s390_expand_insv(), if generating code for ICM et al. src is a MEM and gen_lowpart might force src into a register such that we end up with patterns which do not match anymore. Use adjust_address() instead in order to preserve a MEM. Furthermore, it is not straight forward to enforce a subreg. For example, in case of a paradoxical subreg, gen_lowpart() may return a register. In order to compensate this, s390_gen_lowpart_subreg() emits a reference to a pseudo which does not coincide with its definition which is wrong. Additionally, if dest is a paradoxical subreg, then do not try to emit a strict_low_part since it could mean that dest was not initialized even though this might be fixed up later by init-regs. Splitter for insn *get_tp_64, *zero_extendhisi2_31, *zero_extendqisi2_31, *zero_extendqihi2_31 are applied after reload. Thus, operands[0] is a hard register and gen_lowpart (m, operands[0]) just returns the hard register for mode m which is fine to use as an argument for strict_low_part, i.e., we do not need to enforce subregs here since after reload subregs are supposed to be eliminated anyway. This fixes gcc.dg/torture/pr111821.c. gcc/ChangeLog: * config/s390/s390-protos.h (s390_gen_lowpart_subreg): Remove. * config/s390/s390.cc (s390_gen_lowpart_subreg): Remove. (s390_expand_insv): Use adjust_address() and emit a strict_low_part only in case of a natural subreg. * config/s390/s390.md: Use gen_lowpart() instead of s390_gen_lowpart_subreg(). --- Bootstrapped and regtested on s390. Ok for mainline,gcc12,gcc13,gcc14? gcc/config/s390/s390-protos.h | 1 - gcc/config/s390/s390.cc | 47 +++ gcc/config/s390/s390.md | 13 +- 3 files changed, 20 insertions(+), 41 deletions(-) diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h index b4646ccb606..e7ac59d17da 100644 --- a/gcc/config/s390/s390-protos.h +++ b/gcc/config/s390/s390-protos.h @@ -50,7 +50,6 @@ extern void s390_set_has_landing_pad_p (bool); extern bool s390_hard_regno_rename_ok (unsigned int, unsigned int); extern int s390_class_max_nregs (enum reg_class, machine_mode); extern bool s390_return_addr_from_memory(void); -extern rtx s390_gen_lowpart_subreg (machine_mode, rtx); extern bool s390_fma_allowed_p (machine_mode); #if S390_USE_TARGET_ATTRIBUTE extern tree s390_valid_target_attribute_tree (tree args, diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 7aea776da2f..7cdcebfc08b 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -516,31 +516,6 @@ s390_return_addr_from_memory () return cfun_gpr_save_slot(RETURN_REGNUM) == SAVE_SLOT_STACK; } -/* Generate a SUBREG for the MODE lowpart of EXPR. - - In contrast to gen_lowpart it will always return a SUBREG - expression. This is useful to generate STRICT_LOW_PART - expressions. */ -rtx -s390_gen_lowpart_subreg (machine_mode mode, rtx expr) -{ - rtx lowpart = gen_lowpart (mode, expr); - - /* There might be no SUBREG in case it could be applied to the hard - REG rtx or it could be folded with a paradoxical subreg. Bring - it back. */ - if (!SUBREG_P (lowpart)) -{ - machine_mode reg_mode = TARGET_ZARCH ? DImode : SImode; - gcc_assert (REG_P (lowpart)); - lowpart = gen_lowpart_SUBREG (mode, - gen_rtx_REG (reg_mode, -REGNO (lowpart))); -} - - return lowpart; -} - /* Return nonzero if it's OK to use fused multiply-add for MODE. */ bool s390_fma_allowed_p (machine_mode mode) @@ -7112,15 +7087,21 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src) /* Emit a strict_low_part pattern if possible. */ if (smode_bsize == bitsize && bitpos == mode_bsize - smode_bsize) { - rtx low_dest = s390_gen_lowpart_subreg (smode, dest); - rtx low_src = gen_lowpart (smode, src); - - switch (smode) + rtx low_dest = gen_lowpart (smode, dest); + if (SUBREG_P (low_dest) && !paradoxical_subreg_p (low_dest)) { - case E_QImode: emit_insn (gen_movstrictqi (low_dest, low_src)); return true; - case E_HImode: emit_insn (gen_movstricthi (low_dest, low_src)); return true; - case E_SImode: emit_insn (gen_movstrictsi (low_dest, low_src)); return true; - default: break; + poly_int64 offset = GET_MODE_SIZE (mode) - GET_MODE_SIZE (smode); + rtx low_src = adjust_address (src, smode, offset); + switch (smode) + { + case E_QImode: emit_insn (gen_movstrictqi (low_dest, low_src)); + return true; + case E_HImode: emit_insn (gen_movstricthi (low_dest, low_src)); + return true; + case E_SImode: emit_insn (gen_movstrictsi (low_dest, low_src)); + retu
[PATCH] s390: Remove vector intrinsics
The following intrinsics are not implemented. Thus, remove them. Ok for mainline? gcc/ChangeLog: * config/s390/vecintrin.h (vec_vstbrh): Remove. (vec_vstbrf): Remove. (vec_vstbrg): Remove. (vec_vstbrq): Remove. (vec_vstbrf_flt): Remove. (vec_vstbrg_dbl): Remove. (vec_vsterb): Remove. (vec_vsterh): Remove. (vec_vsterf): Remove. (vec_vsterg): Remove. (vec_vsterf_flt): Remove. (vec_vsterg_dbl): Remove. --- gcc/config/s390/vecintrin.h | 14 -- 1 file changed, 14 deletions(-) diff --git a/gcc/config/s390/vecintrin.h b/gcc/config/s390/vecintrin.h index daeed91ef97..de29f913637 100644 --- a/gcc/config/s390/vecintrin.h +++ b/gcc/config/s390/vecintrin.h @@ -160,20 +160,6 @@ __lcbb(const void *ptr, int bndry) cc != 3 ? 1 : 0; \ }) -#define vec_vstbrh vec_vlbrh -#define vec_vstbrf vec_vlbrf -#define vec_vstbrg vec_vlbrg -#define vec_vstbrq vec_vlbrq -#define vec_vstbrf_flt vec_vlbrf_flt -#define vec_vstbrg_dbl vec_vlbrg_dbl - -#define vec_vsterb vec_vlerb -#define vec_vsterh vec_vlerh -#define vec_vsterf vec_vlerh -#define vec_vsterg vec_vlerh -#define vec_vsterf_flt vec_vlerf_flt -#define vec_vsterg_dbl vec_vlerg_dbl - #define vec_extend_to_fp32_hi __builtin_s390_vclfnhs #define vec_extend_to_fp32_lo __builtin_s390_vclfnls #define vec_round_from_fp32 __builtin_s390_vcrnfs -- 2.45.2
[PATCH] s390: Fix high-level builtins vec_gfmsum{,_accum}_128
Starting with r14-9449-g9f2b16ce1efef0 builtins were streamlined with those in LLVM. In particular s390_vgfm{,a}g have been changed from UV16QI to UINT128 in order to match those in LLVM. However, these low-level builtins are directly used by the high-level builtins vec_gfmsum{,_accum}_128 which expect UV16QI instead. Therefore, introduce new low-level builtins s390_vgfm{,a}g_128 and make use of them, respectively. Bootstrapped on s390. Ok for mainline and releases/gcc-14? gcc/ChangeLog: * config/s390/s390-builtin-types.def (BT_FN_UV16QI_UV2DI_UV2DI): New. (BT_FN_UV16QI_UV2DI_UV2DI_UV16QI): New. * config/s390/s390-builtins.def (s390_vgfmg_128): New. (s390_vgfmag_128): New. * config/s390/vecintrin.h (vec_gfmsum_128): Use s390_vgfmg_128. (vec_gfmsum_accum_128): Use s390_vgfmag_128. --- gcc/config/s390/s390-builtin-types.def | 2 ++ gcc/config/s390/s390-builtins.def | 2 ++ gcc/config/s390/vecintrin.h| 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gcc/config/s390/s390-builtin-types.def b/gcc/config/s390/s390-builtin-types.def index d70eaade8ea..e6f5631ed7a 100644 --- a/gcc/config/s390/s390-builtin-types.def +++ b/gcc/config/s390/s390-builtin-types.def @@ -221,6 +221,7 @@ DEF_FN_TYPE_2 (BT_FN_UV16QI_UCHAR_UCHAR, BT_UV16QI, BT_UCHAR, BT_UCHAR) DEF_FN_TYPE_2 (BT_FN_UV16QI_UV16QI_INTPTR, BT_UV16QI, BT_UV16QI, BT_INTPTR) DEF_FN_TYPE_2 (BT_FN_UV16QI_UV16QI_UCHAR, BT_UV16QI, BT_UV16QI, BT_UCHAR) DEF_FN_TYPE_2 (BT_FN_UV16QI_UV16QI_UV16QI, BT_UV16QI, BT_UV16QI, BT_UV16QI) +DEF_FN_TYPE_2 (BT_FN_UV16QI_UV2DI_UV2DI, BT_UV16QI, BT_UV2DI, BT_UV2DI) DEF_FN_TYPE_2 (BT_FN_UV16QI_UV8HI_UV8HI, BT_UV16QI, BT_UV8HI, BT_UV8HI) DEF_FN_TYPE_2 (BT_FN_UV2DI_UCHAR_UCHAR, BT_UV2DI, BT_UCHAR, BT_UCHAR) DEF_FN_TYPE_2 (BT_FN_UV2DI_ULONGLONG_INT, BT_UV2DI, BT_ULONGLONG, BT_INT) @@ -299,6 +300,7 @@ DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UCHAR_INT, BT_UV16QI, BT_UV16QI, BT_UCHAR, BT DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UV16QI_INT, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INT) DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UV16QI_INTPTR, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UV16QI_UV16QI, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_UV16QI) +DEF_FN_TYPE_3 (BT_FN_UV16QI_UV2DI_UV2DI_UV16QI, BT_UV16QI, BT_UV2DI, BT_UV2DI, BT_UV16QI) DEF_FN_TYPE_3 (BT_FN_UV16QI_UV8HI_UV8HI_INTPTR, BT_UV16QI, BT_UV8HI, BT_UV8HI, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_ULONGLONG_INT, BT_UV2DI, BT_UV2DI, BT_ULONGLONG, BT_INT) DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_UV2DI_INT, BT_UV2DI, BT_UV2DI, BT_UV2DI, BT_INT) diff --git a/gcc/config/s390/s390-builtins.def b/gcc/config/s390/s390-builtins.def index 3a63213e571..7f6190fa810 100644 --- a/gcc/config/s390/s390-builtins.def +++ b/gcc/config/s390/s390-builtins.def @@ -1666,6 +1666,7 @@ B_DEF (s390_vgfmb, vec_gfmsumv16qi, 0, B_DEF (s390_vgfmh, vec_gfmsumv8hi, 0, B_VX, 0, BT_FN_UV4SI_UV8HI_UV8HI) B_DEF (s390_vgfmf, vec_gfmsumv4si, 0, B_VX, 0, BT_FN_UV2DI_UV4SI_UV4SI) B_DEF (s390_vgfmg, vec_gfmsum_128, 0, B_VX, 0, BT_FN_UINT128_UV2DI_UV2DI) +B_DEF (s390_vgfmg_128, vec_gfmsum_128, 0, B_VX, 0, BT_FN_UV16QI_UV2DI_UV2DI) OB_DEF (s390_vec_gfmsum_accum, s390_vec_gfmsum_accum_u8,s390_vec_gfmsum_accum_u32,B_VX, BT_FN_OV4SI_OV4SI_OV4SI_OV4SI) OB_DEF_VAR (s390_vec_gfmsum_accum_u8, s390_vgfmab,0, 0, BT_OV_UV8HI_UV16QI_UV16QI_UV8HI) @@ -1676,6 +1677,7 @@ B_DEF (s390_vgfmab, vec_gfmsum_accumv16qi,0, B_DEF (s390_vgfmah,vec_gfmsum_accumv8hi,0, B_VX, 0, BT_FN_UV4SI_UV8HI_UV8HI_UV4SI) B_DEF (s390_vgfmaf,vec_gfmsum_accumv4si,0, B_VX, 0, BT_FN_UV2DI_UV4SI_UV4SI_UV2DI) B_DEF (s390_vgfmag,vec_gfmsum_accum_128,0, B_VX, 0, BT_FN_UINT128_UV2DI_UV2DI_UINT128) +B_DEF (s390_vgfmag_128,vec_gfmsum_accum_128,0, B_VX, 0, BT_FN_UV16QI_UV2DI_UV2DI_UV16QI) OB_DEF (s390_vec_abs, s390_vec_abs_s8,s390_vec_abs_dbl, B_VX, BT_FN_OV4SI_OV4SI) OB_DEF_VAR (s390_vec_abs_s8,s390_vlpb, 0, 0, BT_OV_V16QI_V16QI) diff --git a/gcc/config/s390/vecintrin.h b/gcc/config/s390/vecintrin.h index 9abbd761067..daeed91ef97 100644 --- a/gcc/config/s390/vecintrin.h +++ b/gcc/config/s390/vecintrin.h @@ -98,8 +98,8 @@ __lcbb(const void *ptr, int bndry) #define vec_splat_u64 __builtin_s390
Re: [PATCH v2] Hard register constraints
On Mon, Aug 05, 2024 at 02:19:50PM +0200, Georg-Johann Lay wrote: > Am 05.08.24 um 12:28 schrieb Stefan Schulze Frielinghaus: > > This is a follow-up of > > https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654013.html > > > > What has changed? > > > > - Rebased and fixed an issue in constrain_operands which manifested > > after late-combine. > > > > - Introduced new test cases for Arm, Intel, POWER, RISCV, S/390 for 32- > > and 64-bit where appropriate (including register pairs etc.). Test > > gcc.dg/asm-hard-reg-7.c is a bit controversial since I'm testing for an > > anti feature here, i.e., I'm testing for register asm in conjunction > > with calls. I'm fine with removing it in the end but I wanted to keep > > it in for demonstration purposes at least during discussion of this > > patch. > > > > - Split test pr87600-2.c into pr87600-2.c and pr87600-3.c since test0 > > errors out early, now. Otherwise, the remaining errors would not be > > reported. Beside that the error message has slightly changed. > > > > - Modified genoutput.cc in order to allow hard register constraints in > > machine descriptions. For example, on s390 the instruction mvcrl makes > > As I already said, such a feature would be great. Some questions: > > Which pass is satisfying that constraint? AFAIK for local reg vars, > it is asmcons, but for register constraints in md it it the register > allocator. This is done by reload during process_alt_operands(). Basically every other change in gimplify.cc, stmt.cc etc. is only there in order to do some error checking and have some proper diagnostics. > The avr backend has many insns that use explicit hard regs in order to > model some libcalls (ones with footprints smaller than ABI, or that > deviate from the ABI). A proper way would be to add a register > constraint for each possible hard reg, e.g. R20_1 for QImode in R20, > R20_2 for HImode in R20, etc. This would require a dozen or more > new register classes, and the problem with that is that register > allocation produces less efficient code even for cases that do > not use these new constraints. So I gave up that approach. > > How does your feature work? Does it imply that for each hreg > constraint there must be an according register class? No. During reload I limit the set of registers by installing a filter and let RA solve it. > > Obviously local reg vars don't require respective reg classes, > so I thought about representing such insns as asm_input or > whatever, but that's pure hack and would never pass a review... > > > use of the implicit register r0 which we currently deal with as follows: > > > > (define_insn "*mvcrl" > >[(set (match_operand:BLK 0 "memory_operand" "=Q") > > (unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q") > > (reg:SI GPR0_REGNUM)] > > UNSPEC_MVCRL))] > >"TARGET_Z15" > >"mvcrl\t%0,%1" > >[(set_attr "op_type" "SSE")]) > > > > (define_expand "mvcrl" > >[(set (reg:SI GPR0_REGNUM) (match_operand:SI 2 "general_operand")) > > (set (match_operand:BLK 0 "memory_operand" "=Q") > > (unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q") > > (reg:SI GPR0_REGNUM)] > > UNSPEC_MVCRL))] > >"TARGET_Z15" > >"") > > > > In the expander we ensure that GPR0 is setup correctly. With this patch > > we could simply write > > > > (define_insn "mvcrl" > >[(set (match_operand:BLK 0 "memory_operand" "=Q") > > (unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q") > > (match_operand:SI 2 "general_operand" "{r0}")] > > UNSPEC_MVCRL))] > >"TARGET_Z15" > >"mvcrl\t%0,%1" > >[(set_attr "op_type" "SSE")]) > > > > What I dislike is that I didn't find a way to verify hard register names > > Are plain register numbers also supported? Like "{0}" ? > (Provided regno(r0) == 0). Basically whatever passes decode_reg_name() is allowed. > > > during genoutput, i.e., ensuring that the name is valid after all. This > > is due to the fact how reg_names is defined which cannot be accessed by > > genoutput. The same holds true for REGISTER_NAMES et al. which may > > referenc
[PATCH v2] Hard register constraints
This is a follow-up of https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654013.html What has changed? - Rebased and fixed an issue in constrain_operands which manifested after late-combine. - Introduced new test cases for Arm, Intel, POWER, RISCV, S/390 for 32- and 64-bit where appropriate (including register pairs etc.). Test gcc.dg/asm-hard-reg-7.c is a bit controversial since I'm testing for an anti feature here, i.e., I'm testing for register asm in conjunction with calls. I'm fine with removing it in the end but I wanted to keep it in for demonstration purposes at least during discussion of this patch. - Split test pr87600-2.c into pr87600-2.c and pr87600-3.c since test0 errors out early, now. Otherwise, the remaining errors would not be reported. Beside that the error message has slightly changed. - Modified genoutput.cc in order to allow hard register constraints in machine descriptions. For example, on s390 the instruction mvcrl makes use of the implicit register r0 which we currently deal with as follows: (define_insn "*mvcrl" [(set (match_operand:BLK 0 "memory_operand" "=Q") (unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q") (reg:SI GPR0_REGNUM)] UNSPEC_MVCRL))] "TARGET_Z15" "mvcrl\t%0,%1" [(set_attr "op_type" "SSE")]) (define_expand "mvcrl" [(set (reg:SI GPR0_REGNUM) (match_operand:SI 2 "general_operand")) (set (match_operand:BLK 0 "memory_operand" "=Q") (unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q") (reg:SI GPR0_REGNUM)] UNSPEC_MVCRL))] "TARGET_Z15" "") In the expander we ensure that GPR0 is setup correctly. With this patch we could simply write (define_insn "mvcrl" [(set (match_operand:BLK 0 "memory_operand" "=Q") (unspec:BLK [(match_operand:BLK 1 "memory_operand" "Q") (match_operand:SI 2 "general_operand" "{r0}")] UNSPEC_MVCRL))] "TARGET_Z15" "mvcrl\t%0,%1" [(set_attr "op_type" "SSE")]) What I dislike is that I didn't find a way to verify hard register names during genoutput, i.e., ensuring that the name is valid after all. This is due to the fact how reg_names is defined which cannot be accessed by genoutput. The same holds true for REGISTER_NAMES et al. which may reference some target specific variable (see e.g. POWER). Thus, in case of an invalid register name in a machine description file we do not end-up with a genoutput-time error but instead fail at run-time in process_alt_operands(): case '{': { int regno = parse_constraint_regname (p); gcc_assert (regno >= 0); cl = REGNO_REG_CLASS (regno); CLEAR_HARD_REG_SET (hregset); SET_HARD_REG_BIT (hregset, regno); cl_filter = &hregset; goto reg; } This is rather unfortunate but I couldn't find a way how to validate register names during genoutput. If no one else has an idea I will replace gcc_assert with a more expressive error message. What's next? I was thinking about replacing register asm with the new hard register constraint. This would solve problems like demonstrated by gcc.dg/asm-hard-reg-7.c. For example, we could replace the constraint register int x asm ("r5") = 42; asm ("foo %0" :: "r" (x)); with register int x asm ("r5") = 42; asm ("foo %0" :: "{r5}" (x)); and ignore any further effect of the register asm. However, I haven't really thought this through and there are certainly cases which are currently allowed which cannot trivially be converted as e.g. here: register int x asm ("r5") = 42; asm ("foo %0" :: "rd" (x)); Multiple alternatives are kind of strange in combination with register asm. For example, on s390 the two constraints "r" and "d" restrict both to GPRs. That is not a show stopper but certainly something which needs some consideration. If you can think of some wild combinations/edge cases I would be happy to here about. Anyhow, this is something for a further patch. Last but not least, if there is enough consent to accept this feature, I will start writing up some documentation. Bootstrapped and regtested on Arm, Intel, POWER, RISCV, S/390. I have only verified the 32-bit tests via cross compilers and didn't execute them in contrast to 64-bit targets. --- gcc/cfgexpand.cc | 42 - gcc/genoutput.cc | 12 ++ gcc/genpreds.cc | 4 +- gcc/gimplify.cc | 134 ++- gcc/lra-constraints.cc| 13 ++ gcc/recog.cc | 11 +- gcc/stmt.cc | 155 +- gcc/stmt.h| 12 +- gcc/testsuite/gcc.dg/asm-hard-reg-1.c | 85 ++ gcc/testsuite/gcc.dg/asm-hard-reg-2.c | 33 gcc/testsuite/gcc.dg/asm-hard-reg-3.c | 25 +
Re: [PATCH] regrename: Skip renaming register pairs [PR115860]
On Tue, Jul 23, 2024 at 11:40:00AM -0600, Jeff Law wrote: > > > On 7/23/24 9:45 AM, Stefan Schulze Frielinghaus wrote: > > > > > > They come from: > > > ``` > > > (define_insn "*tf_to_fprx2_0" > > >[(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0) > > > (subreg:DF (match_operand:TF1 "general_operand" "v") > > > 0))] > > > ... > > > (define_insn "*tf_to_fprx2_1" > > >[(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8) > > > (subreg:DF (match_operand:TF1 "general_operand" "v") > > > 8))] > > > > > > ``` > > > > > > I am not sure if that is a valid thing to do. s390 backend is the only > > > one that has insn patterns like this. all that uses "+" use either > > > strict_lowpart of zero_extract for the lhs or just a pure set. > > > Maybe there is a better way of representing this. Maybe using unspec here? > > > > I gave unspec a try and came up with > > > > (define_insn "*tf_to_fprx2_0" > >[(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0) > > (unspec:DF [(match_operand:TF1 "general_operand" "v")] > > UNSPEC_TF_TO_FPRX2_0))] > >"TARGET_VXE" > >; M4 == 1 corresponds to %v0[0] = %v1[0]; %v0[1] = %v0[1]; > >"vpdi\t%v0,%v1,%v0,1" > >[(set_attr "op_type" "VRR")]) > > > > (define_insn "*tf_to_fprx2_1" > >[(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8) > > (unspec:DF [(match_operand:TF1 "general_operand" "v")] > > UNSPEC_TF_TO_FPRX2_1))] > >"TARGET_VXE" > >; M4 == 5 corresponds to %V0[0] = %v1[1]; %V0[1] = %V0[1]; > >"vpdi\t%V0,%v1,%V0,5" > >[(set_attr "op_type" "VRR")]) > > > > which seems to work. However, I'm still getting subregs at final: > > > > (insn 3 18 7 (set (reg/v:TF 18 %f4 [orig:62 x ] [62]) > > (mem/c:TF (reg:DI 2 %r2 [65]) [1 x+0 S16 A64])) "t.c":3:1 421 > > {movtf_vr} > > (expr_list:REG_DEAD (reg:DI 2 %r2 [65]) > > (nil))) > > (insn 7 3 8 (set (subreg:DF (reg:FPRX2 16 %f0 [64]) 0) > > (unspec:DF [ > > (reg/v:TF 18 %f4 [orig:62 x ] [62]) > > ] UNSPEC_TF_TO_FPRX2_0)) "t.c":4:10 569 {*tf_to_fprx2_0} > > (nil)) > > (insn 8 7 14 (set (subreg:DF (reg:FPRX2 16 %f0 [64]) 8) > > (unspec:DF [ > > (reg/v:TF 18 %f4 [orig:62 x ] [62]) > > ] UNSPEC_TF_TO_FPRX2_1)) "t.c":4:10 570 {*tf_to_fprx2_1} > > (expr_list:REG_DEAD (reg/v:TF 18 %f4 [orig:62 x ] [62]) > > (nil))) > > > > Thus, I'm not sure whether this really solves the problem or rather > > shifts around it. I'm still a bit puzzled why the initial RTL is > > invalid. If I understood you correctly Jeff, then we are missing a > > pattern which would match once the subregs are eliminated. Since none > > exists the subregs survive and regrename gets confused. This basically > > means that subregs of register pairs must not survive RA and the unspec > > solution from above is no real solution. > I'd tend to agree. The routine in question is cleanup_subreg_operands and > from a quick looksie it's not going to work for the insn in question because > cleanup_subreg_operands actually looks down into the recog data structures > for each operand. In the case above the subreg is explicit in the RTL > rather than matched by the operand predicate. Right, I did some further tests over night where I also added patterns in order to match variants where the subregs are eliminated and that seems to work. I still haven't made up my mind which route would be best. Anyhow, it is clear that this patch should be dropped and I will come up with a solution for the target. Thank you Andrew and Jeff for pointing this out. Some myths about subregs have been revealed for me :) Cheers, Stefan
Re: [PATCH] regrename: Skip renaming register pairs [PR115860]
On Mon, Jul 22, 2024 at 08:16:16PM -0700, Andrew Pinski wrote: > On Sun, Jul 21, 2024 at 11:47 PM Stefan Schulze Frielinghaus > > diff --git a/gcc/regrename.cc b/gcc/regrename.cc > > index 054e601740b..6ae5a2309d0 100644 > > --- a/gcc/regrename.cc > > +++ b/gcc/regrename.cc > > @@ -1113,6 +1113,10 @@ scan_rtx_reg (rtx_insn *insn, rtx *loc, enum > > reg_class cl, enum scan_actions act > > > > c = create_new_chain (this_regno, this_nregs, loc, insn, cl); > > > > + /* Give up early in case of register pairs. */ > > + if (this_nregs != 1) > > + c->cannot_rename = 1; > > > I am a bit worried this will make TImode (and DImode for 32bit targets) worse. > And it might make aarch64's vector struct types much worse than they > are currently. > It is interesting how there is a subreg of a hardregister after reload > showing up here. Is that on purpose? Good catch. I don't think this was on purpose. When looking at the dump I rather thought this is valid RTL and didn't question it since subregs for register pairs got "expanded" during final. > They come from: > ``` > (define_insn "*tf_to_fprx2_0" > [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0) > (subreg:DF (match_operand:TF1 "general_operand" "v") 0))] > ... > (define_insn "*tf_to_fprx2_1" > [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8) > (subreg:DF (match_operand:TF1 "general_operand" "v") 8))] > > ``` > > I am not sure if that is a valid thing to do. s390 backend is the only > one that has insn patterns like this. all that uses "+" use either > strict_lowpart of zero_extract for the lhs or just a pure set. > Maybe there is a better way of representing this. Maybe using unspec here? I gave unspec a try and came up with (define_insn "*tf_to_fprx2_0" [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0) (unspec:DF [(match_operand:TF1 "general_operand" "v")] UNSPEC_TF_TO_FPRX2_0))] "TARGET_VXE" ; M4 == 1 corresponds to %v0[0] = %v1[0]; %v0[1] = %v0[1]; "vpdi\t%v0,%v1,%v0,1" [(set_attr "op_type" "VRR")]) (define_insn "*tf_to_fprx2_1" [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8) (unspec:DF [(match_operand:TF1 "general_operand" "v")] UNSPEC_TF_TO_FPRX2_1))] "TARGET_VXE" ; M4 == 5 corresponds to %V0[0] = %v1[1]; %V0[1] = %V0[1]; "vpdi\t%V0,%v1,%V0,5" [(set_attr "op_type" "VRR")]) which seems to work. However, I'm still getting subregs at final: (insn 3 18 7 (set (reg/v:TF 18 %f4 [orig:62 x ] [62]) (mem/c:TF (reg:DI 2 %r2 [65]) [1 x+0 S16 A64])) "t.c":3:1 421 {movtf_vr} (expr_list:REG_DEAD (reg:DI 2 %r2 [65]) (nil))) (insn 7 3 8 (set (subreg:DF (reg:FPRX2 16 %f0 [64]) 0) (unspec:DF [ (reg/v:TF 18 %f4 [orig:62 x ] [62]) ] UNSPEC_TF_TO_FPRX2_0)) "t.c":4:10 569 {*tf_to_fprx2_0} (nil)) (insn 8 7 14 (set (subreg:DF (reg:FPRX2 16 %f0 [64]) 8) (unspec:DF [ (reg/v:TF 18 %f4 [orig:62 x ] [62]) ] UNSPEC_TF_TO_FPRX2_1)) "t.c":4:10 570 {*tf_to_fprx2_1} (expr_list:REG_DEAD (reg/v:TF 18 %f4 [orig:62 x ] [62]) (nil))) Thus, I'm not sure whether this really solves the problem or rather shifts around it. I'm still a bit puzzled why the initial RTL is invalid. If I understood you correctly Jeff, then we are missing a pattern which would match once the subregs are eliminated. Since none exists the subregs survive and regrename gets confused. This basically means that subregs of register pairs must not survive RA and the unspec solution from above is no real solution. Since the only purpose of tf_to_fprx2_0 and tf_to_fprx2_1 are to move a long double from a vector register into a FP register pair one could also merge both insn into one and emit two instructions in the assembler template. This would at least circumvent the subreg issue. (define_insn "tf_to_fprx2" [(set (match_operand:FPRX2 0 "nonimmediate_operand" "=f") (unspec:FPRX2 [(match_operand:TF 1 "general_operand" "v")] UNSPEC_TF_TO_FPRX2))] "TARGET_VXE" "vpdi\t%v0,%v1,%v0,1;vpdi\t%V0,%v1,%V0,5" [(set_attr "length" "12") (set_attr "op_type" "VRR")]) I will give this a try tomorrow. Thanks, Stefan
[PATCH] regrename: Skip renaming register pairs [PR115860]
It is not trivial to decide when a write of a register pair terminates or starts a new chain. For example, prior regrename we have (insn 91 38 36 5 (set (reg:FPRX2 16 %f0 [orig:76 x ] [76]) (const_double:FPRX2 0.0 [0x0.0p+0])) "float-cast-overflow-7-reduced.c":5:55 discrim 2 1507 {*movfprx2_64} (expr_list:REG_EQUAL (const_double:FPRX2 0.0 [0x0.0p+0]) (nil))) (insn 36 91 37 5 (set (subreg:DF (reg:FPRX2 16 %f0 [orig:76 x ] [76]) 0) (mem/c:DF (plus:DI (reg/f:DI 15 %r15) (const_int 160 [0xa0])) [7 %sfp+-32 S8 A64])) "float-cast-overflow-7-reduced.c":5:55 discrim 2 1512 {*movdf_64dfp} (nil)) (insn 37 36 43 5 (set (subreg:DF (reg:FPRX2 16 %f0 [orig:76 x ] [76]) 8) (mem/c:DF (plus:DI (reg/f:DI 15 %r15) (const_int 168 [0xa8])) [7 %sfp+-24 S8 A64])) "float-cast-overflow-7-reduced.c":5:55 discrim 2 1512 {*movdf_64dfp} (nil)) where insn 91 writes both registers of a register pair and it is clear that an existing chain must be terminated and a new started. Insn 36 and 37 write only into one register of a corresponding register pair. For each write on its own it is not obvious when to terminate an existing chain and to start a new one. In other words, once insn 36 materializes and 37 didn't we are kind of in a limbo state. Tracking this correctly is inherently hard and I'm not entirely sure whether optimizations could even lead to more complicated cases where it is even less clear when a chain terminates and a new has to be started. Therefore, skip renaming of register pairs. Bootstrapped and regtested on x86_64, aarch64, powerpc64le, and s390. Ok for mainline? This fixes on s390: FAIL: g++.dg/cpp23/ext-floating14.C -std=gnu++23 execution test FAIL: g++.dg/cpp23/ext-floating14.C -std=gnu++26 execution test FAIL: c-c++-common/ubsan/float-cast-overflow-7.c -O2 execution test FAIL: c-c++-common/ubsan/float-cast-overflow-7.c -O2 -flto -fno-use-linker-plugin -flto-partition=none execution test FAIL: c-c++-common/ubsan/float-cast-overflow-7.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O0 execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O1 execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O2 execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O2 -flto -fno-use-linker-plugin -flto-partition=none execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -O3 -g execution test FAIL: gcc.dg/torture/fp-int-convert-float128-ieee-timode.c -Os execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O0 execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O1 execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O2 execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O2 -flto -fno-use-linker-plugin -flto-partition=none execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -O3 -g execution test FAIL: gcc.dg/torture/fp-int-convert-float64x-timode.c -Os execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O0 execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O1 execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O2 execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O2 -flto -fno-use-linker-plugin -flto-partition=none execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -O3 -g execution test FAIL: gcc.dg/torture/fp-int-convert-timode.c -Os execution test FAIL: gfortran.dg/pr96711.f90 -O0 execution test FAIL: TestSignalForwardingExternal FAIL: go test misc/cgo/testcarchive FAIL: libffi.closures/nested_struct5.c -W -Wall -Wno-psabi -O2 output pattern test FAIL: libphobos.phobos/std/algorithm/mutation.d execution test FAIL: libphobos.phobos/std/conv.d execution test FAIL: libphobos.phobos/std/internal/math/errorfunction.d execution test FAIL: libphobos.phobos/std/variant.d execution test FAIL: libphobos.phobos_shared/std/algorithm/mutation.d execution test FAIL: libphobos.phobos_shared/std/conv.d execution test FAIL: libphobos.phobos_shared/std/internal/math/errorfunction.d execution test FAIL: libphobos.phobos_shared/std/variant.d execution test gcc/ChangeLog: PR rtl-optimiztion/115860 * regrename.cc (scan_rtx_reg): Do not try to rename register pairs. --- gcc/regrename.cc | 4 1 file changed, 4 insertions(+) diff --git a/gcc/regrename.cc b/gcc/regrename.cc index 054e601740b..6ae5a2309d0 100644 ---
Re: [PATCH] s390: Fix unresolved iterators bhfgq and xdee
I'm pinging this early since I would like to make sure that it gets into 14.2 RC which is about to be done on Tuesday 23rd July. On Tue, Jul 16, 2024 at 04:50:29PM +0200, Stefan Schulze Frielinghaus wrote: > Code attribute bhfgq is missing a mapping for TF. This results in > unresolved iterators in assembler templates for *bswaptf. > > With the TF mapping added the base mnemonics vlbr and vstbr are not > "used" anymore but only the extended mnemonics (vlbr was > interpreted as vlbr; likewise for vstbr). Therefore, remove the base > mnemonics from the scheduling description, otherwise, genattrtab would > error about unknown mnemonics. > > Likewise, for movtf_vr only the extended mnemonics for vrepi are used, > now, which means the base mnemonic is "unused" and has to be removed > from the scheduling description. > > Similarly, we end up with unresolved iterators in assembler templates > for mulfprx23 since code attribute xdee is missing a mapping for FPRX2. > > Note, this is basically a cherry pick of commit r15-2060-ga4abda934aa426 > with the addition that vrepi is removed from the scheduling description, > too. > > Bootstrapped on s390. Ok for release branches 12, 13, and 14? > > gcc/ChangeLog: > > * config/s390/3931.md (vlbr, vstbr, vrepi): Remove. > * config/s390/s390.md (xdee): Add FPRX2 mapping. > * config/s390/vector.md (bhfgq): Add TF mapping. > --- > gcc/config/s390/3931.md | 7 --- > gcc/config/s390/s390.md | 2 +- > gcc/config/s390/vector.md | 2 +- > 3 files changed, 2 insertions(+), 9 deletions(-) > > diff --git a/gcc/config/s390/3931.md b/gcc/config/s390/3931.md > index bed1f6c21f1..9cb11b72bba 100644 > --- a/gcc/config/s390/3931.md > +++ b/gcc/config/s390/3931.md > @@ -404,7 +404,6 @@ vlvgg, > vlvgh, > vlvgp, > vst, > -vstbr, > vstbrf, > vstbrg, > vstbrh, > @@ -627,7 +626,6 @@ tm, > tmy, > vl, > vlbb, > -vlbr, > vlbrf, > vlbrg, > vlbrh, > @@ -661,7 +659,6 @@ vlreph, > vlrl, > vlrlr, > vst, > -vstbr, > vstbrf, > vstbrg, > vstbrh, > @@ -1077,7 +1074,6 @@ vrepb, > vrepf, > vrepg, > vreph, > -vrepi, > vrepib, > vrepif, > vrepig, > @@ -1930,7 +1926,6 @@ vrepb, > vrepf, > vrepg, > vreph, > -vrepi, > vrepib, > vrepif, > vrepig, > @@ -2156,7 +2151,6 @@ vistrfs, > vistrhs, > vl, > vlbb, > -vlbr, > vlbrf, > vlbrg, > vlbrh, > @@ -2248,7 +2242,6 @@ tbegin, > tbeginc, > tend, > vst, > -vstbr, > vstbrf, > vstbrg, > vstbrh, > diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md > index 50a828f2bbb..8edc1261c38 100644 > --- a/gcc/config/s390/s390.md > +++ b/gcc/config/s390/s390.md > @@ -744,7 +744,7 @@ > ;; In FP templates, a in "mr" will expand to "mxr" in > ;; TF/TDmode, "mdr" in DF/DDmode, "meer" in SFmode and "mer in > ;; SDmode. > -(define_mode_attr xdee [(TF "x") (DF "d") (SF "ee") (TD "x") (DD "d") (SD > "e")]) > +(define_mode_attr xdee [(TF "x") (FPRX2 "x") (DF "d") (SF "ee") (TD "x") (DD > "d") (SD "e")]) > > ;; The decimal floating point variants of add, sub, div and mul support 3 > ;; fp register operands. The following attributes allow to merge the bfp and > diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md > index 1bae1056951..f88e8b655fa 100644 > --- a/gcc/config/s390/vector.md > +++ b/gcc/config/s390/vector.md > @@ -134,7 +134,7 @@ > (V1TI "q") (TI "q") > (V1SF "f") (V2SF "f") (V4SF "f") > (V1DF "g") (V2DF "g") > - (V1TF "q")]) > + (V1TF "q") (TF "q")]) > > ; This is for vmalhw. It gets an 'w' attached to avoid confusion with > ; multiply and add logical high vmalh. > -- > 2.45.0 >
Re: [PATCH] s390: testsuite: Fix vcond-shift.c
On Thu, Jul 18, 2024 at 11:58:10PM -0700, Andrew Pinski wrote: > On Thu, Jul 18, 2024 at 10:31 PM Stefan Schulze Frielinghaus > wrote: > > > > Previously we optimized expressions of the form a < 0 ? -1 : 0 to > > (signed)a >> 31 during vcond expanding. Since r15-1741-g2ccdd0f22312a1 > > this is done in match.pd. The implementation in the back end as well as > > in match.pd are basically the same but still distinct. For the tests in > > vcond-shift.c the back end emitted > > > > (xx - (xx >> 31)) >> 1 > > > > whereas now via match.pd > > > > ((int) ((unsigned int) xx >> 31) + xx) >> 1 > > > > which is basically the same. We just have to adapt the scan-assembler > > directives w.r.t. signed/unsigned shifts which is done by this patch. > > Note I filed https://gcc.gnu.org/PR115999 because I noticed those 2 > form produce slightly different code generation for scalars (I assume > it will produce similar issues for vectors too). Thanks for the heads up. In that case we should probably wait a bit once a normal form or whatever has settled. Cheers, Stefan > > Thanks, > Andrew Pinski > > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/s390/vector/vcond-shift.c: Adapt to new match.pd > > rule and change scan-assembler-times for shifts. > > --- > > Regtested on s390. Ok for mainline? > > > > gcc/testsuite/gcc.target/s390/vector/vcond-shift.c | 12 ++-- > > 1 file changed, 6 insertions(+), 6 deletions(-) > > > > diff --git a/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c > > b/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c > > index a6b4e97aa50..b942f44039d 100644 > > --- a/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c > > +++ b/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c > > @@ -3,13 +3,13 @@ > > /* { dg-do compile { target { s390*-*-* } } } */ > > /* { dg-options "-O3 -march=z13 -mzarch" } */ > > > > -/* { dg-final { scan-assembler-times "vesraf\t%v.?,%v.?,31" 6 } } */ > > -/* { dg-final { scan-assembler-times "vesrah\t%v.?,%v.?,15" 6 } } */ > > -/* { dg-final { scan-assembler-times "vesrab\t%v.?,%v.?,7" 6 } } */ > > +/* { dg-final { scan-assembler-times "vesraf\t%v.?,%v.?,31" 4 } } */ > > +/* { dg-final { scan-assembler-times "vesrah\t%v.?,%v.?,15" 4 } } */ > > +/* { dg-final { scan-assembler-times "vesrab\t%v.?,%v.?,7" 4 } } */ > > /* { dg-final { scan-assembler-not "vzero\t*" } } */ > > -/* { dg-final { scan-assembler-times "vesrlf\t%v.?,%v.?,31" 4 } } */ > > -/* { dg-final { scan-assembler-times "vesrlh\t%v.?,%v.?,15" 4 } } */ > > -/* { dg-final { scan-assembler-times "vesrlb\t%v.?,%v.?,7" 4 } } */ > > +/* { dg-final { scan-assembler-times "vesrlf\t%v.?,%v.?,31" 6 } } */ > > +/* { dg-final { scan-assembler-times "vesrlh\t%v.?,%v.?,15" 6 } } */ > > +/* { dg-final { scan-assembler-times "vesrlb\t%v.?,%v.?,7" 6 } } */ > > > > /* Make it expand to two vector operations. */ > > #define ITER(X) (2 * (16 / sizeof (X[1]))) > > -- > > 2.45.2 > >
[PATCH] s390: testsuite: Fix vcond-shift.c
Previously we optimized expressions of the form a < 0 ? -1 : 0 to (signed)a >> 31 during vcond expanding. Since r15-1741-g2ccdd0f22312a1 this is done in match.pd. The implementation in the back end as well as in match.pd are basically the same but still distinct. For the tests in vcond-shift.c the back end emitted (xx - (xx >> 31)) >> 1 whereas now via match.pd ((int) ((unsigned int) xx >> 31) + xx) >> 1 which is basically the same. We just have to adapt the scan-assembler directives w.r.t. signed/unsigned shifts which is done by this patch. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vcond-shift.c: Adapt to new match.pd rule and change scan-assembler-times for shifts. --- Regtested on s390. Ok for mainline? gcc/testsuite/gcc.target/s390/vector/vcond-shift.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c b/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c index a6b4e97aa50..b942f44039d 100644 --- a/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c +++ b/gcc/testsuite/gcc.target/s390/vector/vcond-shift.c @@ -3,13 +3,13 @@ /* { dg-do compile { target { s390*-*-* } } } */ /* { dg-options "-O3 -march=z13 -mzarch" } */ -/* { dg-final { scan-assembler-times "vesraf\t%v.?,%v.?,31" 6 } } */ -/* { dg-final { scan-assembler-times "vesrah\t%v.?,%v.?,15" 6 } } */ -/* { dg-final { scan-assembler-times "vesrab\t%v.?,%v.?,7" 6 } } */ +/* { dg-final { scan-assembler-times "vesraf\t%v.?,%v.?,31" 4 } } */ +/* { dg-final { scan-assembler-times "vesrah\t%v.?,%v.?,15" 4 } } */ +/* { dg-final { scan-assembler-times "vesrab\t%v.?,%v.?,7" 4 } } */ /* { dg-final { scan-assembler-not "vzero\t*" } } */ -/* { dg-final { scan-assembler-times "vesrlf\t%v.?,%v.?,31" 4 } } */ -/* { dg-final { scan-assembler-times "vesrlh\t%v.?,%v.?,15" 4 } } */ -/* { dg-final { scan-assembler-times "vesrlb\t%v.?,%v.?,7" 4 } } */ +/* { dg-final { scan-assembler-times "vesrlf\t%v.?,%v.?,31" 6 } } */ +/* { dg-final { scan-assembler-times "vesrlh\t%v.?,%v.?,15" 6 } } */ +/* { dg-final { scan-assembler-times "vesrlb\t%v.?,%v.?,7" 6 } } */ /* Make it expand to two vector operations. */ #define ITER(X) (2 * (16 / sizeof (X[1]))) -- 2.45.2
[PATCH] s390: Fix unresolved iterators bhfgq and xdee
Code attribute bhfgq is missing a mapping for TF. This results in unresolved iterators in assembler templates for *bswaptf. With the TF mapping added the base mnemonics vlbr and vstbr are not "used" anymore but only the extended mnemonics (vlbr was interpreted as vlbr; likewise for vstbr). Therefore, remove the base mnemonics from the scheduling description, otherwise, genattrtab would error about unknown mnemonics. Likewise, for movtf_vr only the extended mnemonics for vrepi are used, now, which means the base mnemonic is "unused" and has to be removed from the scheduling description. Similarly, we end up with unresolved iterators in assembler templates for mulfprx23 since code attribute xdee is missing a mapping for FPRX2. Note, this is basically a cherry pick of commit r15-2060-ga4abda934aa426 with the addition that vrepi is removed from the scheduling description, too. Bootstrapped on s390. Ok for release branches 12, 13, and 14? gcc/ChangeLog: * config/s390/3931.md (vlbr, vstbr, vrepi): Remove. * config/s390/s390.md (xdee): Add FPRX2 mapping. * config/s390/vector.md (bhfgq): Add TF mapping. --- gcc/config/s390/3931.md | 7 --- gcc/config/s390/s390.md | 2 +- gcc/config/s390/vector.md | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/gcc/config/s390/3931.md b/gcc/config/s390/3931.md index bed1f6c21f1..9cb11b72bba 100644 --- a/gcc/config/s390/3931.md +++ b/gcc/config/s390/3931.md @@ -404,7 +404,6 @@ vlvgg, vlvgh, vlvgp, vst, -vstbr, vstbrf, vstbrg, vstbrh, @@ -627,7 +626,6 @@ tm, tmy, vl, vlbb, -vlbr, vlbrf, vlbrg, vlbrh, @@ -661,7 +659,6 @@ vlreph, vlrl, vlrlr, vst, -vstbr, vstbrf, vstbrg, vstbrh, @@ -1077,7 +1074,6 @@ vrepb, vrepf, vrepg, vreph, -vrepi, vrepib, vrepif, vrepig, @@ -1930,7 +1926,6 @@ vrepb, vrepf, vrepg, vreph, -vrepi, vrepib, vrepif, vrepig, @@ -2156,7 +2151,6 @@ vistrfs, vistrhs, vl, vlbb, -vlbr, vlbrf, vlbrg, vlbrh, @@ -2248,7 +2242,6 @@ tbegin, tbeginc, tend, vst, -vstbr, vstbrf, vstbrg, vstbrh, diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 50a828f2bbb..8edc1261c38 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -744,7 +744,7 @@ ;; In FP templates, a in "mr" will expand to "mxr" in ;; TF/TDmode, "mdr" in DF/DDmode, "meer" in SFmode and "mer in ;; SDmode. -(define_mode_attr xdee [(TF "x") (DF "d") (SF "ee") (TD "x") (DD "d") (SD "e")]) +(define_mode_attr xdee [(TF "x") (FPRX2 "x") (DF "d") (SF "ee") (TD "x") (DD "d") (SD "e")]) ;; The decimal floating point variants of add, sub, div and mul support 3 ;; fp register operands. The following attributes allow to merge the bfp and diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 1bae1056951..f88e8b655fa 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -134,7 +134,7 @@ (V1TI "q") (TI "q") (V1SF "f") (V2SF "f") (V4SF "f") (V1DF "g") (V2DF "g") - (V1TF "q")]) + (V1TF "q") (TF "q")]) ; This is for vmalhw. It gets an 'w' attached to avoid confusion with ; multiply and add logical high vmalh. -- 2.45.0
[PATCH] s390: Fix unresolved iterators bhfgq and xdee
Code attribute bhfgq is missing a mapping for TF. This results in unresolved iterators in assembler templates for *bswaptf. With the TF mapping added the base mnemonics vlbr and vstbr are not "used" anymore but only the extended mnemonics (vlbr was interpreted as vlbr; likewise for vstbr). Therefore, remove the base mnemonics from the scheduling description, otherwise, genattrtab would error about unknown mnemonics. Similarly, we end up with unresolved iterators in assembler templates for mulfprx23 since code attribute xdee is missing a mapping for FPRX2. gcc/ChangeLog: * config/s390/3931.md (vlbr, vstbr): Remove. * config/s390/s390.md (xdee): Add FPRX2 mapping. * config/s390/vector.md (bhfgq): Add TF mapping. --- Bootstrapped and regtested on s390. Ok for {mainline,12,13,14}? gcc/config/s390/3931.md | 5 - gcc/config/s390/s390.md | 2 +- gcc/config/s390/vector.md | 2 +- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/gcc/config/s390/3931.md b/gcc/config/s390/3931.md index 632c2456b6a..9f7a4c58755 100644 --- a/gcc/config/s390/3931.md +++ b/gcc/config/s390/3931.md @@ -404,7 +404,6 @@ vlvgg, vlvgh, vlvgp, vst, -vstbr, vstbrf, vstbrg, vstbrh, @@ -627,7 +626,6 @@ tm, tmy, vl, vlbb, -vlbr, vlbrf, vlbrg, vlbrh, @@ -661,7 +659,6 @@ vlreph, vlrl, vlrlr, vst, -vstbr, vstbrf, vstbrg, vstbrh, @@ -2148,7 +2145,6 @@ vistrfs, vistrhs, vl, vlbb, -vlbr, vlbrf, vlbrg, vlbrh, @@ -2240,7 +2236,6 @@ tbegin, tbeginc, tend, vst, -vstbr, vstbrf, vstbrg, vstbrh, diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 303026f6af7..3d5759d6252 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -745,7 +745,7 @@ ;; In FP templates, a in "mr" will expand to "mxr" in ;; TF/TDmode, "mdr" in DF/DDmode, "meer" in SFmode and "mer in ;; SDmode. -(define_mode_attr xdee [(TF "x") (DF "d") (SF "ee") (TD "x") (DD "d") (SD "e")]) +(define_mode_attr xdee [(TF "x") (FPRX2 "x") (DF "d") (SF "ee") (TD "x") (DD "d") (SD "e")]) ;; The decimal floating point variants of add, sub, div and mul support 3 ;; fp register operands. The following attributes allow to merge the bfp and diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 63678859657..cca9e3556c9 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -136,7 +136,7 @@ (V1TI "q") (TI "q") (V1SF "f") (V2SF "f") (V4SF "f") (V1DF "g") (V2DF "g") - (V1TF "q")]) + (V1TF "q") (TF "q")]) ; This is for vmalhw. It gets an 'w' attached to avoid confusion with ; multiply and add logical high vmalh. -- 2.45.2
Re: [PATCH] s390: Align *cjump_64 and *icjump_64
On Thu, Jul 11, 2024 at 07:32:17PM +0200, Stefan Schulze Frielinghaus wrote: > On Thu, Jul 11, 2024 at 05:14:58PM +0200, Jakub Jelinek wrote: > > On Thu, Jul 11, 2024 at 05:09:41PM +0200, Stefan Schulze Frielinghaus wrote: > > > I didn't have the schedule for 11.5 RC in mind which is tomorrow and the > > > release a week afterwards. I hope this is still appropriate for 11.5? > > > > From my side, if Andreas or somebody else approves it, it is tested on 11 > > branch and committed by tomorrow, it can be added. > > But I'd like to know what patches I should wait for tomorrow and approximate > > ETA (and ideally before end of working day in Europe). Once rc1 is done, > > only > > severe blockers will be possible. > > The tester is running over night and will finish around 7 AM CEST. I > will let you know once it has finished. If anything goes wrong we can > skip this patch of course. The tester was extremely slow this time and still didn't finish. I don't wanna rush it risking to introduce late time problems for the 11.5 release. Since I'm testing for three different architectures and the first one hasn't finished, let's drop this patch for 11.5. Sorry for the noise, Stefan
Re: [PATCH] s390: Align *cjump_64 and *icjump_64
On Thu, Jul 11, 2024 at 05:14:58PM +0200, Jakub Jelinek wrote: > On Thu, Jul 11, 2024 at 05:09:41PM +0200, Stefan Schulze Frielinghaus wrote: > > I didn't have the schedule for 11.5 RC in mind which is tomorrow and the > > release a week afterwards. I hope this is still appropriate for 11.5? > > From my side, if Andreas or somebody else approves it, it is tested on 11 > branch and committed by tomorrow, it can be added. > But I'd like to know what patches I should wait for tomorrow and approximate > ETA (and ideally before end of working day in Europe). Once rc1 is done, only > severe blockers will be possible. The tester is running over night and will finish around 7 AM CEST. I will let you know once it has finished. If anything goes wrong we can skip this patch of course. Cheers, Stefan
Re: [PATCH] s390: Align *cjump_64 and *icjump_64
On Thu, Jul 11, 2024 at 04:29:19PM +0200, Stefan Schulze Frielinghaus wrote: > During machine reorg we optimize backward jumps and transform insns as > e.g. > > (jump_insn 118 117 119 (set (pc) > (if_then_else (ne (reg:CCRAW 33 %cc) > (const_int 8 [0x8])) > (label_ref 134) > (pc))) "dec_math_1.f90":204:8 discrim 1 2161 {*cjump_64} > (expr_list:REG_DEAD (reg:CCRAW 33 %cc) > (int_list:REG_BR_PROB 719407028 (nil))) > -> 134) > > into > > (jump_insn 118 117 432 (set (pc) > (if_then_else (ne (reg:CCRAW 33 %cc) > (const_int 8 [0x8])) > (pc) > (label_ref 433))) "dec_math_1.f90":204:8 discrim 1 -1 > (expr_list:REG_DEAD (reg:CCRAW 33 %cc) > (int_list:REG_BR_PROB 719407028 (nil))) > -> 433) > > The latter is not recognized anymore since *icjump_64 only matches > CC_REGNUM against zero. Fixed by aligning *cjump_64 and *icjump_64. > > gcc/ChangeLog: > > * config/s390/s390.md (*icjump_64): Allow raw CC comparisons, > i.e., any constant integer between 0 and 15 for CC comparisons. > --- > Bootstrap and regtest or still running. Assuming no regressions, ok > for {mainline,11,12,13,14}? Would be great to see this in 14.2 RC :) I didn't have the schedule for 11.5 RC in mind which is tomorrow and the release a week afterwards. I hope this is still appropriate for 11.5? Cheers, Stefan > > gcc/config/s390/s390.md | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md > index f5d7003dfad..d3931b09417 100644 > --- a/gcc/config/s390/s390.md > +++ b/gcc/config/s390/s390.md > @@ -9556,7 +9556,8 @@ > (define_insn "*icjump_64" >[(set (pc) > (if_then_else > - (match_operator 1 "s390_comparison" [(reg CC_REGNUM) (const_int > 0)]) > + (match_operator 1 "s390_comparison" [(reg CC_REGNUM) > +(match_operand 2 > "const_int_operand" "")]) >(pc) >(label_ref (match_operand 0 "" ""] >"" > -- > 2.45.2 >
[PATCH] s390: Align *cjump_64 and *icjump_64
During machine reorg we optimize backward jumps and transform insns as e.g. (jump_insn 118 117 119 (set (pc) (if_then_else (ne (reg:CCRAW 33 %cc) (const_int 8 [0x8])) (label_ref 134) (pc))) "dec_math_1.f90":204:8 discrim 1 2161 {*cjump_64} (expr_list:REG_DEAD (reg:CCRAW 33 %cc) (int_list:REG_BR_PROB 719407028 (nil))) -> 134) into (jump_insn 118 117 432 (set (pc) (if_then_else (ne (reg:CCRAW 33 %cc) (const_int 8 [0x8])) (pc) (label_ref 433))) "dec_math_1.f90":204:8 discrim 1 -1 (expr_list:REG_DEAD (reg:CCRAW 33 %cc) (int_list:REG_BR_PROB 719407028 (nil))) -> 433) The latter is not recognized anymore since *icjump_64 only matches CC_REGNUM against zero. Fixed by aligning *cjump_64 and *icjump_64. gcc/ChangeLog: * config/s390/s390.md (*icjump_64): Allow raw CC comparisons, i.e., any constant integer between 0 and 15 for CC comparisons. --- Bootstrap and regtest or still running. Assuming no regressions, ok for {mainline,11,12,13,14}? Would be great to see this in 14.2 RC :) gcc/config/s390/s390.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index f5d7003dfad..d3931b09417 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -9556,7 +9556,8 @@ (define_insn "*icjump_64" [(set (pc) (if_then_else - (match_operator 1 "s390_comparison" [(reg CC_REGNUM) (const_int 0)]) + (match_operator 1 "s390_comparison" [(reg CC_REGNUM) + (match_operand 2 "const_int_operand" "")]) (pc) (label_ref (match_operand 0 "" ""] "" -- 2.45.2
[PATCH] s390: Fully exploit vgm, vgbm, vrepi
Currently instructions vgm and vrepi are utilized only for constant vectors where the element mode equals the element mode of the corresponding instruction. This patch lifts this restriction by making use of those instructions for constant vectors even if element modes do not coincide. For example, the constant vector (v2di){0x7ffe7ffe, 0x7ffe7ffe} can be loaded via vgmf %v0,1,30. Similar, the constant vector (v4si){0x, 0x, 0x, 0x} can be loaded via vrepiq %v0,-86. Analog, if the element mode of a constant vector is smaller than the element mode of a corresponding instruction, we still may make use of those instructions. For example, the constant vector (v4si){0x7fff, 0xfffe, 0x7fff, 0xfffe} can be loaded via vgmg %v0,17,46. Similar, the constant vector (v4si){-1, -16643, -1, -16643} can be loaded via vrepig %v0,-16643. Additionally this patch enables vgm, vgbm, vrepi for partial vectors, i.e., vectors of size less than 16 bytes. Basically this is done by treating a vector as a full vector resulting in replicating constants into the ignored bits whereas vgbm sets those to zero. Furthermore, there is no restriction to integer vectors anymore, i.e., supporting scalars of mode up to and including TI and TF and also floating-point vectors. Here are some numbers how often instructions are emitted for SPEC 2017: w/o patch w/ patch vgbm 140 365 vgm 1750824452 vrepi1360 2775 I expect most (maybe even all) to save us a load from the literal pool. gcc/ChangeLog: * config/s390/2964.md: Remove extended mnemonics for vgm. * config/s390/3906.md: Remove extended mnemonics for vgm. * config/s390/3931.md: Remove extended mnemonics for vgm. * config/s390/8561.md: Remove extended mnemonics for vgm. * config/s390/constraints.md (jKK): Remove constraint. (jzz): Add constraint. * config/s390/s390-protos.h (s390_contiguous_bitmask_vector_p): Add prototype. (s390_constant_via_vgm_p): Add prototype. (s390_constant_via_vrepi_p): Add prototype. * config/s390/s390.cc (s390_contiguous_bitmask_vector_p): New function. (s390_constant_via_vgm_vrepi_helper): New function. (s390_constant_via_vgm_p): New function. (s390_constant_via_vgbm_p): For the sake of symmetry rename s390_bytemask_vector_p into s390_constant_via_vgbm_p. (s390_bytemask_vector_p): Deal with non-integer and partial vectors. (s390_constant_via_vrepi_p): New function. (s390_legitimate_constant_p): Allow partial vectors. (legitimate_reload_constant_p): Fix indentation. (legitimate_reload_vector_constant_p): Restrict to constraints j00, jm1, jxx, jyy, jzz only, i.e., allow partial vectors. (s390_expand_vec_init): Also make use of vrepi if possible. (print_operand): Add q,p,r for vgm,vrepi,vgbm, respectively. Remove e,s,t for constant vectors. * config/s390/s390.md (movti): Add variants utilizing vgbm,vgm,vrepi. * config/s390/vector.md (mov): Adapt variants for vgbm,vgm,vrepi for the new scheme. (mov): Adapt variants for vgbm,vgm for the new scheme and add vrepi variant for modes V_8,V_16,V_32,V_64. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vec-copysign.c: Change to non-extended mnemonic. * gcc.target/s390/vector/vec-genmask-1.c: Change to non-extended mnemonic. * gcc.target/s390/vector/vec-init-1.c: Change to non-extended mnemonic. * gcc.target/s390/vector/vec-vrepi-1.c: Change to non-extended mnemonic. * gcc.target/s390/zvector/autovec-double-quiet-uneq.c: Change to non-extended mnemonic. * gcc.target/s390/zvector/autovec-float-quiet-uneq.c: Change to non-extended mnemonic. * gcc.target/s390/zvector/vec-genmask-1.c: Change to non-extended mnemonic. * gcc.target/s390/zvector/vec-splat-1.c: Change to non-extended mnemonic. * gcc.target/s390/zvector/vec-splat-2.c: Change to non-extended mnemonic. * gcc.target/s390/vector/vgbm-double-1.c: New test. * gcc.target/s390/vector/vgbm-float-1.c: New test. * gcc.target/s390/vector/vgbm-int128-1.c: New test. * gcc.target/s390/vector/vgbm-integer-1.c: New test. * gcc.target/s390/vector/vgbm-longdouble-1.c: New test. * gcc.target/s390/vector/vgm-df-1.c: New test. * gcc.target/s390/vector/vgm-di-1.c: New test. * gcc.target/s390/vector/vgm-hi-1.c: New test. * gcc.target/s390/vector/vgm-int128-1.c: New test. * gcc.target/s390/vector/vgm-longdouble-1.c: New test. * gcc.target/s390/vector/vgm-qi-1.c: New test. * gcc.target/s390/vector/vgm-sf-1.c: New test. * gcc.target/s390/vector/vgm-si-1.c: N
[PATCH] s390: Fix output template for movv1qi
Although for instructions MVI and MVIY it does not make a difference whether the immediate is interpreted as signed or unsigned, GAS expects unsigned immediates for instruction format SI_URD. gcc/ChangeLog: * config/s390/vector.md (mov): Fix output template for movv1qi. --- Bootstrapped and regtested on s390. Ok for {mainline,11,12,13,14}? gcc/config/s390/vector.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 40de0c75a7c..26fd505f2cd 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -368,8 +368,8 @@ lr\t%0,%1 mvi\t%0,0 mviy\t%0,0 - mvi\t%0,-1 - mviy\t%0,-1 + mvi\t%0,255 + mviy\t%0,255 lhi\t%0,0 lhi\t%0,-1 llc\t%0,%1 -- 2.45.2
[PATCH 0/3] Prepare and drop vcond expanders
This drops vcond expanders. The first patch "s390: Emulate vec_cmp{eq,gt,gtu} for 128-bit integers" is somewhat independent of the other two, since we run already in ICEs. However, since after removing vcond expanders testsuite shows one additional fallout without this patch, which is why I would like to make sure that this patch lands first and included it in this series. Stefan Schulze Frielinghaus (3): s390: Emulate vec_cmp{eq,gt,gtu} for 128-bit integers s390: Enable vcond_mask for 128-bit ops s390: Drop vcond{,u} expanders gcc/config/s390/vector.md | 156 -- .../gcc.target/s390/vector/vec-cmp-emu-1.c| 35 .../gcc.target/s390/vector/vec-cmp-emu-2.c| 18 ++ .../gcc.target/s390/vector/vec-cmp-emu-3.c| 17 ++ 4 files changed, 175 insertions(+), 51 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-1.c create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-2.c create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-3.c -- 2.45.2
[PATCH 3/3] s390: Drop vcond{,u} expanders
Optabs vcond{,u} will be removed for GCC 15. Since regtest shows no fallout, dropping the expanders, now. gcc/ChangeLog: PR target/114189 * config/s390/vector.md (V_HW2): Remove. (vcond): Remove. (vcondu): Remove. --- Bootstrapped and regtested on s390. Ok for mainline? gcc/config/s390/vector.md | 35 --- 1 file changed, 35 deletions(-) diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 0e57dd1650c..1caf732d1f9 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -27,14 +27,9 @@ V2SF V4SF V1DF V2DF V1TF V1TI TI]) ; All modes directly supported by the hardware having full vector reg size -; V_HW2 is for having two iterators expanding independently e.g. vcond. -; It's similar to V_HW, but not fully identical: V1TI is not included, because -; there are no 128-bit compares. (define_mode_iterator V_HW [V16QI V8HI V4SI V2DI V1TI TI V2DF (V4SF "TARGET_VXE") (V1TF "TARGET_VXE") (TF "TARGET_VXE")]) -(define_mode_iterator V_HW2 [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE") -(V1TF "TARGET_VXE") (TF "TARGET_VXE")]) (define_mode_iterator VT_HW_HSDT [V8HI V4SI V4SF V2DI V2DF V1TI V1TF TI TF]) (define_mode_iterator V_HW_HSD [V8HI V4SI (V4SF "TARGET_VXE") V2DI V2DF]) @@ -725,36 +720,6 @@ } }) -(define_expand "vcond" - [(set (match_operand:V_HW 0 "register_operand" "") - (if_then_else:V_HW -(match_operator 3 "vcond_comparison_operator" -[(match_operand:V_HW2 4 "register_operand" "") - (match_operand:V_HW2 5 "nonmemory_operand" "")]) -(match_operand:V_HW 1 "nonmemory_operand" "") -(match_operand:V_HW 2 "nonmemory_operand" "")))] - "TARGET_VX && GET_MODE_NUNITS (mode) == GET_MODE_NUNITS (mode)" -{ - s390_expand_vcond (operands[0], operands[1], operands[2], -GET_CODE (operands[3]), operands[4], operands[5]); - DONE; -}) - -(define_expand "vcondu" - [(set (match_operand:V_HW 0 "register_operand" "") - (if_then_else:V_HW -(match_operator 3 "comparison_operator" -[(match_operand:V_HW2 4 "register_operand" "") - (match_operand:V_HW2 5 "nonmemory_operand" "")]) -(match_operand:V_HW 1 "nonmemory_operand" "") -(match_operand:V_HW 2 "nonmemory_operand" "")))] - "TARGET_VX && GET_MODE_NUNITS (mode) == GET_MODE_NUNITS (mode)" -{ - s390_expand_vcond (operands[0], operands[1], operands[2], -GET_CODE (operands[3]), operands[4], operands[5]); - DONE; -}) - (define_expand "vcond_mask_" [(set (match_operand:VT 0 "register_operand" "") (if_then_else:VT -- 2.45.2
[PATCH 1/3] s390: Emulate vec_cmp{eq,gt,gtu} for 128-bit integers
Mode iterator V_HW enables V1TI for target VXE which means vec_cmpv1tiv1ti becomes available which leads to an ICE since there is no corresponding insn. Fixed by emulating comparisons and enabling mode V1TI unconditionally for V_HW. For the sake of symmetry, I also added TI mode to V_HW since TF mode is already included. As a consequence the consumers of V_HW vec_{splat,slb,sld,sldw,sldb,srdb,srab,srb,test_mask_int,test_mask} also become available for 128-bit integers. This fixes gcc.c-torture/execute/pr105613.c and gcc.dg/pr106063.c. gcc/ChangeLog: * config/s390/vector.md (V_HW): Enable V1TI unconditionally and add TI. (vec_cmpu): Add 128-bit integer variants. (*vec_cmpeq_nocc_emu): Emulate operation. (*vec_cmpgt_nocc_emu): Emulate operation. (*vec_cmpgtu_nocc_emu): Emulate operation. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vec-cmp-emu-1.c: New test. * gcc.target/s390/vector/vec-cmp-emu-2.c: New test. * gcc.target/s390/vector/vec-cmp-emu-3.c: New test. --- Bootstrapped and regtested on s390. Ok for mainline and GCC 14? gcc/config/s390/vector.md | 113 -- .../gcc.target/s390/vector/vec-cmp-emu-1.c| 35 ++ .../gcc.target/s390/vector/vec-cmp-emu-2.c| 18 +++ .../gcc.target/s390/vector/vec-cmp-emu-3.c| 17 +++ 4 files changed, 171 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-1.c create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-2.c create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmp-emu-3.c diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 40de0c75a7c..032ec44542c 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -30,7 +30,7 @@ ; V_HW2 is for having two iterators expanding independently e.g. vcond. ; It's similar to V_HW, but not fully identical: V1TI is not included, because ; there are no 128-bit compares. -(define_mode_iterator V_HW [V16QI V8HI V4SI V2DI (V1TI "TARGET_VXE") V2DF +(define_mode_iterator V_HW [V16QI V8HI V4SI V2DI V1TI TI V2DF (V4SF "TARGET_VXE") (V1TF "TARGET_VXE") (TF "TARGET_VXE")]) (define_mode_iterator V_HW2 [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE") @@ -50,6 +50,7 @@ (define_mode_iterator VI_HW_HSDT [V8HI V4SI V2DI V1TI TI]) (define_mode_iterator VI_HW_HS [V8HI V4SI]) (define_mode_iterator VI_HW_QH [V16QI V8HI]) +(define_mode_iterator VI_HW_T [V1TI TI]) ; Directly supported vector modes with a certain number of elements (define_mode_iterator V_HW_2 [V2DI V2DF]) @@ -151,7 +152,7 @@ (V1HI "V1HI") (V2HI "V2HI") (V4HI "V4HI") (V8HI "V8HI") (V1SI "V1SI") (V2SI "V2SI") (V4SI "V4SI") (V1DI "V1DI") (V2DI "V2DI") - (V1TI "V1TI") + (V1TI "V1TI") (TI "V1TI") (V1SF "V1SI") (V2SF "V2SI") (V4SF "V4SI") (V1DF "V1DI") (V2DF "V2DI") (V1TF "V1TI") (TF "V1TI")]) @@ -160,7 +161,7 @@ (V1HI "v1hi") (V2HI "v2hi") (V4HI "v4hi") (V8HI "v8hi") (V1SI "v1si") (V2SI "v2si") (V4SI "v4si") (V1DI "v1di") (V2DI "v2di") - (V1TI "v1ti") + (V1TI "v1ti") (TI "v1ti") (V1SF "v1si") (V2SF "v2si") (V4SF "v4si") (V1DF "v1di") (V2DF "v2di") (V1TF "v1ti") (TF "v1ti")]) @@ -1956,11 +1957,11 @@ DONE; }) -(define_expand "vec_cmpu" - [(set (match_operand:VI_HW0 "register_operand" "") - (match_operator:VI_HW 1 "" - [(match_operand:VI_HW 2 "register_operand" "") - (match_operand:VI_HW 3 "register_operand" "")]))] +(define_expand "vec_cmpu" + [(set (match_operand:VIT_HW0 "register_operand" "") + (match_operator:VIT_HW 1 "" + [(match_operand:VIT_HW 2 "register_operand" "") + (match_operand:VIT_HW 3 "register_operand" "")]))] "TARGET_VX" { s390_expand_vec_compare (operands[0], GET_CODE(operands[1]), operands[2], operands[3]); @@ -1975,6 +1976,94 @@ "vc\t%v2,%v0,%v1" [(set_attr "op_type" "VRR")]) +(define_insn_and_split "*vec_cmpeq_nocc_emu" + [(set (match_operand:VI_HW_T 0 "register_operand" "=v") + (eq:VI_HW_T (match_operand:VI_HW_T 1 "register_operand" "v") + (match_operand:VI_HW_T 2 "register_operand" "v")))] + "TARGET_VX" + "#" + "&& can_create_pseudo_p ()" + [(set (match_dup 3) + (eq:V2DI (match_dup 1) (match_dup 2))) + (set (match_dup 4) + (vec_select:V2DI (match_dup 3) (parallel [(const_int 1) (const_int 0)]))) + (set (match_dup 3) + (and:V2DI (match_dup 3) (match_dup 4))) + (set (m
[PATCH 2/3] s390: Enable vcond_mask for 128-bit ops
In preparation of dropping vcond{,u,eq} optabs https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654690.html enable 128-bit operands for vcond_mask---including integer as well as floating point. This fixes partially PR115519 w.r.t. autovec-long-double-signaling-*.c tests. gcc/ChangeLog: * config/s390/vector.md: Enable vcond_mask for 128-bit ops. --- Bootstrapped and regtested on s390. Ok for mainline? gcc/config/s390/vector.md | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 032ec44542c..0e57dd1650c 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -756,12 +756,12 @@ }) (define_expand "vcond_mask_" - [(set (match_operand:V 0 "register_operand" "") - (if_then_else:V + [(set (match_operand:VT 0 "register_operand" "") + (if_then_else:VT (eq (match_operand: 3 "register_operand" "") (match_dup 4)) -(match_operand:V 2 "register_operand" "") -(match_operand:V 1 "register_operand" "")))] +(match_operand:VT 2 "register_operand" "") +(match_operand:VT 1 "register_operand" "")))] "TARGET_VX" "operands[4] = CONST0_RTX (mode);") -- 2.45.2
Re: [PATCH] Hard register asm constraint
On Fri, Jun 28, 2024 at 11:46:08AM +0200, Georg-Johann Lay wrote: > Am 27.06.24 um 10:51 schrieb Stefan Schulze Frielinghaus: > > On Thu, Jun 27, 2024 at 09:45:32AM +0200, Georg-Johann Lay wrote: > > > Am 24.05.24 um 11:13 Am 25.06.24 um 16:03 schrieb Paul Koning: > > > > > On Jun 24, 2024, at 1:50 AM, Stefan Schulze Frielinghaus > > > > > wrote: > > > > > On Mon, Jun 10, 2024 at 07:19:19AM +0200, Stefan Schulze Frielinghaus > > > > > wrote: > > > > > > On Fri, May 24, 2024 at 11:13:12AM +0200, Stefan Schulze > > > > > > Frielinghaus wrote: > > > > > > > This implements hard register constraints for inline asm. A hard > > > > > > > register > > > > > > > constraint is of the form {regname} where regname is any valid > > > > > > > register. This > > > > > > > basically renders register asm superfluous. For example, the > > > > > > > snippet > > > > > > > > > > > > > > int test (int x, int y) > > > > > > > { > > > > > > >register int r4 asm ("r4") = x; > > > > > > >register int r5 asm ("r5") = y; > > > > > > >unsigned int copy = y; > > > > > > >asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy)); > > > > > > >return r4; > > > > > > > } > > > > > > > > > > > > > > could be rewritten into > > > > > > > > > > > > > > int test (int x, int y) > > > > > > > { > > > > > > >asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y)); > > > > > > >return x; > > > > > > > } > > > > > > > > I like this idea but I'm wondering: regular constraints specify what > > > > sort of value is needed, for example an int vs. a short int vs. a > > > > float. The notation you've shown doesn't seem to have that aspect. > > > > > > > > The other comment is that I didn't see documentation updates to reflect > > > > this new feature. > > > > > > > > paul > > > > > > > Stefan Schulze Frielinghaus: > > > > This implements hard register constraints for inline asm. A hard > > > > register > > > > constraint is of the form {regname} where regname is any valid > > > > register. This > > > > basically renders register asm superfluous. For example, the snippet > > > > > > > > int test (int x, int y) > > > > { > > > > register int r4 asm ("r4") = x; > > > > register int r5 asm ("r5") = y; > > > > unsigned int copy = y; > > > > asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy)); > > > > return r4; > > > > } > > > > > > > > could be rewritten into > > > > > > > > int test (int x, int y) > > > > { > > > > asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y)); > > > > return x; > > > > } > > > > > > Hi, can this also be used in machine descriptions? > > > > > > It would make some insn handling much simpler, for example in > > > the avr backend. > > > > > > That backend has insns that represent assembly sequences in libgcc > > > which have a smaller register footprint than plain calls. However > > > this requires that such insns have explicit description of which regs > > > go in and out. > > > > > > The current solution uses hard regs, which works, but a proper > > > implementation would use register constraints. I tries that a while > > > ago, and register constraints lead to a code bloat even in places that > > > don't use these constraints due to the zillions of new register classes > > > like R22_1, R22;2, R22_4, R20_1, R20_2, R20_4 etc. that were required. > > > > > > Your approach would allow to use hard register constraints in insns, > > > and so far the only problem is to determine how much hard regs are > > > used by the constraint. The gen tools that generates
Re: [PATCH] Hard register asm constraint
On Thu, Jun 27, 2024 at 09:45:32AM +0200, Georg-Johann Lay wrote: > > > Am 24.05.24 um 11:13 Am 25.06.24 um 16:03 schrieb Paul Koning: > > > > > > > On Jun 24, 2024, at 1:50 AM, Stefan Schulze Frielinghaus > > > wrote: > > > > > > Ping. > > > > > > On Mon, Jun 10, 2024 at 07:19:19AM +0200, Stefan Schulze Frielinghaus > > > wrote: > > > > Ping. > > > > > > > > On Fri, May 24, 2024 at 11:13:12AM +0200, Stefan Schulze Frielinghaus > > > > wrote: > > > > > This implements hard register constraints for inline asm. A hard > > > > > register > > > > > constraint is of the form {regname} where regname is any valid > > > > > register. This > > > > > basically renders register asm superfluous. For example, the snippet > > > > > > > > > > int test (int x, int y) > > > > > { > > > > > register int r4 asm ("r4") = x; > > > > > register int r5 asm ("r5") = y; > > > > > unsigned int copy = y; > > > > > asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy)); > > > > > return r4; > > > > > } > > > > > > > > > > could be rewritten into > > > > > > > > > > int test (int x, int y) > > > > > { > > > > > asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y)); > > > > > return x; > > > > > } > > > > I like this idea but I'm wondering: regular constraints specify what sort > > of value is needed, for example an int vs. a short int vs. a float. The > > notation you've shown doesn't seem to have that aspect. > > > > The other comment is that I didn't see documentation updates to reflect > > this new feature. > > > > paul > > > Stefan Schulze Frielinghaus: > > This implements hard register constraints for inline asm. A hard register > > constraint is of the form {regname} where regname is any valid register. > > This > > basically renders register asm superfluous. For example, the snippet > > > > int test (int x, int y) > > { > >register int r4 asm ("r4") = x; > >register int r5 asm ("r5") = y; > >unsigned int copy = y; > >asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy)); > >return r4; > > } > > > > could be rewritten into > > > > int test (int x, int y) > > { > >asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y)); > >return x; > > } > > Hi, can this also be used in machine descriptions? > > It would make some insn handling much simpler, for example in > the avr backend. > > That backend has insns that represent assembly sequences in libgcc > which have a smaller register footprint than plain calls. However > this requires that such insns have explicit description of which regs > go in and out. > > The current solution uses hard regs, which works, but a proper > implementation would use register constraints. I tries that a while > ago, and register constraints lead to a code bloat even in places that > don't use these constraints due to the zillions of new register classes > like R22_1, R22;2, R22_4, R20_1, R20_2, R20_4 etc. that were required. > > Your approach would allow to use hard register constraints in insns, > and so far the only problem is to determine how much hard regs are > used by the constraint. The gen tools that generates cc code from md > would use the operand's machine mode to infer the number of hard regs. I have this on my todo list but ignored it for the very first draft. At the moment this already fails because genoutput cannot parse the constraint format. In my "alpha draft" I implemented this feature by emitting moves to hard registers during expand. This had the limitation that I couldn't support multiple alternatives in combination with hard-register constraints. I'm still not sure whether this is a feature we really want or whether it should be rather denied. Anyhow, with this kind of implementation I doubt that this would be feasible for machine descriptions. I moved on with my current draft where the constraint manifests during register allocation. This also allows multiple alternatives. I think one of the (major?) advantages of doing it this way is that operands are kept in pseudos which means they are automagically saved/restored over function boundaries and what not. Or in other words, the register constraint manifests at the asm boundary which is probably what users expect and should be less error prone (again just thinking of implicit code which gets injected as e.g. by sanitizers introducing calls etc.). So long story short, I would like to look into this but currently it doesn't work. I'm also not sure to which extend this could be used. However, once I have some more time I will have a look at the avr backend for examples. Cheers, Stefan
Re: [PATCH] Hard register asm constraint
On Wed, Jun 26, 2024 at 11:10:38AM -0400, Paul Koning wrote: > > > > On Jun 26, 2024, at 8:54 AM, Stefan Schulze Frielinghaus > > wrote: > > > > On Tue, Jun 25, 2024 at 01:02:39PM -0400, Paul Koning wrote: > >> > >> > >>> On Jun 25, 2024, at 12:04 PM, Stefan Schulze Frielinghaus > >>> wrote: > >>> > >>> On Tue, Jun 25, 2024 at 10:03:34AM -0400, Paul Koning wrote: > >>>> > >>>>>>> ... > >>>>>>> could be rewritten into > >>>>>>> > >>>>>>> int test (int x, int y) > >>>>>>> { > >>>>>>> asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y)); > >>>>>>> return x; > >>>>>>> } > >>>> > >>>> I like this idea but I'm wondering: regular constraints specify what > >>>> sort of value is needed, for example an int vs. a short int vs. a float. > >>>> The notation you've shown doesn't seem to have that aspect. > >>> > >>> As Maciej already pointed out the type of the expression should suffice. > >>> My assumption was that an asm can deal with a value as is or its > >>> promoted value. At least for integer values this should be fine and > >>> AFAICS is also the case for simple constraints like "r" which do not > >>> define any mode. I've probably overseen something but which constraint > >>> differentiates between int vs short? However, you have a good point > >>> with this and I should test this more. > >> > >> I thought there was but I may be confused. On the other hand, there > >> definitely are (machine dependent) constraints that distinguish, say, > >> float from integer registers; pdp11 is an example. If you were to use an > >> "a" constraint, that means a floating point register and the compiler will > >> detect attempts to pass non-float operands ("Inconsistent operand > >> constraints..."). > >> > >> I see that the existing "register int ..." syntax appears to check that > >> the register is the right type for the data type given for it, so for > >> example on pdp11, > >> > >>register int ac1 asm ("ac1") = i; > >> > >> fails ("register ... isn't suitable for data type"). I assume your new > >> syntax would perform the same check and produce roughly the same error > >> message. You might verify that. On pdp11, trying to use, for example, > >> "r0" for a float, or "ac0" for an int, would produce that error. > > > > Right, so far I don't error out here which I will change. It basically > > results in bit casting floats to ints currently. > > That would be bad. For one thing, a PDP11 float doesn't fit in an integer > register. > > That also brings up another point (which applies to more mainstream targets > as well): for data types that require multiple registers, say a register pair > for a double length value, how is that handled? One possible answer is to > reject that. Another would be to load a register pair. > > This case applies to a "long int" on pdp11, or 32 bit MIPS, and probably a > bunch of others. Absolutely, also on mainstream targets you could think of 128-bit integers or long doubles which typically don't fit in (single) GPRs. I should definitely add error handling for this. Similar, I don't error out for non-primitive data types. I will give register pairs a try. Thanks for all your comments so far :) Cheers, Stefan
Re: [PATCH] s390: Check for ADDR_REGS in s390_decompose_addrstyle_without_index
On Wed, Jun 26, 2024 at 02:15:18PM +0200, Stefan Schulze Frielinghaus wrote: > An explicit check for address registers was not required so far since > during register allocation the processing of address constraints was > sufficient. However, address constraints themself do not check for > REGNO_OK_FOR_{BASE,INDEX}_P. Thus, with the newly introduced > late-combine pass in r15-1579-g792f97b44ffc5e we generate new insns with > invalid address registers which aren't fixed up afterwards. > > Fixed by explicitly checking for address registers in > s390_decompose_addrstyle_without_index such that those new insns are > rejected. > > gcc/ChangeLog: > > target/PR115634 > * config/s390/s390.cc (s390_decompose_addrstyle_without_index): > Check for ADDR_REGS in s390_decompose_addrstyle_without_index. > --- > This restores bootstrap on s390. I ran the testsuite against mainline > and of course there is some fallout which is most likely coming from > the new pass or other changes. I have another job running comparing > pre r15-1579-g792f97b44ffc5e with and without this patch. Assuming > this goes well, ok for mainline? Bootstrap and regtest of this test went also fine. > > gcc/config/s390/s390.cc | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) > > diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc > index c65421de831..05a0fde7fb0 100644 > --- a/gcc/config/s390/s390.cc > +++ b/gcc/config/s390/s390.cc > @@ -3347,7 +3347,9 @@ s390_decompose_addrstyle_without_index (rtx op, rtx > *base, >while (op && GET_CODE (op) == SUBREG) > op = SUBREG_REG (op); > > - if (op && GET_CODE (op) != REG) > + if (op && (!REG_P (op) > + || (reload_completed > + && !REGNO_OK_FOR_BASE_P (REGNO (op) > return false; > >if (offset) > -- > 2.45.1 >
Re: [PATCH] Hard register asm constraint
On Tue, Jun 25, 2024 at 01:02:39PM -0400, Paul Koning wrote: > > > > On Jun 25, 2024, at 12:04 PM, Stefan Schulze Frielinghaus > > wrote: > > > > On Tue, Jun 25, 2024 at 10:03:34AM -0400, Paul Koning wrote: > >> > >>>>> ... > >>>>> could be rewritten into > >>>>> > >>>>> int test (int x, int y) > >>>>> { > >>>>> asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y)); > >>>>> return x; > >>>>> } > >> > >> I like this idea but I'm wondering: regular constraints specify what sort > >> of value is needed, for example an int vs. a short int vs. a float. The > >> notation you've shown doesn't seem to have that aspect. > > > > As Maciej already pointed out the type of the expression should suffice. > > My assumption was that an asm can deal with a value as is or its > > promoted value. At least for integer values this should be fine and > > AFAICS is also the case for simple constraints like "r" which do not > > define any mode. I've probably overseen something but which constraint > > differentiates between int vs short? However, you have a good point > > with this and I should test this more. > > I thought there was but I may be confused. On the other hand, there > definitely are (machine dependent) constraints that distinguish, say, float > from integer registers; pdp11 is an example. If you were to use an "a" > constraint, that means a floating point register and the compiler will detect > attempts to pass non-float operands ("Inconsistent operand constraints..."). > > I see that the existing "register int ..." syntax appears to check that the > register is the right type for the data type given for it, so for example on > pdp11, > > register int ac1 asm ("ac1") = i; > > fails ("register ... isn't suitable for data type"). I assume your new > syntax would perform the same check and produce roughly the same error > message. You might verify that. On pdp11, trying to use, for example, "r0" > for a float, or "ac0" for an int, would produce that error. Right, so far I don't error out here which I will change. It basically results in bit casting floats to ints currently. Just one thing to note: this is not a novel feature but pretty similar to Rust's explicit register operands: https://doc.rust-lang.org/rust-by-example/unsafe/asm.html#explicit-register-operands Cheers, Stefan
[PATCH] s390: Check for ADDR_REGS in s390_decompose_addrstyle_without_index
An explicit check for address registers was not required so far since during register allocation the processing of address constraints was sufficient. However, address constraints themself do not check for REGNO_OK_FOR_{BASE,INDEX}_P. Thus, with the newly introduced late-combine pass in r15-1579-g792f97b44ffc5e we generate new insns with invalid address registers which aren't fixed up afterwards. Fixed by explicitly checking for address registers in s390_decompose_addrstyle_without_index such that those new insns are rejected. gcc/ChangeLog: target/PR115634 * config/s390/s390.cc (s390_decompose_addrstyle_without_index): Check for ADDR_REGS in s390_decompose_addrstyle_without_index. --- This restores bootstrap on s390. I ran the testsuite against mainline and of course there is some fallout which is most likely coming from the new pass or other changes. I have another job running comparing pre r15-1579-g792f97b44ffc5e with and without this patch. Assuming this goes well, ok for mainline? gcc/config/s390/s390.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index c65421de831..05a0fde7fb0 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -3347,7 +3347,9 @@ s390_decompose_addrstyle_without_index (rtx op, rtx *base, while (op && GET_CODE (op) == SUBREG) op = SUBREG_REG (op); - if (op && GET_CODE (op) != REG) + if (op && (!REG_P (op) +|| (reload_completed +&& !REGNO_OK_FOR_BASE_P (REGNO (op) return false; if (offset) -- 2.45.1
Re: [PATCH] Hard register asm constraint
On Tue, Jun 25, 2024 at 10:03:34AM -0400, Paul Koning wrote: > > > > On Jun 24, 2024, at 1:50 AM, Stefan Schulze Frielinghaus > > wrote: > > > > Ping. > > > > On Mon, Jun 10, 2024 at 07:19:19AM +0200, Stefan Schulze Frielinghaus wrote: > >> Ping. > >> > >> On Fri, May 24, 2024 at 11:13:12AM +0200, Stefan Schulze Frielinghaus > >> wrote: > >>> This implements hard register constraints for inline asm. A hard register > >>> constraint is of the form {regname} where regname is any valid register. > >>> This > >>> basically renders register asm superfluous. For example, the snippet > >>> > >>> int test (int x, int y) > >>> { > >>> register int r4 asm ("r4") = x; > >>> register int r5 asm ("r5") = y; > >>> unsigned int copy = y; > >>> asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy)); > >>> return r4; > >>> } > >>> > >>> could be rewritten into > >>> > >>> int test (int x, int y) > >>> { > >>> asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y)); > >>> return x; > >>> } > > I like this idea but I'm wondering: regular constraints specify what sort of > value is needed, for example an int vs. a short int vs. a float. The > notation you've shown doesn't seem to have that aspect. As Maciej already pointed out the type of the expression should suffice. My assumption was that an asm can deal with a value as is or its promoted value. At least for integer values this should be fine and AFAICS is also the case for simple constraints like "r" which do not define any mode. I've probably overseen something but which constraint differentiates between int vs short? However, you have a good point with this and I should test this more. > The other comment is that I didn't see documentation updates to reflect this > new feature. I didn't came up with documentation yet since I was not sure whether such a proposal would be accepted at all, i.e., just wanted to hear whether you see some show stoppers or not. Assuming this goes well I guess it should be documented under simple constraints https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html Thanks, Stefan
Re: [PATCH] Hard register asm constraint
Ping. On Mon, Jun 10, 2024 at 07:19:19AM +0200, Stefan Schulze Frielinghaus wrote: > Ping. > > On Fri, May 24, 2024 at 11:13:12AM +0200, Stefan Schulze Frielinghaus wrote: > > This implements hard register constraints for inline asm. A hard register > > constraint is of the form {regname} where regname is any valid register. > > This > > basically renders register asm superfluous. For example, the snippet > > > > int test (int x, int y) > > { > > register int r4 asm ("r4") = x; > > register int r5 asm ("r5") = y; > > unsigned int copy = y; > > asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy)); > > return r4; > > } > > > > could be rewritten into > > > > int test (int x, int y) > > { > > asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y)); > > return x; > > } > > > > As a side-effect this also solves the problem of call-clobbered registers. > > That being said, I was wondering whether we could utilize this feature in > > order > > to get rid of local register asm automatically? For example, converting > > > > // Result will be in r2 on s390 > > extern int bar (void); > > > > void test (void) > > { > > register int x asm ("r2") = 42; > > bar (); > > asm ("foo %0\n" :: "r" (x)); > > } > > > > into > > > > void test (void) > > { > > int x = 42; > > bar (); > > asm ("foo %0\n" :: "{r2}" (x)); > > } > > > > in order to get rid of the limitation of call-clobbered registers which may > > lead to subtle bugs---especially if you think of non-obvious calls e.g. > > introduced by sanitizer/tracer/whatever. Since such a transformation has > > the > > potential to break existing code do you see any edge cases where this might > > be > > problematic or even show stoppers? Currently, even > > > > int test (void) > > { > > register int x asm ("r2") = 42; > > register int y asm ("r2") = 24; > > asm ("foo %0,%1\n" :: "r" (x), "r" (y)); > > } > > > > is allowed which seems error prone to me. Thus, if 100% backwards > > compatibility would be required, then automatically converting every > > register > > asm to the new mechanism isn't viable. Still quite a lot could be > > transformed. > > Any thoughts? > > > > Currently I allow multiple alternatives as demonstrated by > > gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c. However, since a hard > > register > > constraint is pretty specific I could also think of erroring out in case of > > alternatives. Are there any real use cases out there for multiple > > alternatives where one would like to use hard register constraints? > > > > With the current implementation we have a "user visible change" in the sense > > that for > > > > void test (void) > > { > > register int x asm ("r2") = 42; > > register int y asm ("r2") = 24; > > asm ("foo %0,%1\n" : "=r" (x), "=r" (y)); > > } > > > > we do not get the error > > > > "invalid hard register usage between output operands" > > > > anymore but rather > > > > "multiple outputs to hard register: %r2" > > > > This is due to the error handling in gimplify_asm_expr (). Speaking of > > errors, > > I also error out earlier as before which means that e.g. in pr87600-2.c only > > the first error is reported and processing is stopped afterwards which means > > the subsequent tests fail. > > > > I've been skimming through all targets and it looks to me as if none is > > using > > curly brackets for their constraints. Of course, I may have missed > > something. > > > > Cheers, > > Stefan > > > > PS: Current state for Clang: https://reviews.llvm.org/D105142 > > > > --- > > gcc/cfgexpand.cc | 42 --- > > gcc/genpreds.cc | 4 +- > > gcc/gimplify.cc | 115 +- > > gcc/lra-constraints.cc| 17 +++ > > gcc/recog.cc | 14 ++- > > gcc/stmt.cc | 102 +++- &
Re: [PATCH] s390: define single step vector casts
On Thu, Jun 20, 2024 at 09:06:11AM +0200, Juergen Christ wrote: > Some casts were missing leading to missed of bad vectorizations where > casting was done scalar followed by a vector creation from the > individual elements. > > gcc/ChangeLog: > > * config/s390/vector.md (VEC_HALF_NARROWED): New mode iterator. > (vec_half_narrowed): ditto. > (trunc2): New pattern. > (vec_pack_ufix_trunc_v2df): ditto. > (vec_pack_sfix_trunc_v2df): ditto. > (vec_unpack_sfix_trunc_lo_v4sf): ditto. > (vec_unpack_sfix_trunc_hi_v4sf): ditto. > (vec_unpack_ufix_trunc_lo_v4sf): ditto. > (vec_unpack_ufix_trunc_hi_v4sf): ditto. > (floatv2siv2sf2): ditto. > (floatunsv2siv2sf2): ditto. > (vec_unpacks_float_hi_v4si): ditto. > (vec_unpacks_float_lo_v4si): ditto. > (vec_unpacku_float_hi_v4si): ditto. > (vec_unpacku_float_lo_v4si): ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/s390/vector/vec-cast-single.c: New test. > * gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c: New test. > > Bootstrapped and regtested on s390x. Ok for trunk? > > Signed-off-by: Juergen Christ > --- > gcc/config/s390/vector.md | 170 ++- > .../gcc.target/s390/vector/vec-cast-single.c | 271 ++ > .../s390/vector/vec_pack_ufix_trunc_v2df.c| 30 ++ > 3 files changed, 463 insertions(+), 8 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c > create mode 100644 > gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c > > diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md > index 40de0c75a7cf..356f25d26deb 100644 > --- a/gcc/config/s390/vector.md > +++ b/gcc/config/s390/vector.md > @@ -89,6 +89,8 @@ > > (define_mode_iterator VI_EXTEND [V2QI V2HI V2SI V4QI V4HI]) > > +(define_mode_iterator VI_TRUNC [V2HI V2SI V2DI V4HI V4SI]) > + > ; Empty string for all but TImode. This is used to hide the TImode > ; expander name in case it is defined already. See addti3 for an > ; example. > @@ -211,6 +213,14 @@ > (V1SF "v1df") (V2SF "v2df") (V4SF "v4df") > (V1DF "v1tf") (V2DF "v2tf")]) > > +; Vector with narrowed element size and the same number of elements. > +(define_mode_attr VEC_HALF_NARROWED [(V1HI "V1QI") (V2HI "V2QI") (V4HI > "V4QI") (V8HI "V8QI") > + (V1SI "V1HI") (V2SI "V2HI") (V4SI "V4HI") > +(V1DI "V1DI") (V2DI "V2SI")]) > +(define_mode_attr vec_half_narrowed [(V1HI "v1qi") (V2HI "v2qi") (V4HI > "v4qi") (V8HI "v8qi") > + (V1SI "v1hi") (V2SI "v2hi") (V4SI "v4hi") > +(V1DI "v1di") (V2DI "v2si")]) > + > ; Vector with half the element size AND half the number of elements. > (define_mode_attr vec_halfhalf >[(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI") > @@ -2422,6 +2432,17 @@ >operands[2] = gen_reg_rtx (V4SFmode); > }) > > +;; vector truncate > + > +; downcasts > + > +(define_insn "trunc2" > + [(set (match_operand: 0 "register_operand" "=v") > +(truncate: (match_operand:VI_TRUNC 1 > "register_operand" "v")))] > + "TARGET_VX" > + "vpk\t %0,%1,%1" ^ whitespace > + [(set_attr "op_type" "VRR")]) > + > ;; vector unpack v16qi > > ; signed > @@ -3177,17 +3198,150 @@ >emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2])); >emit_insn (gen_vstlv16qi (operands[1], len, mem)); >DONE; > -});; > +}) > + > +(define_expand "vec_pack_ufix_trunc_v2df" > + [(match_operand:V4SI 0 "register_operand") > + (match_operand:V2DF 1 "register_operand") > + (match_operand:V2DF 2 "register_operand")] > + "TARGET_VX" > +{ > + rtx r1 = gen_reg_rtx (V2DImode); > + rtx r2 = gen_reg_rtx (V2DImode); > + > + emit_insn (gen_fixuns_truncv2dfv2di2 (r1, operands[1])); > + emit_insn (gen_fixuns_truncv2dfv2di2 (r2, operands[2])); > + emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2)); > + DONE; > +}) I haven't really wrapped my head around this, however, this two step conversion could miss an IEEE-inexact-exception if a double fits into a 64-bit integer but not in a 32-bit integer. What does the IL/vectorizer say about exceptions? Ok to miss some or do we have to guard this by no-trapping-math et al.? > + > +(define_expand "vec_pack_sfix_trunc_v2df" > + [(match_operand:V4SI 0 "register_operand") > + (match_operand:V2DF 1 "register_operand") > + (match_operand:V2DF 2 "register_operand")] > + "TARGET_VX" > +{ > + rtx r1 = gen_reg_rtx (V2DImode); > + rtx r2 = gen_reg_rtx (V2DImode); > + > + emit_insn (gen_fix_truncv2dfv2di2 (r1, operands[1])); > + emit_insn (gen_fix_truncv2dfv2di2 (r2, operands[2])); > + emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2)); > + DONE; > +}) same as above > + > +; v4sf -> v2di > +(define_expand "vec_unpack_sfix_trunc_lo_v4sf" > + [(match_operand:V2DI 0 "register_operand") > + (match_
Re: [PATCH] middle-end/114189 - drop uses of vcond{,u,eq}_optab
On Mon, Jun 17, 2024 at 08:16:34AM +0200, Richard Biener wrote: > On Mon, 17 Jun 2024, Kewen.Lin wrote: > > > Hi Richi, > > > > on 2024/6/14 18:31, Richard Biener wrote: > > > The following retires vcond{,u,eq} optabs by stopping to use them > > > from the middle-end. Targets instead (should) implement vcond_mask > > > and vec_cmp{,u,eq} optabs. The PR this change refers to lists > > > possibly affected targets - those implementing these patterns, > > > and in particular it lists mips, sparc and ia64 as targets that > > > most definitely will regress while others might simply remove > > > their vcond{,u,eq} patterns. > > > > > > I'd appreciate testing, I do not expect fallout for x86 or arm/aarch64. > > > I know riscv doesn't implement any of the legacy optabs. But less > > > maintained vector targets might need adjustments. > > > > Thanks for making this change, this patch can be bootstrapped on ppc64{,le} > > but both have one failure on gcc/testsuite/gcc.target/powerpc/pr66144-3.c, > > by looking into it, I found it just exposed one oversight in the current > > rs6000 vcond_mask support (the condition mask location is wrong), so I think > > this change is fine for rs6000 port, I'll also test SPEC2017 for this (with > > rs6000 vcond_mask change) soon. > > Btw, for those targets where the patch works out fine it would be nice > to delete their vcond{,u,eq} expanders (and double-check that doesn't > cause issues on its own). > > Can target maintainers note whether their targets support all condition > codes for their vector comparisons (including FP variants)? And > whether they choose to implement all condition codes in vec_cmp > and adjust with inversion / operand swapping for not supported cases? On s390 we support all comparison operations with inverse / operand swapping via s390_expand_vec_compare. However, we still have some failures for which I opened PR115519. Currently it is unclear to me what precisely is missing and will have a further look. vcond_mask expander is also implemented for all modes. Cheers, Stefan > > Thanks, > Richard. > > > BR, > > Kewen > > > > > > > > I want to get rid of those optabs for GCC 15. If I don't hear from > > > you I will assume your target is fine. > > > > > > Thanks, > > > Richard. > > > > > > PR middle-end/114189 > > > * optabs-query.h (get_vcond_icode): Always return CODE_FOR_nothing. > > > (get_vcond_eq_icode): Likewise. > > > --- > > > gcc/optabs-query.h | 13 - > > > 1 file changed, 4 insertions(+), 9 deletions(-) > > > > > > diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h > > > index 0cb2c21ba85..31fbce80175 100644 > > > --- a/gcc/optabs-query.h > > > +++ b/gcc/optabs-query.h > > > @@ -112,14 +112,9 @@ get_vec_cmp_eq_icode (machine_mode vmode, > > > machine_mode mask_mode) > > > mode CMODE, unsigned if UNS is true, resulting in a value of mode > > > VMODE. */ > > > > > > inline enum insn_code > > > -get_vcond_icode (machine_mode vmode, machine_mode cmode, bool uns) > > > +get_vcond_icode (machine_mode, machine_mode, bool) > > > { > > > - enum insn_code icode = CODE_FOR_nothing; > > > - if (uns) > > > -icode = convert_optab_handler (vcondu_optab, vmode, cmode); > > > - else > > > -icode = convert_optab_handler (vcond_optab, vmode, cmode); > > > - return icode; > > > + return CODE_FOR_nothing; > > > } > > > > > > /* Return insn code for a conditional operator with a mask mode > > > @@ -135,9 +130,9 @@ get_vcond_mask_icode (machine_mode vmode, > > > machine_mode mmode) > > > mode CMODE (only EQ/NE), resulting in a value of mode VMODE. */ > > > > > > inline enum insn_code > > > -get_vcond_eq_icode (machine_mode vmode, machine_mode cmode) > > > +get_vcond_eq_icode (machine_mode, machine_mode) > > > { > > > - return convert_optab_handler (vcondeq_optab, vmode, cmode); > > > + return CODE_FOR_nothing; > > > } > > > > > > /* Enumerates the possible extraction_insn operations. */ > > > > > > -- > Richard Biener > SUSE Software Solutions Germany GmbH, > Frankenstrasse 146, 90461 Nuernberg, Germany; > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
Re: [PATCH] s390: testsuite: Fix ifcvt-one-insn-bool.c
Ping. On Wed, Jun 05, 2024 at 08:00:15AM +0200, Stefan Schulze Frielinghaus wrote: > With the change of r15-787-g57e04879389f9c I forgot to also update this > test. > > gcc/testsuite/ChangeLog: > > * gcc.target/s390/ifcvt-one-insn-bool.c: Fix loc. > --- > Ok for mainline? Ok for GCC 14 if the corresponding backport is also > approved? > > gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c > b/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c > index 0c8c2f879a6..4ae29dbd6b6 100644 > --- a/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c > +++ b/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c > @@ -3,7 +3,7 @@ > /* { dg-do compile { target { s390*-*-* } } } */ > /* { dg-options "-O2 -march=z13 -mzarch" } */ > > -/* { dg-final { scan-assembler "lochinh\t%r.?,1" } } */ > +/* { dg-final { scan-assembler "lochile\t%r.?,1" } } */ > #include > > int foo (int *a, unsigned int n) > -- > 2.45.1 >
Re: [PATCH v2] s390: Implement TARGET_NOCE_CONVERSION_PROFITABLE_P [PR109549]
Ping. On Sun, Jun 02, 2024 at 02:07:24PM +0200, Stefan Schulze Frielinghaus wrote: > Since the patch works fine so far for mainline, ok to backport to GCC 14? > > On Fri, May 17, 2024 at 08:59:05AM +0200, Stefan Schulze Frielinghaus wrote: > > I've adapted the patch as follows and will push. > > > > Thanks, > > Stefan > > > > -- > > > > Consider a NOCE conversion as profitable if there is at least one > > conditional move. > > > > gcc/ChangeLog: > > > > * config/s390/s390.cc (TARGET_NOCE_CONVERSION_PROFITABLE_P): > > Define. > > (s390_noce_conversion_profitable_p): Implement. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/s390/ccor.c: Order of loads are reversed, now, as a > > consequence the condition has to be reversed. > > --- > > gcc/config/s390/s390.cc | 32 > > gcc/testsuite/gcc.target/s390/ccor.c | 4 ++-- > > 2 files changed, 34 insertions(+), 2 deletions(-) > > > > diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc > > index bf46eab2d63..7f8f1681c2a 100644 > > --- a/gcc/config/s390/s390.cc > > +++ b/gcc/config/s390/s390.cc > > @@ -78,6 +78,7 @@ along with GCC; see the file COPYING3. If not see > > #include "tree-pass.h" > > #include "context.h" > > #include "builtins.h" > > +#include "ifcvt.h" > > #include "rtl-iter.h" > > #include "intl.h" > > #include "tm-constrs.h" > > @@ -18037,6 +18038,34 @@ s390_vectorize_vec_perm_const (machine_mode vmode, > > machine_mode op_mode, > >return vectorize_vec_perm_const_1 (d); > > } > > > > +/* Consider a NOCE conversion as profitable if there is at least one > > + conditional move. */ > > + > > +static bool > > +s390_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info > > *if_info) > > +{ > > + if (if_info->speed_p) > > +{ > > + for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn)) > > + { > > + rtx set = single_set (insn); > > + if (set == NULL) > > + continue; > > + if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE) > > + continue; > > + rtx src = SET_SRC (set); > > + machine_mode mode = GET_MODE (src); > > + if (GET_MODE_CLASS (mode) != MODE_INT > > + && GET_MODE_CLASS (mode) != MODE_FLOAT) > > + continue; > > + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) > > + continue; > > + return true; > > + } > > +} > > + return default_noce_conversion_profitable_p (seq, if_info); > > +} > > + > > /* Initialize GCC target structure. */ > > > > #undef TARGET_ASM_ALIGNED_HI_OP > > @@ -18350,6 +18379,9 @@ s390_vectorize_vec_perm_const (machine_mode vmode, > > machine_mode op_mode, > > #undef TARGET_VECTORIZE_VEC_PERM_CONST > > #define TARGET_VECTORIZE_VEC_PERM_CONST s390_vectorize_vec_perm_const > > > > +#undef TARGET_NOCE_CONVERSION_PROFITABLE_P > > +#define TARGET_NOCE_CONVERSION_PROFITABLE_P > > s390_noce_conversion_profitable_p > > + > > struct gcc_target targetm = TARGET_INITIALIZER; > > > > #include "gt-s390.h" > > diff --git a/gcc/testsuite/gcc.target/s390/ccor.c > > b/gcc/testsuite/gcc.target/s390/ccor.c > > index 31f30f60314..36a3c3a999a 100644 > > --- a/gcc/testsuite/gcc.target/s390/ccor.c > > +++ b/gcc/testsuite/gcc.target/s390/ccor.c > > @@ -42,7 +42,7 @@ GENFUN1(2) > > > > GENFUN1(3) > > > > -/* { dg-final { scan-assembler {locrno} } } */ > > +/* { dg-final { scan-assembler {locro} } } */ > > > > GENFUN2(0,1) > > > > @@ -58,7 +58,7 @@ GENFUN2(0,3) > > > > GENFUN2(1,2) > > > > -/* { dg-final { scan-assembler {locrnlh} } } */ > > +/* { dg-final { scan-assembler {locrlh} } } */ > > > > GENFUN2(1,3) > > > > -- > > 2.45.0 > >
Re: [PATCH] s390: testsuite: Fix nobp-table-jump-*.c
Ping. On Mon, Jun 03, 2024 at 03:43:39PM +0200, Stefan Schulze Frielinghaus wrote: > Starting with r14-5628-g53ba8d669550d3 interprocedural VRP became strong > enough in order to render these tests useless. Fixed by disabling IPA. > > gcc/testsuite/ChangeLog: > > * gcc.target/s390/nobp-table-jump-inline-z10.c: Do not perform > IPA. > * gcc.target/s390/nobp-table-jump-inline-z900.c: Dito. > * gcc.target/s390/nobp-table-jump-z10.c: Dito. > * gcc.target/s390/nobp-table-jump-z900.c: Dito. > --- > Ok for mainline? > > .../s390/nobp-table-jump-inline-z10.c | 42 +-- > .../s390/nobp-table-jump-inline-z900.c| 42 +-- > .../gcc.target/s390/nobp-table-jump-z10.c | 42 +-- > .../gcc.target/s390/nobp-table-jump-z900.c| 42 +-- > 4 files changed, 84 insertions(+), 84 deletions(-) > > diff --git a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c > b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c > index 8dfd7e4c786..121751166d0 100644 > --- a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c > +++ b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c > @@ -4,29 +4,29 @@ > /* case-values-threshold will be set to 20 by the back-end when jump > thunk are requested. */ > > -int __attribute__((noinline,noclone)) foo1 (void) { return 1; } > -int __attribute__((noinline,noclone)) foo2 (void) { return 2; } > -int __attribute__((noinline,noclone)) foo3 (void) { return 3; } > -int __attribute__((noinline,noclone)) foo4 (void) { return 4; } > -int __attribute__((noinline,noclone)) foo5 (void) { return 5; } > -int __attribute__((noinline,noclone)) foo6 (void) { return 6; } > -int __attribute__((noinline,noclone)) foo7 (void) { return 7; } > -int __attribute__((noinline,noclone)) foo8 (void) { return 8; } > -int __attribute__((noinline,noclone)) foo9 (void) { return 9; } > -int __attribute__((noinline,noclone)) foo10 (void) { return 10; } > -int __attribute__((noinline,noclone)) foo11 (void) { return 11; } > -int __attribute__((noinline,noclone)) foo12 (void) { return 12; } > -int __attribute__((noinline,noclone)) foo13 (void) { return 13; } > -int __attribute__((noinline,noclone)) foo14 (void) { return 14; } > -int __attribute__((noinline,noclone)) foo15 (void) { return 15; } > -int __attribute__((noinline,noclone)) foo16 (void) { return 16; } > -int __attribute__((noinline,noclone)) foo17 (void) { return 17; } > -int __attribute__((noinline,noclone)) foo18 (void) { return 18; } > -int __attribute__((noinline,noclone)) foo19 (void) { return 19; } > -int __attribute__((noinline,noclone)) foo20 (void) { return 20; } > +int __attribute__((noipa)) foo1 (void) { return 1; } > +int __attribute__((noipa)) foo2 (void) { return 2; } > +int __attribute__((noipa)) foo3 (void) { return 3; } > +int __attribute__((noipa)) foo4 (void) { return 4; } > +int __attribute__((noipa)) foo5 (void) { return 5; } > +int __attribute__((noipa)) foo6 (void) { return 6; } > +int __attribute__((noipa)) foo7 (void) { return 7; } > +int __attribute__((noipa)) foo8 (void) { return 8; } > +int __attribute__((noipa)) foo9 (void) { return 9; } > +int __attribute__((noipa)) foo10 (void) { return 10; } > +int __attribute__((noipa)) foo11 (void) { return 11; } > +int __attribute__((noipa)) foo12 (void) { return 12; } > +int __attribute__((noipa)) foo13 (void) { return 13; } > +int __attribute__((noipa)) foo14 (void) { return 14; } > +int __attribute__((noipa)) foo15 (void) { return 15; } > +int __attribute__((noipa)) foo16 (void) { return 16; } > +int __attribute__((noipa)) foo17 (void) { return 17; } > +int __attribute__((noipa)) foo18 (void) { return 18; } > +int __attribute__((noipa)) foo19 (void) { return 19; } > +int __attribute__((noipa)) foo20 (void) { return 20; } > > > -int __attribute__((noinline,noclone)) > +int __attribute__((noipa)) > bar (int a) > { >int ret = 0; > diff --git a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c > b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c > index 46d2c54bcff..5ad0c72afc3 100644 > --- a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c > +++ b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c > @@ -4,29 +4,29 @@ > /* case-values-threshold will be set to 20 by the back-end when jump > thunk are requested. */ > > -int __attribute__((noinline,noclone)) foo1 (void) { return 1; } > -int __attribute__((noinline,noclone)) foo2 (void) { return 2; } > -int __attribute__((noinline,noclone)) foo3 (void) { return 3; } > -int __attribute__((noinline,noclone)) foo4 (void) { return 4; } > -int __attribute__((noinline,noclone)) foo5 (void)
Re: [PATCH] s390: Extend two element float vector
On Tue, Jun 11, 2024 at 10:42:26AM +0200, Andreas Krebbel wrote: > On 6/11/24 10:26, Stefan Schulze Frielinghaus wrote: > > This implements a V2SF -> V2DF extend. > > > > gcc/ChangeLog: > > > > * config/s390/vector.md (*vmrhf): New. > > (extendv2sfv2df2): New. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/s390/vector/vec-extend-3.c: New test. > > Since we already have a *vmrhf pattern, should we perhaps add something to > the name to make it easier to distinguish in the rtl dumps? You have added > the mode already, but perhaps something like *vmrhf_half or something > like this? I like the one with _half added which I will push soon. Thanks, Stefan > > Ok with or without that change. Thanks! > > > Andreas > >
[PATCH] s390: Extend two element float vector
This implements a V2SF -> V2DF extend. gcc/ChangeLog: * config/s390/vector.md (*vmrhf): New. (extendv2sfv2df2): New. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vec-extend-3.c: New test. --- Bootstrap and regtested on s390. Ok for mainline? gcc/config/s390/vector.md | 28 +++ .../gcc.target/s390/vector/vec-extend-3.c | 18 2 files changed, 46 insertions(+) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-extend-3.c diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index a931a4b1b17..d8657fae56d 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -895,6 +895,17 @@ "vmrhf\t%0,%1,%2"; [(set_attr "op_type" "VRR")]) +(define_insn "*vmrhf" + [(set (match_operand:V_HW_40 "register_operand" "=v") + (vec_select:V_HW_4 +(vec_concat:V_HW_4 (match_operand: 1 "register_operand" "v") + (match_operand: 2 "register_operand" "v")) +(parallel [(const_int 0) (const_int 2) + (const_int 1) (const_int 3)])))] + "TARGET_VX" + "vmrhf\t%0,%1,%2"; + [(set_attr "op_type" "VRR")]) + (define_insn "*vmrlf" [(set (match_operand:V_HW_4 0 "register_operand" "=v") (vec_select:V_HW_4 @@ -2394,6 +2405,23 @@ "vuph\t%0,%1" [(set_attr "op_type" "VRR")]) +(define_expand "extendv2sfv2df2" + [(set (match_dup 2) + (vec_select:V4SF +(vec_concat:V4SF (match_operand:V2SF 1 "register_operand") + (match_dup 1)) +(parallel [(const_int 0) (const_int 2) + (const_int 1) (const_int 3)]))) + (set (match_operand:V2DF 0 "register_operand") + (float_extend:V2DF +(vec_select:V2SF + (match_dup 2) + (parallel [(const_int 0) (const_int 2)]] + "TARGET_VX" +{ + operands[2] = gen_reg_rtx (V4SFmode); +}) + ;; vector unpack v16qi ; signed diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-extend-3.c b/gcc/testsuite/gcc.target/s390/vector/vec-extend-3.c new file mode 100644 index 000..2b02e7bf9f8 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/vec-extend-3.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=z13 -mzarch" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +typedef float v2sf __attribute__ ((vector_size (8))); +typedef double v2df __attribute__ ((vector_size (16))); + +/* +** extendv2sfv2df2: +** vmrhf %v24,%v24,%v24 +** vldeb %v24,%v24 +** br %r14 +*/ + +v2df extendv2sfv2df2 (v2sf x) +{ + return __builtin_convertvector (x, v2df); +} -- 2.45.1
[PATCH] s390: Extend two/four element integer vectors
For the moment I deliberately left out one-element QHS vectors since it is unclear whether these are pathological cases or whether they are really used. If we ever get an extend for V1DI -> V1TI we should reconsider this. As a side-effect this fixes PR115261. gcc/ChangeLog: target/PR115261 * config/s390/s390.md (any_extend,extend_insn,zero_extend): New code attributes and code iterator. * config/s390/vector.md (V_EXTEND): New mode iterator. (2): New insn. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vec-extend-1.c: New test. * gcc.target/s390/vector/vec-extend-2.c: New test. --- Bootstrap and regtested on s390. Ok for mainline? gcc/config/s390/s390.md | 4 + gcc/config/s390/vector.md | 29 +-- .../gcc.target/s390/vector/vec-extend-1.c | 79 +++ .../gcc.target/s390/vector/vec-extend-2.c | 55 + 4 files changed, 162 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-extend-1.c create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-extend-2.c diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index c607dce3cf0..1311a5f01cf 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -602,6 +602,10 @@ (define_attr "relative_long" "no,yes" (const_string "no")) +(define_code_attr extend_insn [(sign_extend "extend") (zero_extend "zero_extend")]) +(define_code_attr zero_extend [(sign_extend "") (zero_extend "l")]) +(define_code_iterator any_extend [sign_extend zero_extend]) + ;; Pipeline description for z900. (include "2064.md") diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index ed4742d93c9..a931a4b1b17 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -87,6 +87,8 @@ ; 32 bit int<->fp vector conversion instructions are available since VXE2 (z15). (define_mode_iterator VX_VEC_CONV_BFP [V2DF (V4SF "TARGET_VXE2")]) +(define_mode_iterator VI_EXTEND [V2QI V2HI V2SI V4QI V4HI]) + ; Empty string for all but TImode. This is used to hide the TImode ; expander name in case it is defined already. See addti3 for an ; example. @@ -195,13 +197,20 @@ (V1DF "V2DF") (V2DF "V4DF")]) ; Vector with widened element size and the same number of elements. -(define_mode_attr vec_2x_wide [(V1QI "V1HI") (V2QI "V2HI") (V4QI "V4HI") (V8QI "V8HI") (V16QI "V16HI") +(define_mode_attr VEC_2X_WIDE [(V1QI "V1HI") (V2QI "V2HI") (V4QI "V4HI") (V8QI "V8HI") (V16QI "V16HI") (V1HI "V1SI") (V2HI "V2SI") (V4HI "V4SI") (V8HI "V8SI") (V1SI "V1DI") (V2SI "V2DI") (V4SI "V4DI") (V1DI "V1TI") (V2DI "V2TI") (V1SF "V1DF") (V2SF "V2DF") (V4SF "V4DF") (V1DF "V1TF") (V2DF "V2TF")]) +(define_mode_attr vec_2x_wide [(V1QI "v1hi") (V2QI "v2hi") (V4QI "v4hi") (V8QI "v8hi") (V16QI "v16hi") + (V1HI "v1si") (V2HI "v2si") (V4HI "v4si") (V8HI "v8si") + (V1SI "v1di") (V2SI "v2di") (V4SI "v4di") + (V1DI "v1ti") (V2DI "v2ti") + (V1SF "v1df") (V2SF "v2df") (V4SF "v4df") + (V1DF "v1tf") (V2DF "v2tf")]) + ; Vector with half the element size AND half the number of elements. (define_mode_attr vec_halfhalf [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI") @@ -1604,7 +1613,7 @@ UNSPEC_VEC_UMULT_ODD)) (set (match_operand: 0 "register_operand" "") (vec_select: -(vec_concat: (match_dup 3) (match_dup 4)) +(vec_concat: (match_dup 3) (match_dup 4)) (match_dup 5)))] "TARGET_VX" { @@ -1623,7 +1632,7 @@ UNSPEC_VEC_UMULT_ODD)) (set (match_operand: 0 "register_operand" "") (vec_select: -(vec_concat: (match_dup 3) (match_dup 4)) +(vec_concat: (match_dup 3) (match_dup 4)) (match_dup 5)))] "TARGET_VX" { @@ -1642,7 +1651,7 @@ UNSPEC_VEC_SMULT_ODD)) (set (match_operand: 0 "register_operand" "") (vec_select: -(vec_concat: (match_dup 3) (match_dup 4)) +(vec_concat: (match_dup 3) (match_dup 4)) (match_dup 5)))] "TARGET_VX" { @@ -1661,7 +1670,7 @@ UNSPEC_VEC_SMULT_ODD)) (set (match_operand: 0 "register_operand" "") (vec_select: -(vec_concat: (match_dup 3) (match_dup 4)) +(vec_concat: (match_dup 3) (match_dup 4)) (match_dup 5)))] "TARGET_VX" { @@ -2375,6 +2384,16 @@ "vpkls\t%0,%1,%2" [(set_attr "op_type" "VRR")]) +;; vector unpack / extend + +(define_insn "2" + [(set (match_operand: 0 "register_operand" "=v") + (any_extend: +
Re: [PATCH] Hard register asm constraint
Ping. On Fri, May 24, 2024 at 11:13:12AM +0200, Stefan Schulze Frielinghaus wrote: > This implements hard register constraints for inline asm. A hard register > constraint is of the form {regname} where regname is any valid register. This > basically renders register asm superfluous. For example, the snippet > > int test (int x, int y) > { > register int r4 asm ("r4") = x; > register int r5 asm ("r5") = y; > unsigned int copy = y; > asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy)); > return r4; > } > > could be rewritten into > > int test (int x, int y) > { > asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y)); > return x; > } > > As a side-effect this also solves the problem of call-clobbered registers. > That being said, I was wondering whether we could utilize this feature in > order > to get rid of local register asm automatically? For example, converting > > // Result will be in r2 on s390 > extern int bar (void); > > void test (void) > { > register int x asm ("r2") = 42; > bar (); > asm ("foo %0\n" :: "r" (x)); > } > > into > > void test (void) > { > int x = 42; > bar (); > asm ("foo %0\n" :: "{r2}" (x)); > } > > in order to get rid of the limitation of call-clobbered registers which may > lead to subtle bugs---especially if you think of non-obvious calls e.g. > introduced by sanitizer/tracer/whatever. Since such a transformation has the > potential to break existing code do you see any edge cases where this might be > problematic or even show stoppers? Currently, even > > int test (void) > { > register int x asm ("r2") = 42; > register int y asm ("r2") = 24; > asm ("foo %0,%1\n" :: "r" (x), "r" (y)); > } > > is allowed which seems error prone to me. Thus, if 100% backwards > compatibility would be required, then automatically converting every register > asm to the new mechanism isn't viable. Still quite a lot could be > transformed. > Any thoughts? > > Currently I allow multiple alternatives as demonstrated by > gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c. However, since a hard > register > constraint is pretty specific I could also think of erroring out in case of > alternatives. Are there any real use cases out there for multiple > alternatives where one would like to use hard register constraints? > > With the current implementation we have a "user visible change" in the sense > that for > > void test (void) > { > register int x asm ("r2") = 42; > register int y asm ("r2") = 24; > asm ("foo %0,%1\n" : "=r" (x), "=r" (y)); > } > > we do not get the error > > "invalid hard register usage between output operands" > > anymore but rather > > "multiple outputs to hard register: %r2" > > This is due to the error handling in gimplify_asm_expr (). Speaking of > errors, > I also error out earlier as before which means that e.g. in pr87600-2.c only > the first error is reported and processing is stopped afterwards which means > the subsequent tests fail. > > I've been skimming through all targets and it looks to me as if none is using > curly brackets for their constraints. Of course, I may have missed something. > > Cheers, > Stefan > > PS: Current state for Clang: https://reviews.llvm.org/D105142 > > --- > gcc/cfgexpand.cc | 42 --- > gcc/genpreds.cc | 4 +- > gcc/gimplify.cc | 115 +- > gcc/lra-constraints.cc| 17 +++ > gcc/recog.cc | 14 ++- > gcc/stmt.cc | 102 +++- > gcc/stmt.h| 10 +- > .../gcc.target/s390/asm-hard-reg-1.c | 103 > .../gcc.target/s390/asm-hard-reg-2.c | 29 + > .../gcc.target/s390/asm-hard-reg-3.c | 24 > gcc/testsuite/lib/scanasm.exp | 4 + > 11 files changed, 407 insertions(+), 57 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-1.c > create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c > create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-3.c > > diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc > index 557cb28733b..47f71a2e803 100644 > --- a/gcc/cfgexpand.cc
[PATCH] s390: testsuite: Fix ifcvt-one-insn-bool.c
With the change of r15-787-g57e04879389f9c I forgot to also update this test. gcc/testsuite/ChangeLog: * gcc.target/s390/ifcvt-one-insn-bool.c: Fix loc. --- Ok for mainline? Ok for GCC 14 if the corresponding backport is also approved? gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c b/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c index 0c8c2f879a6..4ae29dbd6b6 100644 --- a/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c +++ b/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c @@ -3,7 +3,7 @@ /* { dg-do compile { target { s390*-*-* } } } */ /* { dg-options "-O2 -march=z13 -mzarch" } */ -/* { dg-final { scan-assembler "lochinh\t%r.?,1" } } */ +/* { dg-final { scan-assembler "lochile\t%r.?,1" } } */ #include int foo (int *a, unsigned int n) -- 2.45.1
[PATCH] s390: testsuite: Fix nobp-table-jump-*.c
Starting with r14-5628-g53ba8d669550d3 interprocedural VRP became strong enough in order to render these tests useless. Fixed by disabling IPA. gcc/testsuite/ChangeLog: * gcc.target/s390/nobp-table-jump-inline-z10.c: Do not perform IPA. * gcc.target/s390/nobp-table-jump-inline-z900.c: Dito. * gcc.target/s390/nobp-table-jump-z10.c: Dito. * gcc.target/s390/nobp-table-jump-z900.c: Dito. --- Ok for mainline? .../s390/nobp-table-jump-inline-z10.c | 42 +-- .../s390/nobp-table-jump-inline-z900.c| 42 +-- .../gcc.target/s390/nobp-table-jump-z10.c | 42 +-- .../gcc.target/s390/nobp-table-jump-z900.c| 42 +-- 4 files changed, 84 insertions(+), 84 deletions(-) diff --git a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c index 8dfd7e4c786..121751166d0 100644 --- a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c +++ b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z10.c @@ -4,29 +4,29 @@ /* case-values-threshold will be set to 20 by the back-end when jump thunk are requested. */ -int __attribute__((noinline,noclone)) foo1 (void) { return 1; } -int __attribute__((noinline,noclone)) foo2 (void) { return 2; } -int __attribute__((noinline,noclone)) foo3 (void) { return 3; } -int __attribute__((noinline,noclone)) foo4 (void) { return 4; } -int __attribute__((noinline,noclone)) foo5 (void) { return 5; } -int __attribute__((noinline,noclone)) foo6 (void) { return 6; } -int __attribute__((noinline,noclone)) foo7 (void) { return 7; } -int __attribute__((noinline,noclone)) foo8 (void) { return 8; } -int __attribute__((noinline,noclone)) foo9 (void) { return 9; } -int __attribute__((noinline,noclone)) foo10 (void) { return 10; } -int __attribute__((noinline,noclone)) foo11 (void) { return 11; } -int __attribute__((noinline,noclone)) foo12 (void) { return 12; } -int __attribute__((noinline,noclone)) foo13 (void) { return 13; } -int __attribute__((noinline,noclone)) foo14 (void) { return 14; } -int __attribute__((noinline,noclone)) foo15 (void) { return 15; } -int __attribute__((noinline,noclone)) foo16 (void) { return 16; } -int __attribute__((noinline,noclone)) foo17 (void) { return 17; } -int __attribute__((noinline,noclone)) foo18 (void) { return 18; } -int __attribute__((noinline,noclone)) foo19 (void) { return 19; } -int __attribute__((noinline,noclone)) foo20 (void) { return 20; } +int __attribute__((noipa)) foo1 (void) { return 1; } +int __attribute__((noipa)) foo2 (void) { return 2; } +int __attribute__((noipa)) foo3 (void) { return 3; } +int __attribute__((noipa)) foo4 (void) { return 4; } +int __attribute__((noipa)) foo5 (void) { return 5; } +int __attribute__((noipa)) foo6 (void) { return 6; } +int __attribute__((noipa)) foo7 (void) { return 7; } +int __attribute__((noipa)) foo8 (void) { return 8; } +int __attribute__((noipa)) foo9 (void) { return 9; } +int __attribute__((noipa)) foo10 (void) { return 10; } +int __attribute__((noipa)) foo11 (void) { return 11; } +int __attribute__((noipa)) foo12 (void) { return 12; } +int __attribute__((noipa)) foo13 (void) { return 13; } +int __attribute__((noipa)) foo14 (void) { return 14; } +int __attribute__((noipa)) foo15 (void) { return 15; } +int __attribute__((noipa)) foo16 (void) { return 16; } +int __attribute__((noipa)) foo17 (void) { return 17; } +int __attribute__((noipa)) foo18 (void) { return 18; } +int __attribute__((noipa)) foo19 (void) { return 19; } +int __attribute__((noipa)) foo20 (void) { return 20; } -int __attribute__((noinline,noclone)) +int __attribute__((noipa)) bar (int a) { int ret = 0; diff --git a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c index 46d2c54bcff..5ad0c72afc3 100644 --- a/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c +++ b/gcc/testsuite/gcc.target/s390/nobp-table-jump-inline-z900.c @@ -4,29 +4,29 @@ /* case-values-threshold will be set to 20 by the back-end when jump thunk are requested. */ -int __attribute__((noinline,noclone)) foo1 (void) { return 1; } -int __attribute__((noinline,noclone)) foo2 (void) { return 2; } -int __attribute__((noinline,noclone)) foo3 (void) { return 3; } -int __attribute__((noinline,noclone)) foo4 (void) { return 4; } -int __attribute__((noinline,noclone)) foo5 (void) { return 5; } -int __attribute__((noinline,noclone)) foo6 (void) { return 6; } -int __attribute__((noinline,noclone)) foo7 (void) { return 7; } -int __attribute__((noinline,noclone)) foo8 (void) { return 8; } -int __attribute__((noinline,noclone)) foo9 (void) { return 9; } -int __attribute__((noinline,noclone)) foo10 (void) { return 10; } -int __attribute__((noinline,noclone)) foo11 (void) { return 11; } -int __attribute__((noinline,noclone)) foo12 (void) { return 12; } -int __attribute__((noinline,noclone)) foo13 (
Re: [PATCH v2] s390: Implement TARGET_NOCE_CONVERSION_PROFITABLE_P [PR109549]
Since the patch works fine so far for mainline, ok to backport to GCC 14? On Fri, May 17, 2024 at 08:59:05AM +0200, Stefan Schulze Frielinghaus wrote: > I've adapted the patch as follows and will push. > > Thanks, > Stefan > > -- > > Consider a NOCE conversion as profitable if there is at least one > conditional move. > > gcc/ChangeLog: > > * config/s390/s390.cc (TARGET_NOCE_CONVERSION_PROFITABLE_P): > Define. > (s390_noce_conversion_profitable_p): Implement. > > gcc/testsuite/ChangeLog: > > * gcc.target/s390/ccor.c: Order of loads are reversed, now, as a > consequence the condition has to be reversed. > --- > gcc/config/s390/s390.cc | 32 > gcc/testsuite/gcc.target/s390/ccor.c | 4 ++-- > 2 files changed, 34 insertions(+), 2 deletions(-) > > diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc > index bf46eab2d63..7f8f1681c2a 100644 > --- a/gcc/config/s390/s390.cc > +++ b/gcc/config/s390/s390.cc > @@ -78,6 +78,7 @@ along with GCC; see the file COPYING3. If not see > #include "tree-pass.h" > #include "context.h" > #include "builtins.h" > +#include "ifcvt.h" > #include "rtl-iter.h" > #include "intl.h" > #include "tm-constrs.h" > @@ -18037,6 +18038,34 @@ s390_vectorize_vec_perm_const (machine_mode vmode, > machine_mode op_mode, >return vectorize_vec_perm_const_1 (d); > } > > +/* Consider a NOCE conversion as profitable if there is at least one > + conditional move. */ > + > +static bool > +s390_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info > *if_info) > +{ > + if (if_info->speed_p) > +{ > + for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn)) > + { > + rtx set = single_set (insn); > + if (set == NULL) > + continue; > + if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE) > + continue; > + rtx src = SET_SRC (set); > + machine_mode mode = GET_MODE (src); > + if (GET_MODE_CLASS (mode) != MODE_INT > + && GET_MODE_CLASS (mode) != MODE_FLOAT) > + continue; > + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) > + continue; > + return true; > + } > +} > + return default_noce_conversion_profitable_p (seq, if_info); > +} > + > /* Initialize GCC target structure. */ > > #undef TARGET_ASM_ALIGNED_HI_OP > @@ -18350,6 +18379,9 @@ s390_vectorize_vec_perm_const (machine_mode vmode, > machine_mode op_mode, > #undef TARGET_VECTORIZE_VEC_PERM_CONST > #define TARGET_VECTORIZE_VEC_PERM_CONST s390_vectorize_vec_perm_const > > +#undef TARGET_NOCE_CONVERSION_PROFITABLE_P > +#define TARGET_NOCE_CONVERSION_PROFITABLE_P s390_noce_conversion_profitable_p > + > struct gcc_target targetm = TARGET_INITIALIZER; > > #include "gt-s390.h" > diff --git a/gcc/testsuite/gcc.target/s390/ccor.c > b/gcc/testsuite/gcc.target/s390/ccor.c > index 31f30f60314..36a3c3a999a 100644 > --- a/gcc/testsuite/gcc.target/s390/ccor.c > +++ b/gcc/testsuite/gcc.target/s390/ccor.c > @@ -42,7 +42,7 @@ GENFUN1(2) > > GENFUN1(3) > > -/* { dg-final { scan-assembler {locrno} } } */ > +/* { dg-final { scan-assembler {locro} } } */ > > GENFUN2(0,1) > > @@ -58,7 +58,7 @@ GENFUN2(0,3) > > GENFUN2(1,2) > > -/* { dg-final { scan-assembler {locrnlh} } } */ > +/* { dg-final { scan-assembler {locrlh} } } */ > > GENFUN2(1,3) > > -- > 2.45.0 >
Re: [PATCH] ifcvt: Clarify if_info.original_cost.
On Fri, May 31, 2024 at 10:05:55PM -0600, Jeff Law wrote: > > > On 5/31/24 9:03 AM, Robin Dapp wrote: > > Hi, > > > > before noce_find_if_block processes a block it sets up an if_info > > structure that holds the original costs. At that point the costs of > > the then/else blocks have not been added so we only care about the > > "if" cost. > > > > The code originally used BRANCH_COST for that but was then changed > > to COST_N_INSNS (2) - a compare and a jump. > > This patch computes the jump costs via > >insn_cost (if_info.jump, ...) > > which is supposed to incorporate the branch costs and, in case of a CC > > comparison, > >pattern_cost (if_info.cond, ...) > > which is supposed to account for the CC creation. > > > > For compare_and_jump patterns insn_cost should have already computed > > the right cost. > > > > Does this "split" make sense, generally? > > > > Bootstrapped and regtested on x86, aarch64 and power10. Regtested > > on riscv. > > > > Regards > > Robin > > > > gcc/ChangeLog: > > > > * ifcvt.cc (noce_process_if_block): Subtract condition pattern > > cost if applicable. > > (noce_find_if_block): Use insn_cost and pattern_cost for > > original cost. > OK. Obviously we'll need to be on the lookout for regressions. My bet is > on s390 since you already tested the x86, aarch64 & p10 targets :-) I just gave it a try on s390 where bootstrap and regtest were successful. Cheers, Stefan > > > jeff >
[PATCH] Hard register asm constraint
This implements hard register constraints for inline asm. A hard register constraint is of the form {regname} where regname is any valid register. This basically renders register asm superfluous. For example, the snippet int test (int x, int y) { register int r4 asm ("r4") = x; register int r5 asm ("r5") = y; unsigned int copy = y; asm ("foo %0,%1,%2" : "+d" (r4) : "d" (r5), "d" (copy)); return r4; } could be rewritten into int test (int x, int y) { asm ("foo %0,%1,%2" : "+{r4}" (x) : "{r5}" (y), "d" (y)); return x; } As a side-effect this also solves the problem of call-clobbered registers. That being said, I was wondering whether we could utilize this feature in order to get rid of local register asm automatically? For example, converting // Result will be in r2 on s390 extern int bar (void); void test (void) { register int x asm ("r2") = 42; bar (); asm ("foo %0\n" :: "r" (x)); } into void test (void) { int x = 42; bar (); asm ("foo %0\n" :: "{r2}" (x)); } in order to get rid of the limitation of call-clobbered registers which may lead to subtle bugs---especially if you think of non-obvious calls e.g. introduced by sanitizer/tracer/whatever. Since such a transformation has the potential to break existing code do you see any edge cases where this might be problematic or even show stoppers? Currently, even int test (void) { register int x asm ("r2") = 42; register int y asm ("r2") = 24; asm ("foo %0,%1\n" :: "r" (x), "r" (y)); } is allowed which seems error prone to me. Thus, if 100% backwards compatibility would be required, then automatically converting every register asm to the new mechanism isn't viable. Still quite a lot could be transformed. Any thoughts? Currently I allow multiple alternatives as demonstrated by gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c. However, since a hard register constraint is pretty specific I could also think of erroring out in case of alternatives. Are there any real use cases out there for multiple alternatives where one would like to use hard register constraints? With the current implementation we have a "user visible change" in the sense that for void test (void) { register int x asm ("r2") = 42; register int y asm ("r2") = 24; asm ("foo %0,%1\n" : "=r" (x), "=r" (y)); } we do not get the error "invalid hard register usage between output operands" anymore but rather "multiple outputs to hard register: %r2" This is due to the error handling in gimplify_asm_expr (). Speaking of errors, I also error out earlier as before which means that e.g. in pr87600-2.c only the first error is reported and processing is stopped afterwards which means the subsequent tests fail. I've been skimming through all targets and it looks to me as if none is using curly brackets for their constraints. Of course, I may have missed something. Cheers, Stefan PS: Current state for Clang: https://reviews.llvm.org/D105142 --- gcc/cfgexpand.cc | 42 --- gcc/genpreds.cc | 4 +- gcc/gimplify.cc | 115 +- gcc/lra-constraints.cc| 17 +++ gcc/recog.cc | 14 ++- gcc/stmt.cc | 102 +++- gcc/stmt.h| 10 +- .../gcc.target/s390/asm-hard-reg-1.c | 103 .../gcc.target/s390/asm-hard-reg-2.c | 29 + .../gcc.target/s390/asm-hard-reg-3.c | 24 gcc/testsuite/lib/scanasm.exp | 4 + 11 files changed, 407 insertions(+), 57 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-1.c create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-2.c create mode 100644 gcc/testsuite/gcc.target/s390/asm-hard-reg-3.c diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc index 557cb28733b..47f71a2e803 100644 --- a/gcc/cfgexpand.cc +++ b/gcc/cfgexpand.cc @@ -2955,44 +2955,6 @@ expand_asm_loc (tree string, int vol, location_t locus) emit_insn (body); } -/* Return the number of times character C occurs in string S. */ -static int -n_occurrences (int c, const char *s) -{ - int n = 0; - while (*s) -n += (*s++ == c); - return n; -} - -/* A subroutine of expand_asm_operands. Check that all operands have - the same number of alternatives. Return true if so. */ - -static bool -check_operand_nalternatives (const vec &constraints) -{ - unsigned len = constraints.length(); - if (len > 0) -{ - int nalternatives = n_occurrences (',', constraints[0]); - - if (nal
[PATCH v2] s390: Implement TARGET_NOCE_CONVERSION_PROFITABLE_P [PR109549]
I've adapted the patch as follows and will push. Thanks, Stefan -- Consider a NOCE conversion as profitable if there is at least one conditional move. gcc/ChangeLog: * config/s390/s390.cc (TARGET_NOCE_CONVERSION_PROFITABLE_P): Define. (s390_noce_conversion_profitable_p): Implement. gcc/testsuite/ChangeLog: * gcc.target/s390/ccor.c: Order of loads are reversed, now, as a consequence the condition has to be reversed. --- gcc/config/s390/s390.cc | 32 gcc/testsuite/gcc.target/s390/ccor.c | 4 ++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index bf46eab2d63..7f8f1681c2a 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -78,6 +78,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-pass.h" #include "context.h" #include "builtins.h" +#include "ifcvt.h" #include "rtl-iter.h" #include "intl.h" #include "tm-constrs.h" @@ -18037,6 +18038,34 @@ s390_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, return vectorize_vec_perm_const_1 (d); } +/* Consider a NOCE conversion as profitable if there is at least one + conditional move. */ + +static bool +s390_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info) +{ + if (if_info->speed_p) +{ + for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn)) + { + rtx set = single_set (insn); + if (set == NULL) + continue; + if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE) + continue; + rtx src = SET_SRC (set); + machine_mode mode = GET_MODE (src); + if (GET_MODE_CLASS (mode) != MODE_INT + && GET_MODE_CLASS (mode) != MODE_FLOAT) + continue; + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + continue; + return true; + } +} + return default_noce_conversion_profitable_p (seq, if_info); +} + /* Initialize GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP @@ -18350,6 +18379,9 @@ s390_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, #undef TARGET_VECTORIZE_VEC_PERM_CONST #define TARGET_VECTORIZE_VEC_PERM_CONST s390_vectorize_vec_perm_const +#undef TARGET_NOCE_CONVERSION_PROFITABLE_P +#define TARGET_NOCE_CONVERSION_PROFITABLE_P s390_noce_conversion_profitable_p + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-s390.h" diff --git a/gcc/testsuite/gcc.target/s390/ccor.c b/gcc/testsuite/gcc.target/s390/ccor.c index 31f30f60314..36a3c3a999a 100644 --- a/gcc/testsuite/gcc.target/s390/ccor.c +++ b/gcc/testsuite/gcc.target/s390/ccor.c @@ -42,7 +42,7 @@ GENFUN1(2) GENFUN1(3) -/* { dg-final { scan-assembler {locrno} } } */ +/* { dg-final { scan-assembler {locro} } } */ GENFUN2(0,1) @@ -58,7 +58,7 @@ GENFUN2(0,3) GENFUN2(1,2) -/* { dg-final { scan-assembler {locrnlh} } } */ +/* { dg-final { scan-assembler {locrlh} } } */ GENFUN2(1,3) -- 2.45.0
[PATCH] s390: Implement TARGET_NOCE_CONVERSION_PROFITABLE_P [PR109549]
Consider a NOCE conversion as profitable if there is at least one conditional move. gcc/ChangeLog: * config/s390/s390.cc (TARGET_NOCE_CONVERSION_PROFITABLE_P): Define. (s390_noce_conversion_profitable_p): Implement. gcc/testsuite/ChangeLog: * gcc.target/s390/ccor.c: Order of loads are reversed, now, as a consequence the condition has to be reversed. --- Bootstrapped and regtested on s390. Ok for mainline? gcc/config/s390/s390.cc | 32 gcc/testsuite/gcc.target/s390/ccor.c | 4 ++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index bf46eab2d63..23b18b5c506 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -78,6 +78,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-pass.h" #include "context.h" #include "builtins.h" +#include "ifcvt.h" #include "rtl-iter.h" #include "intl.h" #include "tm-constrs.h" @@ -18037,6 +18038,37 @@ s390_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, return vectorize_vec_perm_const_1 (d); } +/* Consider a NOCE conversion as profitable if there is at least one + conditional move. */ + +#undef TARGET_NOCE_CONVERSION_PROFITABLE_P +#define TARGET_NOCE_CONVERSION_PROFITABLE_P s390_noce_conversion_profitable_p + +static bool +s390_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info) +{ + if (if_info->speed_p) +{ + for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn)) + { + rtx set = single_set (insn); + if (set == NULL) + continue; + if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE) + continue; + rtx src = SET_SRC (set); + machine_mode mode = GET_MODE (src); + if (GET_MODE_CLASS (mode) != MODE_INT + && GET_MODE_CLASS (mode) != MODE_FLOAT) + continue; + if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (Pmode)) + continue; + return true; + } +} + return default_noce_conversion_profitable_p (seq, if_info); +} + /* Initialize GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP diff --git a/gcc/testsuite/gcc.target/s390/ccor.c b/gcc/testsuite/gcc.target/s390/ccor.c index 31f30f60314..36a3c3a999a 100644 --- a/gcc/testsuite/gcc.target/s390/ccor.c +++ b/gcc/testsuite/gcc.target/s390/ccor.c @@ -42,7 +42,7 @@ GENFUN1(2) GENFUN1(3) -/* { dg-final { scan-assembler {locrno} } } */ +/* { dg-final { scan-assembler {locro} } } */ GENFUN2(0,1) @@ -58,7 +58,7 @@ GENFUN2(0,3) GENFUN2(1,2) -/* { dg-final { scan-assembler {locrnlh} } } */ +/* { dg-final { scan-assembler {locrlh} } } */ GENFUN2(1,3) -- 2.44.0
[PATCH] tree-ssa-loop-prefetch.cc: Honour -fno-unroll-loops
On s390 the following tests fail FAIL: gcc.dg/vect/pr109011-1.c -flto -ffat-lto-objects scan-tree-dump-times optimized " = .CLZ (vect" 1 FAIL: gcc.dg/vect/pr109011-1.c -flto -ffat-lto-objects scan-tree-dump-times optimized " = .POPCOUNT (vect" 1 FAIL: gcc.dg/vect/pr109011-1.c scan-tree-dump-times optimized " = .CLZ (vect" 1 FAIL: gcc.dg/vect/pr109011-1.c scan-tree-dump-times optimized " = .POPCOUNT (vect" 1 FAIL: gcc.dg/vect/pr109011-2.c -flto -ffat-lto-objects scan-tree-dump-times optimized " = .CTZ (vect" 2 FAIL: gcc.dg/vect/pr109011-2.c -flto -ffat-lto-objects scan-tree-dump-times optimized " = .POPCOUNT (vect" 1 FAIL: gcc.dg/vect/pr109011-2.c scan-tree-dump-times optimized " = .CTZ (vect" 2 FAIL: gcc.dg/vect/pr109011-2.c scan-tree-dump-times optimized " = .POPCOUNT (vect" 1 FAIL: gcc.dg/vect/pr109011-4.c -flto -ffat-lto-objects scan-tree-dump-times optimized " = .CTZ (vect" 2 FAIL: gcc.dg/vect/pr109011-4.c -flto -ffat-lto-objects scan-tree-dump-times optimized " = .POPCOUNT (vect" 1 FAIL: gcc.dg/vect/pr109011-4.c scan-tree-dump-times optimized " = .CTZ (vect" 2 FAIL: gcc.dg/vect/pr109011-4.c scan-tree-dump-times optimized " = .POPCOUNT (vect" 1 because aprefetch unrolls loops even if -fno-unroll-loops is used. Accordingly, the scan patterns match more than one time. Could also be fixed by using -fno-prefetch-loop-arrays for the tests. Though, I tend to prefer if aprefetch honours -fno-unroll-loops. Any preferences? Bootstrapped and regtested on x86_64 and s390. Ok for mainline? gcc/ChangeLog: * tree-ssa-loop-prefetch.cc (determine_unroll_factor): Honour -fno-unroll-loops. --- gcc/tree-ssa-loop-prefetch.cc | 4 1 file changed, 4 insertions(+) diff --git a/gcc/tree-ssa-loop-prefetch.cc b/gcc/tree-ssa-loop-prefetch.cc index 70073cc4fe4..bb5d5dec779 100644 --- a/gcc/tree-ssa-loop-prefetch.cc +++ b/gcc/tree-ssa-loop-prefetch.cc @@ -1401,6 +1401,10 @@ determine_unroll_factor (class loop *loop, struct mem_ref_group *refs, struct mem_ref_group *agp; struct mem_ref *ref; + /* Bail out early in case we must not unroll loops. */ + if (!flag_unroll_loops) +return 1; + /* First check whether the loop is not too large to unroll. We ignore PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us from unrolling them enough to make exactly one cache line covered by each -- 2.44.0
Re: [PATCH] tree-optimization/110490 - bitcount for narrow modes
Ping. Ok for mainline? On Thu, Apr 25, 2024 at 09:26:45AM +0200, Stefan Schulze Frielinghaus wrote: > Bitcount operations popcount, clz, and ctz are emulated for narrow modes > in case an operation is only supported for wider modes. Beside that ctz > may be emulated via clz in expand_ctz. Reflect this in > expression_expensive_p. > > I considered the emulation of ctz via clz as not expensive since this > basically reduces to ctz (x) = c - (clz (x & ~x)) where c is the mode > precision minus 1 which should be faster than a loop. > > Bootstrapped and regtested on x86_64 and s390. Though, this is probably > stage1 material? > > gcc/ChangeLog: > > PR tree-optimization/110490 > * tree-scalar-evolution.cc (expression_expensive_p): Also > consider mode widening for popcount, clz, and ctz. > --- > gcc/tree-scalar-evolution.cc | 23 +++ > 1 file changed, 23 insertions(+) > > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc > index b0a5e09a77c..622c7246c1b 100644 > --- a/gcc/tree-scalar-evolution.cc > +++ b/gcc/tree-scalar-evolution.cc > @@ -3458,6 +3458,28 @@ bitcount_call: > && (optab_handler (optab, word_mode) > != CODE_FOR_nothing)) > break; > + /* If popcount is available for a wider mode, we emulate the > + operation for a narrow mode by first zero-extending the value > + and then computing popcount in the wider mode. Analogue for > + ctz. For clz we do the same except that we additionally have > + to subtract the difference of the mode precisions from the > + result. */ > + if (is_a (mode, &int_mode)) > + { > + machine_mode wider_mode_iter; > + FOR_EACH_WIDER_MODE (wider_mode_iter, mode) > + if (optab_handler (optab, wider_mode_iter) > + != CODE_FOR_nothing) > + goto check_call_args; > + /* Operation ctz may be emulated via clz in expand_ctz. */ > + if (optab == ctz_optab) > + { > + FOR_EACH_WIDER_MODE_FROM (wider_mode_iter, mode) > + if (optab_handler (clz_optab, wider_mode_iter) > + != CODE_FOR_nothing) > + goto check_call_args; > + } > + } > return true; > } > break; > @@ -3469,6 +3491,7 @@ bitcount_call: > break; > } > > +check_call_args: >FOR_EACH_CALL_EXPR_ARG (arg, iter, expr) > if (expression_expensive_p (arg, cond_overflow_p, cache, op_cost)) > return true; > -- > 2.44.0 >
[PATCH] s390: testsuite: Fix risbg-ll-2.c
Starting with r14-2047-gd0e891406b16dc we see through subregs which means for f10 in risbg-ll-2.c we do not end up with rosbg_si_noshift but rather rosbg_di_noshift which materializes in slightly different start index. This saves us an extend. gcc/testsuite/ChangeLog: * gcc.target/s390/risbg-ll-2.c: Fix start offset for rosbg of f10. --- Ok for mainline? gcc/testsuite/gcc.target/s390/risbg-ll-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/s390/risbg-ll-2.c b/gcc/testsuite/gcc.target/s390/risbg-ll-2.c index 8bf1a0ff88b..ca80602a83f 100644 --- a/gcc/testsuite/gcc.target/s390/risbg-ll-2.c +++ b/gcc/testsuite/gcc.target/s390/risbg-ll-2.c @@ -113,7 +113,7 @@ i32 f9 (i64 v_x, i32 v_y) // ands with incompatible masks. i32 f10 (i64 v_x, i32 v_y) { - /* { dg-final { scan-assembler "f10:\n\tsrlg\t%r2,%r2,48\n\trosbg\t%r2,%r3,32,39,0" { target { lp64 } } } } */ + /* { dg-final { scan-assembler "f10:\n\tsrlg\t%r2,%r2,48\n\trosbg\t%r2,%r3,0,39,0" { target { lp64 } } } } */ /* { dg-final { scan-assembler "f10:\n\tnilf\t%r4,4278190080\n\trosbg\t%r4,%r2,48,63,48" { target { ! lp64 } } } } */ i64 v_shr6 = ((ui64)v_x) >> 48; i32 v_conv = (ui32)v_shr6; -- 2.44.0
[PATCH] s390: testsuite: Fix zero_bits_compound-1.c
Starting with r12-2731-g96146e61cd7aee we do not generate code like _5 = (unsigned int) c_2(D); i_6 = _5 << 8; _7 = _5 << 20; i_8 = i_6 | _7; anymore but instead _5 = (unsigned int) c_2(D); _3 = _5 * 1048832; which leads finally to slightly different assembly code where we previously ended up for z10 or newer with lr %r1,%r2 sll %r1,8 rosbg %r1,%r2,32,43,20 llgfr %r2,%r1 br %r14 and now lr %r1,%r2 sll %r1,12 ar %r2,%r1 risbg %r2,%r2,35,128+55,8 br %r14 The zero-extend materializes via risbg for which the pattern contains an "and" which is why the test fails. Thus, instead of scanning for RTL expressions rather scan for assembler instructions for s390. --- Ok for mainline? gcc/testsuite/gcc.dg/zero_bits_compound-1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/zero_bits_compound-1.c b/gcc/testsuite/gcc.dg/zero_bits_compound-1.c index e71594911b2..f1e267e0fb0 100644 --- a/gcc/testsuite/gcc.dg/zero_bits_compound-1.c +++ b/gcc/testsuite/gcc.dg/zero_bits_compound-1.c @@ -39,4 +39,5 @@ unsigned long bar (unsigned char c) } /* Check that no pattern containing an AND expression was used. */ -/* { dg-final { scan-assembler-not "\\(and:" } } */ +/* { dg-final { scan-assembler-not "\\(and:" { target { ! { s390*-*-* } } } } } */ +/* { dg-final { scan-assembler-not "\\tng?rk?\\t" { target { s390*-*-* } } } } */ -- 2.44.0
[PATCH] tree-optimization/110490 - bitcount for narrow modes
Bitcount operations popcount, clz, and ctz are emulated for narrow modes in case an operation is only supported for wider modes. Beside that ctz may be emulated via clz in expand_ctz. Reflect this in expression_expensive_p. I considered the emulation of ctz via clz as not expensive since this basically reduces to ctz (x) = c - (clz (x & ~x)) where c is the mode precision minus 1 which should be faster than a loop. Bootstrapped and regtested on x86_64 and s390. Though, this is probably stage1 material? gcc/ChangeLog: PR tree-optimization/110490 * tree-scalar-evolution.cc (expression_expensive_p): Also consider mode widening for popcount, clz, and ctz. --- gcc/tree-scalar-evolution.cc | 23 +++ 1 file changed, 23 insertions(+) diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc index b0a5e09a77c..622c7246c1b 100644 --- a/gcc/tree-scalar-evolution.cc +++ b/gcc/tree-scalar-evolution.cc @@ -3458,6 +3458,28 @@ bitcount_call: && (optab_handler (optab, word_mode) != CODE_FOR_nothing)) break; + /* If popcount is available for a wider mode, we emulate the +operation for a narrow mode by first zero-extending the value +and then computing popcount in the wider mode. Analogue for +ctz. For clz we do the same except that we additionally have +to subtract the difference of the mode precisions from the +result. */ + if (is_a (mode, &int_mode)) + { + machine_mode wider_mode_iter; + FOR_EACH_WIDER_MODE (wider_mode_iter, mode) + if (optab_handler (optab, wider_mode_iter) + != CODE_FOR_nothing) + goto check_call_args; + /* Operation ctz may be emulated via clz in expand_ctz. */ + if (optab == ctz_optab) + { + FOR_EACH_WIDER_MODE_FROM (wider_mode_iter, mode) + if (optab_handler (clz_optab, wider_mode_iter) + != CODE_FOR_nothing) + goto check_call_args; + } + } return true; } break; @@ -3469,6 +3491,7 @@ bitcount_call: break; } +check_call_args: FOR_EACH_CALL_EXPR_ARG (arg, iter, expr) if (expression_expensive_p (arg, cond_overflow_p, cache, op_cost)) return true; -- 2.44.0
[PATCH] s390: testsuite: Xfail forwprop-4{0,1}.c
Hi Andreas, Ok then I will proceed with the patch as is. Opened PR114802. Cheers, Stefan -- The tests fail on s390 since can_vec_perm_const_p fails and therefore the bit insert/ref survive which r14-3381-g27de9aa152141e aims for. Strictly speaking, the tests only fail in case the target supports vectors, i.e., for targets prior z13 or in case of -mesa the emulated vector operations are optimized out. Set to xfail and tracked by PR114802. --- gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c | 4 ++-- gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c | 4 ++-- gcc/testsuite/lib/target-supports.exp | 14 ++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c index 7513497f552..0c5233a68f4 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c @@ -10,5 +10,5 @@ vector int g(vector int a) return a; } -/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 0 "optimized" } } */ -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 0 "optimized" { xfail s390_mvx } } } Xfail: PR114802 */ +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" { xfail s390_mvx } } } Xfail: PR114802 */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c index b1e75797a90..a1f08289dd6 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c @@ -11,6 +11,6 @@ vector int g(vector int a, int c) return a; } -/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 1 "optimized" } } */ -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 1 "optimized" { xfail s390_mvx } } } Xfail PR114802 */ +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" { xfail s390_mvx } } } Xfail PR114802 */ /* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 0 "optimized" } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 3a5713d9869..3a55b2a4159 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -12392,6 +12392,20 @@ proc check_effective_target_profile_update_atomic {} { } "-fprofile-update=atomic -fprofile-generate"] } +# Return 1 if the target has a vector facility. +proc check_effective_target_s390_mvx { } { +if ![istarget s390*-*-*] then { + return 0; +} + +return [check_no_compiler_messages_nocache s390_mvx assembly { + #if !defined __VX__ + #error no vector facility. + #endif + int dummy; +} [current_compiler_flags]] +} + # Return 1 if vector (va - vector add) instructions are understood by # the assembler and can be executed. This also covers checking for # the VX kernel feature. A kernel without that feature does not -- 2.44.0
[PATCH] s390: testsuite: Fix forwprop-4{0,1}.c
The tests fail on s390 since can_vec_perm_const_p fails and therefore the bit insert/ref survive which r14-3381-g27de9aa152141e aims for. Strictly speaking, the tests only fail in case the target supports vectors, i.e., for targets prior z13 or in case of -mesa the emulated vector operations are optimized out. Easiest would be to skip the entire test for s390. Another solution would be to xfail in case of vector support hoping that eventually we end up with an xpass for a future machine generation or if gcc advances. That is implemented by this patch. In order to do so I implemented a new target test s390_mvx which tests whether vector support is available or not. Maybe this is already over-engineered for a simple test? Any thoughts? --- gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c | 4 ++-- gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c | 4 ++-- gcc/testsuite/lib/target-supports.exp | 14 ++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c index 7513497f552..b67e3e93a7f 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c @@ -10,5 +10,5 @@ vector int g(vector int a) return a; } -/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 0 "optimized" } } */ -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 0 "optimized" { xfail s390_mvx } } } */ +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" { xfail s390_mvx } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c index b1e75797a90..0f119675207 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c @@ -11,6 +11,6 @@ vector int g(vector int a, int c) return a; } -/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 1 "optimized" } } */ -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 1 "optimized" { xfail s390_mvx } } } */ +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" { xfail s390_mvx } } } */ /* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 0 "optimized" } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index edce672c0e2..5a692baa8ef 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -12380,6 +12380,20 @@ proc check_effective_target_profile_update_atomic {} { } "-fprofile-update=atomic -fprofile-generate"] } +# Return 1 if the target has a vector facility. +proc check_effective_target_s390_mvx { } { +if ![istarget s390*-*-*] then { + return 0; +} + +return [check_no_compiler_messages_nocache s390_mvx assembly { + #if !defined __VX__ + #error no vector facility. + #endif + int dummy; +} [current_compiler_flags]] +} + # Return 1 if vector (va - vector add) instructions are understood by # the assembler and can be executed. This also covers checking for # the VX kernel feature. A kernel without that feature does not -- 2.44.0
[PATCH] s390: testsuite: Remove xfail for vpopct{b,h}
Starting with r14-9316-g7890836de20912 patterns for vpopct{b,h} are also detected. Thus, remove xfails. gcc/testsuite/ChangeLog: * gcc.target/s390/vxe/popcount-1.c: Remove xfail. --- Ok for mainline? gcc/testsuite/gcc.target/s390/vxe/popcount-1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.target/s390/vxe/popcount-1.c b/gcc/testsuite/gcc.target/s390/vxe/popcount-1.c index 9ea835a1cf0..25ef354f963 100644 --- a/gcc/testsuite/gcc.target/s390/vxe/popcount-1.c +++ b/gcc/testsuite/gcc.target/s390/vxe/popcount-1.c @@ -21,7 +21,7 @@ vpopctb (uv16qi a) return r; } -/* { dg-final { scan-assembler "vpopctb\t%v24,%v24" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler "vpopctb\t%v24,%v24" } } */ uv8hi __attribute__((noinline)) vpopcth (uv8hi a) @@ -34,7 +34,7 @@ vpopcth (uv8hi a) return r; } -/* { dg-final { scan-assembler "vpopcth\t%v24,%v24" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler "vpopcth\t%v24,%v24" } } */ uv4si __attribute__((noinline)) vpopctf (uv4si a) -- 2.44.0
[PATCH] s390: testsuite: Xfail range-sincos.c and vrp-float-abs-1.c
As mentioned in PR114678 those failures will be fixed by https://gcc.gnu.org/pipermail/gcc-patches/2024-March/648303.html For GCC 14 just xfail them which should be reverted once the patch is applied. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/range-sincos.c: Xfail for s390. * gcc.dg/tree-ssa/vrp-float-abs-1.c: Dito. --- Ok for mainline? gcc/testsuite/gcc.dg/tree-ssa/range-sincos.c| 2 +- gcc/testsuite/gcc.dg/tree-ssa/vrp-float-abs-1.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/range-sincos.c b/gcc/testsuite/gcc.dg/tree-ssa/range-sincos.c index 337f9cda02f..35b38c3c914 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/range-sincos.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/range-sincos.c @@ -40,4 +40,4 @@ stool (double x) link_error (); } -// { dg-final { scan-tree-dump-not "link_error" "evrp" { target { { *-*-linux* } && { glibc } } } } } +// { dg-final { scan-tree-dump-not "link_error" "evrp" { target { { *-*-linux* } && { glibc } } xfail s390*-*-* } } } xfail: PR114678 diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp-float-abs-1.c b/gcc/testsuite/gcc.dg/tree-ssa/vrp-float-abs-1.c index 4b7b75833e0..a814a973963 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/vrp-float-abs-1.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp-float-abs-1.c @@ -14,4 +14,4 @@ foo (double x, double y) } } -// { dg-final { scan-tree-dump-not "link_error" "evrp" } } +// { dg-final { scan-tree-dump-not "link_error" "evrp" { xfail s390*-*-* } } } xfail: PR114678 -- 2.43.0
[PATCH] testsuite: Fix loop-interchange-16.c
Yes, that works, too. Will commit. Thanks, Stefan -- Prevent loop unrolling of the innermost loop because otherwise we are left with no loop interchange for targets like s390 which have a more aggressive loop unrolling strategy. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/loop-interchange-16.c: Prevent loop unrolling of the innermost loop. --- gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c | 1 + 1 file changed, 1 insertion(+) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c index 781555e085d..bbcb14f9c6c 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c @@ -11,6 +11,7 @@ double s231(int iterations) //loop with data dependency for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) { for (int i = 0; i < LEN_2D; ++i) { +#pragma GCC unroll 0 for (int j = 1; j < LEN_2D; j++) { aa[j][i] = aa[j - 1][i] + bb[j][i]; } -- 2.43.0
[PATCH] s390: testsuite: Fix loop-interchange-16.c
Revert parameter max-completely-peel-times to 16, otherwise, the innermost loop is removed and we are left with no loop interchange which this test is all about. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/loop-interchange-16.c: Revert parameter max-completely-peel-times for s390. --- Ok for mainline? gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c | 1 + 1 file changed, 1 insertion(+) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c index 781555e085d..2530ec84bc0 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-16.c @@ -1,6 +1,7 @@ /* PR/101280 */ /* { dg-do compile } */ /* { dg-options "-O3 -fdump-tree-linterchange-details" } */ +/* { dg-additional-options "--param max-completely-peel-times=16" { target s390*-*-* } } */ void dummy (double *, double *); #define LEN_2D 32 -- 2.43.0
Re: [PATCH] s390x: Optimize vector permute with constant indexes
On Tue, Apr 02, 2024 at 09:56:01AM +0200, Juergen Christ wrote: > Loop vectorizer can generate vector permutes with constant indexes > where all indexes are equal. Optimize this case to use vector > replicate instead of vector permute. > > gcc/ChangeLog: > > * config/s390/s390.cc (expand_perm_as_replicate): Implement. > (vectorize_vec_perm_const_1): Call new function. > * config/s390/vx-builtins.md (vec_splat): Change to... > (@vec_splat): ...this. > > gcc/testsuite/ChangeLog: > > * gcc.target/s390/vector/vec-expand-replicate.c: New test. > > Bootstrapped and regtested on s390x. Ok for trunk? > > Signed-off-by: Juergen Christ > --- > gcc/config/s390/s390.cc | 32 +++ > gcc/config/s390/vx-builtins.md| 2 +- > .../s390/vector/vec-expand-replicate.c| 30 + > 3 files changed, 63 insertions(+), 1 deletion(-) > create mode 100644 > gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c > > diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc > index 372a23244032..4b4014ebe444 100644 > --- a/gcc/config/s390/s390.cc > +++ b/gcc/config/s390/s390.cc > @@ -17923,6 +17923,35 @@ expand_perm_as_a_vlbr_vstbr_candidate (const struct > expand_vec_perm_d &d) >return false; > } > > +static bool expand_perm_as_replicate (const struct expand_vec_perm_d &d) ^~~~ Function names start on a new line. > +{ > + unsigned char i; > + unsigned char elem; > + rtx base = d.op0; > + rtx insn; > + /* Needed to silence maybe-uninitialized warning. */ > + gcc_assert(d.nelt > 0); ~~^~~~ Between function name and open bracket whitespace is missing. Curiously enough, the error is about d which is a reference and cannot be null. If you are eager you could reduce this and open a PR. s390.cc:17935:8: warning: ‘d’ may be used uninitialized [-Wmaybe-uninitialized] 17935 | elem = d.perm[0]; | ~^~~ > + elem = d.perm[0]; > + for (i = 1; i < d.nelt; ++i) > +if (d.perm[i] != elem) > + return false; > + if (!d.testing_p) > +{ > + if (elem >= d.nelt) > + { > + base = d.op1; > + elem -= d.nelt; > + } > + insn = maybe_gen_vec_splat (d.vmode, d.target, base, GEN_INT (elem)); > + if (insn == NULL_RTX) > + return false; > + emit_insn (insn); > + return true; > +} > + else > +return maybe_code_for_vec_splat (d.vmode) != CODE_FOR_nothing; > +} > + > /* Try to find the best sequence for the vector permute operation > described by D. Return true if the operation could be > expanded. */ > @@ -17941,6 +17970,9 @@ vectorize_vec_perm_const_1 (const struct > expand_vec_perm_d &d) >if (expand_perm_as_a_vlbr_vstbr_candidate (d)) > return true; > > + if (expand_perm_as_replicate(d)) ^~~ Between function name and open bracket whitespace is missing. > +return true; > + >return false; > } > > diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md > index 432d81a719fc..93c0d408a43e 100644 > --- a/gcc/config/s390/vx-builtins.md > +++ b/gcc/config/s390/vx-builtins.md > @@ -424,7 +424,7 @@ > > > ; Replicate from vector element > -(define_expand "vec_splat" > +(define_expand "@vec_splat" >[(set (match_operand:V_HW 0 "register_operand" "") > (vec_duplicate:V_HW (vec_select: >(match_operand:V_HW 1 "register_operand" "") > diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c > b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c > new file mode 100644 > index ..27563a00f22b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c > @@ -0,0 +1,30 @@ > +/* Check that the vectorize_vec_perm_const expander correctly deals with > + replication. Extracted from spec "nab". */ > + > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */ > + > + > +#define REAL_T double > +typedef REAL_T MATRIX_T[ 4 ][ 4 ]; > + > +int concat_mat_i, concat_mat_j; > +static void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3); > +MATRIX_T *rot4p() { > + MATRIX_T mat3, mat4; > + static MATRIX_T mat5; > + concat_mat(mat4, mat3, mat5); > +} > +void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3) { > + int k; > + for (;; concat_mat_i++) { > +concat_mat_j = 0; > +for (; 4; concat_mat_j++) { > + k = 0; > + for (; k < 4; k++) > +m3[concat_mat_i][concat_mat_j] += m1[concat_mat_i][k]; > +} Just nitpicking, if we could come up with a test case which does not involve integer overflows due to non-terminating loops, I would prefer that. Cheers, Stefan > + } > +} > + > +/* { dg-final { scan-assembler-not "vperm" } } */ > -- > 2.39.3 >
[PATCH] testsuite: Fix copy-headers-8.c
This fixes the test on s390x. I'm also seeing test failures for riscv64-suse-linux-gnu, m68k-unknown-linux-gnu, pru-unknown-elf, and powerpc64le-unknown-linux-gnu. However, I didn't check them so this might or might not fix those, too. OK for mainline? gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/copy-headers-8.c: Set LOGICAL_OP_NON_SHORT_CIRCUIT to true. --- gcc/testsuite/gcc.dg/tree-ssa/copy-headers-8.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-8.c b/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-8.c index 8b4b5e7ea81..28b4d15d87f 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-8.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-8.c @@ -1,5 +1,8 @@ +/* For targets where LOGICAL_OP_NON_SHORT_CIRCUIT evaluates to false, two + conditional jumps are emitted instead of a combined conditional which this + test is all about. Thus, set it to true. */ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-ch2-details" } */ +/* { dg-options "-O2 -fdump-tree-ch2-details --param logical-op-non-short-circuit=1" } */ int is_sorted(int *a, int n, int m, int k) { -- 2.43.0
[PATCH] s390: testsuite: Fix backprop-6.c
gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/backprop-6.c: On s390 we also have a copysign optab for long double. Thus, scan 3 instead of 2 times for it. --- OK for mainline? gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c index 4087ba93018..dbde681e383 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c @@ -27,8 +27,9 @@ TEST_FUNCTION (float, f) TEST_FUNCTION (double, ) TEST_FUNCTION (long double, l) -/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 4 "backprop" { target ifn_copysign } } } */ -/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = \.COPYSIGN} 2 "backprop" { target ifn_copysign } } } */ -/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 1 "backprop" { target ifn_copysign } } } */ +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 4 "backprop" { target { ifn_copysign && { ! { s390*-*-* } } } } } } */ +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = \.COPYSIGN} 2 "backprop" { target { ifn_copysign && { ! { s390*-*-* } } } } } } */ +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 1 "backprop" { target { ifn_copysign && { ! { s390*-*-* } } } } } } */ +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = \.COPYSIGN} 3 "backprop" { target { ifn_copysign && s390*-*-* } } } } */ /* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 6 "backprop" { target { ! ifn_copysign } } } } */ /* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 3 "backprop" { target { ! ifn_copysign } } } } */ -- 2.43.0
[PATCH] s390: testsuite: Fix abs-4.c
gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/abs-4.c: On s390 we also have a copysign optab for long double. Thus, scan 3 instead of 2 times for it. --- Ok for mainline? gcc/testsuite/gcc.dg/tree-ssa/abs-4.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c index 80fa448df12..4144d1cd954 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c @@ -10,8 +10,9 @@ long double abs_ld(long double x) { return __builtin_signbit(x) ? x : -x; } /* __builtin_signbit(x) ? x : -x. Should be convert into - ABS_EXP */ /* { dg-final { scan-tree-dump-not "signbit" "optimized"} } */ -/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "optimized" { target ifn_copysign } } } */ -/* { dg-final { scan-tree-dump-times "= -" 1 "optimized" { target ifn_copysign } } } */ -/* { dg-final { scan-tree-dump-times "= \.COPYSIGN" 2 "optimized" { target ifn_copysign } } } */ +/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "optimized" { target { ifn_copysign && { ! { s390*-*-* } } } } } } */ +/* { dg-final { scan-tree-dump-times "= -" 1 "optimized" { target { ifn_copysign && { ! { s390*-*-* } } } } } } */ +/* { dg-final { scan-tree-dump-times "= \.COPYSIGN" 2 "optimized" { target { ifn_copysign && { ! { s390*-*-* } } } } } } */ +/* { dg-final { scan-tree-dump-times "= \.COPYSIGN" 3 "optimized" { target { ifn_copysign && s390*-*-* } } } } */ /* { dg-final { scan-tree-dump-times "= ABS_EXPR" 3 "optimized" { target { ! ifn_copysign } } } } */ /* { dg-final { scan-tree-dump-times "= -" 3 "optimized" { target { ! ifn_copysign } } } } */ -- 2.43.0
Re: [PATCH] analyzer: Bail out on function pointer for -Wanalyzer-allocation-size
On Tue, Mar 19, 2024 at 12:38:34PM -0400, David Malcolm wrote: > On Tue, 2024-03-19 at 16:10 +0100, Stefan Schulze Frielinghaus wrote: > > On s390 pr94688.c is failing due to excess error > > > > pr94688.c:6:5: warning: allocated buffer size is not a multiple of > > the pointee's size [CWE-131] [-Wanalyzer-allocation-size] > > > > This is because on s390 functions are by default aligned to an 8-byte > > boundary and during function type construction size is set to > > function > > boundary. Thus, for the assignment > > > > a.0_1 = (void (*) ()) &a; > > > > we have that the right-hand side is pointing to a 4-byte memory > > region > > whereas the size of the function pointer is 8 byte and a warning is > > emitted. > > FWIW the test case in question is a regression test for an ICE seen in > the GCC 10 implementation of the analyzer, which was fixed by the big > rewrite in r11-2694-g808f4dfeb3a95f. > > So the code in the test doesn't make a great deal of sense. > > > > > I could follow and skip this test as done in PR112705, or we could > > bail > > out early in the analyzer for function pointers. My intuition so far > > is that -Wanalyzer-allocation-size shouldn't care about function > > pointer. Therefore, I went for bailing out early. If you believe > > this > > is wrong I can still go by skipping this test on s390. Any thoughts? > > I tried imagining a situation where we're analyzing a function > generated at run-time, but it strikes me that the buffer allocated for > such a function can be of arbitrary size. So -Wanalyzer-allocation- > size is meaningless for functions. > > There's probably a case for checking for mismatches between pointers to > code vs pointers to data (e.g. alignments, Harvard architecture > machines, etc), but -Wanalyzer-allocation-size doesn't do that. > > So I think your patch is correct. > > OK to push it if it passes bootstrap®ression testing. Bootstrapped and regtested on x64 and s390x. Thanks, Stefan > > Thanks > Dave > > > --- > > gcc/analyzer/region-model.cc | 4 > > 1 file changed, 4 insertions(+) > > > > diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region- > > model.cc > > index f079d1fb37e..1b43443d168 100644 > > --- a/gcc/analyzer/region-model.cc > > +++ b/gcc/analyzer/region-model.cc > > @@ -3514,6 +3514,10 @@ region_model::check_region_size (const region > > *lhs_reg, const svalue *rhs_sval, > > || TYPE_SIZE_UNIT (pointee_type) == NULL_TREE) > > return; > > > > + /* Bail out early on function pointers. */ > > + if (TREE_CODE (pointee_type) == FUNCTION_TYPE) > > + return; > > + > > /* Bail out early on pointers to structs where we can > > not deduce whether the buffer size is compatible. */ > > bool is_struct = RECORD_OR_UNION_TYPE_P (pointee_type); >
[PATCH] analyzer: Bail out on function pointer for -Wanalyzer-allocation-size
On s390 pr94688.c is failing due to excess error pr94688.c:6:5: warning: allocated buffer size is not a multiple of the pointee's size [CWE-131] [-Wanalyzer-allocation-size] This is because on s390 functions are by default aligned to an 8-byte boundary and during function type construction size is set to function boundary. Thus, for the assignment a.0_1 = (void (*) ()) &a; we have that the right-hand side is pointing to a 4-byte memory region whereas the size of the function pointer is 8 byte and a warning is emitted. I could follow and skip this test as done in PR112705, or we could bail out early in the analyzer for function pointers. My intuition so far is that -Wanalyzer-allocation-size shouldn't care about function pointer. Therefore, I went for bailing out early. If you believe this is wrong I can still go by skipping this test on s390. Any thoughts? --- gcc/analyzer/region-model.cc | 4 1 file changed, 4 insertions(+) diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc index f079d1fb37e..1b43443d168 100644 --- a/gcc/analyzer/region-model.cc +++ b/gcc/analyzer/region-model.cc @@ -3514,6 +3514,10 @@ region_model::check_region_size (const region *lhs_reg, const svalue *rhs_sval, || TYPE_SIZE_UNIT (pointee_type) == NULL_TREE) return; + /* Bail out early on function pointers. */ + if (TREE_CODE (pointee_type) == FUNCTION_TYPE) +return; + /* Bail out early on pointers to structs where we can not deduce whether the buffer size is compatible. */ bool is_struct = RECORD_OR_UNION_TYPE_P (pointee_type); -- 2.43.0
Re: RFC: New mechanism for hard reg operands to inline asm
On Fri, Jun 04, 2021 at 06:02:27PM +, Andreas Krebbel via Gcc wrote: > Hi, > > I wonder if we could replace the register asm construct for > inline assemblies with something a bit nicer and more obvious. > E.g. turning this (real world example from IBM Z kernel code): > > int diag8_response(int cmdlen, char *response, int *rlen) > { > register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf; > register unsigned long reg3 asm ("3") = (addr_t) response; > register unsigned long reg4 asm ("4") = cmdlen | 0x4000L; > register unsigned long reg5 asm ("5") = *rlen; /* <-- */ > asm volatile( > " diag%2,%0,0x8\n" > " brc 8,1f\n" > " agr %1,%4\n" > "1:\n" > : "+d" (reg4), "+d" (reg5) > : "d" (reg2), "d" (reg3), "d" (*rlen): "cc"); > *rlen = reg5; > return reg4; > } > > into this: > > int diag8_response(int cmdlen, char *response, int *rlen) > { > unsigned long len = cmdlen | 0x4000L; > > asm volatile( > " diag%2,%0,0x8\n" > " brc 8,1f\n" > " agr %1,%4\n" > "1:\n" > : "+{r4}" (len), "+{r5}" (*rlen) > : "{r2}" ((addr_t)cpcmd_buf), "{r3}" ((addr_t)response), "d" > (*rlen): "cc"); > return len; > } > > Apart from being much easier to read because the hard regs become part > of the inline assembly it solves also a couple of other issues: > > - function calls might clobber register asm variables see BZ100908 > - the constraints for the register asm operands are superfluous > - one register asm variable cannot be used for 2 different inline > assemblies if the value is expected in different hard regs > > I've started with a hackish implementation for IBM Z using the > TARGET_MD_ASM_ADJUST hook and let all the places parsing constraints > skip over the {} parts. But perhaps it would be useful to make this a > generic mechanism for all targets?! > > Andrea Hi all, I would like to resurrect this topic https://gcc.gnu.org/pipermail/gcc/2021-June/236269.html and have been coming up with a first implementation in order to discuss this further. Basically, I see two ways to implement this. First is by letting LRA assign the registers and the second one by introducing extra moves just before/after asm statements. Currently I went for the latter and emit extra moves during expand into hard regs as specified by the input/output constraints. Before going forward I would like to get some feedback whether this approach makes sense to you at all or whether you see some show stoppers. I was wondering whether my current approach is robust enough in the sense that no other pass could potentially remove the extra moves I introduced before. In particular I was first worried about code motion. Initially I thought I have to make use not only of hard regs but hard regs which are flagged as register-asms in order to prevent optimizations to fiddly around with those moves. However, after some more investigation I tend to conclude that this is not necessary. Any thoughts about this approach? With the current approach I can at least handle cases like: int __attribute__ ((noipa)) foo (int x) { return x; } int test (int x) { asm ("foo %0,%1\n" :: "{r3}" (foo (x + 1)), "{r2}" (x)); return x; } Note, this is written with the s390 ABI in mind where the first int argument and return value are passed in register r2. The point here is that r2 needs to be altered and restored multiple times until we reach } of function test(). Luckily, during expand we get all this basically for free. This brings me to the general question what should be allowed and what not? Evaluation order of input expressions is probably unspecified similar to function arguments. However, what about this one: int test (int x) { register int y asm ("r5") = x + 1; asm ("foo %0,%1\n" : "={r4}" (y) : "{r1}" (y)); return y; } IMHO the input is just fine but the output constraint is misleading and it is not obvious in which register variable y resides after the asm statement. With my current implementation, were I don't bail out, it is register r4 contrary to the decl. Interestingly, the other way around where one register is "aliased" by multiple variables is accepted by vanilla GCC: int foo (int x, int y) { register int a asm ("r1") = x; register int b asm ("r1") = y; return a + b; } Though, probably not intentionally. Cheers, Stefan
Re: [PATCH] s390: Fix test vector/long-double-to-i64.c
On Mon, Mar 11, 2024 at 11:14:04AM +0100, Andreas Krebbel wrote: > On 2/29/24 13:15, Stefan Schulze Frielinghaus wrote: > > Starting with r14-8319-g86de9b66480b71 fwprop improved so that vpdi is > > no longer required. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/s390/vector/long-double-to-i64.c: Fix scan > > assembler directive. > > Should we perhaps rather turn the scan-assembler directives into something > which checks for the > absence of vpdi then? In order to get notified once this really useful > optimization breaks? I thought about checking for the most optimal code which would be just two loads and a convert instruction. Thus if this fails, then we have a regression. Speaking of regressions, the old behaviour was restored by r14-9412-g3e3e4156a5f93e which means we are back using vpdi. Thus, I will leave this patch on hold and have a second look. Cheers, Stefan > > Andreas > > > --- > > .../gcc.target/s390/vector/long-double-to-i64.c | 13 + > > 1 file changed, 9 insertions(+), 4 deletions(-) > > > > diff --git a/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c > > b/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c > > index 2dbbb5d1c03..ed89878e6ee 100644 > > --- a/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c > > +++ b/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c > > @@ -1,19 +1,24 @@ > > /* { dg-do compile } */ > > /* { dg-options "-O3 -march=z14 -mzarch --save-temps" } */ > > /* { dg-do run { target { s390_z14_hw } } } */ > > +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } } } */ > > + > > #include > > #include > > > > +/* > > +** long_double_to_i64: > > +** ld %f0,0\(%r2\) > > +** ld %f2,8\(%r2\) > > +** cgxbr %r2,5,%f0 > > +** br %r14 > > +*/ > > __attribute__ ((noipa)) static int64_t > > long_double_to_i64 (long double x) > > { > >return x; > > } > > > > -/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,1\n} 1 } > > } */ > > -/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,5\n} 1 } > > } */ > > -/* { dg-final { scan-assembler-times {\n\tcgxbr\t} 1 } } */ > > - > > int > > main (void) > > { >
Re: [PATCH v3] RISC-V: Introduce gcc attribute riscv_rvv_vector_bits for RVV
On Tue, Mar 12, 2024, at 2:15 AM, pan2...@intel.com wrote: > From: Pan Li > > Update in v3: > * Add pre-defined __riscv_v_fixed_vlen when zvl. > > Update in v2: > * Cleanup some unused code. > * Fix some typo of commit log. > > Original log: > > This patch would like to introduce one new gcc attribute for RVV. > This attribute is used to define fixed-length variants of one > existing sizeless RVV types. > > This attribute is valid if and only if the mrvv-vector-bits=zvl, the only > one args should be the integer constant and its' value is terminated > by the LMUL and the vector register bits in zvl*b. For example: > > typedef vint32m2_t fixed_vint32m2_t > __attribute__((riscv_rvv_vector_bits(128))); > > The above type define is valid when -march=rv64gc_zve64d_zvl64b > (aka 2(m2) * 64 = 128 for vin32m2_t), and will report error when > -march=rv64gcv_zvl128b similar to below. > > "error: invalid RVV vector size '128', expected size is '256' based on > LMUL of type and '-mrvv-vector-bits=zvl'" > > Meanwhile, a pre-define macro __riscv_v_fixed_vlen is introduced to > represent the fixed vlen in a RVV vector register. Shouldn't a major user-facing change like this be discussed in a PR against https://github.com/riscv-non-isa/riscv-c-api-doc/ or https://github.com/riscv-non-isa/rvv-intrinsic-doc before or concurrent with compiler implementation? -s > For the vint*m*_t below operations are allowed. > * The sizeof. > * The global variable(s). > * The element of union and struct. > * The cast to other equalities. > * CMP: >, <, ==, !=, <=, >= > * ALU: +, -, *, /, %, &, |, ^, >>, <<, ~, - > > For the vfloat*m*_t below operations are allowed. > * The sizeof. > * The global variable(s). > * The element of union and struct. > * The cast to other equalities. > * CMP: >, <, ==, !=, <=, >= > * ALU: +, -, *, /, - > > For the vbool*_t types only below operations are allowed except > the CMP and ALU. The CMP and ALU operations on vbool*_t is not > well defined currently. > * The sizeof. > * The global variable(s). > * The element of union and struct. > * The cast to other equalities. > > For the vint*x*m*_t tuple types are not suppored in this patch > which is compatible with clang. > > This patch passed the below testsuites. > * The riscv fully regression tests. > > gcc/ChangeLog: > > * config/riscv/riscv-c.cc (riscv_cpu_cpp_builtins): Add pre-define > macro __riscv_v_fixed_vlen when zvl. > * config/riscv/riscv.cc (riscv_handle_rvv_vector_bits_attribute): > New static func to take care of the RVV types decorated by > the attributes. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-1.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-10.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-11.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-12.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-13.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-14.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-15.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-16.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-17.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-2.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-3.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-4.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-5.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-6.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-7.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-8.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits-9.c: New test. > * gcc.target/riscv/rvv/base/riscv_rvv_vector_bits.h: New test. > > Signed-off-by: Pan Li > --- > gcc/config/riscv/riscv-c.cc | 3 + > gcc/config/riscv/riscv.cc | 87 +- > .../riscv/rvv/base/riscv_rvv_vector_bits-1.c | 6 + > .../riscv/rvv/base/riscv_rvv_vector_bits-10.c | 53 + > .../riscv/rvv/base/riscv_rvv_vector_bits-11.c | 76 > .../riscv/rvv/base/riscv_rvv_vector_bits-12.c | 14 +++ > .../riscv/rvv/base/riscv_rvv_vector_bits-13.c | 10 ++ > .../riscv/rvv/base/riscv_rvv_vector_bits-14.c | 10 ++ > .../riscv/rvv/base/riscv_rvv_vector_bits-15.c | 10 ++ > .../riscv/rvv/base/riscv_rvv_vector_bits-16.c | 11 ++ > .../riscv/rvv/base/riscv_rvv_vector_bits-17.c | 10 ++ > .../riscv/rvv/base/riscv_rvv_vector_bits-2.c | 6 + > .../riscv/rvv/base/riscv_rvv_vector_bits-3.c | 6 + > .../riscv/rvv/base/riscv_rvv_vector_bits-4.c | 6 + > .../riscv/rvv/base/riscv_rvv_vector_bits-5.c | 6 + > .../riscv/rvv/base/riscv_rvv_vector_bits-6.c | 6 + > .../riscv/rvv/base/riscv_rvv_vector_bits-7.c | 76
Re: [PATCH] s390: Streamline NNPA builtins with POP mnemonics
Since there is no straight forward way to introduce an overload with different return types where we would expand differently depending on an immediate operand, lets drop this patch. On Fri, Mar 01, 2024 at 04:18:31PM +0100, Stefan Schulze Frielinghaus wrote: > At the moment there are no extended mnemonics for vclfn(h,l) and vcrnf > defined in the Principles of Operation. Thus, remove the suffix "s" > from the builtins and expanders and introduce a further operand for the > data type. > > gcc/ChangeLog: > > * config/s390/s390-builtin-types.def: Update to reflect latest > changes. > * config/s390/s390-builtins.def: Remove suffix s from > s390_vclfn(h,l)s and s390_vcrnfs. > * config/s390/s390.md: Similar, remove suffix s from unspec > definitions. > * config/s390/vecintrin.h (vec_extend_to_fp32_hi): Redefine. > (vec_extend_to_fp32_lo): Redefine. > (vec_round_from_fp32): Redefine. > * config/s390/vx-builtins.md (vclfnhs_v8hi): Remove suffix s. > (vclfnh_v8hi): Add with extra operand. > (vclfnls_v8hi): Remove suffix s. > (vclfnl_v8hi): Add with extra operand. > (vcrnfs_v8hi): Remove suffix s. > (vcrnf_v8hi): Add with extra operand. > --- > OK for mainline? > > gcc/config/s390/s390-builtin-types.def | 4 ++-- > gcc/config/s390/s390-builtins.def | 6 +++--- > gcc/config/s390/s390.md| 6 +++--- > gcc/config/s390/vecintrin.h| 6 +++--- > gcc/config/s390/vx-builtins.md | 27 ++ > 5 files changed, 26 insertions(+), 23 deletions(-) > > diff --git a/gcc/config/s390/s390-builtin-types.def > b/gcc/config/s390/s390-builtin-types.def > index ce51ae8cd3f..c3d09b42835 100644 > --- a/gcc/config/s390/s390-builtin-types.def > +++ b/gcc/config/s390/s390-builtin-types.def > @@ -273,7 +273,6 @@ DEF_FN_TYPE_2 (BT_FN_V2DI_V2DF_V2DF, BT_V2DI, BT_V2DF, > BT_V2DF) > DEF_FN_TYPE_2 (BT_FN_V2DI_V2DI_V2DI, BT_V2DI, BT_V2DI, BT_V2DI) > DEF_FN_TYPE_2 (BT_FN_V2DI_V4SI_V4SI, BT_V2DI, BT_V4SI, BT_V4SI) > DEF_FN_TYPE_2 (BT_FN_V4SF_FLT_INT, BT_V4SF, BT_FLT, BT_INT) > -DEF_FN_TYPE_2 (BT_FN_V4SF_UV8HI_UINT, BT_V4SF, BT_UV8HI, BT_UINT) > DEF_FN_TYPE_2 (BT_FN_V4SF_V4SF_UCHAR, BT_V4SF, BT_V4SF, BT_UCHAR) > DEF_FN_TYPE_2 (BT_FN_V4SF_V4SF_V4SF, BT_V4SF, BT_V4SF, BT_V4SF) > DEF_FN_TYPE_2 (BT_FN_V4SI_BV4SI_V4SI, BT_V4SI, BT_BV4SI, BT_V4SI) > @@ -324,7 +323,6 @@ DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_USHORT_INT, BT_UV8HI, > BT_UV8HI, BT_USHORT, BT_I > DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_INT, BT_UV8HI, BT_UV8HI, BT_UV8HI, > BT_INT) > DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_INTPTR, BT_UV8HI, BT_UV8HI, BT_UV8HI, > BT_INTPTR) > DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI, > BT_UV8HI) > -DEF_FN_TYPE_3 (BT_FN_UV8HI_V4SF_V4SF_UINT, BT_UV8HI, BT_V4SF, BT_V4SF, > BT_UINT) > DEF_FN_TYPE_3 (BT_FN_V16QI_UV16QI_UV16QI_INTPTR, BT_V16QI, BT_UV16QI, > BT_UV16QI, BT_INTPTR) > DEF_FN_TYPE_3 (BT_FN_V16QI_V16QI_V16QI_INTPTR, BT_V16QI, BT_V16QI, BT_V16QI, > BT_INTPTR) > DEF_FN_TYPE_3 (BT_FN_V16QI_V16QI_V16QI_V16QI, BT_V16QI, BT_V16QI, BT_V16QI, > BT_V16QI) > @@ -340,6 +338,7 @@ DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_INT_INTPTR, BT_V2DI, > BT_V2DF, BT_INT, BT_INTPTR) > DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_V2DF_INTPTR, BT_V2DI, BT_V2DF, BT_V2DF, > BT_INTPTR) > DEF_FN_TYPE_3 (BT_FN_V2DI_V2DI_V2DI_INTPTR, BT_V2DI, BT_V2DI, BT_V2DI, > BT_INTPTR) > DEF_FN_TYPE_3 (BT_FN_V2DI_V4SI_V4SI_V2DI, BT_V2DI, BT_V4SI, BT_V4SI, BT_V2DI) > +DEF_FN_TYPE_3 (BT_FN_V4SF_UV8HI_UINT_UINT, BT_V4SF, BT_UV8HI, BT_UINT, > BT_UINT) > DEF_FN_TYPE_3 (BT_FN_V4SF_V2DF_INT_INT, BT_V4SF, BT_V2DF, BT_INT, BT_INT) > DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_FLT_INT, BT_V4SF, BT_V4SF, BT_FLT, BT_INT) > DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_UCHAR_UCHAR, BT_V4SF, BT_V4SF, BT_UCHAR, > BT_UCHAR) > @@ -377,6 +376,7 @@ DEF_FN_TYPE_4 > (BT_FN_UV4SI_UV4SI_UV4SI_UINTCONSTPTR_UCHAR, BT_UV4SI, BT_UV4SI, B > DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_UV4SI_INT, BT_UV4SI, BT_UV4SI, > BT_UV4SI, BT_UV4SI, BT_INT) > DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_INT_INTPTR, BT_UV8HI, BT_UV8HI, > BT_UV8HI, BT_INT, BT_INTPTR) > DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI_INT, BT_UV8HI, BT_UV8HI, > BT_UV8HI, BT_UV8HI, BT_INT) > +DEF_FN_TYPE_4 (BT_FN_UV8HI_V4SF_V4SF_UINT_UINT, BT_UV8HI, BT_V4SF, BT_V4SF, > BT_UINT, BT_UINT) > DEF_FN_TYPE_4 (BT_FN_VOID_UV2DI_UV2DI_ULONGLONGPTR_ULONGLONG, BT_VOID, > BT_UV2DI, BT_UV2DI, BT_ULONGLONGPTR, BT_ULONGLONG) > DEF_FN_TYPE_4 (BT_FN_VOID_UV4SI_UV4SI_UINTPTR_ULONGLONG, BT_VOID, BT_UV4SI, > BT_UV4SI, BT_UINTPTR, BT_ULONGLONG) > DEF_FN_TYPE_4 (BT_FN_VOID_V4SI_V4SI_INTPTR_ULONGLONG, BT_VOID, BT_V4SI, > BT_V4SI, BT_I
[PATCH] s390: Deprecate some vector builtins
According to IBM Open XL C/C++ for z/OS version 1.1 builtins - vec_permi - vec_ctd - vec_ctsl - vec_ctul - vec_ld2f - vec_st2f are deprecated. Also deprecate helper builtins vec_ctd_s64 and vec_ctd_u64. Furthermore, the overloads of vec_insert which make use of a bool vector are deprecated, too. gcc/ChangeLog: * config/s390/s390-builtins.def (vec_permi): Deprecate. (vec_ctd): Deprecate. (vec_ctd_s64): Deprecate. (vec_ctd_u64): Deprecate. (vec_ctsl): Deprecate. (vec_ctul): Deprecate. (vec_ld2f): Deprecate. (vec_st2f): Deprecate. (vec_insert): Deprecate overloads with bool vectors. --- Ok for mainline? gcc/config/s390/s390-builtins.def | 24 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/gcc/config/s390/s390-builtins.def b/gcc/config/s390/s390-builtins.def index 680a038fa4b..54f400ceb5a 100644 --- a/gcc/config/s390/s390-builtins.def +++ b/gcc/config/s390/s390-builtins.def @@ -416,16 +416,16 @@ B_DEF (s390_vec_splat_s64, vec_splatsv2di, 0, OB_DEF (s390_vec_insert,s390_vec_insert_s8, s390_vec_insert_dbl,B_VX, BT_FN_OV4SI_INT_OV4SI_INT) OB_DEF_VAR (s390_vec_insert_s8, s390_vlvgb, 0, O3_ELEM,BT_OV_V16QI_SCHAR_V16QI_INT) OB_DEF_VAR (s390_vec_insert_u8, s390_vlvgb, 0, O3_ELEM,BT_OV_UV16QI_UCHAR_UV16QI_INT) -OB_DEF_VAR (s390_vec_insert_b8, s390_vlvgb, 0, O3_ELEM,BT_OV_UV16QI_UCHAR_BV16QI_INT) +OB_DEF_VAR (s390_vec_insert_b8, s390_vlvgb, B_DEP, O3_ELEM,BT_OV_UV16QI_UCHAR_BV16QI_INT) OB_DEF_VAR (s390_vec_insert_s16,s390_vlvgh, 0, O3_ELEM,BT_OV_V8HI_SHORT_V8HI_INT) OB_DEF_VAR (s390_vec_insert_u16,s390_vlvgh, 0, O3_ELEM,BT_OV_UV8HI_USHORT_UV8HI_INT) -OB_DEF_VAR (s390_vec_insert_b16,s390_vlvgh, 0, O3_ELEM,BT_OV_UV8HI_USHORT_BV8HI_INT) +OB_DEF_VAR (s390_vec_insert_b16,s390_vlvgh, B_DEP, O3_ELEM,BT_OV_UV8HI_USHORT_BV8HI_INT) OB_DEF_VAR (s390_vec_insert_s32,s390_vlvgf, 0, O3_ELEM,BT_OV_V4SI_INT_V4SI_INT) OB_DEF_VAR (s390_vec_insert_u32,s390_vlvgf, 0, O3_ELEM,BT_OV_UV4SI_UINT_UV4SI_INT) -OB_DEF_VAR (s390_vec_insert_b32,s390_vlvgf, 0, O3_ELEM,BT_OV_UV4SI_UINT_BV4SI_INT) +OB_DEF_VAR (s390_vec_insert_b32,s390_vlvgf, B_DEP, O3_ELEM,BT_OV_UV4SI_UINT_BV4SI_INT) OB_DEF_VAR (s390_vec_insert_s64,s390_vlvgg, 0, O3_ELEM,BT_OV_V2DI_LONGLONG_V2DI_INT) OB_DEF_VAR (s390_vec_insert_u64,s390_vlvgg, 0, O3_ELEM,BT_OV_UV2DI_ULONGLONG_UV2DI_INT) -OB_DEF_VAR (s390_vec_insert_b64,s390_vlvgg, 0, O3_ELEM,BT_OV_UV2DI_ULONGLONG_BV2DI_INT) +OB_DEF_VAR (s390_vec_insert_b64,s390_vlvgg, B_DEP, O3_ELEM,BT_OV_UV2DI_ULONGLONG_BV2DI_INT) OB_DEF_VAR (s390_vec_insert_flt,s390_vlvgf_flt, B_VXE, O3_ELEM,BT_OV_V4SF_FLT_V4SF_INT) /* vlvgf */ OB_DEF_VAR (s390_vec_insert_dbl,s390_vlvgg_dbl, 0, O3_ELEM,BT_OV_V2DF_DBL_V2DF_INT) /* vlvgg */ @@ -658,7 +658,7 @@ OB_DEF_VAR (s390_vec_perm_dbl, s390_vperm, 0, B_DEF (s390_vperm, vec_permv16qi, 0, B_VX, 0, BT_FN_UV16QI_UV16QI_UV16QI_UV16QI) -OB_DEF (s390_vec_permi, s390_vec_permi_s64, s390_vec_permi_dbl, B_VX, BT_FN_OV4SI_OV4SI_OV4SI_INT) +OB_DEF (s390_vec_permi, s390_vec_permi_s64, s390_vec_permi_dbl, B_DEP | B_VX, BT_FN_OV4SI_OV4SI_OV4SI_INT) OB_DEF_VAR (s390_vec_permi_s64, s390_vpdi, 0, O3_U2, BT_OV_V2DI_V2DI_V2DI_INT) OB_DEF_VAR (s390_vec_permi_b64, s390_vpdi, 0, O3_U2, BT_OV_BV2DI_BV2DI_BV2DI_INT) OB_DEF_VAR (s390_vec_permi_u64, s390_vpdi, 0, O3_U2, BT_OV_UV2DI_UV2DI_UV2DI_INT) @@ -2806,7 +2806,7 @@ OB_DEF (s390_vec_any_ngt, s390_vec_any_ngt_flt,s390_vec_any_ngt_db OB_DEF_VAR (s390_vec_any_ngt_flt, vec_any_unlev4sf, B_VXE, 0, BT_OV_INT_V4SF_V4SF) OB_DEF_VAR (s390_vec_any_ngt_dbl, vec_any_unlev2df, 0, 0, BT_OV_INT_V2DF_V2DF) -OB_DEF (s390_vec_ctd, s390_vec_ctd_s64, s390_vec_ctd_u64, B_VX, BT_FN
[PATCH] s390: Streamline NNPA builtins with POP mnemonics
At the moment there are no extended mnemonics for vclfn(h,l) and vcrnf defined in the Principles of Operation. Thus, remove the suffix "s" from the builtins and expanders and introduce a further operand for the data type. gcc/ChangeLog: * config/s390/s390-builtin-types.def: Update to reflect latest changes. * config/s390/s390-builtins.def: Remove suffix s from s390_vclfn(h,l)s and s390_vcrnfs. * config/s390/s390.md: Similar, remove suffix s from unspec definitions. * config/s390/vecintrin.h (vec_extend_to_fp32_hi): Redefine. (vec_extend_to_fp32_lo): Redefine. (vec_round_from_fp32): Redefine. * config/s390/vx-builtins.md (vclfnhs_v8hi): Remove suffix s. (vclfnh_v8hi): Add with extra operand. (vclfnls_v8hi): Remove suffix s. (vclfnl_v8hi): Add with extra operand. (vcrnfs_v8hi): Remove suffix s. (vcrnf_v8hi): Add with extra operand. --- OK for mainline? gcc/config/s390/s390-builtin-types.def | 4 ++-- gcc/config/s390/s390-builtins.def | 6 +++--- gcc/config/s390/s390.md| 6 +++--- gcc/config/s390/vecintrin.h| 6 +++--- gcc/config/s390/vx-builtins.md | 27 ++ 5 files changed, 26 insertions(+), 23 deletions(-) diff --git a/gcc/config/s390/s390-builtin-types.def b/gcc/config/s390/s390-builtin-types.def index ce51ae8cd3f..c3d09b42835 100644 --- a/gcc/config/s390/s390-builtin-types.def +++ b/gcc/config/s390/s390-builtin-types.def @@ -273,7 +273,6 @@ DEF_FN_TYPE_2 (BT_FN_V2DI_V2DF_V2DF, BT_V2DI, BT_V2DF, BT_V2DF) DEF_FN_TYPE_2 (BT_FN_V2DI_V2DI_V2DI, BT_V2DI, BT_V2DI, BT_V2DI) DEF_FN_TYPE_2 (BT_FN_V2DI_V4SI_V4SI, BT_V2DI, BT_V4SI, BT_V4SI) DEF_FN_TYPE_2 (BT_FN_V4SF_FLT_INT, BT_V4SF, BT_FLT, BT_INT) -DEF_FN_TYPE_2 (BT_FN_V4SF_UV8HI_UINT, BT_V4SF, BT_UV8HI, BT_UINT) DEF_FN_TYPE_2 (BT_FN_V4SF_V4SF_UCHAR, BT_V4SF, BT_V4SF, BT_UCHAR) DEF_FN_TYPE_2 (BT_FN_V4SF_V4SF_V4SF, BT_V4SF, BT_V4SF, BT_V4SF) DEF_FN_TYPE_2 (BT_FN_V4SI_BV4SI_V4SI, BT_V4SI, BT_BV4SI, BT_V4SI) @@ -324,7 +323,6 @@ DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_USHORT_INT, BT_UV8HI, BT_UV8HI, BT_USHORT, BT_I DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_INT, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INT) DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_INTPTR, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI) -DEF_FN_TYPE_3 (BT_FN_UV8HI_V4SF_V4SF_UINT, BT_UV8HI, BT_V4SF, BT_V4SF, BT_UINT) DEF_FN_TYPE_3 (BT_FN_V16QI_UV16QI_UV16QI_INTPTR, BT_V16QI, BT_UV16QI, BT_UV16QI, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_V16QI_V16QI_V16QI_INTPTR, BT_V16QI, BT_V16QI, BT_V16QI, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_V16QI_V16QI_V16QI_V16QI, BT_V16QI, BT_V16QI, BT_V16QI, BT_V16QI) @@ -340,6 +338,7 @@ DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_INT_INTPTR, BT_V2DI, BT_V2DF, BT_INT, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_V2DF_INTPTR, BT_V2DI, BT_V2DF, BT_V2DF, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_V2DI_V2DI_V2DI_INTPTR, BT_V2DI, BT_V2DI, BT_V2DI, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_V2DI_V4SI_V4SI_V2DI, BT_V2DI, BT_V4SI, BT_V4SI, BT_V2DI) +DEF_FN_TYPE_3 (BT_FN_V4SF_UV8HI_UINT_UINT, BT_V4SF, BT_UV8HI, BT_UINT, BT_UINT) DEF_FN_TYPE_3 (BT_FN_V4SF_V2DF_INT_INT, BT_V4SF, BT_V2DF, BT_INT, BT_INT) DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_FLT_INT, BT_V4SF, BT_V4SF, BT_FLT, BT_INT) DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_UCHAR_UCHAR, BT_V4SF, BT_V4SF, BT_UCHAR, BT_UCHAR) @@ -377,6 +376,7 @@ DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_UINTCONSTPTR_UCHAR, BT_UV4SI, BT_UV4SI, B DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_UV4SI_INT, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_INT) DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_INT_INTPTR, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INT, BT_INTPTR) DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI_INT, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INT) +DEF_FN_TYPE_4 (BT_FN_UV8HI_V4SF_V4SF_UINT_UINT, BT_UV8HI, BT_V4SF, BT_V4SF, BT_UINT, BT_UINT) DEF_FN_TYPE_4 (BT_FN_VOID_UV2DI_UV2DI_ULONGLONGPTR_ULONGLONG, BT_VOID, BT_UV2DI, BT_UV2DI, BT_ULONGLONGPTR, BT_ULONGLONG) DEF_FN_TYPE_4 (BT_FN_VOID_UV4SI_UV4SI_UINTPTR_ULONGLONG, BT_VOID, BT_UV4SI, BT_UV4SI, BT_UINTPTR, BT_ULONGLONG) DEF_FN_TYPE_4 (BT_FN_VOID_V4SI_V4SI_INTPTR_ULONGLONG, BT_VOID, BT_V4SI, BT_V4SI, BT_INTPTR, BT_ULONGLONG) diff --git a/gcc/config/s390/s390-builtins.def b/gcc/config/s390/s390-builtins.def index 02ff516c677..0d4e20ea425 100644 --- a/gcc/config/s390/s390-builtins.def +++ b/gcc/config/s390/s390-builtins.def @@ -3025,10 +3025,10 @@ B_DEF (s390_vstrszf,vstrszv4si, 0, /* arch 14 builtins */ -B_DEF (s390_vclfnhs,vclfnhs_v8hi, 0, B_NNPA, O2_U4, BT_FN_V4SF_UV8HI_UINT) -B_DEF (s390_vclfnls,vclfnls_v8hi, 0, B_NNPA, O2_U4, BT_FN_V4SF_UV8HI_UINT) +B_DEF (s390_vclfnh, vclfnh_v8hi, 0, B
[PATCH] s390: Streamline vector builtins with LLVM
Similar as to s390_lcbb, s390_vll, s390_vstl, et al. make use of a signed vector type for vlbb. Furthermore, a const void pointer seems more common and an integer for the mask. For s390_vfi(s,d)b make use of integers for masks, too. Use unsigned integers for all s390_vlbr/vstbr variants. Make use of type UV16QI for the length operand of s390_vstrs(,z)(h,f). Following the Principles of Operation, change from signed to unsigned type for s390_va(c,cc,ccc)q and s390_vs(,c,bc)biq and s390_vmslg. Make use of scalar type UINT128 instead of UV16QI for s390_vgfm(,a)g, and s390_vsumq(f,g). Ok for mainline? gcc/ChangeLog: * config/s390/s390-builtin-types.def: Update to reflect latest changes. * config/s390/s390-builtins.def: Streamline vector builtins with LLVM. --- gcc/config/s390/s390-builtin-types.def | 29 +++- gcc/config/s390/s390-builtins.def | 48 +- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/gcc/config/s390/s390-builtin-types.def b/gcc/config/s390/s390-builtin-types.def index 556104e0e23..ce51ae8cd3f 100644 --- a/gcc/config/s390/s390-builtin-types.def +++ b/gcc/config/s390/s390-builtin-types.def @@ -58,6 +58,7 @@ DEF_TYPE (BT_FLT, float_type_node, 0) DEF_TYPE (BT_FLTCONST, float_type_node, 1) DEF_TYPE (BT_INT, integer_type_node, 0) DEF_TYPE (BT_INT128, intTI_type_node, 0) +DEF_TYPE (BT_INT128CONST, intTI_type_node, 1) DEF_TYPE (BT_INTCONST, integer_type_node, 1) DEF_TYPE (BT_LONG, long_integer_type_node, 0) DEF_TYPE (BT_LONGLONG, long_long_integer_type_node, 0) @@ -69,6 +70,8 @@ DEF_TYPE (BT_SHORTCONST, short_integer_type_node, 1) DEF_TYPE (BT_UCHAR, unsigned_char_type_node, 0) DEF_TYPE (BT_UCHARCONST, unsigned_char_type_node, 1) DEF_TYPE (BT_UINT, unsigned_type_node, 0) +DEF_TYPE (BT_UINT128, unsigned_intTI_type_node, 0) +DEF_TYPE (BT_UINT128CONST, unsigned_intTI_type_node, 1) DEF_TYPE (BT_UINT64, c_uint64_type_node, 0) DEF_TYPE (BT_UINTCONST, unsigned_type_node, 1) DEF_TYPE (BT_ULONG, long_unsigned_type_node, 0) @@ -79,6 +82,7 @@ DEF_TYPE (BT_USHORTCONST, short_unsigned_type_node, 1) DEF_TYPE (BT_VOID, void_type_node, 0) DEF_TYPE (BT_VOIDCONST, void_type_node, 1) DEF_VECTOR_TYPE (BT_UV16QI, BT_UCHAR, 16) +DEF_VECTOR_TYPE (BT_UV1TI, BT_UINT128, 1) DEF_VECTOR_TYPE (BT_UV2DI, BT_ULONGLONG, 2) DEF_VECTOR_TYPE (BT_UV4SI, BT_UINT, 4) DEF_VECTOR_TYPE (BT_UV8HI, BT_USHORT, 8) @@ -93,6 +97,8 @@ DEF_POINTER_TYPE (BT_DBLCONSTPTR, BT_DBLCONST) DEF_POINTER_TYPE (BT_DBLPTR, BT_DBL) DEF_POINTER_TYPE (BT_FLTCONSTPTR, BT_FLTCONST) DEF_POINTER_TYPE (BT_FLTPTR, BT_FLT) +DEF_POINTER_TYPE (BT_INT128CONSTPTR, BT_INT128CONST) +DEF_POINTER_TYPE (BT_INT128PTR, BT_INT128) DEF_POINTER_TYPE (BT_INTCONSTPTR, BT_INTCONST) DEF_POINTER_TYPE (BT_INTPTR, BT_INT) DEF_POINTER_TYPE (BT_LONGLONGCONSTPTR, BT_LONGLONGCONST) @@ -103,6 +109,8 @@ DEF_POINTER_TYPE (BT_SHORTCONSTPTR, BT_SHORTCONST) DEF_POINTER_TYPE (BT_SHORTPTR, BT_SHORT) DEF_POINTER_TYPE (BT_UCHARCONSTPTR, BT_UCHARCONST) DEF_POINTER_TYPE (BT_UCHARPTR, BT_UCHAR) +DEF_POINTER_TYPE (BT_UINT128CONSTPTR, BT_UINT128CONST) +DEF_POINTER_TYPE (BT_UINT128PTR, BT_UINT128) DEF_POINTER_TYPE (BT_UINT64PTR, BT_UINT64) DEF_POINTER_TYPE (BT_UINTCONSTPTR, BT_UINTCONST) DEF_POINTER_TYPE (BT_UINTPTR, BT_UINT) @@ -114,9 +122,11 @@ DEF_POINTER_TYPE (BT_VOIDCONSTPTR, BT_VOIDCONST) DEF_POINTER_TYPE (BT_VOIDPTR, BT_VOID) DEF_DISTINCT_TYPE (BT_BCHAR, BT_UCHAR) DEF_DISTINCT_TYPE (BT_BINT, BT_UINT) +DEF_DISTINCT_TYPE (BT_BINT128, BT_UINT128) DEF_DISTINCT_TYPE (BT_BLONGLONG, BT_ULONGLONG) DEF_DISTINCT_TYPE (BT_BSHORT, BT_USHORT) DEF_OPAQUE_VECTOR_TYPE (BT_BV16QI, BT_BCHAR, 16) +DEF_OPAQUE_VECTOR_TYPE (BT_BV1TI, BT_BINT128, 1) DEF_OPAQUE_VECTOR_TYPE (BT_BV2DI, BT_BLONGLONG, 2) DEF_OPAQUE_VECTOR_TYPE (BT_BV4SI, BT_BINT, 4) DEF_OPAQUE_VECTOR_TYPE (BT_BV8HI, BT_BSHORT, 8) @@ -131,6 +141,7 @@ DEF_FN_TYPE_1 (BT_FN_INT_VOIDPTR, BT_INT, BT_VOIDPTR) DEF_FN_TYPE_1 (BT_FN_OV4SI_INT, BT_OV4SI, BT_INT) DEF_FN_TYPE_1 (BT_FN_OV4SI_INTCONSTPTR, BT_OV4SI, BT_INTCONSTPTR) DEF_FN_TYPE_1 (BT_FN_OV4SI_OV4SI, BT_OV4SI, BT_OV4SI) +DEF_FN_TYPE_1 (BT_FN_UINT128_UINT128, BT_UINT128, BT_UINT128) DEF_FN_TYPE_1 (BT_FN_UV16QI_UCHAR, BT_UV16QI, BT_UCHAR) DEF_FN_TYPE_1 (BT_FN_UV16QI_UCHARCONSTPTR, BT_UV16QI, BT_UCHARCONSTPTR) DEF_FN_TYPE_1 (BT_FN_UV16QI_USHORT, BT_UV16QI, BT_USHORT) @@ -154,7 +165,6 @@ DEF_FN_TYPE_1 (BT_FN_UV8HI_UV8HI, BT_UV8HI, BT_UV8HI) DEF_FN_TYPE_1 (BT_FN_V16QI_SCHAR, BT_V16QI, BT_SCHAR) DEF_FN_TYPE_1 (BT_FN_V16QI_UCHAR, BT_V16QI, BT_UCHAR) DEF_FN_TYPE_1 (BT_FN_V16QI_V16QI, BT_V16QI, BT_V16QI) -DEF_FN_TYPE_1 (BT_FN_V1TI_V1TI, BT_V1TI, BT_V1TI) DEF_FN_TYPE_1 (BT_FN_V2DF_DBL, BT_V2DF, BT_DBL) DEF_FN_TYPE_1 (BT_FN_V2DF_DBLCONSTPTR, BT_V2DF, BT_DBLCONSTPTR) DEF_FN_TYPE_1 (BT_FN_V2DF_FLTCONSTPTR, BT_V2DF, BT_FLTCONSTPTR) @@ -207,18 +217,18 @@ DEF_FN_TYPE_2 (BT_FN_OV4SI_OV4SI_OV4SI, BT_OV4SI, BT_OV4SI, BT_OV4SI) DEF_FN_TYPE_2 (BT_FN_OV4SI_OV4SI_UCHAR, BT_OV4SI, BT_OV4SI, B
Re: [PATCH] s390: Fix TARGET_SECONDARY_RELOAD for non-SYMBOL_REFs
On Thu, Feb 29, 2024 at 01:26:54PM +0100, Andreas Schwab wrote: > On Feb 29 2024, Stefan Schulze Frielinghaus wrote: > > > RTX X must not necessarily be a SYMBOL_REF and may e.g. be an > > False friend: s/must not/need not/ Argh I always fall for this ;-) Thanks for pointing this out. Changed for the final commit. Cheers, Stefan
[PATCH] s390: Fix test vector/long-double-to-i64.c
Starting with r14-8319-g86de9b66480b71 fwprop improved so that vpdi is no longer required. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/long-double-to-i64.c: Fix scan assembler directive. --- .../gcc.target/s390/vector/long-double-to-i64.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c b/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c index 2dbbb5d1c03..ed89878e6ee 100644 --- a/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c +++ b/gcc/testsuite/gcc.target/s390/vector/long-double-to-i64.c @@ -1,19 +1,24 @@ /* { dg-do compile } */ /* { dg-options "-O3 -march=z14 -mzarch --save-temps" } */ /* { dg-do run { target { s390_z14_hw } } } */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } } } */ + #include #include +/* +** long_double_to_i64: +** ld %f0,0\(%r2\) +** ld %f2,8\(%r2\) +** cgxbr %r2,5,%f0 +** br %r14 +*/ __attribute__ ((noipa)) static int64_t long_double_to_i64 (long double x) { return x; } -/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,1\n} 1 } } */ -/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,5\n} 1 } } */ -/* { dg-final { scan-assembler-times {\n\tcgxbr\t} 1 } } */ - int main (void) { -- 2.43.0
[PATCH] s390: Fix tests rosbg_si_srl and rxsbg_si_srl
Starting with r14-2047-gd0e891406b16dc two SI mode tests are optimized into DI mode. Thus, the scan-assembler directives fail. For example RTL expression (ior:SI (subreg:SI (lshiftrt:DI (reg:DI 69) (const_int 2 [0x2])) 4) (subreg:SI (reg:DI 68) 4)) is optimized into (ior:DI (lshiftrt:DI (reg:DI 69) (const_int 2 [0x2])) (reg:DI 68)) Fixed by moving operands into memory in order to enforce SI mode computation. Furthermore, in r9-6056-g290dfd9bc7bea2 the starting bit position of the scan-assembler directive for rosbg was incorrectly set to 32 which actually should be 32+SHIFT_AMOUNT, i.e., in this particular case 34. gcc/testsuite/ChangeLog: * gcc.target/s390/md/rXsbg_mode_sXl.c: Fix tests rosbg_si_srl and rxsbg_si_srl. --- .../gcc.target/s390/md/rXsbg_mode_sXl.c| 18 ++ 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/gcc/testsuite/gcc.target/s390/md/rXsbg_mode_sXl.c b/gcc/testsuite/gcc.target/s390/md/rXsbg_mode_sXl.c index ede813818ff..cf454d2783c 100644 --- a/gcc/testsuite/gcc.target/s390/md/rXsbg_mode_sXl.c +++ b/gcc/testsuite/gcc.target/s390/md/rXsbg_mode_sXl.c @@ -22,6 +22,8 @@ { dg-skip-if "" { *-*-* } { "*" } { "-march=*" } } */ +unsigned int a, b; + __attribute__ ((noinline)) unsigned int si_sll (unsigned int x) { @@ -42,11 +44,11 @@ rosbg_si_sll (unsigned int a, unsigned int b) /* { dg-final { scan-assembler-times "rosbg\t%r.,%r.,32,62,1" 1 } } */ __attribute__ ((noinline)) unsigned int -rosbg_si_srl (unsigned int a, unsigned int b) +rosbg_si_srl (void) { return a | (b >> 2); } -/* { dg-final { scan-assembler-times "rosbg\t%r.,%r.,32,63,62" 1 } } */ +/* { dg-final { scan-assembler-times "rosbg\t%r.,%r.,34,63,62" 1 } } */ __attribute__ ((noinline)) unsigned int rxsbg_si_sll (unsigned int a, unsigned int b) @@ -56,11 +58,11 @@ rxsbg_si_sll (unsigned int a, unsigned int b) /* { dg-final { scan-assembler-times "rxsbg\t%r.,%r.,32,62,1" 1 } } */ __attribute__ ((noinline)) unsigned int -rxsbg_si_srl (unsigned int a, unsigned int b) +rxsbg_si_srl (void) { return a ^ (b >> 2); } -/* { dg-final { scan-assembler-times "rxsbg\t%r.,%r.,32,63,62" 1 } } */ +/* { dg-final { scan-assembler-times "rxsbg\t%r.,%r.,34,63,62" 1 } } */ __attribute__ ((noinline)) unsigned long long di_sll (unsigned long long x) @@ -108,21 +110,21 @@ main (void) /* SIMode */ { unsigned int r; -unsigned int a = 0x12488421u; -unsigned int b = 0xu; +a = 0x12488421u; +b = 0xu; unsigned int csll = si_sll (b); unsigned int csrl = si_srl (b); r = rosbg_si_sll (a, b); if (r != (a | csll)) __builtin_abort (); -r = rosbg_si_srl (a, b); +r = rosbg_si_srl (); if (r != (a | csrl)) __builtin_abort (); r = rxsbg_si_sll (a, b); if (r != (a ^ csll)) __builtin_abort (); -r = rxsbg_si_srl (a, b); +r = rxsbg_si_srl (); if (r != (a ^ csrl)) __builtin_abort (); } -- 2.43.0
[PATCH] s390: Fix TARGET_SECONDARY_RELOAD for non-SYMBOL_REFs
RTX X must not necessarily be a SYMBOL_REF and may e.g. be an UNSPEC_GOTENT for which SYMBOL_FLAG_NOTALIGN2_P fails. gcc/ChangeLog: * config/s390/s390.cc (s390_secondary_reload): Guard SYMBOL_FLAG_NOTALIGN2_P. --- gcc/config/s390/s390.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 943fc9bfd72..12430d77786 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -4778,7 +4778,7 @@ s390_secondary_reload (bool in_p, rtx x, reg_class_t rclass_i, if (in_p && s390_loadrelative_operand_p (x, &symref, &offset) && mode == Pmode - && !SYMBOL_FLAG_NOTALIGN2_P (symref) + && (!SYMBOL_REF_P (symref) || !SYMBOL_FLAG_NOTALIGN2_P (symref)) && (offset & 1) == 1) sri->icode = ((mode == DImode) ? CODE_FOR_reloaddi_larl_odd_addend_z10 : CODE_FOR_reloadsi_larl_odd_addend_z10); -- 2.43.0