Re: [PATCH 01/13] [APX EGPR] middle-end: Add insn argument to base_reg_class
Vladimir Makarov via Gcc-patches 于2023年9月9日周六 01:04写道: > > > On 8/31/23 04:20, Hongyu Wang wrote: > > @@ -2542,6 +2542,8 @@ the code of the immediately enclosing expression > > (@code{MEM} for the top level > > of an address, @code{ADDRESS} for something that occurs in an > > @code{address_operand}). @var{index_code} is the code of the > > corresponding > > index expression if @var{outer_code} is @code{PLUS}; @code{SCRATCH} > > otherwise. > > +@code{insn} indicates insn specific base register class should be subset > > +of the original base register class. > > @end defmac > > I'd prefer more general description of 'insn' argument for the macros. > Something like that: > > @code{insn} can be used to define an insn-specific base register class. > Sure, will adjust in the V2 patch. Also, currently we reuse the old macro MODE_CODE_BASE_REG_CLASS, do you think we need a new macro like INSN_BASE_REG_CLASS as other parameters are actually unused? Then we don't need to change other targets like avr/gcn.
[PATCH] RISC-V: Avoid unnecessary slideup in compress pattern of vec_perm
If a const vector all elements are same, the slide up is unnecessary. gcc/ChangeLog: * config/riscv/riscv-v.cc (shuffle_compress_patterns): Avoid unnecessary slideup. --- gcc/config/riscv/riscv-v.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index bee60de1d26..7ef884907b8 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -2697,7 +2697,7 @@ shuffle_compress_patterns (struct expand_vec_perm_d *d) rtx mask = force_reg (mask_mode, builder.build ()); rtx merge = d->op1; - if (need_slideup_p) + if (need_slideup_p && !const_vec_duplicate_p (d->op1)) { int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1; rtx ops[] = {d->target, d->op1, gen_int_mode (slideup_cnt, Pmode)}; -- 2.36.3
[PATCH] RISC-V: Expand fixed-vlmax/vls vector permutation in targethook
When debugging FAIL: gcc.dg/pr92301.c execution test. Realize a vls vector permutation situation failed to vectorize since early return false: - /* For constant size indices, we dont't need to handle it here. - Just leave it to vec_perm. */ - if (d->perm.length ().is_constant ()) -return false; To avoid more potential failed vectorization case. Now expand it in targethook. gcc/ChangeLog: * config/riscv/riscv-v.cc (shuffle_generic_patterns): Expand fixed-vlmax/vls vector permutation. --- gcc/config/riscv/riscv-v.cc | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index dc8c10f6ed2..bee60de1d26 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -2792,14 +2792,9 @@ shuffle_generic_patterns (struct expand_vec_perm_d *d) if (!pow2p_hwi (d->perm.encoding().npatterns ())) return false; - /* For constant size indices, we dont't need to handle it here. - Just leave it to vec_perm. */ - if (d->perm.length ().is_constant ()) -return false; - /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv. Otherwise, it could overflow the index range. */ - if (GET_MODE_INNER (d->vmode) == QImode + if (!nunits.is_constant () && GET_MODE_INNER (d->vmode) == QImode && !get_vector_mode (HImode, nunits).exists (&sel_mode)) return false; @@ -2808,7 +2803,12 @@ shuffle_generic_patterns (struct expand_vec_perm_d *d) return true; rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); - expand_vec_perm (d->target, d->op0, d->op1, force_reg (sel_mode, sel)); + /* 'mov' generte interleave vector. */ + if (!nunits.is_constant ()) +sel = force_reg (sel_mode, sel); + /* Some FIXED-VLMAX/VLS vector permutation situations call targethook + instead of expand vec_perm, we handle it directly. */ + expand_vec_perm (d->target, d->op0, d->op1, sel); return true; } -- 2.36.3
RE: [PATCH] RISC-V: Fix dump FILE of VSETVL PASS[PR111311]
Committed, thanks Kito. Pan -Original Message- From: Gcc-patches On Behalf Of Kito Cheng via Gcc-patches Sent: Sunday, September 10, 2023 9:22 AM To: Juzhe-Zhong Cc: GCC Patches ; Kito Cheng Subject: Re: [PATCH] RISC-V: Fix dump FILE of VSETVL PASS[PR111311] LGTM Juzhe-Zhong 於 2023年9月10日 週日 07:58 寫道: > To make the dump FILE not too big, add TDF_DETAILS. > > This patch fix these following FAILs in > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111311 > > FAIL: gcc.c-torture/unsorted/dump-noaddr.c.*r.vsetvl, -O3 > -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer > -finline-functions comparison > FAIL: gcc.c-torture/unsorted/dump-noaddr.c.*r.vsetvl, -O3 -g comparison > > gcc/ChangeLog: > > * config/riscv/riscv-vsetvl.cc (pass_vsetvl::vsetvl_fusion): Add > TDF_DETAILS. > (pass_vsetvl::pre_vsetvl): Ditto. > (pass_vsetvl::init): Ditto. > (pass_vsetvl::lazy_vsetvl): Ditto. > > --- > gcc/config/riscv/riscv-vsetvl.cc | 10 +- > 1 file changed, 5 insertions(+), 5 deletions(-) > > diff --git a/gcc/config/riscv/riscv-vsetvl.cc > b/gcc/config/riscv/riscv-vsetvl.cc > index ae362a3f6a8..134b97737ae 100644 > --- a/gcc/config/riscv/riscv-vsetvl.cc > +++ b/gcc/config/riscv/riscv-vsetvl.cc > @@ -3438,7 +3438,7 @@ pass_vsetvl::vsetvl_fusion (void) > m_vector_manager->vector_kill, > m_vector_manager->vector_earliest); >changed_p |= earliest_fusion (); > - if (dump_file) > + if (dump_file && (dump_flags & TDF_DETAILS)) > { > fprintf (dump_file, "\nEARLIEST fusion %d\n", fusion_no); > m_vector_manager->dump (dump_file); > @@ -3720,7 +3720,7 @@ pass_vsetvl::pre_vsetvl (void) > >/* We should dump the information before CFG is changed. Otherwise it > will > produce ICE (internal compiler error). */ > - if (dump_file) > + if (dump_file && (dump_flags & TDF_DETAILS)) > m_vector_manager->dump (dump_file); > >refine_vsetvls (); > @@ -4250,7 +4250,7 @@ pass_vsetvl::init (void) >m_vector_manager = new vector_infos_manager (); >compute_probabilities (); > > - if (dump_file) > + if (dump_file && (dump_flags & TDF_DETAILS)) > { >fprintf (dump_file, "\nPrologue: Initialize vector infos\n"); >m_vector_manager->dump (dump_file); > @@ -4334,7 +4334,7 @@ pass_vsetvl::lazy_vsetvl (void) > fprintf (dump_file, "\nPhase 1: Compute local backward vector > infos\n"); >for (const bb_info *bb : crtl->ssa->bbs ()) > compute_local_backward_infos (bb); > - if (dump_file) > + if (dump_file && (dump_flags & TDF_DETAILS)) > m_vector_manager->dump (dump_file); > >/* Phase 2 - Emit vsetvl instructions within each basic block according > to > @@ -4344,7 +4344,7 @@ pass_vsetvl::lazy_vsetvl (void) > "\nPhase 2: Emit vsetvl instruction within each block\n"); >for (const bb_info *bb : crtl->ssa->bbs ()) > emit_local_forward_vsetvls (bb); > - if (dump_file) > + if (dump_file && (dump_flags & TDF_DETAILS)) > m_vector_manager->dump (dump_file); > >/* Phase 3 - Propagate demanded info across blocks. */ > -- > 2.36.3 > >
Re: [PATCH] RISC-V: Fix dump FILE of VSETVL PASS[PR111311]
LGTM Juzhe-Zhong 於 2023年9月10日 週日 07:58 寫道: > To make the dump FILE not too big, add TDF_DETAILS. > > This patch fix these following FAILs in > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111311 > > FAIL: gcc.c-torture/unsorted/dump-noaddr.c.*r.vsetvl, -O3 > -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer > -finline-functions comparison > FAIL: gcc.c-torture/unsorted/dump-noaddr.c.*r.vsetvl, -O3 -g comparison > > gcc/ChangeLog: > > * config/riscv/riscv-vsetvl.cc (pass_vsetvl::vsetvl_fusion): Add > TDF_DETAILS. > (pass_vsetvl::pre_vsetvl): Ditto. > (pass_vsetvl::init): Ditto. > (pass_vsetvl::lazy_vsetvl): Ditto. > > --- > gcc/config/riscv/riscv-vsetvl.cc | 10 +- > 1 file changed, 5 insertions(+), 5 deletions(-) > > diff --git a/gcc/config/riscv/riscv-vsetvl.cc > b/gcc/config/riscv/riscv-vsetvl.cc > index ae362a3f6a8..134b97737ae 100644 > --- a/gcc/config/riscv/riscv-vsetvl.cc > +++ b/gcc/config/riscv/riscv-vsetvl.cc > @@ -3438,7 +3438,7 @@ pass_vsetvl::vsetvl_fusion (void) > m_vector_manager->vector_kill, > m_vector_manager->vector_earliest); >changed_p |= earliest_fusion (); > - if (dump_file) > + if (dump_file && (dump_flags & TDF_DETAILS)) > { > fprintf (dump_file, "\nEARLIEST fusion %d\n", fusion_no); > m_vector_manager->dump (dump_file); > @@ -3720,7 +3720,7 @@ pass_vsetvl::pre_vsetvl (void) > >/* We should dump the information before CFG is changed. Otherwise it > will > produce ICE (internal compiler error). */ > - if (dump_file) > + if (dump_file && (dump_flags & TDF_DETAILS)) > m_vector_manager->dump (dump_file); > >refine_vsetvls (); > @@ -4250,7 +4250,7 @@ pass_vsetvl::init (void) >m_vector_manager = new vector_infos_manager (); >compute_probabilities (); > > - if (dump_file) > + if (dump_file && (dump_flags & TDF_DETAILS)) > { >fprintf (dump_file, "\nPrologue: Initialize vector infos\n"); >m_vector_manager->dump (dump_file); > @@ -4334,7 +4334,7 @@ pass_vsetvl::lazy_vsetvl (void) > fprintf (dump_file, "\nPhase 1: Compute local backward vector > infos\n"); >for (const bb_info *bb : crtl->ssa->bbs ()) > compute_local_backward_infos (bb); > - if (dump_file) > + if (dump_file && (dump_flags & TDF_DETAILS)) > m_vector_manager->dump (dump_file); > >/* Phase 2 - Emit vsetvl instructions within each basic block according > to > @@ -4344,7 +4344,7 @@ pass_vsetvl::lazy_vsetvl (void) > "\nPhase 2: Emit vsetvl instruction within each block\n"); >for (const bb_info *bb : crtl->ssa->bbs ()) > emit_local_forward_vsetvls (bb); > - if (dump_file) > + if (dump_file && (dump_flags & TDF_DETAILS)) > m_vector_manager->dump (dump_file); > >/* Phase 3 - Propagate demanded info across blocks. */ > -- > 2.36.3 > >
[PATCH] RISC-V: Fix dump FILE of VSETVL PASS[PR111311]
To make the dump FILE not too big, add TDF_DETAILS. This patch fix these following FAILs in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111311 FAIL: gcc.c-torture/unsorted/dump-noaddr.c.*r.vsetvl, -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions comparison FAIL: gcc.c-torture/unsorted/dump-noaddr.c.*r.vsetvl, -O3 -g comparison gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pass_vsetvl::vsetvl_fusion): Add TDF_DETAILS. (pass_vsetvl::pre_vsetvl): Ditto. (pass_vsetvl::init): Ditto. (pass_vsetvl::lazy_vsetvl): Ditto. --- gcc/config/riscv/riscv-vsetvl.cc | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index ae362a3f6a8..134b97737ae 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -3438,7 +3438,7 @@ pass_vsetvl::vsetvl_fusion (void) m_vector_manager->vector_kill, m_vector_manager->vector_earliest); changed_p |= earliest_fusion (); - if (dump_file) + if (dump_file && (dump_flags & TDF_DETAILS)) { fprintf (dump_file, "\nEARLIEST fusion %d\n", fusion_no); m_vector_manager->dump (dump_file); @@ -3720,7 +3720,7 @@ pass_vsetvl::pre_vsetvl (void) /* We should dump the information before CFG is changed. Otherwise it will produce ICE (internal compiler error). */ - if (dump_file) + if (dump_file && (dump_flags & TDF_DETAILS)) m_vector_manager->dump (dump_file); refine_vsetvls (); @@ -4250,7 +4250,7 @@ pass_vsetvl::init (void) m_vector_manager = new vector_infos_manager (); compute_probabilities (); - if (dump_file) + if (dump_file && (dump_flags & TDF_DETAILS)) { fprintf (dump_file, "\nPrologue: Initialize vector infos\n"); m_vector_manager->dump (dump_file); @@ -4334,7 +4334,7 @@ pass_vsetvl::lazy_vsetvl (void) fprintf (dump_file, "\nPhase 1: Compute local backward vector infos\n"); for (const bb_info *bb : crtl->ssa->bbs ()) compute_local_backward_infos (bb); - if (dump_file) + if (dump_file && (dump_flags & TDF_DETAILS)) m_vector_manager->dump (dump_file); /* Phase 2 - Emit vsetvl instructions within each basic block according to @@ -4344,7 +4344,7 @@ pass_vsetvl::lazy_vsetvl (void) "\nPhase 2: Emit vsetvl instruction within each block\n"); for (const bb_info *bb : crtl->ssa->bbs ()) emit_local_forward_vsetvls (bb); - if (dump_file) + if (dump_file && (dump_flags & TDF_DETAILS)) m_vector_manager->dump (dump_file); /* Phase 3 - Propagate demanded info across blocks. */ -- 2.36.3
Fix false positive for -Walloc-size-larger-than, part II [PR79132] (was: [PATCH] Fix false positive for -Walloc-size-larger-than (PR, bootstrap/79132))
Hi! On 2017-01-23T15:10:44-0700, Jeff Law wrote: > On 01/19/2017 04:46 AM, Martin Liška wrote: >> Following patch fixes asan bootstrap, as mentioned in the PR. >> >> Ready to be installed? >> * tree-ssa-reassoc.c (rewrite_expr_tree_parallel): Insert assert >> that would prevent us to call alloca with -1 as argument. > Maybe one day we'll be able to use the array index into STMTS to derive > a range for stmt_num. But we don't right now. > > Otherwise I don't see any way to derive a range for stmt_num that if > rewrite_expr_tree_parallel is not inlined into its caller. > > > OK for the trunk. Just a few years later, I've run into this one again -- with only a slight twist; probably latent all those years... OK to push the attached "Fix false positive for -Walloc-size-larger-than, part II [PR79132]"? Or, do we (incrementally?) want to formulate that "assume"-like trait yet differently? That is, should we gain some 'gcc_assume ([...])' in 'gcc/system.h'? CCing Jakub as author of commit 08b51baddc53d64aa4c5e7a81ef3c4bf320293be "c++, c: Implement C++23 P1774R8 - Portable assumptions [PR106654]": The following patch implements C++23 P1774R8 - Portable assumptions paper, by introducing support for [[assume (cond)]]; attribute for C++. In addition to that the patch adds [[gnu::assume (cond)]]; and __attribute__((assume (cond))); support to both C and C++. As described in C++23, the attribute argument is conditional-expression rather than the usual assignment-expression for attribute arguments, the condition is contextually converted to bool (for C truthvalue conversion is done on it) and is never evaluated at runtime. For C++ constant expression evaluation, I only check the simplest conditions for undefined behavior, because otherwise I'd need to undo changes to *ctx->global which happened during the evaluation (but I believe the spec allows that and we can further improve later). The patch uses a new internal function, .ASSUME, to hold the condition in the FEs. At gimplification time, if the condition is simple/without side-effects, it is gimplified as if (cond) ; else __builtin_unreachable (); and otherwise for now dropped on the floor. The intent is to incrementally outline the conditions into separate artificial functions and use .ASSUME further to tell the ranger and perhaps other optimization passes about the assumptions, as detailed in the PR. When implementing it, I found that assume entry hasn't been added to https://eel.is/c++draft/cpp.cond#6 Jonathan said he'll file a NB comment about it, this patch assumes it has been added into the table as 202207L when the paper has been voted in. With the attributes for both C/C++, I'd say we don't need to add __builtin_assume with similar purpose, especially when __builtin_assume in LLVM is just weird. It is strange for side-effects in function call's argument not to be evaluated, and LLVM in that case (annoyingly) warns and ignores the side-effects (but doesn't do then anything with it), if there are no side-effects, it will work like our if (!cond) __builtin_unreachable (); (Followed by further commits re "intent is to incrementally [...]".) The current 'gcc/doc/extend.texi', "Statement Attributes" for reference: @cindex @code{assume} statement attribute @item assume The @code{assume} attribute with a null statement serves as portable assumption. It should have a single argument, a conditional expression, which is not evaluated. If the argument would evaluate to true at the point where it appears, it has no effect, otherwise there is undefined behavior. This is a GNU variant of the ISO C++23 standard @code{assume} attribute, but it can be used in any version of both C and C++. @smallexample int foo (int x, int y) @{ __attribute__((assume(x == 42))); __attribute__((assume(++y == 43))); return x + y; @} @end smallexample @code{y} is not actually incremented and the compiler can but does not have to optimize it to just @code{return 42 + 42;}. (I've not actually verified that'd do the job here, but I'd be very surprised if not.) Grüße Thomas - Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955 >From 7dc7de834989d85cb1dbaf7b5a0917ba07319cfb Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Sat, 9 Sep 2023 16:49:16 +0200 Subject: [PATCH] Fix false positive for -Walloc-size-larger-than, part II [PR79132] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In a GCC bootstrap, I was running into: In file included from [...]/gcc/system.h:729, from [...]/gcc/
Re: [PATCH] fortran: Remove redundant tree walk to delete element
Le 08/09/2023 à 23:22, Harald Anlauf via Fortran a écrit : Am 08.09.23 um 12:04 schrieb Mikael Morin via Gcc-patches: Hello, this avoids some redundant work in the symbol deletion code, which is used a lot by the parser to cancel statements that fail to match in the end. I haven't tried to measure the performance effect, if any, on a pathological example; just passed the fortran testsuite on x86_64-pc-linux-gnu. OK for master? This is OK. Thanks. I had forgotten function comments. This is what I have pushed. From 1ea7130315a14ba4f66c2de76d034b33181812c5 Mon Sep 17 00:00:00 2001 From: Mikael Morin Date: Sat, 9 Sep 2023 11:45:11 +0200 Subject: [PATCH] fortran: Remove redundant tree walk to delete element Remove preliminary walk of the symbol tree when we are about to remove an element. This preliminary walk was necessary because the deletion function updated the tree without reporting back to the caller the element it had removed. But knowing that element is necessary to free its memory, so one had to first get that element before it was removed from the tree. This change updates the main deletion function delete_treap and its public wrapper gfc_delete_bbt so that the removed element can be known by the caller. This makes the preliminary walk in gfc_delete_symtree redundant, permitting its removal. gcc/fortran/ChangeLog: * bbt.cc (delete_treap): Add argument REMOVED, set it to the removed element from the tree. Change NULL to nullptr. (gfc_delete_bbt): Return the removed element from the tree. * gfortran.h (gfc_delete_symtree): Remove prototype. (gfc_delete_bbt): Set return type to pointer. * symbol.cc (gfc_delete_symtree): Make static. Get the element to be freed from the result of gfc_delete_bbt. Remove the preliminary walk to get it. --- gcc/fortran/bbt.cc | 41 + gcc/fortran/gfortran.h | 3 +-- gcc/fortran/symbol.cc | 6 ++ 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/gcc/fortran/bbt.cc b/gcc/fortran/bbt.cc index 851e5e92c7b..7f1f4067fbd 100644 --- a/gcc/fortran/bbt.cc +++ b/gcc/fortran/bbt.cc @@ -162,37 +162,54 @@ delete_root (gfc_bbt *t) } -/* Delete an element from a tree. The 'old' value does not - necessarily have to point to the element to be deleted, it must - just point to a treap structure with the key to be deleted. - Returns the new root node of the tree. */ +/* Delete an element from a tree, returning the new root node of the tree. + The OLD value does not necessarily have to point to the element to be + deleted, it must just point to a treap structure with the key to be deleted. + The REMOVED argument, if non-null, is set to the removed element from the + tree upon return. */ static gfc_bbt * -delete_treap (gfc_bbt *old, gfc_bbt *t, compare_fn compare) +delete_treap (gfc_bbt *old, gfc_bbt *t, compare_fn compare, gfc_bbt **removed) { int c; - if (t == NULL) -return NULL; + if (t == nullptr) +{ + if (removed) + *removed = nullptr; + return nullptr; +} c = (*compare) (old, t); if (c < 0) -t->left = delete_treap (old, t->left, compare); +t->left = delete_treap (old, t->left, compare, removed); if (c > 0) -t->right = delete_treap (old, t->right, compare); +t->right = delete_treap (old, t->right, compare, removed); if (c == 0) -t = delete_root (t); +{ + if (removed) + *removed = t; + t = delete_root (t); +} return t; } -void +/* Delete the element from the tree at *ROOT that matches the OLD element + according to the COMPARE_FN function. This updates the *ROOT pointer to + point to the new tree root (if different from the original) and returns the + deleted element. */ + +void * gfc_delete_bbt (void *root, void *old, compare_fn compare) { gfc_bbt **t; + gfc_bbt *removed; t = (gfc_bbt **) root; - *t = delete_treap ((gfc_bbt *) old, *t, compare); + *t = delete_treap ((gfc_bbt *) old, *t, compare, &removed); + + return (void *) removed; } diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h index b37c6bb9ad4..371f8743312 100644 --- a/gcc/fortran/gfortran.h +++ b/gcc/fortran/gfortran.h @@ -3510,7 +3510,6 @@ bool gfc_reference_st_label (gfc_st_label *, gfc_sl_type); gfc_namespace *gfc_get_namespace (gfc_namespace *, int); gfc_symtree *gfc_new_symtree (gfc_symtree **, const char *); gfc_symtree *gfc_find_symtree (gfc_symtree *, const char *); -void gfc_delete_symtree (gfc_symtree **, const char *); gfc_symtree *gfc_get_unique_symtree (gfc_namespace *); gfc_user_op *gfc_get_uop (const char *); gfc_user_op *gfc_find_uop (const char *, gfc_namespace *); @@ -3911,7 +3910,7 @@ bool gfc_inline_intrinsic_function_p (gfc_expr *); /* bbt.cc */ typedef int (*compare_fn) (void *, void *); void gfc_insert_bbt (void *, void *, compare_fn); -void gfc_delete_bbt (void *, void *, compare_fn); +void * gfc_delete_bbt (void *, void *, compare_fn); /* dump-parse-tr
Re: [PATCH] LoongArch: Fix up memcpy-vec-3.c test case
On Sat, 2023-09-09 at 16:21 +0800, chenglulu wrote: > LGTM! Pushed r14-3821. > 在 2023/9/9 下午4:20, Xi Ruoyao 写道: > > The generic code will split 16-byte copy into two 8-byte copies, so the > > vector code wouldn't be used even if -mno-strict-align. This > > contradicted with the purpose of this test case. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/loongarch/memcpy-vec-3.c: Increase the amount of > > copied bytes to 32. -- Xi Ruoyao School of Aerospace Science and Technology, Xidian University
Re: [PATCH v5] Implement new RTL optimizations pass: fold-mem-offsets.
This new version fixes the issues discussed in v4 and also fixes an issue that is described in the newly introduced compute_validity_closure. Bootstrapped on x86-64 and AArch64. I also ran the GCC testsuite on x86-64, AArch64 and RISCV64. There are no regressions except for gcc.target/i386/pr52146.c which I have already mentioned and I believe it shouldn't be fixed in f-m-o. I have also measured the number of eliminated instructions for SPEC intrate on these three architectures, which are as follows: RISCV64: 500: 112 502: 443 505: 0 520: 808 523: 20 525: 384 531: 41 541: 97 548: 101 557: 9 AArch64: 500: 71 502: 318 505: 0 520: 23 523: 205 525: 73 531: 7 541: 56 548: 0 557: 2 x86-64: 500: 8 502: 16 505: 0 520: 4 523: 5 525: 2 531: 0 541: 0 548: 0 557: 0 Thanks, Manolis On Sat, Sep 9, 2023 at 11:47 AM Manolis Tsamis wrote: > > This is a new RTL pass that tries to optimize memory offset calculations > by moving them from add immediate instructions to the memory loads/stores. > For example it can transform this: > > addi t4,sp,16 > add t2,a6,t4 > shl t3,t2,1 > ld a2,0(t3) > addi a2,1 > sd a2,8(t2) > > into the following (one instruction less): > > add t2,a6,sp > shl t3,t2,1 > ld a2,32(t3) > addi a2,1 > sd a2,24(t2) > > Although there are places where this is done already, this pass is more > powerful and can handle the more difficult cases that are currently not > optimized. Also, it runs late enough and can optimize away unnecessary > stack pointer calculations. > > gcc/ChangeLog: > > * Makefile.in: Add fold-mem-offsets.o. > * passes.def: Schedule a new pass. > * tree-pass.h (make_pass_fold_mem_offsets): Declare. > * common.opt: New options. > * doc/invoke.texi: Document new option. > * fold-mem-offsets.cc: New file. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/fold-mem-offsets-1.c: New test. > * gcc.target/riscv/fold-mem-offsets-2.c: New test. > * gcc.target/riscv/fold-mem-offsets-3.c: New test. > > Signed-off-by: Manolis Tsamis > --- > > Changes in v5: > - Introduce new helper function fold_offsets_1. > - Fix bug because constants could be partially propagated > through instructions that weren't understood. > - Introduce helper class fold_mem_info that stores f-m-o > info for an instruction. > - Calculate fold_offsets only once with do_fold_info_calculation. > - Fix correctness issue by introducing compute_validity_closure. > - Propagate in more cases for PLUS/MINUS with constant. > > Changes in v4: > - Add DF_EQ_NOTES flag to avoid incorrect state in notes. > - Remove fold_mem_offsets_driver and enum fold_mem_phase. > - Call recog when patching offsets in do_commit_offset. > - Restore INSN_CODE after modifying insn in do_check_validity. > > Changes in v3: > - Added propagation for more codes: > sub, neg, mul. > - Added folding / elimination for sub and > const int moves. > - For the validity check of the generated addresses > also test memory_address_addr_space_p. > - Replaced GEN_INT with gen_int_mode. > - Replaced some bitmap_head with auto_bitmap. > - Refactor each phase into own function for readability. > - Add dump details. > - Replace rtx iteration with reg_mentioned_p. > - Return early for codes that we can't propagate through. > > Changes in v2: > - Made the pass target-independant instead of RISCV specific. > - Fixed a number of bugs. > - Add code to handle more ADD patterns as found > in other targets (x86, aarch64). > - Improved naming and comments. > - Fixed bitmap memory leak. > > gcc/Makefile.in | 1 + > gcc/common.opt| 4 + > gcc/doc/invoke.texi | 8 + > gcc/fold-mem-offsets.cc | 891 ++ > gcc/passes.def| 1 + > .../gcc.target/riscv/fold-mem-offsets-1.c | 16 + > .../gcc.target/riscv/fold-mem-offsets-2.c | 24 + > .../gcc.target/riscv/fold-mem-offsets-3.c | 17 + > gcc/tree-pass.h | 1 + > 9 files changed, 963 insertions(+) > create mode 100644 gcc/fold-mem-offsets.cc > create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c > create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c > create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c > > diff --git a/gcc/Makefile.in b/gcc/Makefile.in > index 6d608db4dd2..d18bef1be4b 100644 > --- a/gcc/Makefile.in > +++ b/gcc/Makefile.in > @@ -1435,6 +1435,7 @@ OBJS = \ > fixed-value.o \ > fold-const.o \ > fold-const-call.o \ > + fold-mem-
[PATCH v5] Implement new RTL optimizations pass: fold-mem-offsets.
This is a new RTL pass that tries to optimize memory offset calculations by moving them from add immediate instructions to the memory loads/stores. For example it can transform this: addi t4,sp,16 add t2,a6,t4 shl t3,t2,1 ld a2,0(t3) addi a2,1 sd a2,8(t2) into the following (one instruction less): add t2,a6,sp shl t3,t2,1 ld a2,32(t3) addi a2,1 sd a2,24(t2) Although there are places where this is done already, this pass is more powerful and can handle the more difficult cases that are currently not optimized. Also, it runs late enough and can optimize away unnecessary stack pointer calculations. gcc/ChangeLog: * Makefile.in: Add fold-mem-offsets.o. * passes.def: Schedule a new pass. * tree-pass.h (make_pass_fold_mem_offsets): Declare. * common.opt: New options. * doc/invoke.texi: Document new option. * fold-mem-offsets.cc: New file. gcc/testsuite/ChangeLog: * gcc.target/riscv/fold-mem-offsets-1.c: New test. * gcc.target/riscv/fold-mem-offsets-2.c: New test. * gcc.target/riscv/fold-mem-offsets-3.c: New test. Signed-off-by: Manolis Tsamis --- Changes in v5: - Introduce new helper function fold_offsets_1. - Fix bug because constants could be partially propagated through instructions that weren't understood. - Introduce helper class fold_mem_info that stores f-m-o info for an instruction. - Calculate fold_offsets only once with do_fold_info_calculation. - Fix correctness issue by introducing compute_validity_closure. - Propagate in more cases for PLUS/MINUS with constant. Changes in v4: - Add DF_EQ_NOTES flag to avoid incorrect state in notes. - Remove fold_mem_offsets_driver and enum fold_mem_phase. - Call recog when patching offsets in do_commit_offset. - Restore INSN_CODE after modifying insn in do_check_validity. Changes in v3: - Added propagation for more codes: sub, neg, mul. - Added folding / elimination for sub and const int moves. - For the validity check of the generated addresses also test memory_address_addr_space_p. - Replaced GEN_INT with gen_int_mode. - Replaced some bitmap_head with auto_bitmap. - Refactor each phase into own function for readability. - Add dump details. - Replace rtx iteration with reg_mentioned_p. - Return early for codes that we can't propagate through. Changes in v2: - Made the pass target-independant instead of RISCV specific. - Fixed a number of bugs. - Add code to handle more ADD patterns as found in other targets (x86, aarch64). - Improved naming and comments. - Fixed bitmap memory leak. gcc/Makefile.in | 1 + gcc/common.opt| 4 + gcc/doc/invoke.texi | 8 + gcc/fold-mem-offsets.cc | 891 ++ gcc/passes.def| 1 + .../gcc.target/riscv/fold-mem-offsets-1.c | 16 + .../gcc.target/riscv/fold-mem-offsets-2.c | 24 + .../gcc.target/riscv/fold-mem-offsets-3.c | 17 + gcc/tree-pass.h | 1 + 9 files changed, 963 insertions(+) create mode 100644 gcc/fold-mem-offsets.cc create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 6d608db4dd2..d18bef1be4b 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1435,6 +1435,7 @@ OBJS = \ fixed-value.o \ fold-const.o \ fold-const-call.o \ + fold-mem-offsets.o \ function.o \ function-abi.o \ function-tests.o \ diff --git a/gcc/common.opt b/gcc/common.opt index f137a1f81ac..b103b8d28ed 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1252,6 +1252,10 @@ fcprop-registers Common Var(flag_cprop_registers) Optimization Perform a register copy-propagation optimization pass. +ffold-mem-offsets +Target Bool Var(flag_fold_mem_offsets) Init(1) +Fold instructions calculating memory offsets to the memory access instruction if possible. + fcrossjumping Common Var(flag_crossjumping) Optimization Perform cross-jumping optimization. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 33befee7d6b..ce5a83a2d9c 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -543,6 +543,7 @@ Objective-C and Objective-C++ Dialects}. -fauto-inc-dec -fbranch-probabilities -fcaller-saves -fcombine-stack-adjustments -fconserve-stack +-ffold-mem-offsets -fcompare-elim -fcprop-registers -fcrossjumping -fcse-follow-jumps -fcse-skip-blocks -fcx-fortran-rules -fcx-limited-range @@ -14355,6 +14356,13 @@
Re: [PATCH] LoongArch: Fix up memcpy-vec-3.c test case
LGTM! 在 2023/9/9 下午4:20, Xi Ruoyao 写道: The generic code will split 16-byte copy into two 8-byte copies, so the vector code wouldn't be used even if -mno-strict-align. This contradicted with the purpose of this test case. gcc/testsuite/ChangeLog: * gcc.target/loongarch/memcpy-vec-3.c: Increase the amount of copied bytes to 32. --- gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c b/gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c index 233ed215078..db2ea510b09 100644 --- a/gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c +++ b/gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c @@ -3,4 +3,4 @@ /* { dg-final { scan-assembler-not "vst" } } */ extern char a[], b[]; -void test() { __builtin_memcpy(a, b, 16); } +void test() { __builtin_memcpy(a, b, 32); }
[PATCH] LoongArch: Fix up memcpy-vec-3.c test case
The generic code will split 16-byte copy into two 8-byte copies, so the vector code wouldn't be used even if -mno-strict-align. This contradicted with the purpose of this test case. gcc/testsuite/ChangeLog: * gcc.target/loongarch/memcpy-vec-3.c: Increase the amount of copied bytes to 32. --- gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c b/gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c index 233ed215078..db2ea510b09 100644 --- a/gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c +++ b/gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c @@ -3,4 +3,4 @@ /* { dg-final { scan-assembler-not "vst" } } */ extern char a[], b[]; -void test() { __builtin_memcpy(a, b, 16); } +void test() { __builtin_memcpy(a, b, 32); } -- 2.42.0
Re: [PATCH v1] LoongArch: Fix bug of 'di3_fake'.
On Sat, 2023-09-09 at 15:42 +0800, Lulu Cheng wrote: > PR 111334 > > gcc/ChangeLog: > > * config/loongarch/loongarch.md: Fix bug of di3_fake. > > gcc/testsuite/ChangeLog: > > * gcc.target/loongarch/pr111334.c: New test. Ok. Despite I still think we should use unspec inside any_div, this should be enough to prevent the compiler from matching di3_fake. > --- > gcc/config/loongarch/loongarch.md | 14 +-- > gcc/testsuite/gcc.target/loongarch/pr111334.c | 39 +++ > 2 files changed, 49 insertions(+), 4 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/loongarch/pr111334.c > > diff --git a/gcc/config/loongarch/loongarch.md > b/gcc/config/loongarch/loongarch.md > index 1dc6b524416..3fa32562aa6 100644 > --- a/gcc/config/loongarch/loongarch.md > +++ b/gcc/config/loongarch/loongarch.md > @@ -72,6 +72,9 @@ (define_c_enum "unspec" [ > UNSPEC_LUI_H_HI12 > UNSPEC_TLS_LOW > > + ;; Fake div.w[u] mod.w[u] > + UNSPEC_FAKE_ANY_DIV > + > UNSPEC_SIBCALL_VALUE_MULTIPLE_INTERNAL_1 > UNSPEC_CALL_VALUE_MULTIPLE_INTERNAL_1 > ]) > @@ -900,7 +903,7 @@ (define_expand "3" > (match_operand:GPR 2 "register_operand")))] > "" > { > - if (GET_MODE (operands[0]) == SImode) > + if (GET_MODE (operands[0]) == SImode && TARGET_64BIT) > { > rtx reg1 = gen_reg_rtx (DImode); > rtx reg2 = gen_reg_rtx (DImode); > @@ -938,9 +941,12 @@ (define_insn "*3" > (define_insn "di3_fake" > [(set (match_operand:DI 0 "register_operand" "=r,&r,&r") > (sign_extend:DI > - (any_div:SI (match_operand:DI 1 "register_operand" "r,r,0") > - (match_operand:DI 2 "register_operand" "r,r,r"] > - "" > + (unspec:SI > + [(subreg:SI > + (any_div:DI (match_operand:DI 1 "register_operand" "r,r,0") > + (match_operand:DI 2 "register_operand" "r,r,r")) 0)] > + UNSPEC_FAKE_ANY_DIV)))] > + "TARGET_64BIT" > { > return loongarch_output_division (".w\t%0,%1,%2", operands); > } > diff --git a/gcc/testsuite/gcc.target/loongarch/pr111334.c > b/gcc/testsuite/gcc.target/loongarch/pr111334.c > new file mode 100644 > index 000..47366afcb74 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/loongarch/pr111334.c > @@ -0,0 +1,39 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +unsigned > +util_next_power_of_two (unsigned x) > +{ > + return (1 << __builtin_clz (x - 1)); > +} > + > +extern int create_vec_from_array (void); > + > +struct ac_shader_args { > + struct { > + unsigned char offset; > + unsigned char size; > + } args[384]; > +}; > + > +struct isel_context { > + const struct ac_shader_args* args; > + int arg_temps[384]; > +}; > + > + > +void > +add_startpgm (struct isel_context* ctx, unsigned short arg_count) > +{ > + > + for (unsigned i = 0, arg = 0; i < arg_count; i++) > + { > + unsigned size = ctx->args->args[i].size; > + unsigned reg = ctx->args->args[i].offset; > + > + if (reg % ( 4 < util_next_power_of_two (size) > + ? 4 : util_next_power_of_two (size))) > + ctx->arg_temps[i] = create_vec_from_array (); > + } > +} > + -- Xi Ruoyao School of Aerospace Science and Technology, Xidian University
[PATCH v1] LoongArch: Fix bug of 'di3_fake'.
PR 111334 gcc/ChangeLog: * config/loongarch/loongarch.md: Fix bug of di3_fake. gcc/testsuite/ChangeLog: * gcc.target/loongarch/pr111334.c: New test. --- gcc/config/loongarch/loongarch.md | 14 +-- gcc/testsuite/gcc.target/loongarch/pr111334.c | 39 +++ 2 files changed, 49 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/loongarch/pr111334.c diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md index 1dc6b524416..3fa32562aa6 100644 --- a/gcc/config/loongarch/loongarch.md +++ b/gcc/config/loongarch/loongarch.md @@ -72,6 +72,9 @@ (define_c_enum "unspec" [ UNSPEC_LUI_H_HI12 UNSPEC_TLS_LOW + ;; Fake div.w[u] mod.w[u] + UNSPEC_FAKE_ANY_DIV + UNSPEC_SIBCALL_VALUE_MULTIPLE_INTERNAL_1 UNSPEC_CALL_VALUE_MULTIPLE_INTERNAL_1 ]) @@ -900,7 +903,7 @@ (define_expand "3" (match_operand:GPR 2 "register_operand")))] "" { - if (GET_MODE (operands[0]) == SImode) + if (GET_MODE (operands[0]) == SImode && TARGET_64BIT) { rtx reg1 = gen_reg_rtx (DImode); rtx reg2 = gen_reg_rtx (DImode); @@ -938,9 +941,12 @@ (define_insn "*3" (define_insn "di3_fake" [(set (match_operand:DI 0 "register_operand" "=r,&r,&r") (sign_extend:DI - (any_div:SI (match_operand:DI 1 "register_operand" "r,r,0") - (match_operand:DI 2 "register_operand" "r,r,r"] - "" + (unspec:SI + [(subreg:SI +(any_div:DI (match_operand:DI 1 "register_operand" "r,r,0") +(match_operand:DI 2 "register_operand" "r,r,r")) 0)] + UNSPEC_FAKE_ANY_DIV)))] + "TARGET_64BIT" { return loongarch_output_division (".w\t%0,%1,%2", operands); } diff --git a/gcc/testsuite/gcc.target/loongarch/pr111334.c b/gcc/testsuite/gcc.target/loongarch/pr111334.c new file mode 100644 index 000..47366afcb74 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/pr111334.c @@ -0,0 +1,39 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +unsigned +util_next_power_of_two (unsigned x) +{ + return (1 << __builtin_clz (x - 1)); +} + +extern int create_vec_from_array (void); + +struct ac_shader_args { +struct { + unsigned char offset; + unsigned char size; +} args[384]; +}; + +struct isel_context { +const struct ac_shader_args* args; +int arg_temps[384]; +}; + + +void +add_startpgm (struct isel_context* ctx, unsigned short arg_count) +{ + + for (unsigned i = 0, arg = 0; i < arg_count; i++) +{ + unsigned size = ctx->args->args[i].size; + unsigned reg = ctx->args->args[i].offset; + + if (reg % ( 4 < util_next_power_of_two (size) +? 4 : util_next_power_of_two (size))) + ctx->arg_temps[i] = create_vec_from_array (); +} +} + -- 2.31.1
Re: [PATCH] LoongArch: Use LSX and LASX for block move
On Sat, 2023-09-09 at 15:14 +0800, chenglulu wrote: > > 在 2023/9/9 下午3:06, Xi Ruoyao 写道: > > On Sat, 2023-09-09 at 15:04 +0800, chenglulu wrote: > > > Hi,RuoYao: > > > > > > I think the test example memcpy-vec-3.c submitted in r14-3818 is > > > implemented incorrectly. > > > > > > The 16-byte length in this test example will cause can_move_by_pieces to > > > return true when with '-mstrict-align', so no vector load instructions > > > will be generated. > > Yes, in this case we cannot use vst because we don't know if b is > > aligned. Thus a { scan-assembler-not "vst" } guarantees that. > > > > Or am I understanding something wrongly here? > > > Well, what I mean is that even if '-mno-strict-align' is used here, > vst/vld will not be used, > > so this test example cannot test what we want to test. Let me revise it... -- Xi Ruoyao School of Aerospace Science and Technology, Xidian University
Re: [PATCH] LoongArch: Use LSX and LASX for block move
在 2023/9/9 下午3:06, Xi Ruoyao 写道: On Sat, 2023-09-09 at 15:04 +0800, chenglulu wrote: Hi,RuoYao: I think the test example memcpy-vec-3.c submitted in r14-3818 is implemented incorrectly. The 16-byte length in this test example will cause can_move_by_pieces to return true when with '-mstrict-align', so no vector load instructions will be generated. Yes, in this case we cannot use vst because we don't know if b is aligned. Thus a { scan-assembler-not "vst" } guarantees that. Or am I understanding something wrongly here? Well, what I mean is that even if '-mno-strict-align' is used here, vst/vld will not be used, so this test example cannot test what we want to test.
Re: [PATCH] LoongArch: Use LSX and LASX for block move
On Sat, 2023-09-09 at 15:04 +0800, chenglulu wrote: > Hi,RuoYao: > > I think the test example memcpy-vec-3.c submitted in r14-3818 is > implemented incorrectly. > > The 16-byte length in this test example will cause can_move_by_pieces to > return true when with '-mstrict-align', so no vector load instructions > will be generated. Yes, in this case we cannot use vst because we don't know if b is aligned. Thus a { scan-assembler-not "vst" } guarantees that. Or am I understanding something wrongly here? -- Xi Ruoyao School of Aerospace Science and Technology, Xidian University
Re: [PATCH] LoongArch: Use LSX and LASX for block move
Hi,RuoYao: I think the test example memcpy-vec-3.c submitted in r14-3818 is implemented incorrectly. The 16-byte length in this test example will cause can_move_by_pieces to return true when with '-mstrict-align', so no vector load instructions will be generated. 在 2023/9/8 上午12:14, Xi Ruoyao 写道: gcc/ChangeLog: * config/loongarch/loongarch.h (LARCH_MAX_MOVE_PER_INSN): Define to the maximum amount of bytes able to be loaded or stored with one machine instruction. * config/loongarch/loongarch.cc (loongarch_mode_for_move_size): New static function. (loongarch_block_move_straight): Call loongarch_mode_for_move_size for machine_mode to be moved. (loongarch_expand_block_move): Use LARCH_MAX_MOVE_PER_INSN instead of UNITS_PER_WORD. --- Bootstrapped and regtested on loongarch64-linux-gnu, with PR110939 patch applied, the "lib_build_self_spec = %<..." line in t-linux commented out (because it's silently making -mlasx in BOOT_CFLAGS ineffective, Yujie is working on a proper fix), and BOOT_CFLAGS="-O3 -mlasx". Ok for trunk? gcc/config/loongarch/loongarch.cc | 22 ++ gcc/config/loongarch/loongarch.h | 3 +++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index 6698414281e..509ef2b97f1 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -5191,6 +5191,20 @@ loongarch_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED, return true; } +static machine_mode +loongarch_mode_for_move_size (HOST_WIDE_INT size) +{ + switch (size) +{ +case 32: + return V32QImode; +case 16: + return V16QImode; +} + + return int_mode_for_size (size * BITS_PER_UNIT, 0).require (); +} + /* Emit straight-line code to move LENGTH bytes from SRC to DEST. Assume that the areas do not overlap. */ @@ -5220,7 +5234,7 @@ loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length, for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2) { - mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require (); + mode = loongarch_mode_for_move_size (delta_cur); for (; offs + delta_cur <= length; offs += delta_cur, i++) { @@ -5231,7 +5245,7 @@ loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length, for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2) { - mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require (); + mode = loongarch_mode_for_move_size (delta_cur); for (; offs + delta_cur <= length; offs += delta_cur, i++) loongarch_emit_move (adjust_address (dest, mode, offs), regs[i]); @@ -5326,8 +5340,8 @@ loongarch_expand_block_move (rtx dest, rtx src, rtx r_length, rtx r_align) HOST_WIDE_INT align = INTVAL (r_align); - if (!TARGET_STRICT_ALIGN || align > UNITS_PER_WORD) -align = UNITS_PER_WORD; + if (!TARGET_STRICT_ALIGN || align > LARCH_MAX_MOVE_PER_INSN) +align = LARCH_MAX_MOVE_PER_INSN; if (length <= align * LARCH_MAX_MOVE_OPS_STRAIGHT) { diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h index 3fc9dc43ab1..7e391205583 100644 --- a/gcc/config/loongarch/loongarch.h +++ b/gcc/config/loongarch/loongarch.h @@ -1181,6 +1181,9 @@ typedef struct { least twice. */ #define LARCH_MAX_MOVE_OPS_STRAIGHT (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2) +#define LARCH_MAX_MOVE_PER_INSN \ + (ISA_HAS_LASX ? 32 : (ISA_HAS_LSX ? 16 : UNITS_PER_WORD)) + /* The base cost of a memcpy call, for MOVE_RATIO and friends. These values were determined experimentally by benchmarking with CSiBE. */
Pushed: [PATCH] LoongArch: Slightly simplify loongarch_block_move_straight
Pushed r14-3819. On Sat, 2023-09-09 at 14:16 +0800, chenglulu wrote: > > 在 2023/9/8 上午12:33, Xi Ruoyao 写道: > > gcc/ChangeLog: > > > > * config/loongarch/loongarch.cc > > (loongarch_block_move_straight): > > Check precondition (delta must be a power of 2) and use > > popcount_hwi instead of a homebrew loop. > > --- > > > > I've not run a full bootstrap with this, but it should be obvious. > > Ok for trunk? > > LGTM! > > Thanks! > > > > > gcc/config/loongarch/loongarch.cc | 5 ++--- > > 1 file changed, 2 insertions(+), 3 deletions(-) > > > > diff --git a/gcc/config/loongarch/loongarch.cc > > b/gcc/config/loongarch/loongarch.cc > > index 509ef2b97f1..845fad5a8e8 100644 > > --- a/gcc/config/loongarch/loongarch.cc > > +++ b/gcc/config/loongarch/loongarch.cc > > @@ -5225,9 +5225,8 @@ loongarch_block_move_straight (rtx dest, rtx > > src, HOST_WIDE_INT length, > > emit two ld.d/st.d pairs, one ld.w/st.w pair, and one > > ld.b/st.b > > pair. For each load/store pair we use a dedicated register > > to keep > > the pipeline as populated as possible. */ > > - HOST_WIDE_INT num_reg = length / delta; > > - for (delta_cur = delta / 2; delta_cur != 0; delta_cur /= 2) > > - num_reg += !!(length & delta_cur); > > + gcc_assert (pow2p_hwi (delta)); > > + HOST_WIDE_INT num_reg = length / delta + popcount_hwi (length % > > delta); > > > > /* Allocate a buffer for the temporary registers. */ > > regs = XALLOCAVEC (rtx, num_reg); > -- Xi Ruoyao School of Aerospace Science and Technology, Xidian University
Pushed: [PATCH v2] LoongArch: Use LSX and LASX for block move
Pushed r14-3818 with test cases added. The pushed patch is attached. On Sat, 2023-09-09 at 14:10 +0800, chenglulu wrote: > > 在 2023/9/8 上午12:14, Xi Ruoyao 写道: > > gcc/ChangeLog: > > > > * config/loongarch/loongarch.h (LARCH_MAX_MOVE_PER_INSN): > > Define to the maximum amount of bytes able to be loaded or > > stored with one machine instruction. > > * config/loongarch/loongarch.cc (loongarch_mode_for_move_size): > > New static function. > > (loongarch_block_move_straight): Call > > loongarch_mode_for_move_size for machine_mode to be moved. > > (loongarch_expand_block_move): Use LARCH_MAX_MOVE_PER_INSN > > instead of UNITS_PER_WORD. > > --- > > > > Bootstrapped and regtested on loongarch64-linux-gnu, with PR110939 patch > > applied, the "lib_build_self_spec = %<..." line in t-linux commented out > > (because it's silently making -mlasx in BOOT_CFLAGS ineffective, Yujie > > is working on a proper fix), and BOOT_CFLAGS="-O3 -mlasx". Ok for trunk? > > I think test cases need to be added here. > > Otherwise OK, thanks! /* snip */ -- Xi Ruoyao School of Aerospace Science and Technology, Xidian University From 35adc54b55aa199f17e2c84e382792e424b6171e Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Tue, 5 Sep 2023 21:02:38 +0800 Subject: [PATCH v2] LoongArch: Use LSX and LASX for block move gcc/ChangeLog: * config/loongarch/loongarch.h (LARCH_MAX_MOVE_PER_INSN): Define to the maximum amount of bytes able to be loaded or stored with one machine instruction. * config/loongarch/loongarch.cc (loongarch_mode_for_move_size): New static function. (loongarch_block_move_straight): Call loongarch_mode_for_move_size for machine_mode to be moved. (loongarch_expand_block_move): Use LARCH_MAX_MOVE_PER_INSN instead of UNITS_PER_WORD. gcc/testsuite/ChangeLog: * gcc.target/loongarch/memcpy-vec-1.c: New test. * gcc.target/loongarch/memcpy-vec-2.c: New test. * gcc.target/loongarch/memcpy-vec-3.c: New test. --- gcc/config/loongarch/loongarch.cc | 22 +++ gcc/config/loongarch/loongarch.h | 3 +++ .../gcc.target/loongarch/memcpy-vec-1.c | 11 ++ .../gcc.target/loongarch/memcpy-vec-2.c | 12 ++ .../gcc.target/loongarch/memcpy-vec-3.c | 6 + 5 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/loongarch/memcpy-vec-1.c create mode 100644 gcc/testsuite/gcc.target/loongarch/memcpy-vec-2.c create mode 100644 gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index 6698414281e..509ef2b97f1 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -5191,6 +5191,20 @@ loongarch_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED, return true; } +static machine_mode +loongarch_mode_for_move_size (HOST_WIDE_INT size) +{ + switch (size) +{ +case 32: + return V32QImode; +case 16: + return V16QImode; +} + + return int_mode_for_size (size * BITS_PER_UNIT, 0).require (); +} + /* Emit straight-line code to move LENGTH bytes from SRC to DEST. Assume that the areas do not overlap. */ @@ -5220,7 +5234,7 @@ loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length, for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2) { - mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require (); + mode = loongarch_mode_for_move_size (delta_cur); for (; offs + delta_cur <= length; offs += delta_cur, i++) { @@ -5231,7 +5245,7 @@ loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length, for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2) { - mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require (); + mode = loongarch_mode_for_move_size (delta_cur); for (; offs + delta_cur <= length; offs += delta_cur, i++) loongarch_emit_move (adjust_address (dest, mode, offs), regs[i]); @@ -5326,8 +5340,8 @@ loongarch_expand_block_move (rtx dest, rtx src, rtx r_length, rtx r_align) HOST_WIDE_INT align = INTVAL (r_align); - if (!TARGET_STRICT_ALIGN || align > UNITS_PER_WORD) -align = UNITS_PER_WORD; + if (!TARGET_STRICT_ALIGN || align > LARCH_MAX_MOVE_PER_INSN) +align = LARCH_MAX_MOVE_PER_INSN; if (length <= align * LARCH_MAX_MOVE_OPS_STRAIGHT) { diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h index 3fc9dc43ab1..7e391205583 100644 --- a/gcc/config/loongarch/loongarch.h +++ b/gcc/config/loongarch/loongarch.h @@ -1181,6 +1181,9 @@ typedef struct { least twice. */ #define LARCH_MAX_MOVE_OPS_STRAIGHT (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2) +#define LARCH_MAX_MOVE_PER_INSN \ + (ISA_HAS_LASX ? 32 : (ISA_HAS_LSX ? 16 : UNITS_PER_WORD)) + /* The base cost of a memcpy call, for MOVE_RATIO and friends. These v