[PATCH] c: Avoid ICE with _BitInt(N) : 0 bitfield [PR113740]

2024-02-04 Thread Jakub Jelinek
Hi!

finish_struct already made sure not to call build_bitint_type for
signed _BitInt(2) : 1;
or
signed _BitInt(2) : 0;
bitfields (but instead build a zero precision integral type,
we remove it later), this patch makes sure we do it also for
unsigned _BitInt(1) : 0;
because of the build_bitint_type assertion that precision is
>= (unsigned ? 1 : 2).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-02-05  Jakub Jelinek  

PR c/113740
* c-decl.cc (finish_struct): Only use build_bitint_type if
bit-field has width larger or equal to minimum _BitInt
precision.

* gcc.dg/bitint-85.c: New test.

--- gcc/c/c-decl.cc.jj  2024-02-01 09:14:16.474551596 +0100
+++ gcc/c/c-decl.cc 2024-02-03 13:03:35.272479105 +0100
@@ -9555,7 +9555,7 @@ finish_struct (location_t loc, tree t, t
  if (width != TYPE_PRECISION (type))
{
  if (TREE_CODE (type) == BITINT_TYPE
- && (width > 1 || TYPE_UNSIGNED (type)))
+ && width >= (TYPE_UNSIGNED (type) ? 1 : 2))
TREE_TYPE (field)
  = build_bitint_type (width, TYPE_UNSIGNED (type));
  else
--- gcc/testsuite/gcc.dg/bitint-85.c.jj 2024-02-03 13:05:49.162639344 +0100
+++ gcc/testsuite/gcc.dg/bitint-85.c2024-02-03 13:05:39.489772259 +0100
@@ -0,0 +1,5 @@
+/* PR c/113740 */
+/* { dg-do compile { target bitint } } */
+/* { dg-options "-std=c23" } */
+
+struct S { unsigned _BitInt(32) : 0; };

Jakub



[PATCH] lower-bitint: Remove single label _BitInt switches [PR113737]

2024-02-04 Thread Jakub Jelinek
Hi!

The following testcase ICEs, because group_case_labels_stmt optimizes
  switch (a.0_7)  [50.00%], case 0:  [50.00%], case 2:  
[50.00%]>
where L7 block starts with __builtin_unreachable (); to
  switch (a.0_7)  [50.00%]>
and single label GIMPLE_SWITCH is something the switch expansion refuses to
lower:
  if (gimple_switch_num_labels (m_switch) == 1
  || range_check_type (index_type) == NULL_TREE)
return false;
(range_check_type never returns NULL for BITINT_TYPE), but the gimple
lowering pass relies on all large/huge _BitInt switches to be lowered
by that pass.

The following patch just removes those after making the single successor
edge EDGE_FALLTHRU.  I've done it even if !optimize just in case in case
we'd end up with single case label from earlier passes.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-02-05  Jakub Jelinek  

PR tree-optimization/113737
* gimple-lower-bitint.cc (gimple_lower_bitint): If GIMPLE_SWITCH
has just a single label, remove it and make single successor edge
EDGE_FALLTHRU.

* gcc.dg/bitint-84.c: New test.

--- gcc/gimple-lower-bitint.cc.jj   2024-02-02 11:30:05.801776658 +0100
+++ gcc/gimple-lower-bitint.cc  2024-02-03 12:49:52.99574 +0100
@@ -5832,7 +5832,14 @@ gimple_lower_bitint (void)
 
  if (optimize)
group_case_labels_stmt (swtch);
- switch_statements.safe_push (swtch);
+ if (gimple_switch_num_labels (swtch) == 1)
+   {
+ single_succ_edge (bb)->flags |= EDGE_FALLTHRU;
+ gimple_stmt_iterator gsi = gsi_for_stmt (swtch);
+ gsi_remove (&gsi, true);
+   }
+ else
+   switch_statements.safe_push (swtch);
}
 }
 
--- gcc/testsuite/gcc.dg/bitint-84.c.jj 2024-02-03 12:56:08.153622744 +0100
+++ gcc/testsuite/gcc.dg/bitint-84.c2024-02-03 12:57:05.425835789 +0100
@@ -0,0 +1,32 @@
+/* PR tree-optimization/113737 */
+/* { dg-do compile { target bitint } } */
+/* { dg-options "-O2 -std=c23" } */
+
+#if __BITINT_MAXWIDTH__ >= 129
+_BitInt(129) a;
+#else
+_BitInt(63) a;
+#endif
+
+int b[1], c;
+
+int
+foo (void)
+{
+  switch (a)
+  case 0:
+  case 2:
+return 1;
+  return 0;
+}
+
+void
+bar (int i)
+{
+  for (;; ++i)
+{
+  c = b[i];
+  if (!foo ())
+   __asm__ ("");
+}
+}

Jakub



Re: [PATCH 0/4] Add DF_LIVE_SUBREG data and apply to IRA and LRA

2024-02-04 Thread Lehua Ding

For SPEC INT 2017, when using upstream GCC (whitout these patches), I get a
coredump when training the peak case, so no data yet. The cause of the core
dump still needs to be investigated.


Typo, SPEC INT 2017 -> SPEC FP 2017
Also There is a bad news, the score of specint 2017 (with these patches) 
is dropped, a bit strange and I need to be locating the cause.


--
Best,
Lehua (RiVAI)



Re: Re: [PATCH] RISC-V: Expand VLMAX scalar move in reduction

2024-02-04 Thread juzhe.zh...@rivai.ai
I think it just trigger a latent bug that we didn't encounter.

Hi, Robin. Would you mind give me preprocessed file to reproduce the issue ?

I suspect it triggers latent bug in VSETVL PASS.



juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2024-02-05 12:36
To: Juzhe-Zhong; gcc-patches
CC: kito.cheng; kito.cheng; rdapp.gcc
Subject: Re: [PATCH] RISC-V: Expand VLMAX scalar move in reduction
 
 
On 2/4/24 20:26, Jeff Law wrote:
> 
> 
> On 2/1/24 18:56, Juzhe-Zhong wrote:
>> This patch fixes the following:
>>
>>  vsetvli a5,a1,e32,m1,tu,ma
>>  sllia4,a5,2
>>  sub a1,a1,a5
>>  vle32.v v2,0(a0)
>>  add a0,a0,a4
>>  vadd.vv v1,v2,v1
>>  bne a1,zero,.L3
>>  vsetivlizero,1,e32,m1,ta,ma
>>  vmv.s.x v2,zero
>>  vsetvli a5,zero,e32,m1,ta,ma  ---> Redundant vsetvl.
>>  vredsum.vs  v1,v1,v2
>>  vmv.x.s a0,v1
>>  ret
>>
>> VSETVL PASS is able to fuse avl = 1 of scalar move and VLMAX avl of 
>> reduction.
>>
>> However, this following RTL blocks the fusion in dependence analysis 
>> in VSETVL PASS:
>>
>> (insn 49 24 50 5 (set (reg:RVVM1SI 98 v2 [148])
>>  (if_then_else:RVVM1SI (unspec:RVVMF32BI [
>>  (const_vector:RVVMF32BI [
>>  (const_int 1 [0x1])
>>  repeat [
>>  (const_int 0 [0])
>>  ]
>>  ])
>>  (const_int 1 [0x1])
>>  (const_int 2 [0x2]) repeated x2
>>  (const_int 0 [0])
>>  (reg:SI 66 vl)
>>  (reg:SI 67 vtype)
>>  ] UNSPEC_VPREDICATE)
>>  (const_vector:RVVM1SI repeat [
>>  (const_int 0 [0])
>>  ])
>>  (unspec:RVVM1SI [
>>  (reg:DI 0 zero)
>>  ] UNSPEC_VUNDEF))) 3813 {*pred_broadcastrvvm1si_zero}
>>   (nil))
>> (insn 50 49 51 5 (set (reg:DI 15 a5 [151])  
>> >  It set a5, blocks the following VLMAX into the scalar move above.
>>  (unspec:DI [
>>  (const_int 32 [0x20])
>>  ] UNSPEC_VLMAX)) 2566 {vlmax_avldi}
>>   (expr_list:REG_EQUIV (unspec:DI [
>>  (const_int 32 [0x20])
>>  ] UNSPEC_VLMAX)
>>  (nil)))
>> (insn 51 50 52 5 (set (reg:RVVM1SI 97 v1 [150])
>>  (unspec:RVVM1SI [
>>  (unspec:RVVMF32BI [
>>  (const_vector:RVVMF32BI repeat [
>>  (const_int 1 [0x1])
>>  ])
>>  (reg:DI 15 a5 [151])
>>  (const_int 2 [0x2])
>>  (const_int 1 [0x1])
>>  (reg:SI 66 vl)
>>  (reg:SI 67 vtype)
>>  ] UNSPEC_VPREDICATE)
>>  (unspec:RVVM1SI [
>>  (reg:RVVM1SI 97 v1 [orig:134 vect_result_14.6 
>> ] [134])
>>  (reg:RVVM1SI 98 v2 [148])
>>  ] UNSPEC_REDUC_SUM)
>>  (unspec:RVVM1SI [
>>  (reg:DI 0 zero)
>>  ] UNSPEC_VUNDEF)
>>  ] UNSPEC_REDUC)) 17541 {pred_redsumrvvm1si}
>>   (expr_list:REG_DEAD (reg:RVVM1SI 98 v2 [148])
>>  (expr_list:REG_DEAD (reg:SI 66 vl)
>>  (expr_list:REG_DEAD (reg:DI 15 a5 [151])
>>  (expr_list:REG_DEAD (reg:DI 0 zero)
>>  (nil))
>>
>> Such situation can only happen on auto-vectorization, never happen on 
>> intrinsic codes.
>> Since the reduction is passed VLMAX AVL, it should be more natural to 
>> pass VLMAX to the scalar move which initial the value of the reduction.
>>
>> After this patch:
>>
>> vsetvlia5,a1,e32,m1,tu,ma
>> sllia4,a5,2
>> suba1,a1,a5
>> vle32.vv2,0(a0)
>> adda0,a0,a4
>> vadd.vvv1,v2,v1
>> bnea1,zero,.L3
>> vsetvlia5,zero,e32,m1,ta,ma
>> vmv.s.xv2,zero
>> vredsum.vsv1,v1,v2
>> vmv.x.sa0,v1
>>  ret
>>
>> Tested on both RV32/RV64 no regression.
>>
>> PR target/113697
>>
>> gcc/ChangeLog:
>>
>> * config/riscv/riscv-v.cc (expand_reduction): Pass VLMAX avl to 
>> scalar move.
>>
>> gcc/testsuite/ChangeLog:
>>
>> * gcc.target/riscv/rvv/autovec/pr113697.c: New test.
> I suspect this broke 502.gcc in spec2017.  Basically it's hanging during 
> the build phase.  I'm not sure if I'm going to have time this week to 
> dive into it.
> 
> 
> Optimization options used:
> 
>> GCC Flags:  -Ofast -flto -fsched-pressure -fno-strict-aliasing 
>> -fgnu89-inline -fcommon -fno-finite-math-only 
>> -fno-unsafe-math-optimizations
> 
> 
> 
> Given this appears to be a minor optimization issue, I wouldn't lose any 
> sleep if it was reverted

Re: [PATCH] RISC-V: Fix macro fusion for auipc+add, when identifying UNSPEC_AUIPC. [PR113742]

2024-02-04 Thread Monk Chiang
Yes, this test needs  "--enable-checking=rtl" build.

On Mon, Feb 5, 2024 at 11:28 AM Jeff Law  wrote:

>
>
> On 2/4/24 20:20, Monk Chiang wrote:
> > gcc/ChangeLog:
> >
> >   PR target/113742
> >   * config/riscv/riscv.cc (riscv_macro_fusion_pair_p): Fix
> >   recognizes UNSPEC_AUIPC for RISCV_FUSE_LUI_ADDI.
> >
> > gcc/testsuite/ChangeLog:
> >
> >   * gcc.target/riscv/pr113742.c: New test.
> OK.  Presumably this faulted during an --enable-checking=rtl build or
> something similar?
>
> Jeff
>


Re: [PATCH] combine: Don't optimize SIGN_EXTEND of MEM on WORD_REGISTER_OPERATIONS targets [PR113010]

2024-02-04 Thread Jeff Law




On 2/2/24 15:48, Greg McGary wrote:

On 2/1/24 10:24 PM, Jeff Law wrote:


On 2/1/24 18:24, Greg McGary wrote:

However, for a machine where (WORD_REGISTER_OPERATIONS && 
load_extend_op (inner_mode) == SIGN_EXTEND), the high part of a PSoM 
is  only known at runtime as 0s or 1s. That's the downstream bug. The 
fix for such machines is either (A) forbid static evaluation of the 
high part of a PSoM, or (B) forbid transforming (SIGN_EXTEND (MEM 
...) ) into a PSoM. My patch does B. Perhaps you prefer A? The 
trouble with A is that in the zero-extend case, it is valid to 
statically evaluate as 0. It is only the sign-extend case that isn't 
known until runtime. By the time we have a PSoM, its upstream 
ancestor as sign- or zero-extend is already lost.


Does that give you the understanding you desire, or are there deeper 
mysteries to probe?
It's a good start and I can see what you're trying to do -- and it may 
in fact be correct -- the quick discussion with Palmer Tuesday and 
your follow-up have helped a lot).


But just to be sure, what's the incoming rtl at function entry? just 
"debug_rtx (x)" should be sufficient.


input: (sign_extend:DI (mem/c:SI (symbol_ref:DI ("minus_1") [flags 0x86] 
) [1 minus_1+0 S4 A32]))


result: (subreg:DI (mem/c:SI (symbol_ref:DI ("minus_1") [flags 0x86] 
) [1 minus_1+0 S4 A32]) 0)


Later, the high part of the PSoM statically evaluates to 0, the code to 
load and test is elided, and the incorrect alternative is emitted 
unconditionally.
So I think we need to know where that high part gets statically turned 
into a zero.


I'm not happy with the sign_extend->subreg transformation as we 
generally want to avoid (subreg (mem)) for various reasons.  So we'll 
probably want to do something like your patch as well.   But let's chase 
down the static evaluation of the high part to zero first -- that's 
clearly wrong given the defined semantics of (subreg (mem)) in the 
presence of LOAD_EXTEND_OP.


jeff


Re: [PATCH] RISC-V: Expand VLMAX scalar move in reduction

2024-02-04 Thread Jeff Law




On 2/4/24 20:26, Jeff Law wrote:



On 2/1/24 18:56, Juzhe-Zhong wrote:

This patch fixes the following:

 vsetvli a5,a1,e32,m1,tu,ma
 slli    a4,a5,2
 sub a1,a1,a5
 vle32.v v2,0(a0)
 add a0,a0,a4
 vadd.vv v1,v2,v1
 bne a1,zero,.L3
 vsetivli    zero,1,e32,m1,ta,ma
 vmv.s.x v2,zero
 vsetvli a5,zero,e32,m1,ta,ma  ---> Redundant vsetvl.
 vredsum.vs  v1,v1,v2
 vmv.x.s a0,v1
 ret

VSETVL PASS is able to fuse avl = 1 of scalar move and VLMAX avl of 
reduction.


However, this following RTL blocks the fusion in dependence analysis 
in VSETVL PASS:


(insn 49 24 50 5 (set (reg:RVVM1SI 98 v2 [148])
 (if_then_else:RVVM1SI (unspec:RVVMF32BI [
 (const_vector:RVVMF32BI [
 (const_int 1 [0x1])
 repeat [
 (const_int 0 [0])
 ]
 ])
 (const_int 1 [0x1])
 (const_int 2 [0x2]) repeated x2
 (const_int 0 [0])
 (reg:SI 66 vl)
 (reg:SI 67 vtype)
 ] UNSPEC_VPREDICATE)
 (const_vector:RVVM1SI repeat [
 (const_int 0 [0])
 ])
 (unspec:RVVM1SI [
 (reg:DI 0 zero)
 ] UNSPEC_VUNDEF))) 3813 {*pred_broadcastrvvm1si_zero}
  (nil))
(insn 50 49 51 5 (set (reg:DI 15 a5 [151])  
>  It set a5, blocks the following VLMAX into the scalar move above.

 (unspec:DI [
 (const_int 32 [0x20])
 ] UNSPEC_VLMAX)) 2566 {vlmax_avldi}
  (expr_list:REG_EQUIV (unspec:DI [
 (const_int 32 [0x20])
 ] UNSPEC_VLMAX)
 (nil)))
(insn 51 50 52 5 (set (reg:RVVM1SI 97 v1 [150])
 (unspec:RVVM1SI [
 (unspec:RVVMF32BI [
 (const_vector:RVVMF32BI repeat [
 (const_int 1 [0x1])
 ])
 (reg:DI 15 a5 [151])
 (const_int 2 [0x2])
 (const_int 1 [0x1])
 (reg:SI 66 vl)
 (reg:SI 67 vtype)
 ] UNSPEC_VPREDICATE)
 (unspec:RVVM1SI [
 (reg:RVVM1SI 97 v1 [orig:134 vect_result_14.6 
] [134])

 (reg:RVVM1SI 98 v2 [148])
 ] UNSPEC_REDUC_SUM)
 (unspec:RVVM1SI [
 (reg:DI 0 zero)
 ] UNSPEC_VUNDEF)
 ] UNSPEC_REDUC)) 17541 {pred_redsumrvvm1si}
  (expr_list:REG_DEAD (reg:RVVM1SI 98 v2 [148])
 (expr_list:REG_DEAD (reg:SI 66 vl)
 (expr_list:REG_DEAD (reg:DI 15 a5 [151])
 (expr_list:REG_DEAD (reg:DI 0 zero)
 (nil))

Such situation can only happen on auto-vectorization, never happen on 
intrinsic codes.
Since the reduction is passed VLMAX AVL, it should be more natural to 
pass VLMAX to the scalar move which initial the value of the reduction.


After this patch:

vsetvli    a5,a1,e32,m1,tu,ma
slli    a4,a5,2
sub    a1,a1,a5
vle32.v    v2,0(a0)
add    a0,a0,a4
vadd.vv    v1,v2,v1
bne    a1,zero,.L3
vsetvli    a5,zero,e32,m1,ta,ma
vmv.s.x    v2,zero
vredsum.vs    v1,v1,v2
vmv.x.s    a0,v1
 ret

Tested on both RV32/RV64 no regression.

PR target/113697

gcc/ChangeLog:

* config/riscv/riscv-v.cc (expand_reduction): Pass VLMAX avl to 
scalar move.


gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113697.c: New test.
I suspect this broke 502.gcc in spec2017.  Basically it's hanging during 
the build phase.  I'm not sure if I'm going to have time this week to 
dive into it.



Optimization options used:

GCC Flags:  -Ofast -flto -fsched-pressure -fno-strict-aliasing 
-fgnu89-inline -fcommon -fno-finite-math-only 
-fno-unsafe-math-optimizations




Given this appears to be a minor optimization issue, I wouldn't lose any 
sleep if it was reverted and deferred to gcc-15.


Anyway, good luck.  Sorry I can't do more on the debugging/reduction front.
Actually, I'm starting to wonder if this is just the trigger and if the 
real issue is something else that went in over the last week or so.  I 
reverted the patch above which allows 502.gcc to build. But then I get a 
hang on xalancbmk.


Makes me wonder if the vsetvl bits are the culprit given the size of 
that change.


jeff


Re: Repost [PATCH 6/6] PowerPC: Add support for 1,024 bit DMR registers.

2024-02-04 Thread Kewen.Lin
Hi Mike,

on 2024/1/6 07:42, Michael Meissner wrote:
> This patch is a prelimianry patch to add the full 1,024 bit dense math 
> register> (DMRs) for -mcpu=future.  The MMA 512-bit accumulators map onto the 
> top of the
> DMR register.
> 
> This patch only adds the new 1,024 bit register support.  It does not add
> support for any instructions that need 1,024 bit registers instead of 512 bit
> registers.
> 
> I used the new mode 'TDOmode' to be the opaque mode used for 1,204 bit

typo: 1,204

> registers.  The 'wD' constraint added in previous patches is used for these
> registers.  I added support to do load and store of DMRs via the VSX 
> registers,
> since there are no load/store dense math instructions.  I added the new 
> keyword
> '__dmr' to create 1,024 bit types that can be loaded into DMRs.  At present, I
> don't have aliases for __dmr512 and __dmr1024 that we've discussed internally.
> 
> The patches have been tested on both little and big endian systems.  Can I 
> check
> it into the master branch?
> 
> 2024-01-05   Michael Meissner  
> 
> gcc/
> 
>   * config/rs6000/mma.md (UNSPEC_DM_INSERT512_UPPER): New unspec.
>   (UNSPEC_DM_INSERT512_LOWER): Likewise.
>   (UNSPEC_DM_EXTRACT512): Likewise.
>   (UNSPEC_DMR_RELOAD_FROM_MEMORY): Likewise.
>   (UNSPEC_DMR_RELOAD_TO_MEMORY): Likewise.
>   (movtdo): New define_expand and define_insn_and_split to implement 1,024
>   bit DMR registers.
>   (movtdo_insert512_upper): New insn.
>   (movtdo_insert512_lower): Likewise.
>   (movtdo_extract512): Likewise.
>   (reload_dmr_from_memory): Likewise.
>   (reload_dmr_to_memory): Likewise.
>   * config/rs6000/rs6000-builtin.cc (rs6000_type_string): Add DMR
>   support.
>   (rs6000_init_builtins): Add support for __dmr keyword.
>   * config/rs6000/rs6000-call.cc (rs6000_return_in_memory): Add support
>   for TDOmode.
>   (rs6000_function_arg): Likewise.
>   * config/rs6000/rs6000-modes.def (TDOmode): New mode.
>   * config/rs6000/rs6000.cc (rs6000_hard_regno_nregs_internal): Add
>   support for TDOmode.
>   (rs6000_hard_regno_mode_ok_uncached): Likewise.
>   (rs6000_hard_regno_mode_ok): Likewise.
>   (rs6000_modes_tieable_p): Likewise.
>   (rs6000_debug_reg_global): Likewise.
>   (rs6000_setup_reg_addr_masks): Likewise.
>   (rs6000_init_hard_regno_mode_ok): Add support for TDOmode.  Setup reload
>   hooks for DMR mode.
>   (reg_offset_addressing_ok_p): Add support for TDOmode.
>   (rs6000_emit_move): Likewise.
>   (rs6000_secondary_reload_simple_move): Likewise.
>   (rs6000_secondary_reload_class): Likewise.
>   (rs6000_mangle_type): Add mangling for __dmr type.
>   (rs6000_dmr_register_move_cost): Add support for TDOmode.
>   (rs6000_split_multireg_move): Likewise.
>   (rs6000_invalid_conversion): Likewise.
>   * config/rs6000/rs6000.h (VECTOR_ALIGNMENT_P): Add TDOmode.
>   (enum rs6000_builtin_type_index): Add DMR type nodes.
>   (dmr_type_node): Likewise.
>   (ptr_dmr_type_node): Likewise.
> 
> gcc/testsuite/
> 
>   * gcc.target/powerpc/dm-1024bit.c: New test.
> ---
>  gcc/config/rs6000/mma.md  | 152 ++
>  gcc/config/rs6000/rs6000-builtin.cc   |  13 ++
>  gcc/config/rs6000/rs6000-call.cc  |  13 +-
>  gcc/config/rs6000/rs6000-modes.def|   4 +
>  gcc/config/rs6000/rs6000.cc   | 135 
>  gcc/config/rs6000/rs6000.h|   7 +-
>  gcc/testsuite/gcc.target/powerpc/dm-1024bit.c |  63 
>  7 files changed, 351 insertions(+), 36 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/dm-1024bit.c
> 
> diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
> index f06e6bbb184..37de9030903 100644
> --- a/gcc/config/rs6000/mma.md
> +++ b/gcc/config/rs6000/mma.md
> @@ -92,6 +92,11 @@ (define_c_enum "unspec"
> UNSPEC_MMA_XXMFACC
> UNSPEC_MMA_XXMTACC
> UNSPEC_DM_ASSEMBLE_ACC
> +   UNSPEC_DM_INSERT512_UPPER
> +   UNSPEC_DM_INSERT512_LOWER
> +   UNSPEC_DM_EXTRACT512
> +   UNSPEC_DMR_RELOAD_FROM_MEMORY
> +   UNSPEC_DMR_RELOAD_TO_MEMORY
>])
>  
>  (define_c_enum "unspecv"
> @@ -879,3 +884,150 @@ (define_insn "mma_"
>[(set_attr "type" "mma")
> (set_attr "prefixed" "yes")
> (set_attr "isa" "dm,not_dm,not_dm")])
> +
> +
> +;; TDOmode (i.e. __dmr).
> +(define_expand "movtdo"
> +  [(set (match_operand:TDO 0 "nonimmediate_operand")
> + (match_operand:TDO 1 "input_operand"))]
> +  "TARGET_DENSE_MATH"
> +{
> +  rs6000_emit_move (operands[0], operands[1], TDOmode);
> +  DONE;
> +})
> +
> +(define_insn_and_split "*movtdo"
> +  [(set (match_operand:TDO 0 "nonimmediate_operand" "=wa,m,wa,wD,wD,wa")
> + (match_operand:TDO 1 "input_operand" "m,wa,wa,wa,wD,wD"))]
> +  "TARGET_DENSE_MATH
> +   && (gpc_reg_operand (operands[0], TDOmode)
> +   || gpc_reg_operand (operands[1], TDOmode))"
> +  "@
> +   #
>

Re: [PATCH] RISC-V: Fix macro fusion for auipc+add, when identifying UNSPEC_AUIPC. [PR113742]

2024-02-04 Thread Jeff Law




On 2/4/24 20:20, Monk Chiang wrote:

gcc/ChangeLog:

PR target/113742
* config/riscv/riscv.cc (riscv_macro_fusion_pair_p): Fix
recognizes UNSPEC_AUIPC for RISCV_FUSE_LUI_ADDI.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/pr113742.c: New test.
OK.  Presumably this faulted during an --enable-checking=rtl build or 
something similar?


Jeff


Re: [PATCH] RISC-V: Expand VLMAX scalar move in reduction

2024-02-04 Thread Jeff Law




On 2/1/24 18:56, Juzhe-Zhong wrote:

This patch fixes the following:

 vsetvli a5,a1,e32,m1,tu,ma
 sllia4,a5,2
 sub a1,a1,a5
 vle32.v v2,0(a0)
 add a0,a0,a4
 vadd.vv v1,v2,v1
 bne a1,zero,.L3
 vsetivlizero,1,e32,m1,ta,ma
 vmv.s.x v2,zero
 vsetvli a5,zero,e32,m1,ta,ma  ---> Redundant vsetvl.
 vredsum.vs  v1,v1,v2
 vmv.x.s a0,v1
 ret

VSETVL PASS is able to fuse avl = 1 of scalar move and VLMAX avl of reduction.

However, this following RTL blocks the fusion in dependence analysis in VSETVL 
PASS:

(insn 49 24 50 5 (set (reg:RVVM1SI 98 v2 [148])
 (if_then_else:RVVM1SI (unspec:RVVMF32BI [
 (const_vector:RVVMF32BI [
 (const_int 1 [0x1])
 repeat [
 (const_int 0 [0])
 ]
 ])
 (const_int 1 [0x1])
 (const_int 2 [0x2]) repeated x2
 (const_int 0 [0])
 (reg:SI 66 vl)
 (reg:SI 67 vtype)
 ] UNSPEC_VPREDICATE)
 (const_vector:RVVM1SI repeat [
 (const_int 0 [0])
 ])
 (unspec:RVVM1SI [
 (reg:DI 0 zero)
 ] UNSPEC_VUNDEF))) 3813 {*pred_broadcastrvvm1si_zero}
  (nil))
(insn 50 49 51 5 (set (reg:DI 15 a5 [151])  >  It 
set a5, blocks the following VLMAX into the scalar move above.
 (unspec:DI [
 (const_int 32 [0x20])
 ] UNSPEC_VLMAX)) 2566 {vlmax_avldi}
  (expr_list:REG_EQUIV (unspec:DI [
 (const_int 32 [0x20])
 ] UNSPEC_VLMAX)
 (nil)))
(insn 51 50 52 5 (set (reg:RVVM1SI 97 v1 [150])
 (unspec:RVVM1SI [
 (unspec:RVVMF32BI [
 (const_vector:RVVMF32BI repeat [
 (const_int 1 [0x1])
 ])
 (reg:DI 15 a5 [151])
 (const_int 2 [0x2])
 (const_int 1 [0x1])
 (reg:SI 66 vl)
 (reg:SI 67 vtype)
 ] UNSPEC_VPREDICATE)
 (unspec:RVVM1SI [
 (reg:RVVM1SI 97 v1 [orig:134 vect_result_14.6 ] [134])
 (reg:RVVM1SI 98 v2 [148])
 ] UNSPEC_REDUC_SUM)
 (unspec:RVVM1SI [
 (reg:DI 0 zero)
 ] UNSPEC_VUNDEF)
 ] UNSPEC_REDUC)) 17541 {pred_redsumrvvm1si}
  (expr_list:REG_DEAD (reg:RVVM1SI 98 v2 [148])
 (expr_list:REG_DEAD (reg:SI 66 vl)
 (expr_list:REG_DEAD (reg:DI 15 a5 [151])
 (expr_list:REG_DEAD (reg:DI 0 zero)
 (nil))

Such situation can only happen on auto-vectorization, never happen on intrinsic 
codes.
Since the reduction is passed VLMAX AVL, it should be more natural to pass 
VLMAX to the scalar move which initial the value of the reduction.

After this patch:

vsetvli a5,a1,e32,m1,tu,ma
sllia4,a5,2
sub a1,a1,a5
vle32.v v2,0(a0)
add a0,a0,a4
vadd.vv v1,v2,v1
bne a1,zero,.L3
vsetvli a5,zero,e32,m1,ta,ma
vmv.s.x v2,zero
vredsum.vs  v1,v1,v2
vmv.x.s a0,v1
 ret

Tested on both RV32/RV64 no regression.

PR target/113697

gcc/ChangeLog:

* config/riscv/riscv-v.cc (expand_reduction): Pass VLMAX avl to scalar 
move.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113697.c: New test.
I suspect this broke 502.gcc in spec2017.  Basically it's hanging during 
the build phase.  I'm not sure if I'm going to have time this week to 
dive into it.



Optimization options used:


GCC Flags:  -Ofast -flto -fsched-pressure -fno-strict-aliasing -fgnu89-inline 
-fcommon -fno-finite-math-only -fno-unsafe-math-optimizations




Given this appears to be a minor optimization issue, I wouldn't lose any 
sleep if it was reverted and deferred to gcc-15.


Anyway, good luck.  Sorry I can't do more on the debugging/reduction front.

Jeff


[PATCH] RISC-V: Fix macro fusion for auipc+add, when identifying UNSPEC_AUIPC. [PR113742]

2024-02-04 Thread Monk Chiang
gcc/ChangeLog:

PR target/113742
* config/riscv/riscv.cc (riscv_macro_fusion_pair_p): Fix
recognizes UNSPEC_AUIPC for RISCV_FUSE_LUI_ADDI.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/pr113742.c: New test.
---
 gcc/config/riscv/riscv.cc | 2 +-
 gcc/testsuite/gcc.target/riscv/pr113742.c | 4 
 2 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/pr113742.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 799d7919a4a..4100abc9dd1 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -8434,7 +8434,7 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
(lo_sum:DI (reg:DI rD) (const_int IMM12))) */
 
   if (GET_CODE (SET_SRC (prev_set)) == UNSPEC
- && XINT (prev_set, 1) == UNSPEC_AUIPC
+ && XINT (SET_SRC (prev_set), 1) == UNSPEC_AUIPC
  && (GET_CODE (SET_SRC (curr_set)) == LO_SUM
  || (GET_CODE (SET_SRC (curr_set)) == PLUS
  && SMALL_OPERAND (INTVAL (XEXP (SET_SRC (curr_set), 1))
diff --git a/gcc/testsuite/gcc.target/riscv/pr113742.c 
b/gcc/testsuite/gcc.target/riscv/pr113742.c
new file mode 100644
index 000..ab8934c2a8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr113742.c
@@ -0,0 +1,4 @@
+//* { dg-do compile } */
+/* { dg-options "-O2 -finstrument-functions -mabi=lp64d -mcpu=sifive-p670" } */
+
+void foo(void) {}
-- 
2.40.1



Re: [PATCH 2/2] RISC-V: Add sifive-p450, sifive-p67 to -mcpu

2024-02-04 Thread Kito Cheng
pushed, thanks :)

On Fri, Feb 2, 2024 at 11:59 AM Monk Chiang  wrote:
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-cores.def: Add sifive-p450, sifive-p670.
> * doc/invoke.texi (RISC-V Options): Add sifive-p450,
> sifive-p670.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/mcpu-sifive-p450.c: New test.
> * gcc.target/riscv/mcpu-sifive-p670.c: New test.
> ---
>  gcc/config/riscv/riscv-cores.def  |  9 +
>  gcc/doc/invoke.texi   |  3 +-
>  .../gcc.target/riscv/mcpu-sifive-p450.c   | 34 
>  .../gcc.target/riscv/mcpu-sifive-p670.c   | 40 +++
>  4 files changed, 85 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mcpu-sifive-p450.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mcpu-sifive-p670.c
>
> diff --git a/gcc/config/riscv/riscv-cores.def 
> b/gcc/config/riscv/riscv-cores.def
> index 0785e8f3fbd..57928bccdc8 100644
> --- a/gcc/config/riscv/riscv-cores.def
> +++ b/gcc/config/riscv/riscv-cores.def
> @@ -76,6 +76,15 @@ RISCV_CORE("sifive-s76",  "rv64imafdc", 
> "sifive-7-series")
>  RISCV_CORE("sifive-u54",  "rv64imafdc", "sifive-5-series")
>  RISCV_CORE("sifive-u74",  "rv64imafdc", "sifive-7-series")
>  RISCV_CORE("sifive-x280", "rv64imafdcv_zfh_zba_zbb_zvfh_zvl512b", 
> "sifive-7-series")
> +RISCV_CORE("sifive-p450", 
> "rv64imafdc_za64rs_zic64b_zicbom_zicbop_zicboz_"
> + 
> "ziccamoa_ziccif_zicclsm_ziccrse_zicsr_zifencei_"
> + 
> "zihintntl_zihintpause_zihpm_zfhmin_zba_zbb_zbs",
> + "sifive-p400-series")
> +RISCV_CORE("sifive-p670", 
> "rv64imafdcv_za64rs_zic64b_zicbom_zicbop_zicboz_"
> + 
> "ziccamoa_ziccif_zicclsm_ziccrse_zicsr_zifencei_"
> + 
> "zihintntl_zihintpause_zihpm_zfhmin_zba_zbb_zbs_"
> + "zvl128b_zvbb_zvknc_zvkng_zvksc_zvksg",
> + "sifive-p600-series")
>
>  RISCV_CORE("thead-c906",  
> "rv64imafdc_xtheadba_xtheadbb_xtheadbs_xtheadcmo_"
>   "xtheadcondmov_xtheadfmemidx_xtheadmac_"
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index f8645822ca4..71339b8b30f 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -30707,7 +30707,8 @@ by particular CPU name.
>  Permissible values for this option are: @samp{sifive-e20}, @samp{sifive-e21},
>  @samp{sifive-e24}, @samp{sifive-e31}, @samp{sifive-e34}, @samp{sifive-e76},
>  @samp{sifive-s21}, @samp{sifive-s51}, @samp{sifive-s54}, @samp{sifive-s76},
> -@samp{sifive-u54}, @samp{sifive-u74}, and @samp{sifive-x280}.
> +@samp{sifive-u54}, @samp{sifive-u74}, @samp{sifive-x280}, 
> @samp{sifive-xp450},
> +@samp{sifive-x670}.
>
>  @opindex mtune
>  @item -mtune=@var{processor-string}
> diff --git a/gcc/testsuite/gcc.target/riscv/mcpu-sifive-p450.c 
> b/gcc/testsuite/gcc.target/riscv/mcpu-sifive-p450.c
> new file mode 100644
> index 000..563041821e7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/mcpu-sifive-p450.c
> @@ -0,0 +1,34 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "-march given" { *-*-* } { "-march=*" } } */
> +/* { dg-options "-mcpu=sifive-p450 -mabi=lp64d" } */
> +/* SiFive p450 => 
> rv64imafdc_za64rs_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_zicsr_zifencei_zihintntl_zihintpause_zihpm_zfhmin_zba_zbb_zbs
>  */
> +
> +#if !((__riscv_xlen == 64) \
> +  && !defined(__riscv_32e) \
> +  && (__riscv_flen == 64)  \
> +  && defined(__riscv_c)\
> +  && defined(__riscv_za64rs)   \
> +  && defined(__riscv_zic64b)   \
> +  && defined(__riscv_zicbom)   \
> +  && defined(__riscv_zicbop)   \
> +  && defined(__riscv_zicboz)   \
> +  && defined(__riscv_ziccamoa) \
> +  && defined(__riscv_ziccif)   \
> +  && defined(__riscv_zicclsm)  \
> +  && defined(__riscv_ziccrse)  \
> +  && defined(__riscv_zicsr)\
> +  && defined(__riscv_zifencei) \
> +  && defined(__riscv_zihintntl)\
> +  && defined(__riscv_zihintpause)  \
> +  && defined(__riscv_zihpm)\
> +  && defined(__riscv_zfhmin)   \
> +  && defined(__riscv_zba)  \
> +  && defined(__riscv_zbb)  \
> +  && defined(__riscv_zbs))
> +#error "unexpected arch"
> +#endif
> +
> +int main()
> +{
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/mcpu-sifive-p670.c 
> b/gcc/testsuite/gcc.target/riscv/mcpu-sifive-p670.c
> new file mode 100644
> index 000..8dfd490f440
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/mcpu-sifive-p670.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "-march given" { *-*-* } { "-march=*" } } */
> +/* { dg-options "-mcpu=sifive-p670 -mabi=lp64d" } */
> +/* SiFive p670 => 
> 

Re: [PATCH 1/2] RISC-V: Support scheduling for sifive p400 series

2024-02-04 Thread Kito Cheng
pushed, thanks :)

On Fri, Feb 2, 2024 at 11:59 AM Monk Chiang  wrote:
>
> Add sifive p400 series scheduler module. For more information
> see https://www.sifive.com/cores/performance-p450-470.
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.md: Include sifive-p400.md.
> * config/riscv/sifive-p400.md: New file.
> * config/riscv/riscv-cores.def (RISCV_TUNE): Add parameter.
> * config/riscv/riscv-opts.h (enum riscv_microarchitecture_type):
> Add sifive_p400.
> * config/riscv/riscv.cc (sifive_p400_tune_info): New.
> * config/riscv/riscv.h (TARGET_SFB_ALU): Update.
> * doc/invoke.texi (RISC-V Options): Add sifive-p400-series
> ---
>  gcc/config/riscv/riscv-cores.def |   1 +
>  gcc/config/riscv/riscv-opts.h|   1 +
>  gcc/config/riscv/riscv.cc|  17 +++
>  gcc/config/riscv/riscv.h |   1 +
>  gcc/config/riscv/riscv.md|   3 +-
>  gcc/config/riscv/sifive-p400.md  | 174 +++
>  gcc/doc/invoke.texi  |   4 +-
>  7 files changed, 198 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/config/riscv/sifive-p400.md
>
> diff --git a/gcc/config/riscv/riscv-cores.def 
> b/gcc/config/riscv/riscv-cores.def
> index a07a79e2cb7..0785e8f3fbd 100644
> --- a/gcc/config/riscv/riscv-cores.def
> +++ b/gcc/config/riscv/riscv-cores.def
> @@ -37,6 +37,7 @@ RISCV_TUNE("rocket", generic, rocket_tune_info)
>  RISCV_TUNE("sifive-3-series", generic, rocket_tune_info)
>  RISCV_TUNE("sifive-5-series", generic, rocket_tune_info)
>  RISCV_TUNE("sifive-7-series", sifive_7, sifive_7_tune_info)
> +RISCV_TUNE("sifive-p400-series", sifive_p400, sifive_p400_tune_info)
>  RISCV_TUNE("sifive-p600-series", sifive_p600, sifive_p600_tune_info)
>  RISCV_TUNE("thead-c906", generic, thead_c906_tune_info)
>  RISCV_TUNE("generic-ooo", generic_ooo, generic_ooo_tune_info)
> diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
> index 25951665b13..4edddbadc37 100644
> --- a/gcc/config/riscv/riscv-opts.h
> +++ b/gcc/config/riscv/riscv-opts.h
> @@ -55,6 +55,7 @@ extern enum riscv_isa_spec_class riscv_isa_spec;
>  enum riscv_microarchitecture_type {
>generic,
>sifive_7,
> +  sifive_p400,
>sifive_p600,
>generic_ooo
>  };
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index cead76fe1a2..4b24e4b9a0a 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -447,6 +447,23 @@ static const struct riscv_tune_param sifive_7_tune_info 
> = {
>NULL,/* vector cost */
>  };
>
> +/* Costs to use when optimizing for Sifive p400 Series.  */
> +static const struct riscv_tune_param sifive_p400_tune_info = {
> +  {COSTS_N_INSNS (4), COSTS_N_INSNS (4)},  /* fp_add */
> +  {COSTS_N_INSNS (4), COSTS_N_INSNS (4)},  /* fp_mul */
> +  {COSTS_N_INSNS (20), COSTS_N_INSNS (20)},/* fp_div */
> +  {COSTS_N_INSNS (4), COSTS_N_INSNS (4)},  /* int_mul */
> +  {COSTS_N_INSNS (6), COSTS_N_INSNS (6)},  /* int_div */
> +  3,   /* issue_rate */
> +  4,   /* branch_cost */
> +  3,   /* memory_cost */
> +  4,   /* fmv_cost */
> +  true,/* 
> slow_unaligned_access */
> +  false,   /* use_divmod_expansion */
> +  RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI,  /* fusible_ops */
> +  &generic_vector_cost,/* vector cost */
> +};
> +
>  /* Costs to use when optimizing for Sifive p600 Series.  */
>  static const struct riscv_tune_param sifive_p600_tune_info = {
>{COSTS_N_INSNS (4), COSTS_N_INSNS (4)},  /* fp_add */
> diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
> index e0cb3ba08d4..669308cc96d 100644
> --- a/gcc/config/riscv/riscv.h
> +++ b/gcc/config/riscv/riscv.h
> @@ -898,6 +898,7 @@ extern enum riscv_cc get_riscv_cc (const rtx use);
>
>  #define TARGET_SFB_ALU \
>   ((riscv_microarchitecture == sifive_7) \
> +  || (riscv_microarchitecture == sifive_p400) \
>|| (riscv_microarchitecture == sifive_p600))
>
>  #define LOGICAL_OP_NON_SHORT_CIRCUIT 0
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 2a164a03dbd..39b29795cd6 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -687,7 +687,7 @@
>  ;; Microarchitectures we know how to tune for.
>  ;; Keep this in sync with enum riscv_microarchitecture.
>  (define_attr "tune"
> -  "generic,sifive_7,sifive_p600,generic_ooo"
> +  "generic,sifive_7,sifive_p400,sifive_p600,generic_ooo"
>(const (symbol_ref "((enum attr_tune) riscv_microarchitecture)")))
>
>  ;; Describe a user's asm statement.
> @@ -3850,6 +3850,7 @@
>  (include "pic.md")
>  (include "generic.md")
>  (include "sifive-7.md")
> +(include "sifive-p400.md")
>  (include "si

Re: [PATCH] MIPS: Fix wrong MSA FP vector negation

2024-02-04 Thread YunQiang Su
Xi Ruoyao  于2024年2月5日周一 02:01写道:
>
> We expanded (neg x) to (minus const0 x) for MSA FP vectors, this is
> wrong because -0.0 is not 0 - 0.0.  This causes some Python tests to
> fail when Python is built with MSA enabled.
>
> Use the bnegi.df instructions to simply reverse the sign bit instead.
>
> gcc/ChangeLog:
>
> * config/mips/mips-msa.md (elmsgnbit): New define_mode_attr.
> (neg2): Change the mode iterator from MSA to IMSA because
> in FP arithmetic we cannot use (0 - x) for -x.
> (neg2): New define_insn to implement FP vector negation,
> using a bnegi instruction to negate the sign bit.
> ---
>
> Bootstrapped and regtested on mips64el-linux-gnuabi64.  Ok for trunk
> and/or release branches?
>
>  gcc/config/mips/mips-msa.md | 18 +++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
>

LGTM, while I guess that we also need a test case.

> diff --git a/gcc/config/mips/mips-msa.md b/gcc/config/mips/mips-msa.md
> index 83d9a08e360..920161ed1d8 100644
> --- a/gcc/config/mips/mips-msa.md
> +++ b/gcc/config/mips/mips-msa.md
> @@ -231,6 +231,10 @@ (define_mode_attr bitimm
> (V4SI  "uimm5")
> (V2DI  "uimm6")])
>
> +;; The index of sign bit in FP vector elements.
> +(define_mode_attr elmsgnbit [(V2DF "63") (V4DF "63")
> +(V4SF "31") (V8SF "31")])
> +
>  (define_expand "vec_init"
>[(match_operand:MSA 0 "register_operand")
> (match_operand:MSA 1 "")]
> @@ -597,9 +601,9 @@ (define_expand "abs2"
>  })
>
>  (define_expand "neg2"
> -  [(set (match_operand:MSA 0 "register_operand")
> -   (minus:MSA (match_dup 2)
> -  (match_operand:MSA 1 "register_operand")))]
> +  [(set (match_operand:IMSA 0 "register_operand")
> +   (minus:IMSA (match_dup 2)
> +  (match_operand:IMSA 1 "register_operand")))]
>"ISA_HAS_MSA"
>  {
>rtx reg = gen_reg_rtx (mode);
> @@ -607,6 +611,14 @@ (define_expand "neg2"
>operands[2] = reg;
>  })
>
> +(define_insn "neg2"
> +  [(set (match_operand:FMSA 0 "register_operand" "=f")
> +   (neg (match_operand:FMSA 1 "register_operand" "f")))]
> +  "ISA_HAS_MSA"
> +  "bnegi.\t%w0,%w1,"
> +  [(set_attr "type" "simd_bit")
> +   (set_attr "mode" "")])
> +
>  (define_expand "msa_ldi"
>[(match_operand:IMSA 0 "register_operand")
> (match_operand 1 "const_imm10_operand")]
> --
> 2.43.0
>


[x86_64 PATCH] PR target/113690: Fix-up MULT REG_EQUAL notes in STV.

2024-02-04 Thread Roger Sayle

This patch fixes PR target/113690, an ICE-on-valid regression on x86_64
that exhibits with a specific combination of command line options.  The
cause is that x86's scalar-to-vector pass converts a chain of instructions
from TImode to V1TImode, but fails to appropriately update the attached
REG_EQUAL note.  Given that multiplication isn't supported in V1TImode,
the REG_NOTE handling code wasn't expecting to see a MULT.  Easily solved
with additional handling for other binary operators that may potentially
(in future) have an immediate constant as the second operand that needs
handling.  For convenience, this code (re)factors the logic to convert
a TImode constant into a V1TImode constant vector into a subroutine and
reuses it.

For the record, STV is actually doing something useful in this strange
testcase,  GCC with -O2 -fno-dce -fno-forward-propagate
-fno-split-wide-types
-funroll-loops generates:

foo:movl$v, %eax
pxor%xmm0, %xmm0
movaps  %xmm0, 48(%rax)
movaps  %xmm0, (%rax)
movaps  %xmm0, 16(%rax)
movaps  %xmm0, 32(%rax)
ret

With the addition of -mno-stv (to disable the patched code) it gives:

foo:movl$v, %eax
movq$0, 48(%rax)
movq$0, 56(%rax)
movq$0, (%rax)
movq$0, 8(%rax)
movq$0, 16(%rax)
movq$0, 24(%rax)
movq$0, 32(%rax)
movq$0, 40(%rax)
ret


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2024-02-05  Roger Sayle  

gcc/ChangeLog
PR target/113690
* config/i386/i386-features.cc (timode_convert_cst): New helper
function to convert a TImode CONST_SCALAR_INT_P to a V1TImode
CONST_VECTOR.
(timode_scalar_chain::convert_op): Use timode_convert_cst.
(timode_scalar_chain::convert_insn): If a REG_EQUAL note contains
a binary operator where the second operand is an immediate integer
constant, convert it to V1TImode using timode_convert_cst.
Use timode_convert_cst.

gcc/testsuite/ChangeLog
PR target/113690
* gcc.target/i386/pr113690.c: New test case.


Thanks in advance,
Roger
--

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 4020b27..90ada7d 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -1749,6 +1749,19 @@ timode_scalar_chain::fix_debug_reg_uses (rtx reg)
 }
 }
 
+/* Helper function to convert immediate constant X to V1TImode.  */
+static rtx
+timode_convert_cst (rtx x)
+{
+  /* Prefer all ones vector in case of -1.  */
+  if (constm1_operand (x, TImode))
+return CONSTM1_RTX (V1TImode);
+
+  rtx *v = XALLOCAVEC (rtx, 1);
+  v[0] = x;
+  return gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec_v (1, v));
+}
+
 /* Convert operand OP in INSN from TImode to V1TImode.  */
 
 void
@@ -1775,18 +1788,8 @@ timode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 }
   else if (CONST_SCALAR_INT_P (*op))
 {
-  rtx vec_cst;
   rtx tmp = gen_reg_rtx (V1TImode);
-
-  /* Prefer all ones vector in case of -1.  */
-  if (constm1_operand (*op, TImode))
-   vec_cst = CONSTM1_RTX (V1TImode);
-  else
-   {
- rtx *v = XALLOCAVEC (rtx, 1);
- v[0] = *op;
- vec_cst = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec_v (1, v));
-   }
+  rtx vec_cst = timode_convert_cst (*op);
 
   if (!standard_sse_constant_p (vec_cst, V1TImode))
{
@@ -1830,12 +1833,28 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
  tmp = find_reg_equal_equiv_note (insn);
  if (tmp)
{
- if (GET_MODE (XEXP (tmp, 0)) == TImode)
-   PUT_MODE (XEXP (tmp, 0), V1TImode);
- else if (CONST_SCALAR_INT_P (XEXP (tmp, 0)))
-   XEXP (tmp, 0)
- = gen_rtx_CONST_VECTOR (V1TImode,
- gen_rtvec (1, XEXP (tmp, 0)));
+ rtx expr = XEXP (tmp, 0);
+ if (GET_MODE (expr) == TImode)
+   {
+ PUT_MODE (expr, V1TImode);
+ switch (GET_CODE (expr))
+   {
+   case PLUS:
+   case MINUS:
+   case MULT:
+   case AND:
+   case IOR:
+   case XOR:
+ if (CONST_SCALAR_INT_P (XEXP (expr, 1)))
+   XEXP (expr, 1) = timode_convert_cst (XEXP (expr, 1));
+ break;
+
+   default:
+ break;
+   }
+   }
+ else if (CONST_SCALAR_INT_P (expr))
+   XEXP (tmp, 0) = timode_convert_cst (expr);
}
}
   break;
@@ -1876,7 +1895,7 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
}
  e

Re: [PATCH 2/2] xtensa: Fix missing mode warning in "*eqne_zero_masked_bits"

2024-02-04 Thread Max Filippov
On Sat, Feb 3, 2024 at 6:19 AM Takayuki 'January June' Suwa
 wrote:
>
> gcc/ChangeLog:
>
> * config/xtensa/xtensa.md (*eqne_zero_masked_bits):
> Add missing ":SI" to the match_operator.
> ---
>  gcc/config/xtensa/xtensa.md | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Regtested for target=xtensa-linux-uclibc, no new regressions.
Committed to master.

-- 
Thanks.
-- Max


Re: [PATCH 1/2 v2] xtensa: Recover constant synthesis for HImode after LRA transition

2024-02-04 Thread Max Filippov
On Sun, Feb 4, 2024 at 2:20 AM Takayuki 'January June' Suwa
 wrote:
>
> After LRA transition, HImode constants that don't fit into signed 12 bits
> are no longer subject to constant synthesis:
>
> /* example */
> void test(void) {
>   short foo = 32767;
>   __asm__ ("" :: "r"(foo));
> }
>
> ;; before
> .literal_position
> .literal .LC0, 32767
> test:
> l32ra9, .LC0
> ret.n
>
> This patch fixes that:
>
> ;; after
> test:
> movi.n  a9, -1
> extui   a9, a9, 17, 15
> ret.n
>
> gcc/ChangeLog:
>
> * config/xtensa/xtensa.md (SHI): New mode iterator.
> (2 split patterns related to constsynth):
> Change to also accept HImode operands.
> ---
>  gcc/config/xtensa/xtensa.md | 22 ++
>  1 file changed, 14 insertions(+), 8 deletions(-)

Regtested for target=xtensa-linux-uclibc, no new regressions.
Committed to master.

-- 
Thanks.
-- Max


[committed] Reasonably handle SUBREGs in risc-v cost modeling

2024-02-04 Thread Jeff Law
This patch adjusts the costs so that we treat REG and SUBREG expressions 
the same for costing.


This was motivated by bt_skip_func and bt_find_func in xz and results in 
nearly a 5% improvement in the dynamic instruction count for input #2 
and smaller, but definitely visible improvements pretty much across the 
board.  Exceptions would be perlbench input #1 and exchange2 which 
showed small regressions.




In the bt_find_func and bt_skip_func cases we have  something like this:


(insn 10 7 11 2 (set (reg/v:DI 136 [ x ])
(zero_extend:DI (subreg/s/u:SI (reg/v:DI 137 [ a ]) 0))) "zz.c":6:21 
387 {*zero_extendsidi2_bitmanip}
 (nil))
(insn 11 10 12 2 (set (reg:DI 142 [ _1 ])
(plus:DI (reg/v:DI 136 [ x ])
(reg/v:DI 139 [ b ]))) "zz.c":7:23 5 {adddi3}
 (nil))


[ ... ]

(insn 13 12 14 2 (set (reg:DI 143 [ _2 ])
(plus:DI (reg/v:DI 136 [ x ])
(reg/v:DI 141 [ c ]))) "zz.c":8:23 5 {adddi3}
 (nil))



Note the two uses of (reg 136). The best way to handle that in combine 
might be a 3->2 split.  But there's a much better approach if we look at 
fwprop...




(set (reg:DI 142 [ _1 ])
(plus:DI (zero_extend:DI (subreg/s/u:SI (reg/v:DI 137 [ a ]) 0))
(reg/v:DI 139 [ b ])))
change not profitable (cost 4 -> cost 8)


So that should be the same cost as a regular DImode addition when the 
ZBA extension is enabled.  But it ends up costing more because the 
clause to cost this variant isn't prepared to handle a SUBREG.  That 
results in the RTL above having too high a cost and fwprop gives up.


One approach would be to replace the REG_P  with REG_P || SUBREG_P in 
the costing code.  I ultimately decided against that and instead check 
if the operand in question passes register_operand.


By far the most important case to handle is the DImode PLUS.  But for 
the sake of consistency, I changed the other instances in 
riscv_rtx_costs as well.  For those other cases we're talking about 
improvements in the .01% range.


While we are into stage4, this just hits cost modeling which we've 
generally agreed is still appropriate for the RISC-V backend (though we 
were mostly talking about vector).  So I'm going to extend that general 
agreement ever so slightly and include scalar cost modeling :-)


Built and regression tested on rv64gc.  Pushing to the trunk.

Shout out to Jivan who took the original somewhat vague report about 
bt_skip_func and boiled it down to a very simple testcase along with 
info on a couple attempted fixes that didn't work out.



Jeffcommit 777df37a12e55ecbc135efbed2749a8a8a756d4d
Author: Jeff Law 
Date:   Sun Feb 4 13:01:50 2024 -0700

[committed] Reasonably handle SUBREGs in risc-v cost modeling

This patch adjusts the costs so that we treat REG and SUBREG expressions the
same for costing.

This was motivated by bt_skip_func and bt_find_func in xz and results in 
nearly
a 5% improvement in the dynamic instruction count for input #2 and smaller, 
but
definitely visible improvements pretty much across the board.  Exceptions 
would
be perlbench input #1 and exchange2 which showed very small regressions.

In the bt_find_func and bt_skip_func cases we have  something like this:

> (insn 10 7 11 2 (set (reg/v:DI 136 [ x ])
> (zero_extend:DI (subreg/s/u:SI (reg/v:DI 137 [ a ]) 0))) 
"zz.c":6:21 387 {*zero_extendsidi2_bitmanip}
>  (nil))
> (insn 11 10 12 2 (set (reg:DI 142 [ _1 ])
> (plus:DI (reg/v:DI 136 [ x ])
> (reg/v:DI 139 [ b ]))) "zz.c":7:23 5 {adddi3}
>  (nil))

[ ... ]> (insn 13 12 14 2 (set (reg:DI 143 [ _2 ])
> (plus:DI (reg/v:DI 136 [ x ])
> (reg/v:DI 141 [ c ]))) "zz.c":8:23 5 {adddi3}
>  (nil))

Note the two uses of (reg 136). The best way to handle that in combine 
might be
a 3->2 split.  But there's a much better approach if we look at fwprop...

(set (reg:DI 142 [ _1 ])
(plus:DI (zero_extend:DI (subreg/s/u:SI (reg/v:DI 137 [ a ]) 0))
(reg/v:DI 139 [ b ])))
change not profitable (cost 4 -> cost 8)

So that should be the same cost as a regular DImode addition when the ZBA
extension is enabled.  But it ends up costing more because the clause to 
cost
this variant isn't prepared to handle a SUBREG.  That results in the RTL 
above
having too high a cost and fwprop gives up.

One approach would be to replace the REG_P  with REG_P || SUBREG_P in the
costing code.  I ultimately decided against that and instead check if the
operand in question passes register_operand.

By far the most important case to handle is the DImode PLUS.  But for the 
sake
of consistency, I changed the other instances in riscv_rtx_costs as well.  
For
those other cases we're talking about improvements in the .01% range.

While we are into stage4, this just hits cost modeling which we've generall

[PATCH] MIPS: Fix wrong MSA FP vector negation

2024-02-04 Thread Xi Ruoyao
We expanded (neg x) to (minus const0 x) for MSA FP vectors, this is
wrong because -0.0 is not 0 - 0.0.  This causes some Python tests to
fail when Python is built with MSA enabled.

Use the bnegi.df instructions to simply reverse the sign bit instead.

gcc/ChangeLog:

* config/mips/mips-msa.md (elmsgnbit): New define_mode_attr.
(neg2): Change the mode iterator from MSA to IMSA because
in FP arithmetic we cannot use (0 - x) for -x.
(neg2): New define_insn to implement FP vector negation,
using a bnegi instruction to negate the sign bit.
---

Bootstrapped and regtested on mips64el-linux-gnuabi64.  Ok for trunk
and/or release branches?

 gcc/config/mips/mips-msa.md | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/gcc/config/mips/mips-msa.md b/gcc/config/mips/mips-msa.md
index 83d9a08e360..920161ed1d8 100644
--- a/gcc/config/mips/mips-msa.md
+++ b/gcc/config/mips/mips-msa.md
@@ -231,6 +231,10 @@ (define_mode_attr bitimm
(V4SI  "uimm5")
(V2DI  "uimm6")])
 
+;; The index of sign bit in FP vector elements.
+(define_mode_attr elmsgnbit [(V2DF "63") (V4DF "63")
+(V4SF "31") (V8SF "31")])
+
 (define_expand "vec_init"
   [(match_operand:MSA 0 "register_operand")
(match_operand:MSA 1 "")]
@@ -597,9 +601,9 @@ (define_expand "abs2"
 })
 
 (define_expand "neg2"
-  [(set (match_operand:MSA 0 "register_operand")
-   (minus:MSA (match_dup 2)
-  (match_operand:MSA 1 "register_operand")))]
+  [(set (match_operand:IMSA 0 "register_operand")
+   (minus:IMSA (match_dup 2)
+  (match_operand:IMSA 1 "register_operand")))]
   "ISA_HAS_MSA"
 {
   rtx reg = gen_reg_rtx (mode);
@@ -607,6 +611,14 @@ (define_expand "neg2"
   operands[2] = reg;
 })
 
+(define_insn "neg2"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+   (neg (match_operand:FMSA 1 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "bnegi.\t%w0,%w1,"
+  [(set_attr "type" "simd_bit")
+   (set_attr "mode" "")])
+
 (define_expand "msa_ldi"
   [(match_operand:IMSA 0 "register_operand")
(match_operand 1 "const_imm10_operand")]
-- 
2.43.0



Pushed: [PATCH] LoongArch: Avoid out-of-bounds access in loongarch_symbol_insns

2024-02-04 Thread Xi Ruoyao
On Sun, 2024-02-04 at 11:19 +0800, chenglulu wrote:
> 
> 在 2024/2/2 下午5:55, Xi Ruoyao 写道:
> > We call loongarch_symbol_insns with mode = MAX_MACHINE_MODE sometimes.
> > But in loongarch_symbol_insns:
> > 
> >  if (LSX_SUPPORTED_MODE_P (mode) || LASX_SUPPORTED_MODE_P (mode))
> >    return 0;
> > 
> > And LSX_SUPPORTED_MODE_P is defined as:
> > 
> >  #define LSX_SUPPORTED_MODE_P(MODE) \
> >    (ISA_HAS_LSX \
> >     && GET_MODE_SIZE (MODE) == UNITS_PER_LSX_REG ... ...
> > 
> > GET_MODE_SIZE is expanded to a call to mode_to_bytes, which is defined:
> > 
> >  ALWAYS_INLINE poly_uint16
> >  mode_to_bytes (machine_mode mode)
> >  {
> >  #if GCC_VERSION >= 4001
> >    return (__builtin_constant_p (mode)
> >   ? mode_size_inline (mode) : mode_size[mode]);
> >  #else
> >    return mode_size[mode];
> >  #endif
> >  }
> > 
> > There is an assertion in mode_size_inline:
> > 
> >  gcc_assert (mode >= 0 && mode < NUM_MACHINE_MODES);
> > 
> > Note that NUM_MACHINE_MODES = MAX_MACHINE_MODE (emitted by genmodes.cc),
> > thus if __builtin_constant_p (mode) is evaluated true (it happens when
> > GCC is bootstrapped with LTO+PGO), the assertion will be triggered and
> > cause an ICE.  OTOH if __builtin_constant_p (mode) is evaluated false,
> > mode_size[mode] is still an out-of-bound array access (the length or the
> > mode_size array is NUM_MACHINE_MODES).
> > 
> > So we shouldn't call LSX_SUPPORTED_MODE_P or LASX_SUPPORTED_MODE_P with
> > MAX_MACHINE_MODE in loongarch_symbol_insns.  This is very similar to a
> > MIPS bug PR98491 fixed by me about 3 years ago.
> > 
> > gcc/ChangeLog:
> > 
> > * config/loongarch/loongarch.cc (loongarch_symbol_insns): Do not
> > use LSX_SUPPORTED_MODE_P or LASX_SUPPORTED_MODE_P if mode is
> > MAX_MACHINE_MODE.
> > ---
> > 
> > Bootstrapped and regtested on loongarch64-linux-gnu.  Ok for trunk?
> 
> LGTM!

Pushed r14-8785.

> I have a question. I see that you often add compilation options in 
> BOOT_CFLAGS.
> 
> I also want to test it. Do you have a recommended set of compilation 
> options?

When I build a compiler for my system I use
{BOOT_{C,CXX,LD}FLAGS,{C,CXX,LD}FLAGS_FOR_TARGET}="-O3 -march=la664 -
mtune=la664 -pipe -fgraphite-identity -floop-nest-optimize -fipa-pta -
fdevirtualize-at-ltrans -fno-semantic-interposition -Wl,-O1 -Wl,--as-
needed"

and enable PGO (make profiledbootstrap) and LTO (--with-build-
config=bootstrap-lto).

All of them but GRAPHITE (-fgraphite-identity -floop-nest-optimize)
seems "pretty safe" on the architectures I have a hardware of.  GRAPHITE
is causing bootstrap failure on AArch64 with GCC 13 (PR109929) if
combined with PGO and the real cause is still not found yet.

But when I do a test build I normally only enable the flags which may
help to catch some issues, for example when a change only affects LTO I
add --with-build-config=bootstrap-lto, when changing something related
to LASX I use -O3 -mlasx (or -O3 -march=la664) as BOOT_CFLAGS.


-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


Pushed: [PATCH] LoongArch: Fix wrong LSX FP vector negation

2024-02-04 Thread Xi Ruoyao
On Sun, 2024-02-04 at 11:20 +0800, chenglulu wrote:
> 
> 在 2024/2/3 下午4:58, Xi Ruoyao 写道:
> > We expanded (neg x) to (minus const0 x) for LSX FP vectors, this is
> > wrong because -0.0 is not 0 - 0.0.  This causes some Python tests to
> > fail when Python is built with LSX enabled.
> > 
> > Use the vbitrevi.{d/w} instructions to simply reverse the sign bit
> > instead.  We are already doing this for LASX and now we can unify them
> > into simd.md.
> > 
> > gcc/ChangeLog:
> > 
> > * config/loongarch/lsx.md (neg2): Remove the
> > incorrect expand.
> > * config/loongarch/simd.md (simdfmt_as_i): New define_mode_attr.
> > (elmsgnbit): Likewise.
> > (neg2): New define_insn.
> > * config/loongarch/lasx.md (negv4df2, negv8sf2): Remove as they
> > are now instantiated in simd.md.
> > ---
> > 
> > Bootstrapped and regtested on loongarch64-linux-gnu.  Ok for trunk?
> 
> LGTM!
> 
> Thanks!

Pushed r14-8785.


-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


[PATCH 1/2 v2] xtensa: Recover constant synthesis for HImode after LRA transition

2024-02-04 Thread Takayuki 'January June' Suwa
After LRA transition, HImode constants that don't fit into signed 12 bits
are no longer subject to constant synthesis:

/* example */
void test(void) {
  short foo = 32767;
  __asm__ ("" :: "r"(foo));
}

;; before
.literal_position
.literal .LC0, 32767
test:
l32ra9, .LC0
ret.n

This patch fixes that:

;; after
test:
movi.n  a9, -1
extui   a9, a9, 17, 15
ret.n

gcc/ChangeLog:

* config/xtensa/xtensa.md (SHI): New mode iterator.
(2 split patterns related to constsynth):
Change to also accept HImode operands.
---
 gcc/config/xtensa/xtensa.md | 22 ++
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 13b8b57f1fc..1a2249b059a 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -87,6 +87,10 @@
 ;; the same template.
 (define_mode_iterator HQI [HI QI])
 
+;; This mode iterator allows the SI and HI patterns to be defined from
+;; the same template.
+(define_mode_iterator SHI [SI HI])
+
 
 ;; Attributes.
 
@@ -1291,28 +1295,30 @@
(set_attr "length"  "2,2,2,2,2,2,3,3,3,3,6,3,3,3,3,3")])
 
 (define_split
-  [(set (match_operand:SI 0 "register_operand")
-   (match_operand:SI 1 "const_int_operand"))]
+  [(set (match_operand:SHI 0 "register_operand")
+   (match_operand:SHI 1 "const_int_operand"))]
   "!TARGET_CONST16 && !TARGET_AUTO_LITPOOLS
&& ! xtensa_split1_finished_p ()
&& ! xtensa_simm12b (INTVAL (operands[1]))"
   [(set (match_dup 0)
(match_dup 1))]
 {
-  operands[1] = force_const_mem (SImode, operands[1]);
+  operands[1] = force_const_mem (mode, operands[1]);
 })
 
 (define_split
-  [(set (match_operand:SI 0 "register_operand")
-   (match_operand:SI 1 "constantpool_operand"))]
+  [(set (match_operand:SHI 0 "register_operand")
+   (match_operand:SHI 1 "constantpool_operand"))]
   "! optimize_debug && reload_completed"
   [(const_int 0)]
 {
-  rtx x = avoid_constant_pool_reference (operands[1]);
+  rtx x = avoid_constant_pool_reference (operands[1]), dst = operands[0];
   if (! CONST_INT_P (x))
 FAIL;
-  if (! xtensa_constantsynth (operands[0], INTVAL (x)))
-emit_move_insn (operands[0], x);
+  if (mode == HImode)
+dst = gen_rtx_REG (SImode, REGNO (dst));
+  if (! xtensa_constantsynth (dst, INTVAL (x)))
+emit_move_insn (dst, x);
   DONE;
 })
 
-- 
2.30.2


Re:[pushed] [PATCH v1] LoongArch: testsuite: Fix gcc.dg/vect/vect-reduc-mul_{1,2}.c FAIL.

2024-02-04 Thread chenglulu

Pushed to r14-8784.

在 2024/2/2 上午9:42, Li Wei 写道:

This FAIL was introduced from r14-6908. The reason is that when merging
constant vector permutation implementations, the 128-bit matching situation
was not fully considered. In fact, the expansion of 128-bit vectors after
merging only supports value-based 4 elements set shuffle, so this time is a
complete implementation of the entire 128-bit vector constant permutation,
and some structural adjustments have also been made to the code.

gcc/ChangeLog:

* config/loongarch/loongarch.cc (loongarch_expand_vselect): Adjust.
(loongarch_expand_vselect_vconcat): Ditto.
(loongarch_try_expand_lsx_vshuf_const): New, use vshuf to implement
all 128-bit constant permutation situations.
(loongarch_expand_lsx_shuffle): Adjust and rename function name.
(loongarch_is_imm_set_shuffle): Renamed function name.
(loongarch_expand_vec_perm_even_odd): Function forward declaration.
(loongarch_expand_vec_perm_even_odd_1): Add implement for 128-bit
extract-even and extract-odd permutations.
(loongarch_is_odd_extraction): Delete.
(loongarch_is_even_extraction): Ditto.
(loongarch_expand_vec_perm_const): Adjust.
---
  gcc/config/loongarch/loongarch.cc | 218 ++
  1 file changed, 163 insertions(+), 55 deletions(-)

diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index 8bc18448753..61723844756 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -8029,7 +8029,8 @@ struct expand_vec_perm_d
  
  static bool

  loongarch_expand_vselect (rtx target, rtx op0,
- const unsigned char *perm, unsigned nelt)
+ const unsigned char *perm, unsigned nelt,
+ bool testing_p)
  {
rtx rperm[MAX_VECT_LEN], x;
rtx_insn *insn;
@@ -8048,6 +8049,9 @@ loongarch_expand_vselect (rtx target, rtx op0,
remove_insn (insn);
return false;
  }
+
+  if (testing_p)
+  remove_insn (insn);
return true;
  }
  
@@ -8055,7 +8059,8 @@ loongarch_expand_vselect (rtx target, rtx op0,
  
  static bool

  loongarch_expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
- const unsigned char *perm, unsigned nelt)
+ const unsigned char *perm, unsigned nelt,
+ bool testing_p)
  {
machine_mode v2mode;
rtx x;
@@ -8063,7 +8068,7 @@ loongarch_expand_vselect_vconcat (rtx target, rtx op0, 
rtx op1,
if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
  return false;
x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
-  return loongarch_expand_vselect (target, x, perm, nelt);
+  return loongarch_expand_vselect (target, x, perm, nelt, testing_p);
  }
  
  static tree

@@ -8317,11 +8322,87 @@ loongarch_set_handled_components (sbitmap components)
  #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
  #undef TARGET_ASM_ALIGNED_DI_OP
  #define TARGET_ASM_ALIGNED_DI_OP "\t.dword\t"
+
+/* Use the vshuf instruction to implement all 128-bit constant vector
+   permuatation.  */
+
+static bool
+loongarch_try_expand_lsx_vshuf_const (struct expand_vec_perm_d *d)
+{
+  int i;
+  rtx target, op0, op1, sel, tmp;
+  rtx rperm[MAX_VECT_LEN];
+
+  if (GET_MODE_SIZE (d->vmode) == 16)
+{
+  target = d->target;
+  op0 = d->op0;
+  op1 = d->one_vector_p ? d->op0 : d->op1;
+
+  if (GET_MODE (op0) != GET_MODE (op1)
+ || GET_MODE (op0) != GET_MODE (target))
+   return false;
+
+  if (d->testing_p)
+   return true;
+
+  for (i = 0; i < d->nelt; i += 1)
+ rperm[i] = GEN_INT (d->perm[i]);
+
+  if (d->vmode == E_V2DFmode)
+   {
+ sel = gen_rtx_CONST_VECTOR (E_V2DImode, gen_rtvec_v (d->nelt, rperm));
+ tmp = simplify_gen_subreg (E_V2DImode, d->target, d->vmode, 0);
+ emit_move_insn (tmp, sel);
+   }
+  else if (d->vmode == E_V4SFmode)
+   {
+ sel = gen_rtx_CONST_VECTOR (E_V4SImode, gen_rtvec_v (d->nelt, rperm));
+ tmp = simplify_gen_subreg (E_V4SImode, d->target, d->vmode, 0);
+ emit_move_insn (tmp, sel);
+   }
+  else
+   {
+ sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm));
+ emit_move_insn (d->target, sel);
+   }
+
+  switch (d->vmode)
+   {
+   case E_V2DFmode:
+ emit_insn (gen_lsx_vshuf_d_f (target, target, op1, op0));
+ break;
+   case E_V2DImode:
+ emit_insn (gen_lsx_vshuf_d (target, target, op1, op0));
+ break;
+   case E_V4SFmode:
+ emit_insn (gen_lsx_vshuf_w_f (target, target, op1, op0));
+ break;
+   case E_V4SImode:
+ emit_insn (gen_lsx_vshuf_w (target, target, op1, op0));
+ break;
+   case E_V8HImode:
+ emit_insn (gen_lsx_vshuf_h (target, target, op1, op0));
+ break;
+   case E