Re: Re: cpymem for RISCV with v extension

2023-08-04 Thread 钟居哲
>> Umm, this patch has been queued up for at least a couple weeks now.

Oh. I am sorry I didn't see this patch since this patch doesn't CC me.
I didn't subscribe GCC-patch, so I may miss some patches that didn't explicitly 
CC me.

I just happen to see your reply email today then reply.



juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2023-08-05 07:17
To: 钟居哲; gcc-patches
CC: kito.cheng; kito.cheng; rdapp.gcc; Joern Rennecke
Subject: Re: cpymem for RISCV with v extension
 
 
On 8/4/23 17:10, 钟居哲 wrote:
> Could you add testcases for this patch?
Testing what specifically?  Are you asking for correctness tests, 
performance/code quality tests?
 
 
> 
> +;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
> +;; optimizers from changing cpymem_loop_* into this.
> +(define_insn "@cpymem_straight"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "r,r")))
> + (use (and (match_dup 1) (const_int 127)))
> +   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
> +   (clobber (match_scratch:V_WHOLE 3 "=,"))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +  "@vsetvli zero,%2,e,m8,ta,ma\;vle.v %3,(%1)\;vse.v %3,(%0)
> +   vsetivli zero,%2,e,m8,ta,ma\;vle.v %3,(%1)\;vse.v %3,(%0)"
> +)
> +
> +(define_insn "@cpymem_loop"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "="))
> +   (clobber (match_scratch:P 4 "="))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e,m8,ta,ma\;"
> +"vle.v %3,(%1)\;"
> +"sub %2,%2,%4", operands);
> +  if ( != 8)
> +{
> +  rtx xop[2];
> +  xop[0] = operands[4];
> +  xop[1] = GEN_INT (exact_log2 (/8));
> +  output_asm_insn ("slli %0,%0,%1", xop);
> +}
> +  output_asm_insn ("add %1,%1,%4\;"
> +"vse.v %3,(%0)\;"
> +"add %0,%0,%4\;"
> +"bnez %2,0b", operands);
> +  return "";
> +})
> +
> +;; This pattern (at bltu) assumes pointers can be treated as unsigned,
> +;; i.e.  objects can't straddle 0x / 0x .
> +(define_insn "@cpymem_loop_fast"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "="))
> +   (clobber (match_scratch:P 4 "="))
> +   (clobber (match_scratch:P 5 "="))
> +   (clobber (match_scratch:P 6 "="))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{
> +  output_asm_insn ("vsetvli %4,%2,e,m8,ta,ma\;"
> +"beq %4,%2,1f\;"
> +"add %5,%0,%2\;"
> +"sub %6,%5,%4", operands);
> +  if ( != 8)
> +{
> +  rtx xop[2];
> +  xop[0] = operands[4];
> +  xop[1] = GEN_INT (exact_log2 (/8));
> +  output_asm_insn ("slli %0,%0,%1", xop);
> +}
> +  output_asm_insn ("\n0:\t" "vle.v %3,(%1)\;"
> +"add %1,%1,%4\;"
> +"vse.v %3,(%0)\;"
> +"add %0,%0,%4\;"
>>>"bltu %0,%6,0b\;"
>>>"sub %5,%5,%0", operands);
>>>   if ( != 8)
>>> {
>>>   rtx xop[2];
>>>   xop[0] = operands[4];
>>>   xop[1] = GEN_INT (exact_log2 (/8));
>>>   output_asm_insn ("srli %0,%0,%1", xop);
>>>  }
>>>   output_asm_insn ("vsetvli %4,%5,e,m8,ta,ma\n"
>>> "1:\t" "vle.v %3,(%1)\;"
>>>"vse.v %3,(%0)", operands);
>>>   return "";
>>> })
> 
> I don't think they are necessary.
What specifically do you think is not necessary?
 
 
> 
>>> Just post the update for archival purposes and consider
>>> it pre-approved for the trunk.
> 
> I am so sorry that I disagree approve this patch too fast.
Umm, this patch has been queued up for at least a couple weeks now.
 
> 
> It should be well tested.
If you refer to Joern's message he indicated how it was tested.  Joern 
is a long time GCC developer and is well aware of how to test code.
 
 
It was tested on this set of multilibs without regressions:
 
>riscv-sim
> 
> riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
> 
> riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
> 
> riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
> 
> riscv-sim/-march=rv32imfdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
> 
> 

Re: Re: cpymem for RISCV with v extension

2023-08-04 Thread 钟居哲
>> Testing what specifically?  Are you asking for correctness tests,
>> performance/code quality tests?

Add memcpy test using RVV instructions, just like we are adding testcases for 
auto-vectorization support.

For example:

#include 
#include 
#include 

void foo (int32_t * a, int32_t * b, int num)
{
  memcpy (a, b, num);
}


In my downstream LLVM/GCC codegen:
foo:
.L2:
vsetvli a5,a2,e8,m8,ta,ma
vle8.v  v24,(a1)
sub a2,a2,a5
vse8.v  v24,(a0)
add a1,a1,a5
add a0,a0,a5
bne a2,zero,.L2
ret

Another example:
void foo (int32_t * a, int32_t * b, int num)
{
  memcpy (a, b, 4);
}


My downstream LLVM/GCC assembly:

foo:
vsetvli zero,16,e8,m1,ta,ma
vle8.v v24,(a1)
vse8.v v24,(a0)
ret

>> What specifically do you think is not necessary?
> +(define_insn "@cpymem_loop"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "="))
> +   (clobber (match_scratch:P 4 "="))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e,m8,ta,ma\;"
> +"vle.v %3,(%1)\;"
> +"sub %2,%2,%4", operands);
> +  if ( != 8)
> +{
> +  rtx xop[2];
> +  xop[0] = operands[4];
> +  xop[1] = GEN_INT (exact_log2 (/8));
> +  output_asm_insn ("slli %0,%0,%1", xop);
> +}
> +  output_asm_insn ("add %1,%1,%4\;"
> +"vse.v %3,(%0)\;"
> +"add %0,%0,%4\;"
> +"bnez %2,0b", operands);
> +  return "";
> +})

For example, this pattern, we could simpilfy emit insn with:

emit_label ...
emit_insn (gen_add...)
emit_insn (gen_pred_store...)
emit_insn (gen_add...)
emit_branch()

I don't see why it is necessary we should use such explicit pattern with 
explict multiple assembly.
More details, you can see "rvv-next" (a little bit different from my downstream 
but generally idea same).



juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2023-08-05 07:17
To: 钟居哲; gcc-patches
CC: kito.cheng; kito.cheng; rdapp.gcc; Joern Rennecke
Subject: Re: cpymem for RISCV with v extension
 
 
On 8/4/23 17:10, 钟居哲 wrote:
> Could you add testcases for this patch?
Testing what specifically?  Are you asking for correctness tests, 
performance/code quality tests?
 
 
> 
> +;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
> +;; optimizers from changing cpymem_loop_* into this.
> +(define_insn "@cpymem_straight"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "r,r")))
> + (use (and (match_dup 1) (const_int 127)))
> +   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
> +   (clobber (match_scratch:V_WHOLE 3 "=,"))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +  "@vsetvli zero,%2,e,m8,ta,ma\;vle.v %3,(%1)\;vse.v %3,(%0)
> +   vsetivli zero,%2,e,m8,ta,ma\;vle.v %3,(%1)\;vse.v %3,(%0)"
> +)
> +
> +(define_insn "@cpymem_loop"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "="))
> +   (clobber (match_scratch:P 4 "="))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e,m8,ta,ma\;"
> +"vle.v %3,(%1)\;"
> +"sub %2,%2,%4", operands);
> +  if ( != 8)
> +{
> +  rtx xop[2];
> +  xop[0] = operands[4];
> +  xop[1] = GEN_INT (exact_log2 (/8));
> +  output_asm_insn ("slli %0,%0,%1", xop);
> +}
> +  output_asm_insn ("add %1,%1,%4\;"
> +"vse.v %3,(%0)\;"
> +"add %0,%0,%4\;"
> +"bnez %2,0b", operands);
> +  return "";
> +})
> +
> +;; This pattern (at bltu) assumes pointers can be treated as unsigned,
> +;; i.e.  objects can't straddle 0x / 0x .
> +(define_insn "@cpymem_loop_fast"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "="))
> +   (clobber (match_scratch:P 4 "="))
> +   (clobber (match_scratch:P 5 "="))
> +   (clobber (match_scratch:P 6 "="))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{
> +  output_asm_insn ("vsetvli %4,%2,e,m8,ta,ma\;"
> +"beq %4,%2,1f\;"
> +"add %5,%0,%2\;"
> +"sub %6,%5,%4", operands);
> +  if ( != 8)
> +{
> +  rtx xop[2];
> +

Re: [PATCH-1, combine] Don't widen shift mode when target has rotate/mask instruction on original mode [PR93738]

2023-08-04 Thread Jeff Law via Gcc-patches




On 7/20/23 18:59, HAO CHEN GUI wrote:

Hi Jeff,

在 2023/7/21 5:27, Jeff Law 写道:

Wouldn't it make more sense to just try rotate/mask in the original mode before 
trying a shift in a widened mode?  I'm not sure why we need a target hook here.


There is no change to try rotate/mask with the original mode when
expensive_optimizations is set. The subst widens the shift mode.

But we can add it before the attempt in the wider mode.



   if (flag_expensive_optimizations)
 {
   /* Pass pc_rtx so no substitutions are done, just
  simplifications.  */
   if (i1)
 {
   subst_low_luid = DF_INSN_LUID (i1);
   i1src = subst (i1src, pc_rtx, pc_rtx, 0, 0, 0);
 }

   subst_low_luid = DF_INSN_LUID (i2);
   i2src = subst (i2src, pc_rtx, pc_rtx, 0, 0, 0);
 }

I don't know if the wider mode is helpful to other targets, so
I added the target hook.
In this scenario we're often better off relying on rtx_costs (even with 
all its warts) rather than adding yet another target hook.


I'd love to hear from Segher here to see if he's got other ideas.

jeff


Re: cpymem for RISCV with v extension

2023-08-04 Thread Jeff Law via Gcc-patches




On 8/4/23 17:10, 钟居哲 wrote:

Could you add testcases for this patch?
Testing what specifically?  Are you asking for correctness tests, 
performance/code quality tests?





+;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
+;; optimizers from changing cpymem_loop_* into this.
+(define_insn "@cpymem_straight"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
+   (mem:BLK (match_operand:P 1 "register_operand" "r,r")))
+   (use (and (match_dup 1) (const_int 127)))
+   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
+   (clobber (match_scratch:V_WHOLE 3 "=,"))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+  "@vsetvli zero,%2,e,m8,ta,ma\;vle.v %3,(%1)\;vse.v %3,(%0)
+   vsetivli zero,%2,e,m8,ta,ma\;vle.v %3,(%1)\;vse.v %3,(%0)"
+)
+
+(define_insn "@cpymem_loop"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+   (mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "="))
+   (clobber (match_scratch:P 4 "="))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e,m8,ta,ma\;"
+  "vle.v %3,(%1)\;"
+  "sub %2,%2,%4", operands);
+  if ( != 8)
+{
+  rtx xop[2];
+  xop[0] = operands[4];
+  xop[1] = GEN_INT (exact_log2 (/8));
+  output_asm_insn ("slli %0,%0,%1", xop);
+}
+  output_asm_insn ("add %1,%1,%4\;"
+  "vse.v %3,(%0)\;"
+  "add %0,%0,%4\;"
+  "bnez %2,0b", operands);
+  return "";
+})
+
+;; This pattern (at bltu) assumes pointers can be treated as unsigned,
+;; i.e.  objects can't straddle 0x / 0x .
+(define_insn "@cpymem_loop_fast"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+   (mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "="))
+   (clobber (match_scratch:P 4 "="))
+   (clobber (match_scratch:P 5 "="))
+   (clobber (match_scratch:P 6 "="))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{
+  output_asm_insn ("vsetvli %4,%2,e,m8,ta,ma\;"
+  "beq %4,%2,1f\;"
+  "add %5,%0,%2\;"
+  "sub %6,%5,%4", operands);
+  if ( != 8)
+{
+  rtx xop[2];
+  xop[0] = operands[4];
+  xop[1] = GEN_INT (exact_log2 (/8));
+  output_asm_insn ("slli %0,%0,%1", xop);
+}
+  output_asm_insn ("\n0:\t" "vle.v %3,(%1)\;"
+  "add %1,%1,%4\;"
+  "vse.v %3,(%0)\;"
+  "add %0,%0,%4\;"

   "bltu %0,%6,0b\;"
   "sub %5,%5,%0", operands);
  if ( != 8)
{
  rtx xop[2];
  xop[0] = operands[4];
  xop[1] = GEN_INT (exact_log2 (/8));
  output_asm_insn ("srli %0,%0,%1", xop);
 }
  output_asm_insn ("vsetvli %4,%5,e,m8,ta,ma\n"
"1:\t" "vle.v %3,(%1)\;"
   "vse.v %3,(%0)", operands);
  return "";
})


I don't think they are necessary.

What specifically do you think is not necessary?





Just post the update for archival purposes and consider
it pre-approved for the trunk.


I am so sorry that I disagree approve this patch too fast.

Umm, this patch has been queued up for at least a couple weeks now.



It should be well tested.
If you refer to Joern's message he indicated how it was tested.  Joern 
is a long time GCC developer and is well aware of how to test code.



It was tested on this set of multilibs without regressions:


   riscv-sim

riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f

riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32

riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f

riscv-sim/-march=rv32imfdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32

riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d

riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d

riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d








We should at least these 2 following situations:

1. an unknown number bytes to be memcpy, this codegen should be as follows:

vsetvl a5,a2,e8,m8,ta,ma

vle

vse

bump counter


cpymem for RISCV with v extension

2023-08-04 Thread 钟居哲
Could you add testcases for this patch?

+;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
+;; optimizers from changing cpymem_loop_* into this.
+(define_insn "@cpymem_straight"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
+   (mem:BLK (match_operand:P 1 "register_operand" "r,r")))
+   (use (and (match_dup 1) (const_int 127)))
+   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
+   (clobber (match_scratch:V_WHOLE 3 "=,"))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+  "@vsetvli zero,%2,e,m8,ta,ma\;vle.v %3,(%1)\;vse.v %3,(%0)
+   vsetivli zero,%2,e,m8,ta,ma\;vle.v %3,(%1)\;vse.v %3,(%0)"
+)
+
+(define_insn "@cpymem_loop"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+   (mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "="))
+   (clobber (match_scratch:P 4 "="))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e,m8,ta,ma\;"
+  "vle.v %3,(%1)\;"
+  "sub %2,%2,%4", operands);
+  if ( != 8)
+{
+  rtx xop[2];
+  xop[0] = operands[4];
+  xop[1] = GEN_INT (exact_log2 (/8));
+  output_asm_insn ("slli %0,%0,%1", xop);
+}
+  output_asm_insn ("add %1,%1,%4\;"
+  "vse.v %3,(%0)\;"
+  "add %0,%0,%4\;"
+  "bnez %2,0b", operands);
+  return "";
+})
+
+;; This pattern (at bltu) assumes pointers can be treated as unsigned,
+;; i.e.  objects can't straddle 0x / 0x .
+(define_insn "@cpymem_loop_fast"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+   (mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "="))
+   (clobber (match_scratch:P 4 "="))
+   (clobber (match_scratch:P 5 "="))
+   (clobber (match_scratch:P 6 "="))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{
+  output_asm_insn ("vsetvli %4,%2,e,m8,ta,ma\;"
+  "beq %4,%2,1f\;"
+  "add %5,%0,%2\;"
+  "sub %6,%5,%4", operands);
+  if ( != 8)
+{
+  rtx xop[2];
+  xop[0] = operands[4];
+  xop[1] = GEN_INT (exact_log2 (/8));
+  output_asm_insn ("slli %0,%0,%1", xop);
+}
+  output_asm_insn ("\n0:\t" "vle.v %3,(%1)\;"
+  "add %1,%1,%4\;"
+  "vse.v %3,(%0)\;"
+  "add %0,%0,%4\;"
>> "bltu %0,%6,0b\;"
>> "sub %5,%5,%0", operands);
>>   if ( != 8)
>> {
>>   rtx xop[2];
>>   xop[0] = operands[4];
>>   xop[1] = GEN_INT (exact_log2 (/8));
>>   output_asm_insn ("srli %0,%0,%1", xop);
>>  }
>>   output_asm_insn ("vsetvli %4,%5,e,m8,ta,ma\n"
>>  "1:\t" "vle.v %3,(%1)\;"
>> "vse.v %3,(%0)", operands);
>>   return "";
>>  })
I don't think they are necessary.

>>  considering that this code is usually memory-constrainted, limit this
>>  to -O3.  ??? It would make sense to differentiate here between in-order
>> and OOO microarchitectures.  */
>> else if (!size_p && optimize >= 3)
>>   emit_insn (gen_cpymem_loop_fast (Pmode, vmode, dst, src, end));
>>  else
>>   emit_insn (gen_cpymem_loop (Pmode, vmode, dst, src, end));
Why not just emit RVV pattern.
>> Just post the update for archival purposes and consider 
>> it pre-approved for the trunk.I am so sorry that I disagree approve this 
>> patch too fast.It should be well tested.
We should at least these 2 following situations:1. an unknown number bytes to 
be memcpy, this codegen should be as follows:   vsetvl a5,a2,e8,m8,ta,mavle 
   vsebump counterbranch2. a known number bytes to be memcpy, and the 
number bytes allow us to fine a VLS modes to hold it.For example, memcpy 16 
bytes QImode.Then, we can use V16QImode directly, the codegen should be:
vsetvli zero,16, vle vseSimple 3 instructions are enough. 
This patch should be well tested with these 2 situations before approved since 
LLVM does the same thing.We should be able to have the same behavior as LLVM.


juzhe.zh...@rivai.ai


Re: [PATCH 1/5] Middle-end _BitInt support [PR102989]

2023-08-04 Thread Joseph Myers
On Fri, 4 Aug 2023, Richard Biener via Gcc-patches wrote:

> > Sorry, I hoped it wouldn't take me almost 3 months and would be much shorter
> > as well, but clearly I'm not good at estimating stuff...
> 
> Well, it’s definitely feature creep with now the _Decimal and bitfield stuff …

I think feature creep would more be adding new features *outside the scope 
of the standard* (_BitInt bit-fields and conversions to/from DFP are 
within the standard, as are _BitInt atomic operations).  For example, 
features to help support type-generic operations on _BitInt, or 
type-generic versions of existing built-in functions (e.g. popcount) 
suitable for use on _BitInt - it's likely such features will be of use 
eventually, but they aren't needed for C23 (where the corresponding 
type-generic operations only support _BitInt types when they have the same 
width as some other type), so we can certainly get the standard features 
in first and think about additional features beyond that later (just as 
support for wider _BitInt can come later, not being required by the 
standard).

-- 
Joseph S. Myers
jos...@codesourcery.com


[PATCH] match.pd: Implement missed optimization ((x ^ y) & z) | x -> (z & y) | x [PR109938]

2023-08-04 Thread Drew Ross via Gcc-patches
Adds a simplification for ((x ^ y) & z) | x to be folded into
(z & y) | x. Merges this simplification with ((x | y) & z) | x -> (z & y) | x
to prevent duplicate pattern. Tested successfully on x86_64 and x86 targets.

PR tree-opt/109938

gcc/ChangeLog:

* match.pd ((x ^ y) & z) | x -> (z & y) | x: New simplification.

gcc/testsuite/ChangeLog:

* gcc.c-torture/execute/pr109938.c: New test.
* gcc.dg/tree-ssa/pr109938.c: New test.
---
 gcc/match.pd  |  10 +-
 .../gcc.c-torture/execute/pr109938.c  |  33 +
 gcc/testsuite/gcc.dg/tree-ssa/pr109938.c  | 125 ++
 3 files changed, 164 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.c-torture/execute/pr109938.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr109938.c

diff --git a/gcc/match.pd b/gcc/match.pd
index ee6cef6b09d..884dc622b25 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1946,10 +1946,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (bitop:c (rbitop:c (bit_not @0) @1) @0)
   (bitop @0 @1)))
 
-/* ((x | y) & z) | x -> (z & y) | x */
-(simplify
-  (bit_ior:c (bit_and:cs (bit_ior:cs @0 @1) @2) @0)
-  (bit_ior (bit_and @2 @1) @0))
+/* ((x |^ y) & z) | x -> (z & y) | x  */
+(for op (bit_ior bit_xor)
+ (simplify
+  (bit_ior:c (nop_convert1? (bit_and:c (nop_convert2? (op:c @0 @1)) @2)) @3)
+  (if (bitwise_equal_p (@0, @3))
+   (convert (bit_ior (bit_and @1 (convert @2)) (convert @0))
 
 /* (x | CST1) & CST2 -> (x & CST2) | (CST1 & CST2) */
 (simplify
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr109938.c 
b/gcc/testsuite/gcc.c-torture/execute/pr109938.c
new file mode 100644
index 000..a65d13b305d
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr109938.c
@@ -0,0 +1,33 @@
+/* PR tree-opt/109938 */
+
+#include "../../gcc.dg/tree-ssa/pr109938.c"
+
+int 
+main ()
+{
+  if (t1 (29789, 29477, 23942) != 30045) __builtin_abort ();
+  if (t2 (-20196, 18743, -32901) != -1729) __builtin_abort ();
+  if (t3 (2136614690L, 1136698390L, 2123767997L) != 2145003318UL) 
__builtin_abort ();
+  if (t4 (-4878, 9977, 23313) != 61171) __builtin_abort ();
+  if (t5 (127, 99, 43) != 127) __builtin_abort ();
+  if (t6 (9176690219839792930LL, 3176690219839721234LL, 5671738468274920831LL)
+  != 9177833729112616754LL) __builtin_abort ();
+  if (t7 (29789, 29477, 23942) != 30045) __builtin_abort ();
+  if (t8 (23489, 99477, 87942) != 90053) __builtin_abort ();
+  if (t9 (10489, 66477, -73313) != 10749) __builtin_abort ();
+  if (t10 (2136614690L, -1136614690L, 4136614690UL) != 4284131106UL)
+__builtin_abort ();
+  if (t11 (29789, 29477, 12345) != 29821) __builtin_abort ();
+  if (t12 (-120, 98, -73) != 170) __builtin_abort ();
+  if (t13 (9176690219839792930ULL, -3176690219839721234LL, 
5671738468274920831ULL)
+  != 9221726284835125102ULL) __builtin_abort ();
+  v4si a1 = {29789, -20196, 23489, 10489};
+  v4si a2 = {29477, 18743, 99477, 66477}; 
+  v4si a3 = {23942, -32901, 87942, -73313};
+  v4si r1 = {30045, 63807, 90053, 10749}; 
+  v4si b1 = t14 (a1, a2, a3);
+  v4si b2 = t15 (a1, a2, a3);
+  if (__builtin_memcmp (,  ,  sizeof (b1) != 0)) __builtin_abort();  
+  if (__builtin_memcmp (,  ,  sizeof (b2) != 0)) __builtin_abort();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr109938.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr109938.c
new file mode 100644
index 000..0cae55886c6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr109938.c
@@ -0,0 +1,125 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-dse1 -Wno-psabi" } */
+
+typedef int v4si __attribute__((vector_size(4 * sizeof(int;
+
+/* Generic */
+__attribute__((noipa)) int
+t1 (int a, int b, int c)
+{
+  return ((a ^ c) & b) | a;
+}
+
+__attribute__((noipa)) unsigned int
+t2 (int a, unsigned int b, int c)
+{
+  return ((a ^ c) & b) | a;
+}
+
+__attribute__((noipa)) unsigned long
+t3 (unsigned long a, long b, unsigned long c)
+{
+  return ((a ^ c) & b) | a;
+}
+
+__attribute__((noipa)) unsigned short
+t4 (short a, unsigned short b, unsigned short c)
+{
+  return (unsigned short) ((a ^ c) & b) | a;
+}
+
+__attribute__((noipa)) unsigned char
+t5 (unsigned char a, signed char b, signed char c)
+{
+  return ((a ^ c) & b) | a;
+}
+
+__attribute__((noipa)) long long
+t6 (long long a, long long b, long long c)
+{
+  return ((a ^ c) & (unsigned long long) b) | a;
+}
+
+/* Gimple */
+__attribute__((noipa)) int
+t7 (int a, int b, int c)
+{
+  int t1 = a ^ c;
+  int t2 = t1 & b;
+  int t3 = t2 | a;
+  return t3;
+}
+
+__attribute__((noipa)) int
+t8 (int a, unsigned int b, unsigned int c)
+{
+  unsigned int t1 = a ^ c;
+  int t2 = t1 & b;
+  int t3 = t2 | a;
+  return t3;
+}
+
+__attribute__((noipa)) unsigned int
+t9 (unsigned int a, unsigned int b, int c)
+{
+  unsigned int t1 = a ^ c;
+  unsigned int t2 = t1 & b;
+  unsigned int t3 = t2 | a;
+  return t3;
+}
+
+__attribute__((noipa)) unsigned long
+t10 (unsigned long a, long b, unsigned long c)
+{
+  

Re: [PATCH v3] [RISC-V] Generate Zicond instruction for select pattern with condition eq or neq to 0

2023-08-04 Thread Jeff Law via Gcc-patches



On 8/1/23 19:38, Xiao Zeng wrote:

This patch recognizes Zicond patterns when the select pattern
with condition eq or neq to 0 (using eq as an example), namely:

1 rd = (rs2 == 0) ? non-imm : 0
2 rd = (rs2 == 0) ? non-imm : non-imm
3 rd = (rs2 == 0) ? reg : non-imm
4 rd = (rs2 == 0) ? reg : reg

gcc/ChangeLog:

 * config/riscv/riscv.cc (riscv_expand_conditional_move): Recognize
 Zicond patterns
 * config/riscv/riscv.md: Recognize Zicond patterns through movcc
So I've made minor adjustments to the remaining three cases.  First we 
need to check the code before optimizing the cases were one of the arms 
of the conditional move matches op0.


I slightly adjusted the case for out of range constants.  Its better to 
check SMALL_OPERAND rather than testing for specific constants.  And 
when that triggers, we can just force the value into a register and 
continue as-is rather than recursing.


The patch I'm committing fixes one comment typo (whitespace) and a bit 
of accidentally duplicated code I added in a prior commit.


Next up Raphael's patches to handle nontrival conditionals by emiting an 
scc insn :-)


Jeff

ps.  I'm deferrring the testsuite bits until we sort out the costing 
problems.  THey're definitely not forgotten and I still use them in my 
local tree.
commit 4e87c953d16377457b31b65b6c3268d932e462ab
Author: Xiao Zeng 
Date:   Fri Aug 4 17:23:56 2023 -0400

[PATCH v3] [RISC-V] Generate Zicond instruction for select pattern with 
condition eq or neq to 0

This patch recognizes Zicond patterns when the select pattern
with condition eq or neq to 0 (using eq as an example), namely:

1 rd = (rs2 == 0) ? non-imm : 0
2 rd = (rs2 == 0) ? non-imm : non-imm
3 rd = (rs2 == 0) ? reg : non-imm
4 rd = (rs2 == 0) ? reg : reg

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_expand_conditional_move): Recognize
more Zicond patterns.  Fix whitespace typo.
(riscv_rtx_costs): Remove accidental code duplication.

Co-authored-by: Jeff Law 

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 8b725610815..7728cd34569 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -2522,20 +2522,6 @@ riscv_rtx_costs (rtx x, machine_mode mode, int 
outer_code, int opno ATTRIBUTE_UN
  *total = COSTS_N_INSNS (1);
  return true;
}
-  else if (TARGET_ZICOND
-  && outer_code == SET
-  && ((GET_CODE (XEXP (x, 1)) == REG
-   && XEXP (x, 2) == CONST0_RTX (GET_MODE (XEXP (x, 1
-  || (GET_CODE (XEXP (x, 2)) == REG
-  && XEXP (x, 1) == CONST0_RTX (GET_MODE (XEXP (x, 2
-  || (GET_CODE (XEXP (x, 1)) == REG
-  && rtx_equal_p (XEXP (x, 1), XEXP (XEXP (x, 0), 0)))
-  || (GET_CODE (XEXP (x, 1)) == REG
-  && rtx_equal_p (XEXP (x, 2), XEXP (XEXP (x, 0), 0)
-   {
- *total = COSTS_N_INSNS (1);
- return true;
-   }
   else if (LABEL_REF_P (XEXP (x, 1)) && XEXP (x, 2) == pc_rtx)
{
  if (equality_operator (XEXP (x, 0), mode)
@@ -3583,7 +3569,7 @@ riscv_expand_conditional_move (rtx dest, rtx op, rtx 
cons, rtx alt)
   /* The expander is a bit loose in its specification of the true
 arm of the conditional move.  That allows us to support more
 cases for extensions which are more general than SFB.  But
-   does mean we need to force CONS into a register at this point.  */
+does mean we need to force CONS into a register at this point.  */
   cons = force_reg (GET_MODE (dest), cons);
   emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (GET_MODE (dest),
  cond, cons, alt)));
@@ -3628,6 +3614,40 @@ riscv_expand_conditional_move (rtx dest, rtx op, rtx 
cons, rtx alt)
  riscv_emit_binary (PLUS, dest, dest, cons);
  return true;
}
+  /* imm, reg  */
+  else if (CONST_INT_P (cons) && cons != CONST0_RTX (mode) && REG_P (alt))
+   {
+ /* Optimize for register value of 0.  */
+ if (code == NE && rtx_equal_p (op0, alt) && op1 == CONST0_RTX (mode))
+   {
+ rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
+ cons = force_reg (mode, cons);
+ emit_insn (gen_rtx_SET (dest,
+ gen_rtx_IF_THEN_ELSE (mode, cond,
+   cons, alt)));
+ return true;
+   }
+
+ riscv_emit_int_compare (, , , true);
+ rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
+
+ rtx temp1 = gen_reg_rtx (mode);
+ rtx temp2 = gen_int_mode (-1 * INTVAL (cons), mode);
+
+ /* TEMP2 might not fit into a signed 12 bit immediate suitable
+for an addi 

Re: [PATCH V2] RISC-V: Support POPCOUNT auto-vectorization

2023-08-04 Thread Jeff Law via Gcc-patches




On 8/1/23 00:47, Robin Dapp via Gcc-patches wrote:

  I'm not against continuing with the more well-known approach for now
  but we should keep in mind that might still be potential for improvement.


No. I don't think it's faster.


I did a quick check on my x86 laptop and it's roughly 25% faster there.
That's consistent with the literature.  RISC-V qemu only shows 5-10%
improvement, though.


I have no ideal. I saw ARM SVE generate:
POP_COUNT
POP_COUNT
VEC_PACK_TRUNC.


I'd strongly suspect this happens because it's converting to int.
If you change dst to uint64_t there won't be any vec_pack_trunc.


I am gonna drop this patch since it's meaningless.


But why?  It can still help even if we can improve on the sequence.
IMHO you can go ahead with it and just change int -> uint64_t in the
tests.
It'd also be interesting to see if those popcounts in deepsjeng are 
vectorizable.  We got a major boost in deepsjeng at a prior employer, 
but I can't remember if it was from getting the pcounts vectorized or 
just not doing stupid stuff with them on the scalar side.



jeff


Re: cpymem for RISCV with v extension

2023-08-04 Thread Jeff Law via Gcc-patches




On 7/17/23 22:47, Joern Rennecke wrote:

Subject:
cpymem for RISCV with v extension
From:
Joern Rennecke 
Date:
7/17/23, 22:47

To:
GCC Patches 


As discussed on last week's patch call, this patch uses either a
straight copy or an opaque pattern that emits the loop as assembly to
optimize cpymem for the 'v' extension.
I used Ju-Zhe Zhong's patch - starting in git with:

Author: zhongjuzhe<66454988+zhongju...@users.noreply.github.com>
Date:   Mon Mar 21 14:20:42 2022 +0800

   PR for RVV support using splitted small chunks (#334)

as a starting point, even though not all that much of the original code remains.

Regression tested on x86_64-pc-linux-gnu X
 riscv-sim
 
riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
 
riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
 
riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
 
riscv-sim/-march=rv32imfdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
 
riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
 
riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
 
riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d


cpymem-diff-20230718.txt

2023-07-12  Ju-Zhe Zhong
 Joern Rennecke

* config/riscv/riscv-protos.h (riscv_vector::expand_block_move):
Declare.
* config/riscv/riscv-v.cc (riscv_vector::expand_block_move):
New function.
* config/riscv/riscv.md (cpymemsi): Use riscv_vector::expand_block_move.
* config/riscv/vector.md (@cpymem_straight):
New define_insn patterns.
(@cpymem_loop): Likewise.
(@cpymem_loop_fast): Likewise.




diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index b4884a30872..e61110fa3ad 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -49,6 +49,7 @@
  #include "tm-constrs.h"
  #include "rtx-vector-builder.h"
  #include "targhooks.h"
+#include "predict.h"
Not sure this is needed, but I didn't scan for it explicitly.  If it's 
not needed, then remove it.





+  if (CONST_INT_P (length_in))
+{
+  HOST_WIDE_INT length = INTVAL (length_in);
+
+/* By using LMUL=8, we can copy as many bytes in one go as there
+   are bits in a vector register.  If the entire block thus fits,
+   we don't need a loop.  */
+if (length <= TARGET_MIN_VLEN)
+  {
+   need_loop = false;
+
+   /* If a single scalar load / store pair can do the job, leave it
+  to the scalar code to do that.  */
+
+   if (pow2p_hwi (length) && length <= potential_ew)
+ return false;
+  }
We could probably argue over the threshold for doing the copy on the 
scalar side, but I don't think it's necessary.  Once we start seeing V 
hardware we can revisit.




+
+  /* Find the vector mode to use.  Using the largest possible element
+size is likely to give smaller constants, and thus potentially
+reducing code size.  However, if we need a loop, we need to update
+the pointers, and that is more complicated with a larger element
+size, unless we use an immediate, which prevents us from dynamically
+using the largets transfer size that the hart supports.  And then,
+unless we know the*exact*  vector size of the hart, we'd need
+multiple vsetvli / branch statements, so it's not even a size win.
+If, in the future, we find an RISCV-V implementation that is slower
+for small element widths, we might allow larger element widths for
+loops too.  */

s/largets/largest/

And a space is missing in "the*extact*"

Note that I think the proposed glibc copier does allow larger elements 
widths for this case.



+
+ /* Unless we get an implementation that's slow for small element
+size / non-word-aligned accesses, we assume that the hardware
+handles this well, and we don't want to complicate the code
+with shifting word contents around or handling extra bytes at
+the start and/or end.  So we want the total transfer size and
+alignemnt to fit with the element size.  */

s/alignemnt/alignment/

Yes, let's not try to support every uarch we can envision and instead do 
a good job on the uarches we know about.If a uarch with slow element 
or non-word aligned accesses comes along, they can propose changes at 
that time.





+
+ // The VNx*?I modes have a factor of riscv_vector_chunks for nunits.
Comment might need updating after the recent work to adjust 

[pushed] analyzer: handle function attribute "alloc_size" [PR110426]

2023-08-04 Thread David Malcolm via Gcc-patches
This patch makes -fanalyzer make use of the function attribute
"alloc_size", allowing -fanalyzer to emit -Wanalyzer-allocation-size,
-Wanalyzer-out-of-bounds, and -Wanalyzer-tainted-allocation-size on
execution paths involving allocations using such functions.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r14-3001-g021077b94741c9.

gcc/analyzer/ChangeLog:
PR analyzer/110426
* bounds-checking.cc (region_model::check_region_bounds): Handle
symbolic base regions.
* call-details.cc: Include "stringpool.h" and "attribs.h".
(call_details::lookup_function_attribute): New function.
* call-details.h (call_details::lookup_function_attribute): New
function decl.
* region-model-manager.cc
(region_model_manager::maybe_fold_binop): Add reference to
PR analyzer/110902.
* region-model-reachability.cc (reachable_regions::handle_sval):
Add symbolic regions for pointers that are conjured svalues for
the LHS of a stmt.
* region-model.cc (region_model::canonicalize): Purge dynamic
extents for regions that aren't referenced.
(get_result_size_in_bytes): New function.
(region_model::on_call_pre): Use get_result_size_in_bytes and
potentially set the dynamic extents of the region pointed to by
the return value.
(region_model::deref_rvalue): Add param "add_nonnull_constraint"
and use it to conditionalize adding the constraint.
(pending_diagnostic_subclass::dubious_allocation_size): Add "stmt"
param to both ctors and use it to initialize new "m_stmt" field.
(pending_diagnostic_subclass::operator==): Use m_stmt; don't use
m_lhs or m_rhs.
(pending_diagnostic_subclass::m_stmt): New field.
(region_model::check_region_size): Generalize to any kind of
pointer svalue by using deref_rvalue rather than checking for
region_svalue.  Pass stmt to dubious_allocation_size ctor.
* region-model.h (region_model::deref_rvalue): Add param
"add_nonnull_constraint".
* svalue.cc (conjured_svalue::lhs_value_p): New function.
* svalue.h (conjured_svalue::lhs_value_p): New decl.

gcc/testsuite/ChangeLog:
PR analyzer/110426
* gcc.dg/analyzer/allocation-size-1.c: Update expected message to
reflect consolidation of size and assignment into a single event.
* gcc.dg/analyzer/allocation-size-2.c: Likewise.
* gcc.dg/analyzer/allocation-size-3.c: Likewise.
* gcc.dg/analyzer/allocation-size-4.c: Likewise.
* gcc.dg/analyzer/allocation-size-multiline-1.c: Likewise.
* gcc.dg/analyzer/allocation-size-multiline-2.c: Likewise.
* gcc.dg/analyzer/allocation-size-multiline-3.c: Likewise.
* gcc.dg/analyzer/attr-alloc_size-1.c: New test.
* gcc.dg/analyzer/attr-alloc_size-2.c: New test.
* gcc.dg/analyzer/attr-alloc_size-3.c: New test.
* gcc.dg/analyzer/explode-4.c: New test.
* gcc.dg/analyzer/taint-size-1.c: Add test coverage for
__attribute__ alloc_size.
---
 gcc/analyzer/bounds-checking.cc   |  12 +-
 gcc/analyzer/call-details.cc  |  21 +++
 gcc/analyzer/call-details.h   |   2 +
 gcc/analyzer/region-model-manager.cc  |   2 +
 gcc/analyzer/region-model-reachability.cc |  21 +++
 gcc/analyzer/region-model.cc  | 109 ++--
 gcc/analyzer/region-model.h   |   3 +-
 gcc/analyzer/svalue.cc|  11 ++
 gcc/analyzer/svalue.h |   1 +
 .../gcc.dg/analyzer/allocation-size-1.c   |   3 +-
 .../gcc.dg/analyzer/allocation-size-2.c   |   3 +-
 .../gcc.dg/analyzer/allocation-size-3.c   |   9 +-
 .../gcc.dg/analyzer/allocation-size-4.c   |   6 +-
 .../analyzer/allocation-size-multiline-1.c|  12 +-
 .../analyzer/allocation-size-multiline-2.c|  15 +-
 .../analyzer/allocation-size-multiline-3.c|  10 +-
 .../gcc.dg/analyzer/attr-alloc_size-1.c   |  81 +
 .../gcc.dg/analyzer/attr-alloc_size-2.c   |  19 +++
 .../gcc.dg/analyzer/attr-alloc_size-3.c   |  14 ++
 gcc/testsuite/gcc.dg/analyzer/explode-4.c | 157 ++
 gcc/testsuite/gcc.dg/analyzer/taint-size-1.c  |  10 ++
 21 files changed, 458 insertions(+), 63 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/attr-alloc_size-1.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/attr-alloc_size-2.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/attr-alloc_size-3.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/explode-4.c

diff --git a/gcc/analyzer/bounds-checking.cc b/gcc/analyzer/bounds-checking.cc
index 5e8de9a7aa5..f49cf7cf2af 100644
--- a/gcc/analyzer/bounds-checking.cc
+++ b/gcc/analyzer/bounds-checking.cc
@@ -981,12 +981,6 @@ region_model::check_region_bounds (const region *reg,
   region_offset reg_offset = 

[pushed] analyzer: fix some svalue::dump_to_pp implementations

2023-08-04 Thread David Malcolm via Gcc-patches
Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r14-3000-g187b213ddbe7ea.

gcc/analyzer/ChangeLog:
* svalue.cc (region_svalue::dump_to_pp): Support NULL type.
(constant_svalue::dump_to_pp): Likewise.
(initial_svalue::dump_to_pp): Likewise.
(conjured_svalue::dump_to_pp): Likewise.  Fix missing print of the
type.
---
 gcc/analyzer/svalue.cc | 27 ---
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/gcc/analyzer/svalue.cc b/gcc/analyzer/svalue.cc
index 4395018dbc3..5d5c80f88c6 100644
--- a/gcc/analyzer/svalue.cc
+++ b/gcc/analyzer/svalue.cc
@@ -714,8 +714,11 @@ region_svalue::dump_to_pp (pretty_printer *pp, bool 
simple) const
   else
 {
   pp_string (pp, "region_svalue(");
-  print_quoted_type (pp, get_type ());
-  pp_string (pp, ", ");
+  if (get_type ())
+   {
+ print_quoted_type (pp, get_type ());
+ pp_string (pp, ", ");
+   }
   m_reg->dump_to_pp (pp, simple);
   pp_string (pp, ")");
 }
@@ -811,8 +814,11 @@ constant_svalue::dump_to_pp (pretty_printer *pp, bool 
simple) const
   else
 {
   pp_string (pp, "constant_svalue(");
-  print_quoted_type (pp, get_type ());
-  pp_string (pp, ", ");
+  if (get_type ())
+   {
+ print_quoted_type (pp, get_type ());
+ pp_string (pp, ", ");
+   }
   dump_tree (pp, m_cst_expr);
   pp_string (pp, ")");
 }
@@ -1029,8 +1035,11 @@ initial_svalue::dump_to_pp (pretty_printer *pp, bool 
simple) const
   else
 {
   pp_string (pp, "initial_svalue(");
-  print_quoted_type (pp, get_type ());
-  pp_string (pp, ", ");
+  if (get_type ())
+   {
+ print_quoted_type (pp, get_type ());
+ pp_string (pp, ", ");
+   }
   m_reg->dump_to_pp (pp, simple);
   pp_string (pp, ")");
 }
@@ -1910,7 +1919,11 @@ conjured_svalue::dump_to_pp (pretty_printer *pp, bool 
simple) const
   else
 {
   pp_string (pp, "conjured_svalue (");
-  pp_string (pp, ", ");
+  if (get_type ())
+   {
+ print_quoted_type (pp, get_type ());
+ pp_string (pp, ", ");
+   }
   pp_gimple_stmt_1 (pp, m_stmt, 0, (dump_flags_t)0);
   pp_string (pp, ", ");
   m_id_reg->dump_to_pp (pp, simple);
-- 
2.26.3



Re: [committed][RISC-V] Fix 20010221-1.c with zicond

2023-08-04 Thread Jeff Law via Gcc-patches




On 8/4/23 03:29, Xiao Zeng wrote:

On Thu, Aug 03, 2023 at 01:20:00 AM  Jeff Law  wrote:




In the wrong two optimization modes, I only considered the
case of satisfying the ELSE branch, but in fact, like the correct
two optimization modes, I should consider the case of satisfying
both the THAN and ELSE branches.
It happens -- we all make mistakes.  FWIW I didn't spot it during the 
review either.





By the way, I was assigned other tasks during the week and
didn't have time to reply to emails, sorry.
No worries.  I'm trying to keep this moving because we have multiple 
submissions from different authors in this space as well as bits 
internal to Ventana.  That's a recipe for a messy integration phase if 
it's not well managed.


It's also something I kept meaning to resolve and your submission just 
gave me the proper motivation to move zicond forward.  The target 
specific bits you did lined up perfectly with the community feedback on 
the original VRULL implementation as well as the direction Ventana had 
taken on our internal tree.


Jeff


[V2][PATCH 3/3] Use the counted_by attribute information in bound sanitizer[PR108896]

2023-08-04 Thread Qing Zhao via Gcc-patches
gcc/c-family/ChangeLog:

PR C/108896
* c-ubsan.cc (ubsan_instrument_bounds): Use counted_by attribute
information.

gcc/testsuite/ChangeLog:

PR C/108896
* gcc.dg/ubsan/flex-array-counted-by-bounds.c: New test.
* gcc.dg/ubsan/flex-array-counted-by-bounds-2.c: New test.
---
 gcc/c-family/c-ubsan.cc   | 16 +++
 .../ubsan/flex-array-counted-by-bounds-2.c| 27 +++
 .../ubsan/flex-array-counted-by-bounds.c  | 46 +++
 3 files changed, 89 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/ubsan/flex-array-counted-by-bounds-2.c
 create mode 100644 gcc/testsuite/gcc.dg/ubsan/flex-array-counted-by-bounds.c

diff --git a/gcc/c-family/c-ubsan.cc b/gcc/c-family/c-ubsan.cc
index 51aa83a378d2..a99e8433069f 100644
--- a/gcc/c-family/c-ubsan.cc
+++ b/gcc/c-family/c-ubsan.cc
@@ -362,6 +362,10 @@ ubsan_instrument_bounds (location_t loc, tree array, tree 
*index,
 {
   tree type = TREE_TYPE (array);
   tree domain = TYPE_DOMAIN (type);
+  /* whether the array ref is a flexible array member with valid counted_by
+ attribute.  */
+  bool fam_has_count_attr = false;
+  tree counted_by = NULL_TREE;
 
   if (domain == NULL_TREE)
 return NULL_TREE;
@@ -375,6 +379,17 @@ ubsan_instrument_bounds (location_t loc, tree array, tree 
*index,
  && COMPLETE_TYPE_P (type)
  && integer_zerop (TYPE_SIZE (type)))
bound = build_int_cst (TREE_TYPE (TYPE_MIN_VALUE (domain)), -1);
+  /* If the array ref is to flexible array member field which has
+counted_by attribute.  We can use the information from the
+attribute as the bound to instrument the reference.  */
+  else if ((counted_by = component_ref_get_counted_by (array))
+   != NULL_TREE)
+   {
+ fam_has_count_attr = true;
+ bound = fold_build2 (MINUS_EXPR, TREE_TYPE (counted_by),
+  counted_by,
+  build_int_cst (TREE_TYPE (counted_by), 1));
+   }
   else
return NULL_TREE;
 }
@@ -387,6 +402,7 @@ ubsan_instrument_bounds (location_t loc, tree array, tree 
*index,
  -fsanitize=bounds-strict.  */
   tree base = get_base_address (array);
   if (!sanitize_flags_p (SANITIZE_BOUNDS_STRICT)
+  && !fam_has_count_attr
   && TREE_CODE (array) == COMPONENT_REF
   && base && (INDIRECT_REF_P (base) || TREE_CODE (base) == MEM_REF))
 {
diff --git a/gcc/testsuite/gcc.dg/ubsan/flex-array-counted-by-bounds-2.c 
b/gcc/testsuite/gcc.dg/ubsan/flex-array-counted-by-bounds-2.c
new file mode 100644
index ..77ec333509d0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ubsan/flex-array-counted-by-bounds-2.c
@@ -0,0 +1,27 @@
+/* test the attribute counted_by and its usage in
+   bounds sanitizer combined with VLA.  */
+/* { dg-do run } */
+/* { dg-options "-fsanitize=bounds" } */
+
+#include 
+
+void __attribute__((__noinline__)) setup_and_test_vla (int n, int m)
+{
+   struct foo {
+   int n;
+   int p[][n] __attribute__((counted_by(n)));
+   } *f;
+
+   f = (struct foo *) malloc (sizeof(struct foo) + m*sizeof(int[n]));
+   f->n = m;
+   f->p[m][n-1]=1;
+   return;
+}
+
+int main(int argc, char *argv[])
+{
+  setup_and_test_vla (10, 11);
+  return 0;
+}
+
+/* { dg-output "17:8: runtime error: index 11 out of bounds for type" } */
diff --git a/gcc/testsuite/gcc.dg/ubsan/flex-array-counted-by-bounds.c 
b/gcc/testsuite/gcc.dg/ubsan/flex-array-counted-by-bounds.c
new file mode 100644
index ..81eaeb3f2681
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ubsan/flex-array-counted-by-bounds.c
@@ -0,0 +1,46 @@
+/* test the attribute counted_by and its usage in
+   bounds sanitizer.  */
+/* { dg-do run } */
+/* { dg-options "-fsanitize=bounds" } */
+
+#include 
+
+struct flex {
+  int b;
+  int c[];
+} *array_flex;
+
+struct annotated {
+  int b;
+  int c[] __attribute__ ((counted_by (b)));
+} *array_annotated;
+
+void __attribute__((__noinline__)) setup (int normal_count, int 
annotated_count)
+{
+  array_flex
+= (struct flex *)malloc (sizeof (struct flex)
++ normal_count *  sizeof (int));
+  array_flex->b = normal_count;
+
+  array_annotated
+= (struct annotated *)malloc (sizeof (struct annotated)
+ + annotated_count *  sizeof (int));
+  array_annotated->b = annotated_count;
+
+  return;
+}
+
+void __attribute__((__noinline__)) test (int normal_index, int annotated_index)
+{
+  array_flex->c[normal_index] = 1;
+  array_annotated->c[annotated_index] = 2;
+}
+
+int main(int argc, char *argv[])
+{
+  setup (10, 10);   
+  test (10, 10);
+  return 0;
+}
+
+/* { dg-output "36:21: runtime error: index 10 out of bounds for type" } */
-- 
2.31.1



[V2][PATCH 2/3] Use the counted_by atribute info in builtin object size [PR108896]

2023-08-04 Thread Qing Zhao via Gcc-patches
gcc/ChangeLog:

PR C/108896
* tree-object-size.cc (addr_object_size): Use the counted_by
attribute info.
* tree.cc (component_ref_has_counted_by_p): New function.
(component_ref_get_counted_by): New function.
* tree.h (component_ref_has_counted_by_p): New prototype.
(component_ref_get_counted_by): New prototype.

gcc/testsuite/ChangeLog:

PR C/108896
* gcc.dg/flex-array-counted-by-2.c: New test.
* gcc.dg/flex-array-counted-by-3.c: New test.
---
 .../gcc.dg/flex-array-counted-by-2.c  |  74 +++
 .../gcc.dg/flex-array-counted-by-3.c  | 197 ++
 gcc/tree-object-size.cc   |  37 +++-
 gcc/tree.cc   |  95 -
 gcc/tree.h|  10 +
 5 files changed, 405 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/flex-array-counted-by-2.c
 create mode 100644 gcc/testsuite/gcc.dg/flex-array-counted-by-3.c

diff --git a/gcc/testsuite/gcc.dg/flex-array-counted-by-2.c 
b/gcc/testsuite/gcc.dg/flex-array-counted-by-2.c
new file mode 100644
index ..ec580c1f1f01
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/flex-array-counted-by-2.c
@@ -0,0 +1,74 @@
+/* test the attribute counted_by and its usage in
+ * __builtin_dynamic_object_size.  */ 
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+#include "builtin-object-size-common.h"
+
+#define expect(p, _v) do { \
+size_t v = _v; \
+if (p == v) \
+   __builtin_printf ("ok:  %s == %zd\n", #p, p); \
+else \
+   {  \
+ __builtin_printf ("WAT: %s == %zd (expected %zd)\n", #p, p, v); \
+ FAIL (); \
+   } \
+} while (0);
+
+struct flex {
+  int b;
+  int c[];
+} *array_flex;
+
+struct annotated {
+  int b;
+  int c[] __attribute__ ((counted_by (b)));
+} *array_annotated;
+
+struct nested_annotated {
+  struct {
+union {
+  int b;
+  float f; 
+};
+int n;
+  };
+  int c[] __attribute__ ((counted_by (b)));
+} *array_nested_annotated;
+
+void __attribute__((__noinline__)) setup (int normal_count, int attr_count)
+{
+  array_flex
+= (struct flex *)malloc (sizeof (struct flex)
++ normal_count *  sizeof (int));
+  array_flex->b = normal_count;
+
+  array_annotated
+= (struct annotated *)malloc (sizeof (struct annotated)
+ + attr_count *  sizeof (int));
+  array_annotated->b = attr_count;
+
+  array_nested_annotated
+= (struct nested_annotated *)malloc (sizeof (struct nested_annotated)
++ attr_count *  sizeof (int));
+  array_nested_annotated->b = attr_count;
+
+  return;
+}
+
+void __attribute__((__noinline__)) test ()
+{
+expect(__builtin_dynamic_object_size(array_flex->c, 1), -1);
+expect(__builtin_dynamic_object_size(array_annotated->c, 1),
+  array_annotated->b * sizeof (int));
+expect(__builtin_dynamic_object_size(array_nested_annotated->c, 1),
+  array_nested_annotated->b * sizeof (int));
+}
+
+int main(int argc, char *argv[])
+{
+  setup (10,10);   
+  test ();
+  DONE ();
+}
diff --git a/gcc/testsuite/gcc.dg/flex-array-counted-by-3.c 
b/gcc/testsuite/gcc.dg/flex-array-counted-by-3.c
new file mode 100644
index ..22ef2af31c20
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/flex-array-counted-by-3.c
@@ -0,0 +1,197 @@
+/* test the attribute counted_by and its usage in
+__builtin_dynamic_object_size: what's the correct behavior when the allocaiton
+size mismatched with the value of counted_by attribute?  */
+/* { dg-do run } */
+/* { dg-options "-O -fstrict-flex-arrays=3" } */
+
+#include "builtin-object-size-common.h"
+
+struct annotated {
+  size_t foo;
+  int array[] __attribute__((counted_by (foo)));
+};
+
+#define expect(p, _v) do { \
+size_t v = _v; \
+if (p == v) \
+__builtin_printf ("ok:  %s == %zd\n", #p, p); \
+else \
+{  \
+  __builtin_printf ("WAT: %s == %zd (expected %zd)\n", #p, p, v); \
+ FAIL (); \
+} \
+} while (0);
+
+#define noinline __attribute__((__noinline__))
+#define SIZE_BUMP 5
+
+/* In general, Due to type casting, the type for the pointee of a pointer
+   does not say anything about the object it points to,
+   So, __builtin_object_size can not directly use the type of the pointee
+   to decide the size of the object the pointer points to.
+
+   there are only two reliable ways:
+   A. observed allocations  (call to the allocation functions in the routine)
+   B. observed accesses (read or write access to the location of the
+ pointer points to)
+
+   that provide information about the type/existence of an object at
+   the corresponding address.
+
+   for A, we use the "alloc_size" attribute for the corresponding allocation
+   functions to determine the object size;
+
+   For B, we use the SIZE info of the TYPE attached to the corresponding 
access.
+   

[V2][PATCH 1/3] Provide counted_by attribute to flexible array member field (PR108896)

2023-08-04 Thread Qing Zhao via Gcc-patches
'counted_by (COUNT)'
 The 'counted_by' attribute may be attached to the flexible array
 member of a structure.  It indicates that the number of the
 elements of the array is given by the field named "COUNT" in the
 same structure as the flexible array member.  GCC uses this
 information to improve the results of the array bound sanitizer and
 the '__builtin_dynamic_object_size'.

 For instance, the following code:

  struct P {
size_t count;
int array[] __attribute__ ((counted_by (count)));
  } *p;

 specifies that the 'array' is a flexible array member whose number
 of elements is given by the field 'count' in the same structure.

 The field that represents the number of the elements should have an
 integer type.  An explicit 'counted_by' annotation defines a
 relationship between two objects, 'p->array' and 'p->count', that
 'p->array' has _at least_ 'p->count' number of elements available.
 This relationship must hold even after any of these related objects
 are updated.  It's the user's responsibility to make sure this
 relationship to be kept all the time.  Otherwise the results of the
 array bound sanitizer and the '__builtin_dynamic_object_size' might
 be incorrect.

 For instance, in the following example, the allocated array has
 less elements than what's specified by the 'sbuf->count', this is
 an user error.  As a result, out-of-bounds access to the array
 might not be detected.

  #define SIZE_BUMP 10
  struct P *sbuf;
  void alloc_buf (size_t nelems)
  {
sbuf = (int *) malloc (sizeof (struct P) + sizeof (int) * nelems);
sbuf->count = nelems + SIZE_BUMP;
/* This is invalid when the sbuf->array has less than sbuf->count
   elements.  */
  }

 In the following example, the 2nd update to the field 'sbuf->count'
 of the above structure will permit out-of-bounds access to the
 array 'sbuf>array' as well.

  #define SIZE_BUMP 10
  struct P *sbuf;
  void alloc_buf (size_t nelems)
  {
sbuf = (int *) malloc (sizeof (struct P)
 + sizeof (int) * (nelems + SIZE_BUMP));
sbuf->count = nelems;
/* This is valid when the sbuf->array has at least sbuf->count
   elements.  */
  }
  void use_buf (int index)
  {
sbuf->count = sbuf->count + SIZE_BUMP + 1;
/* Now the value of sbuf->count is larger than the number
   of elements of sbuf->array.  */
sbuf->array[index] = 0;
/* then the out-of-bound access to this array
   might not be detected.  */
  }

gcc/c-family/ChangeLog:

PR C/108896
* c-attribs.cc (handle_counted_by_attribute): New function.
(attribute_takes_identifier_p): Add counted_by attribute to the list.
* c-common.cc (c_flexible_array_member_type_p): ...To this.
* c-common.h (c_flexible_array_member_type_p): New prototype.

gcc/c/ChangeLog:

PR C/108896
* c-decl.cc (flexible_array_member_type_p): Renamed and moved to...
(add_flexible_array_elts_to_size): Use renamed function.
(is_flexible_array_member_p): Use renamed function.
(verify_counted_by_attribute): New function.
(finish_struct): Use renamed function and verify counted_by
attribute.

gcc/ChangeLog:

PR C/108896
* doc/extend.texi: Document attribute counted_by.
* tree.cc (get_named_field): New function.
* tree.h (get_named_field): New prototype.

gcc/testsuite/ChangeLog:

PR C/108896
* gcc.dg/flex-array-counted-by.c: New test.
---
 gcc/c-family/c-attribs.cc| 54 -
 gcc/c-family/c-common.cc | 13 
 gcc/c-family/c-common.h  |  1 +
 gcc/c/c-decl.cc  | 79 +++-
 gcc/doc/extend.texi  | 73 ++
 gcc/testsuite/gcc.dg/flex-array-counted-by.c | 40 ++
 gcc/tree.cc  | 40 ++
 gcc/tree.h   |  5 ++
 8 files changed, 287 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/flex-array-counted-by.c

diff --git a/gcc/c-family/c-attribs.cc b/gcc/c-family/c-attribs.cc
index e2792ca6898b..65e4f6639109 100644
--- a/gcc/c-family/c-attribs.cc
+++ b/gcc/c-family/c-attribs.cc
@@ -103,6 +103,8 @@ static tree handle_warn_if_not_aligned_attribute (tree *, 
tree, tree,
  int, bool *);
 static tree handle_strict_flex_array_attribute (tree *, tree, tree,
 int, bool *);
+static tree handle_counted_by_attribute (tree *, tree, tree,
+  

[V2][PATCH 0/3] New attribute "counted_by" to annotate bounds for C99 FAM(PR108896)

2023-08-04 Thread Qing Zhao via Gcc-patches
Hi,

This is the 2nd version of the patch, per our discussion based on the
review comments for the 1st version, the major changes in this version
are:

1. change the name "element_count" to "counted_by";
2. change the parameter for the attribute from a STRING to an
Identifier;
3. Add logic and testing cases to handle anonymous structure/unions;
4. Clarify documentation to permit the situation when the allocation
size is larger than what's specified by "counted_by", at the same time,
it's user's error if allocation size is smaller than what's specified by
"counted_by";
5. Add a complete testing case for using counted_by attribute in
__builtin_dynamic_object_size when there is mismatch between the
allocation size and the value of "counted_by", the expecting behavior
for each case and the explanation on why in the comments. 

As discussed, I plan to add two more separate patch sets after this initial
patch set is approved and committed.

set 1. A new warning option and a new sanitizer option for the user error
   when the allocation size is smaller than the value of "counted_by".
set 2. An improvement to __builtin_dynamic_object_size  for the following
   case:

struct A
{
size_t foo;
int array[] __attribute__((counted_by (foo)));
};

extern struct fix * alloc_buf ();

int main ()
{
struct fix *p = alloc_buf ();
__builtin_object_size(p->array, 0) == sizeof(struct A) + p->foo * sizeof(int);
  /* with the current algorithm, it’s UNKNOWN */ 
__builtin_object_size(p->array, 2) == sizeof(struct A) + p->foo * sizeof(int);
  /* with the current algorithm, it’s UNKNOWN */
}

Bootstrapped and regression tested on both aarch64 and X86, no issue.

Please see more details on the description of this work on:

https://gcc.gnu.org/pipermail/gcc-patches/2023-May/619708.html

Okay for committing?

thanks.

Qing

Qing Zhao (3):
  Provide counted_by attribute to flexible array member field (PR108896)
  Use the counted_by atribute info in builtin object size [PR108896]
  Use the counted_by attribute information in bound sanitizer[PR108896]

 gcc/c-family/c-attribs.cc |  54 -
 gcc/c-family/c-common.cc  |  13 ++
 gcc/c-family/c-common.h   |   1 +
 gcc/c-family/c-ubsan.cc   |  16 ++
 gcc/c/c-decl.cc   |  79 +--
 gcc/doc/extend.texi   |  73 +++
 .../gcc.dg/flex-array-counted-by-2.c  |  74 +++
 .../gcc.dg/flex-array-counted-by-3.c  | 197 ++
 gcc/testsuite/gcc.dg/flex-array-counted-by.c  |  40 
 .../ubsan/flex-array-counted-by-bounds-2.c|  27 +++
 .../ubsan/flex-array-counted-by-bounds.c  |  46 
 gcc/tree-object-size.cc   |  37 +++-
 gcc/tree.cc   | 133 
 gcc/tree.h|  15 ++
 14 files changed, 780 insertions(+), 25 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/flex-array-counted-by-2.c
 create mode 100644 gcc/testsuite/gcc.dg/flex-array-counted-by-3.c
 create mode 100644 gcc/testsuite/gcc.dg/flex-array-counted-by.c
 create mode 100644 gcc/testsuite/gcc.dg/ubsan/flex-array-counted-by-bounds-2.c
 create mode 100644 gcc/testsuite/gcc.dg/ubsan/flex-array-counted-by-bounds.c

-- 
2.31.1



Re: One question on the source code of tree-object-size.cc

2023-08-04 Thread Qing Zhao via Gcc-patches


> On Aug 4, 2023, at 3:09 PM, Siddhesh Poyarekar  wrote:
> 
> On 2023-08-04 15:06, Qing Zhao wrote:
>>> Yes, that's what I'm thinking.
>>> 
> so `q` must be pointing to a single element.  So you could deduce:
> 
> 1. the minimum size of the whole object that q points to.
 You mean that the TYPE will determine the minimum size of the whole 
 object?  (Does this include the size of the flexible array member, or only 
 the other part of the structure except the flexible array member?)
>>> 
>>> Only the constant sized part of the structure.
>> Okay. I see.
>> But if the “counted_by” info is available, then from p->array, we can deduce 
>> the minimum size too, as sizeof(struct A) + q->foo * sizeof(int), right?
> 
> Yes.
> 
>>> 
> Actually for minimum size we'd also need a guarantee that 
> `alloc_buf_more` returns a valid allocated object.
 Why? Please explain a little bit here.
>>> 
>>> So `alloc_buf_more` could return NULL, a valid pointer or an invalid 
>>> pointer.  So, we could end up returning a non-zero minimum size for an 
>>> invalid or NULL pointer, which is incorrect, we don't know that.
>> I see what’ s you mean now.
>> However, if we already see p->array, then the p is guaranteed a valid 
>> pointer and not a NULL, right?  (We are discussing on 
>> __builtin_dynamic_object_size (q->array, 2), we see q->array already)
> 
> Yes, you could argue that for p->array, I agree, but not for p.

Agreed. Yes, for p->array, observed access. -:)

Looks like we can improve __builtin_dynamic_object_size  for the following case:
struct A
{
 size_t foo;
 int array[] __attribute__((counted_by (foo)));
};

extern struct fix * alloc_buf ();

int main ()
{
 struct fix *p = alloc_buf ();
 __builtin_object_size(p->array, 0) == sizeof(struct A) + p->foo * sizeof(int); 
  /* with the current algorithm, it’s UNKNOWN */ 
 __builtin_object_size(p->array, 2) == sizeof(struct A) + p->foo * sizeof(int); 
  /* with the current algorithm, it’s UNKNOWN */
}

I will add this improvement to __builtin_dynamic_object_size for FAM with 
“counted_by” attribute in a later patch after the initial patch is committed.

Thanks a lot for the help.

Qing
> 
>>> 
>>> We won't need the object validity guarantee for (2) beyond, e.g. guarding 
>>> against a new NULL pointer dereference because it's a *maximum* estimate; 
>>> an invalid or NULL pointer would have 0 size.  So for such cases, __bos(q, 
>>> 0) could return
>>> 
>>> sizeof(*q) + (q ? q->foo:0)
>>> 
>>> and __bos(q->array, 0) could be
>>> 
>>> sizeof(*q) + q->foo - offsetof(q, array)
>>> 
>>> There's no need to guard against a dereference in the second case because 
>>> the q->array dereference already assumes that q is valid.
>> q->array should also guarantee that q is a valid pointer for minimum size, 
>> right? Or do I miss anything here?
> 
> Yes.
> 
> Thanks,
> Sid



Re: One question on the source code of tree-object-size.cc

2023-08-04 Thread Siddhesh Poyarekar

On 2023-08-04 15:06, Qing Zhao wrote:

Yes, that's what I'm thinking.


so `q` must be pointing to a single element.  So you could deduce:

1. the minimum size of the whole object that q points to.

You mean that the TYPE will determine the minimum size of the whole object?  
(Does this include the size of the flexible array member, or only the other 
part of the structure except the flexible array member?)


Only the constant sized part of the structure.

Okay. I see.
But if the “counted_by” info is available, then from p->array, we can deduce the 
minimum size too, as sizeof(struct A) + q->foo * sizeof(int), right?


Yes.




Actually for minimum size we'd also need a guarantee that `alloc_buf_more` 
returns a valid allocated object.

Why? Please explain a little bit here.


So `alloc_buf_more` could return NULL, a valid pointer or an invalid pointer.  
So, we could end up returning a non-zero minimum size for an invalid or NULL 
pointer, which is incorrect, we don't know that.


I see what’ s you mean now.

However, if we already see p->array, then the p is guaranteed a valid pointer and not 
a NULL, right?  (We are discussing on __builtin_dynamic_object_size (q->array, 2), we 
see q->array already)


Yes, you could argue that for p->array, I agree, but not for p.



We won't need the object validity guarantee for (2) beyond, e.g. guarding 
against a new NULL pointer dereference because it's a *maximum* estimate; an 
invalid or NULL pointer would have 0 size.  So for such cases, __bos(q, 0) 
could return

sizeof(*q) + (q ? q->foo:0)

and __bos(q->array, 0) could be

sizeof(*q) + q->foo - offsetof(q, array)

There's no need to guard against a dereference in the second case because the 
q->array dereference already assumes that q is valid.


q->array should also guarantee that q is a valid pointer for minimum size, 
right? Or do I miss anything here?


Yes.

Thanks,
Sid


Re: [committed][RISC-V] Remove errant hunk of code

2023-08-04 Thread Andrew Pinski via Gcc-patches
On Thu, Aug 3, 2023 at 10:31 PM Jeff Law via Gcc-patches
 wrote:
>
>
>
> On 8/3/23 17:38, Vineet Gupta wrote:
>
> >> ;-)  Actually if you wanted to poke at zicond, the most interesting
> >> unexplored area I've come across is the COND_EXPR handling in gimple.
> >> When we expand a COND_EXPR into RTL the first approach we take is to
> >> try movcc in RTL.
> >>
> >> Unfortunately we don't create COND_EXPRs all that often in gimple.
> >> Some simple match.pd patterns would likely really help here.
> >>
> >> The problem is RTL expansion when movcc FAILs is usually poor at
> >> best.  So if we're going to add those match.pd patterns, we probably
> >> need to beef up the RTL expansion code to do a better job when the
> >> target doesn't have a movcc RTL pattern.
> >
> > Ok, I'll add that to my todo list.
> You might want to reach out to Andrew Pinski if you do poke at this.  I
> made a reference to this issue in a BZ he recently commented on.  It was
> an x86 issue with cmov generation, but the same core issue applies --
> we're not generating COND_EXPRs very aggressively in gimple.

Yes I have some ideas of producing more aggressively COND_EXPR in
either isel or in the last phiopt.
There is also a canonicalization form issue dealing with `bool * b`
representing `bool ? b : 0` where isel could select between the
COND_EXPR and multiply too.
This is the issue Jeff is talking about too.

Thanks,
Andrew

>
> jeff


Re: One question on the source code of tree-object-size.cc

2023-08-04 Thread Qing Zhao via Gcc-patches


> On Aug 4, 2023, at 12:36 PM, Siddhesh Poyarekar  wrote:
> 
> On 2023-08-04 11:27, Qing Zhao wrote:
>>> On Aug 4, 2023, at 10:40 AM, Siddhesh Poyarekar  wrote:
>>> 
>>> On 2023-08-03 13:34, Qing Zhao wrote:
 One thing I need to point out first is, currently, even for regular fixed 
 size array in the structure,
 We have this same issue, for example:
 #define LENGTH 10
 struct fix {
   size_t foo;
   int array[LENGTH];
 };
 …
 int main ()
 {
   struct fix *p;
   p = alloc_buf_more ();
   expect(__builtin_object_size(p->array, 1), LENGTH * sizeof(int));
   expect(__builtin_object_size(p->array, 0), -1);
 }
 Currently, for __builtin_object_size(p->array, 0),  GCC return UNKNOWN for 
 it.
 This is not a special issue for flexible array member.
>>> 
>>> That's fine for fixed arrays at the end of a struct because the "whole 
>>> object" size could be anything; `p` could be pointing to the beginning of 
>>> an array for all we know.  If however `array` is strictly a flex array, 
>>> i.e.:
>>> 
>>> ```
>>> struct A
>>> {
>>>  size_t foo;
>>>  int array[];
>>> };
>>> ```
>>> 
>>> then there's no way in valid C to have an array of `struct fix`,
>> Yes!!   this is exactly the place that makes difference between structures 
>> with fixed arrays and the ones with flexible arrays.
>> With such difference, I guess that using the type of the structure with 
>> flexible array member for p->array to get the size of the whole object p 
>> point to might be reasonable?
> 
> Yes, that's what I'm thinking.
> 
>>> so `q` must be pointing to a single element.  So you could deduce:
>>> 
>>> 1. the minimum size of the whole object that q points to.
>> You mean that the TYPE will determine the minimum size of the whole object?  
>> (Does this include the size of the flexible array member, or only the other 
>> part of the structure except the flexible array member?)
> 
> Only the constant sized part of the structure.
Okay. I see.
But if the “counted_by” info is available, then from p->array, we can deduce 
the minimum size too, as sizeof(struct A) + q->foo * sizeof(int), right?
> 
>>> Actually for minimum size we'd also need a guarantee that `alloc_buf_more` 
>>> returns a valid allocated object.
>> Why? Please explain a little bit here.
> 
> So `alloc_buf_more` could return NULL, a valid pointer or an invalid pointer. 
>  So, we could end up returning a non-zero minimum size for an invalid or NULL 
> pointer, which is incorrect, we don't know that.

I see what’ s you mean now.

However, if we already see p->array, then the p is guaranteed a valid pointer 
and not a NULL, right?  (We are discussing on __builtin_dynamic_object_size 
(q->array, 2), we see q->array already)

> 
> We won't need the object validity guarantee for (2) beyond, e.g. guarding 
> against a new NULL pointer dereference because it's a *maximum* estimate; an 
> invalid or NULL pointer would have 0 size.  So for such cases, __bos(q, 0) 
> could return
> 
> sizeof(*q) + (q ? q->foo:0)
> 
> and __bos(q->array, 0) could be
> 
> sizeof(*q) + q->foo - offsetof(q, array)
> 
> There's no need to guard against a dereference in the second case because the 
> q->array dereference already assumes that q is valid.

q->array should also guarantee that q is a valid pointer for minimum size, 
right? Or do I miss anything here?

thanks.

Qing
> 
>>> 
>>> and
>>> 
>>> 2. if you're able to determine the size of the flex array (through 
>>> __element_count__(foo) for example), you could even determine the maximum 
>>> size of the whole object.
>>> 
>>> For (2) though, you'd break applications that overallocate and then expect 
>>> to be able to use that overallocation despite the space not being reflected 
>>> in the __element_count__.  I think it's a bug in the application and I 
>>> can't see a way for an application to be able to do this in a valid way so 
>>> I'm inclined towards breaking it.
>> Currently, we allow the situation when the allocation size for the whole 
>> object is larger than the value reflected in the “counted_by” attribute (the 
>> old name is __element_count__). But don’t allow the other way around (i.e, 
>> when the allocation size for the whole object is smaller than the value 
>> reflected in the “counted_by” attribute.
> 
> Right, that's going to be the "break".  For underallocation __bos will only 
> end up overestimating the space available, which is not ideal, but won't end 
> up breaking compatibility.
> 
>>> 
>>> Of course, the fact that gcc allows flex arrays to be in the middle of 
>>> structs breaks the base assumption but that's something we need to get rid 
>>> of anyway since there's no way for valid C programs to use that safely.
>> Since GCC14, we started to deprecate this extension (allow flex array to be 
>> in the middle of structs).
>> https://gcc.gnu.org/pipermail/gcc-cvs/2023-June/385730.html
> 
> Yes, that's what I'm banking on.
> 
> Thanks,
> Sid



[PATCH] _Decimal* to _BitInt conversion support [PR102989]

2023-08-04 Thread Jakub Jelinek via Gcc-patches
Hi!

Repost because the patch was too large.

On Fri, Jul 28, 2023 at 06:03:33PM +, Joseph Myers wrote:
> Note that representations with too-large significand are defined to be
> noncanonical representations of zero, so you need to take care of that in
> decoding BID.

Done.

> You could e.g. have a table up to 10^(N-1) for some N, and 10^N, 10^2N
> etc. up to 10^6144 (or rather up to 10^6111, which can then be multiplied
> by a 34-digit integer significand), so that only one multiplication is
> needed to get the power of 10 and then a second multiplication by the
> significand.  (Or split into three parts at the cost of an extra
> multiplication, or multiply the significand by 1, 10, 100, 1000 or 1
> as a multiplication within 128 bits and so only need to compute 10^k for k
> a multiple of 5, or any number of variations on those themes.)

So, I've used N 256 and applied a little further space optimization,
omitting least significant whole limbs full of just zeros (for 32-bit limbs
actually pairs of those limbs, otherwise it would be a nightmare).
With that I got down to 32KiB or so (32128 bytes the limb array and
560 bytes the offset array), tables generated such that they can be used
with both 32-bit and 64-bit limbs and both little and big endian ordering of
them.

The following patch implements for now just the _Decimal -> _BitInt
conversions and uses the soft-fp infrastructure to raise exceptions (not
heavily tight to that, if the bitint.h header from soft-fp is copied/tweaked
and a few typedefs are provided it could be also in libbid if it grows
usable exception support).

I'll work on _BitInt -> _Decimal next and hope to use the __bid_pow10bitint
function in there as well (guess some safe lower power of 10 divisor,
use __divmodbitint4 to divide by that power of 10 including computing
remainder, analyze that remainder (check if it is 0, exact half of the
power of 10, or something smaller or larger than that) and if guessed too
low, divide the usually much smaller quotient again to get exact answer
(+ again check remainder).

The bitintpow10.c patch is included compressed, as the single source file is
408KiB.

2023-08-04  Jakub Jelinek  

PR c/102989
gcc/
* gimple-lower-bitint.cc (bitint_large_huge::lower_float_conv_stmt):
Handle _Decimal* to _BitInt conversions.
* internal-fn.cc (expand_FLOATTOBITINT): Likewise.
gcc/testsuite/
* gcc.dg/dfp/bitint-1.c: New test.
* gcc.dg/dfp/bitint-2.c: New test.
* gcc.dg/dfp/bitint-3.c: New test.
libgcc/
* config/t-softfp (softfp_bid_list, softfp_bid_file_list): New
variables.
(LIB2ADD_ST): Add $(softfp_bid_file_list).
* soft-fp/fixsdbitint.c: New file.
* soft-fp/fixddbitint.c: New file.
* soft-fp/fixtdbitint.c: New file.
* soft-fp/bitint.h (bitint_negate): New static inline function.
(__mulbitint3, __divmodbitint4, __bid_pow10bitint): Declare.
* soft-fp/bitintpow10.c: New file.

--- gcc/gimple-lower-bitint.cc.jj   2023-08-02 17:36:15.439915237 +0200
+++ gcc/gimple-lower-bitint.cc  2023-08-04 09:40:14.271005211 +0200
@@ -3363,8 +3363,7 @@ bitint_large_huge::lower_float_conv_stmt
   tree rhs1 = gimple_assign_rhs1 (stmt);
   tree lhs = gimple_assign_lhs (stmt);
   tree_code rhs_code = gimple_assign_rhs_code (stmt);
-  if (DECIMAL_FLOAT_MODE_P (TYPE_MODE (TREE_TYPE (rhs1)))
-  || DECIMAL_FLOAT_MODE_P (TYPE_MODE (TREE_TYPE (lhs
+  if (DECIMAL_FLOAT_MODE_P (TYPE_MODE (TREE_TYPE (lhs
 {
   sorry_at (gimple_location (stmt),
"unsupported conversion between %<_BitInt(%d)%> and %qT",
--- gcc/internal-fn.cc.jj   2023-07-26 10:06:29.233849044 +0200
+++ gcc/internal-fn.cc  2023-08-04 09:47:58.368480546 +0200
@@ -4846,11 +4846,25 @@ expand_FLOATTOBITINT (internal_fn, gcall
   const char *mname = GET_MODE_NAME (mode);
   unsigned mname_len = strlen (mname);
   int len = 12 + mname_len;
+  if (DECIMAL_FLOAT_MODE_P (mode))
+len += 4;
   char *libfunc_name = XALLOCAVEC (char, len);
   char *p = libfunc_name;
   const char *q;
-  memcpy (p, "__fix", 5);
-  p += 5;
+  if (DECIMAL_FLOAT_MODE_P (mode))
+{
+#if ENABLE_DECIMAL_BID_FORMAT
+  memcpy (p, "__bid_fix", 9);
+#else
+  memcpy (p, "__dpd_fix", 9);
+#endif
+  p += 9;
+}
+  else
+{
+  memcpy (p, "__fix", 5);
+  p += 5;
+}
   for (q = mname; *q; q++)
 *p++ = TOLOWER (*q);
   memcpy (p, "bitint", 7);
--- gcc/testsuite/gcc.dg/dfp/bitint-1.c.jj  2023-08-04 14:30:24.615100334 
+0200
+++ gcc/testsuite/gcc.dg/dfp/bitint-1.c 2023-08-04 19:37:26.834790279 +0200
@@ -0,0 +1,98 @@
+/* PR c/102989 */
+/* { dg-do run { target bitint } } */
+/* { dg-options "-O2 -std=c2x -pedantic-errors" } */
+
+#if __BITINT_MAXWIDTH__ >= 192
+__attribute__((noipa)) _BitInt(192)
+tests192 (_Decimal64 d)
+{
+  return d;
+}
+
+__attribute__((noipa)) unsigned _BitInt(192)
+testu192 (_Decimal64 d)
+{
+  return d;
+}
+#endif
+
+#if 

[PATCH] Add -Wdisabled-optimization warning for not optimizing sibling calls

2023-08-04 Thread Bradley Lucier via Gcc-patches
The patch at the end adds a warning when a tail/sibling call cannot be 
optimized for various reasons.


I built and tested GCC with and without the patch with configuration

Configured with: ../../gcc-mainline/configure --enable-languages=c 
--disable-multilib --prefix=/pkgs/gcc-mainline --disable-werror


There were some changes in the test results, but I can't say that they 
look substantive:


diff -C 2 summary.log ../gcc-mainline
*** summary.log Thu Aug  3 22:56:13 2023
--- ../gcc-mainline/summary.log Thu Aug  3 19:42:33 2023
***
*** 14,22 
=== g++ Summary ===

! # of expected passes  239234
  # of unexpected failures  5
  # of expected failures2087
! # of unsupported tests10566
! /home/lucier/programs/gcc/objdirs/gcc-mainline-new/gcc/xg++  version 
14.0.0 20230802 (experimental) (GCC)


=== gcc tests ===
--- 14,22 
=== g++ Summary ===

! # of expected passes  239262
  # of unexpected failures  5
  # of expected failures2087
! # of unsupported tests10562
! /home/lucier/programs/gcc/objdirs/gcc-mainline/gcc/xg++  version 
14.0.0 20230802 (experimental) (GCC)


=== gcc tests ===
***
*** 155,164 
=== gcc Summary ===

! # of expected passes  192553
  # of unexpected failures  109
  # of unexpected successes 19
  # of expected failures1506
! # of unsupported tests2623
! /home/lucier/programs/gcc/objdirs/gcc-mainline-new/gcc/xgcc  version 
14.0.0 20230802 (experimental) (GCC)


=== libatomic tests ===
--- 155,164 
=== gcc Summary ===

! # of expected passes  192563
  # of unexpected failures  109
  # of unexpected successes 19
  # of expected failures1506
! # of unsupported tests2619
! /home/lucier/programs/gcc/objdirs/gcc-mainline/gcc/xgcc  version 
14.0.0 20230802 (experimental) (GCC)


=== libatomic tests ===

I then configured and built GCC with

 ../../gcc-mainline/configure CXX="/pkgs/gcc-mainline-new/bin/g++ 
-Wdisabled-optimization" --enable-languages=c --disable-multilib 
--prefix=/pkgs/gcc-mainline-test --disable-werror --disable-bootstrap


to test the new warning.  The warnings are of the form, e.g.,

../../../gcc-mainline/gcc/tree-vect-stmts.cc:11990:44: warning: cannot 
apply sibling-call optimization: callee required more stack slots than 
the caller [-Wdisabled-optimization]


These are the number of times this warning was triggered building stage1:

grep warning: build.log | grep sibling | sed 's/^.*://' | sort | uniq -c
259  callee required more stack slots than the caller 
[-Wdisabled-optimization]

 43  callee returns a structure [-Wdisabled-optimization]

If this patch is OK, someone else will need to commit it for me.

Brad

gcc/Changelog

* calls.cc (maybe_complain_about_tail_call) Add warning when
tail or sibling call cannot be optimized.

diff --git a/gcc/calls.cc b/gcc/calls.cc
index 1f3a6d5c450..b95c876fda8 100644
--- a/gcc/calls.cc
+++ b/gcc/calls.cc
@@ -1242,10 +1242,12 @@ void
 maybe_complain_about_tail_call (tree call_expr, const char *reason)
 {
   gcc_assert (TREE_CODE (call_expr) == CALL_EXPR);
-  if (!CALL_EXPR_MUST_TAIL_CALL (call_expr))
-return;
-
-  error_at (EXPR_LOCATION (call_expr), "cannot tail-call: %s", reason);
+  if (CALL_EXPR_MUST_TAIL_CALL (call_expr))
+error_at (EXPR_LOCATION (call_expr), "cannot tail-call: %s", reason);
+  else if (flag_optimize_sibling_calls)
+warning (OPT_Wdisabled_optimization,
+ "cannot apply sibling-call optimization: %s", reason);
+  return;
 }

 /* Fill in ARGS_SIZE and ARGS array based on the parameters found in




Re: [PATCH] ipa-sra: Don't consider CLOBBERS as writes preventing splitting

2023-08-04 Thread Richard Biener via Gcc-patches



> Am 04.08.2023 um 18:26 schrieb Martin Jambor :
> 
> Hello,
> 
>> On Wed, Aug 02 2023, Richard Biener wrote:
>>> On Mon, Jul 31, 2023 at 7:05 PM Martin Jambor  wrote:
>>> 
>>> Hi,
>>> 
>>> when IPA-SRA detects whether a parameter passed by reference is
>>> written to, it does not special case CLOBBERs which means it often
>>> bails out unnecessarily, especially when dealing with C++ destructors.
>>> Fixed by the obvious continue in the two relevant loops.
>>> 
>>> The (slightly) more complex testcases in the PR need surprisingly more
>>> effort but the simple one can be fixed now easily by this patch and I'll
>>> work on the others incrementally.
>>> 
>>> Bootstrapped and currently undergoing testsuite run on x86_64-linux.  OK
>>> if it passes too?
>> 
>> LGTM, btw - how are the clobbers handled during transform?
> 
> it turns out your question is spot on.  I assumed that the mini-DCE that
> I implemented into IPA-SRA transform would delete but I had a closer
> look and it is not invoked on split parameters,only on removed ones.
> What was actually happening is that the parameter got remapped to a
> default definition of a replacement VAR_DECL and we were thus
> gimple-clobbering a pointer pointing to nowhere.  The clobber then got
> DSEd and so I originally did not notice looking at the optimized dump.
> 
> Still that is of course not ideal and so I added a simple function
> removing clobbers when splitting.  I as considering adding that
> functionality to ipa_param_body_adjustments::mark_dead_statements but
> that would make the function harder to read without much gain.
> 
> So thanks again for the remark.  The following passes bootstrap and
> testing on x86_64-linux.  I am running LTO bootstrap now.  OK if it
> passes?

Ok

Richard 

> Martin
> 
> 
> 
> When IPA-SRA detects whether a parameter passed by reference is
> written to, it does not special case CLOBBERs which means it often
> bails out unnecessarily, especially when dealing with C++ destructors.
> Fixed by the obvious continue in the two relevant loops and by adding
> a simple function that marks the clobbers in the transformation code
> as statements to be removed.
> 
> gcc/ChangeLog:
> 
> 2023-08-04  Martin Jambor  
> 
>PR ipa/110378
>* ipa-param-manipulation.h (class ipa_param_body_adjustments): New
>members get_ddef_if_exists_and_is_used and mark_clobbers_dead.
>* ipa-sra.cc (isra_track_scalar_value_uses): Ignore clobbers.
>(ptr_parm_has_nonarg_uses): Likewise.
>* ipa-param-manipulation.cc
>(ipa_param_body_adjustments::get_ddef_if_exists_and_is_used): New.
>(ipa_param_body_adjustments::mark_dead_statements): Move initial
>checks to get_ddef_if_exists_and_is_used.
>(ipa_param_body_adjustments::mark_clobbers_dead): New.
>(ipa_param_body_adjustments::common_initialization): Call
>mark_clobbers_dead when splitting.
> 
> gcc/testsuite/ChangeLog:
> 
> 2023-07-31  Martin Jambor  
> 
>PR ipa/110378
>* g++.dg/ipa/pr110378-1.C: New test.
> ---
> gcc/ipa-param-manipulation.cc | 44 +---
> gcc/ipa-param-manipulation.h  |  2 ++
> gcc/ipa-sra.cc|  6 ++--
> gcc/testsuite/g++.dg/ipa/pr110378-1.C | 48 +++
> 4 files changed, 94 insertions(+), 6 deletions(-)
> create mode 100644 gcc/testsuite/g++.dg/ipa/pr110378-1.C
> 
> diff --git a/gcc/ipa-param-manipulation.cc b/gcc/ipa-param-manipulation.cc
> index a286af7f5d9..4a185ddbdf4 100644
> --- a/gcc/ipa-param-manipulation.cc
> +++ b/gcc/ipa-param-manipulation.cc
> @@ -1072,6 +1072,20 @@ ipa_param_body_adjustments::carry_over_param (tree t)
>   return new_parm;
> }
> 
> +/* If DECL is a gimple register that has a default definition SSA name and 
> that
> +   has some uses, return the default definition, otherwise return NULL_TREE. 
>  */
> +
> +tree
> +ipa_param_body_adjustments::get_ddef_if_exists_and_is_used (tree decl)
> +{
> + if (!is_gimple_reg (decl))
> +return NULL_TREE;
> +  tree ddef = ssa_default_def (m_id->src_cfun, decl);
> +  if (!ddef || has_zero_uses (ddef))
> +return NULL_TREE;
> +  return ddef;
> +}
> +
> /* Populate m_dead_stmts given that DEAD_PARAM is going to be removed without
>any replacement or splitting.  REPL is the replacement VAR_SECL to base any
>remaining uses of a removed parameter on.  Push all removed SSA names that
> @@ -1084,10 +1098,8 @@ ipa_param_body_adjustments::mark_dead_statements (tree 
> dead_param,
>   /* Current IPA analyses which remove unused parameters never remove a
>  non-gimple register ones which have any use except as parameters in other
>  calls, so we can safely leve them as they are.  */
> -  if (!is_gimple_reg (dead_param))
> -return;
> -  tree parm_ddef = ssa_default_def (m_id->src_cfun, dead_param);
> -  if (!parm_ddef || has_zero_uses (parm_ddef))
> +  tree parm_ddef = get_ddef_if_exists_and_is_used (dead_param);
> +  if (!parm_ddef)
> return;
> 
>   auto_vec stack;
> @@ -1169,6 

Re: [C PATCH] _Generic should not warn in non-active branches [PR68193,PR97100]

2023-08-04 Thread Joseph Myers
On Fri, 4 Aug 2023, Martin Uecker via Gcc-patches wrote:

> Here is a patch to reduce false positives in _Generic.
> 
> Bootstrapped and regression tested on x86_64-linux.
> 
> Martin
> 
> c: _Generic should not warn in non-active branches [PR68193,PR97100]
> 
> To avoid false diagnostics, use c_inhibit_evaluation_warnings when
> a generic association is known to match during parsing.  We may still
> generate false positives if the default branch comes earler than
> a specific association that matches.
> 
> PR c/68193
> PR c/97100
> 
> gcc/c/:
> * c-parser.cc (c_parser_generic_selection): Inhibit evaluation
> warnings branches that are known not be taken during parsing.
> 
> gcc/testsuite/ChangeLog:
> * gcc.dg/pr68193.c: New test.

OK.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: One question on the source code of tree-object-size.cc

2023-08-04 Thread Siddhesh Poyarekar

On 2023-08-04 11:27, Qing Zhao wrote:




On Aug 4, 2023, at 10:40 AM, Siddhesh Poyarekar  wrote:

On 2023-08-03 13:34, Qing Zhao wrote:

One thing I need to point out first is, currently, even for regular fixed size 
array in the structure,
We have this same issue, for example:
#define LENGTH 10
struct fix {
   size_t foo;
   int array[LENGTH];
};
…
int main ()
{
   struct fix *p;
   p = alloc_buf_more ();
   expect(__builtin_object_size(p->array, 1), LENGTH * sizeof(int));
   expect(__builtin_object_size(p->array, 0), -1);
}
Currently, for __builtin_object_size(p->array, 0),  GCC return UNKNOWN for it.
This is not a special issue for flexible array member.


That's fine for fixed arrays at the end of a struct because the "whole object" 
size could be anything; `p` could be pointing to the beginning of an array for all we 
know.  If however `array` is strictly a flex array, i.e.:

```
struct A
{
  size_t foo;
  int array[];
};
```

then there's no way in valid C to have an array of `struct fix`,


Yes!!   this is exactly the place that makes difference between structures with 
fixed arrays and the ones with flexible arrays.

With such difference, I guess that using the type of the structure with flexible 
array member for p->array to get the size of the whole object p point to might 
be reasonable?


Yes, that's what I'm thinking.


so `q` must be pointing to a single element.  So you could deduce:

1. the minimum size of the whole object that q points to.


You mean that the TYPE will determine the minimum size of the whole object?  
(Does this include the size of the flexible array member, or only the other 
part of the structure except the flexible array member?)


Only the constant sized part of the structure.


Actually for minimum size we'd also need a guarantee that `alloc_buf_more` 
returns a valid allocated object.


Why? Please explain a little bit here.


So `alloc_buf_more` could return NULL, a valid pointer or an invalid 
pointer.  So, we could end up returning a non-zero minimum size for an 
invalid or NULL pointer, which is incorrect, we don't know that.


We won't need the object validity guarantee for (2) beyond, e.g. 
guarding against a new NULL pointer dereference because it's a *maximum* 
estimate; an invalid or NULL pointer would have 0 size.  So for such 
cases, __bos(q, 0) could return


sizeof(*q) + (q ? q->foo:0)

and __bos(q->array, 0) could be

sizeof(*q) + q->foo - offsetof(q, array)

There's no need to guard against a dereference in the second case 
because the q->array dereference already assumes that q is valid.




and

2. if you're able to determine the size of the flex array (through 
__element_count__(foo) for example), you could even determine the maximum size 
of the whole object.

For (2) though, you'd break applications that overallocate and then expect to 
be able to use that overallocation despite the space not being reflected in the 
__element_count__.  I think it's a bug in the application and I can't see a way 
for an application to be able to do this in a valid way so I'm inclined towards 
breaking it.


Currently, we allow the situation when the allocation size for the whole object 
is larger than the value reflected in the “counted_by” attribute (the old name 
is __element_count__). But don’t allow the other way around (i.e, when the 
allocation size for the whole object is smaller than the value reflected in the 
“counted_by” attribute.


Right, that's going to be the "break".  For underallocation __bos will 
only end up overestimating the space available, which is not ideal, but 
won't end up breaking compatibility.




Of course, the fact that gcc allows flex arrays to be in the middle of structs 
breaks the base assumption but that's something we need to get rid of anyway 
since there's no way for valid C programs to use that safely.


Since GCC14, we started to deprecate this extension (allow flex array to be in 
the middle of structs).
https://gcc.gnu.org/pipermail/gcc-cvs/2023-June/385730.html


Yes, that's what I'm banking on.

Thanks,
Sid


Re: [PATCH] ipa-sra: Don't consider CLOBBERS as writes preventing splitting

2023-08-04 Thread Martin Jambor
Hello,

On Wed, Aug 02 2023, Richard Biener wrote:
> On Mon, Jul 31, 2023 at 7:05 PM Martin Jambor  wrote:
>>
>> Hi,
>>
>> when IPA-SRA detects whether a parameter passed by reference is
>> written to, it does not special case CLOBBERs which means it often
>> bails out unnecessarily, especially when dealing with C++ destructors.
>> Fixed by the obvious continue in the two relevant loops.
>>
>> The (slightly) more complex testcases in the PR need surprisingly more
>> effort but the simple one can be fixed now easily by this patch and I'll
>> work on the others incrementally.
>>
>> Bootstrapped and currently undergoing testsuite run on x86_64-linux.  OK
>> if it passes too?
>
> LGTM, btw - how are the clobbers handled during transform?

it turns out your question is spot on.  I assumed that the mini-DCE that
I implemented into IPA-SRA transform would delete but I had a closer
look and it is not invoked on split parameters,only on removed ones.
What was actually happening is that the parameter got remapped to a
default definition of a replacement VAR_DECL and we were thus
gimple-clobbering a pointer pointing to nowhere.  The clobber then got
DSEd and so I originally did not notice looking at the optimized dump.

Still that is of course not ideal and so I added a simple function
removing clobbers when splitting.  I as considering adding that
functionality to ipa_param_body_adjustments::mark_dead_statements but
that would make the function harder to read without much gain.

So thanks again for the remark.  The following passes bootstrap and
testing on x86_64-linux.  I am running LTO bootstrap now.  OK if it
passes?

Martin



When IPA-SRA detects whether a parameter passed by reference is
written to, it does not special case CLOBBERs which means it often
bails out unnecessarily, especially when dealing with C++ destructors.
Fixed by the obvious continue in the two relevant loops and by adding
a simple function that marks the clobbers in the transformation code
as statements to be removed.

gcc/ChangeLog:

2023-08-04  Martin Jambor  

PR ipa/110378
* ipa-param-manipulation.h (class ipa_param_body_adjustments): New
members get_ddef_if_exists_and_is_used and mark_clobbers_dead.
* ipa-sra.cc (isra_track_scalar_value_uses): Ignore clobbers.
(ptr_parm_has_nonarg_uses): Likewise.
* ipa-param-manipulation.cc
(ipa_param_body_adjustments::get_ddef_if_exists_and_is_used): New.
(ipa_param_body_adjustments::mark_dead_statements): Move initial
checks to get_ddef_if_exists_and_is_used.
(ipa_param_body_adjustments::mark_clobbers_dead): New.
(ipa_param_body_adjustments::common_initialization): Call
mark_clobbers_dead when splitting.

gcc/testsuite/ChangeLog:

2023-07-31  Martin Jambor  

PR ipa/110378
* g++.dg/ipa/pr110378-1.C: New test.
---
 gcc/ipa-param-manipulation.cc | 44 +---
 gcc/ipa-param-manipulation.h  |  2 ++
 gcc/ipa-sra.cc|  6 ++--
 gcc/testsuite/g++.dg/ipa/pr110378-1.C | 48 +++
 4 files changed, 94 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/ipa/pr110378-1.C

diff --git a/gcc/ipa-param-manipulation.cc b/gcc/ipa-param-manipulation.cc
index a286af7f5d9..4a185ddbdf4 100644
--- a/gcc/ipa-param-manipulation.cc
+++ b/gcc/ipa-param-manipulation.cc
@@ -1072,6 +1072,20 @@ ipa_param_body_adjustments::carry_over_param (tree t)
   return new_parm;
 }
 
+/* If DECL is a gimple register that has a default definition SSA name and that
+   has some uses, return the default definition, otherwise return NULL_TREE.  
*/
+
+tree
+ipa_param_body_adjustments::get_ddef_if_exists_and_is_used (tree decl)
+{
+ if (!is_gimple_reg (decl))
+return NULL_TREE;
+  tree ddef = ssa_default_def (m_id->src_cfun, decl);
+  if (!ddef || has_zero_uses (ddef))
+return NULL_TREE;
+  return ddef;
+}
+
 /* Populate m_dead_stmts given that DEAD_PARAM is going to be removed without
any replacement or splitting.  REPL is the replacement VAR_SECL to base any
remaining uses of a removed parameter on.  Push all removed SSA names that
@@ -1084,10 +1098,8 @@ ipa_param_body_adjustments::mark_dead_statements (tree 
dead_param,
   /* Current IPA analyses which remove unused parameters never remove a
  non-gimple register ones which have any use except as parameters in other
  calls, so we can safely leve them as they are.  */
-  if (!is_gimple_reg (dead_param))
-return;
-  tree parm_ddef = ssa_default_def (m_id->src_cfun, dead_param);
-  if (!parm_ddef || has_zero_uses (parm_ddef))
+  tree parm_ddef = get_ddef_if_exists_and_is_used (dead_param);
+  if (!parm_ddef)
 return;
 
   auto_vec stack;
@@ -1169,6 +1181,28 @@ ipa_param_body_adjustments::mark_dead_statements (tree 
dead_param,
   m_dead_ssa_debug_equiv.put (parm_ddef, dp_ddecl);
 }
 
+/* Put all clobbers of of dereference of default definition of PARAM into
+  

Re: RISC-V: Folding memory for FP + constant case

2023-08-04 Thread Jeff Law via Gcc-patches




On 8/4/23 03:52, Manolis Tsamis wrote:

Hi all,

It is true that regcprop currently does not propagate sp and hence
leela is not optimized, but from what I see this should be something
we can address.

The reason that the propagation fails is this check that I have added
when I introduced maybe_copy_reg_attrs:

else if (REG_POINTER (new_reg) != REG_POINTER (old_reg))
   {
 /* Only a single instance of STACK_POINTER_RTX must exist and we cannot
modify it. Allow propagation if REG_POINTER for OLD_REG matches and
don't touch ORIGINAL_REGNO and REG_ATTRS. */
 return NULL_RTX;
   }

To be honest I did add this back then just to be on the safe side of
whether a mismatch in REG_POINTER after propagation would be an issue
(since the original regcprop had caused enough problems).
No worries.  Obviously not propagating is the safe thing to do and if we 
find notable missed cases we can always refine the existing code like 
we're considering now.




I see two ways to solve this and make fmo able to optimize leela as well:
  1) Remove the REG_POINTER check in regcprop if that is safe. My
understanding is that REG_POINTER is used as a hint and there would be
no correctness issues.
REG_POINTER is meant to be conservatively correct.  If REG_POINTER is 
set, then the register must be a pointer to a valid memory location.  If 
REG_POINTER is not set, then it may or may not point to a valid memory 
location.  sp, fp, ap are by definition pointers and thus have 
REG_POINTER set.


With that definition we can safely copy propagate away an sp->gpr copy 
from a REG_POINTER standpoint.












  2) Mark the corresponding registers with REG_POINTER. I'm not sure
where that is supposed to happen.

Since the instructions look like this:
   (insn 113 11 16 2 (set (reg:DI 15 a5 [226])
   (reg/f:DI 2 sp)) 179 {*movdi_64bit}
(nil))

I assume that we'd want to mark a5 as REG_POINTER anyway (which is
not), and in that case propagation would work.
I'd strongly recommend against that.  The problem is the destination 
register might have been used in another context as an index register. 
We've fixed a few bugs in this space through the years I suspect there 
may be others lurking.  You can't directly set that up in C code, but 
once you get down to RTL it can happen.




On the other hand if there's no correctness issue w.r.t. REG_POINTER
and regcprop then removing the additional check would increase
propagation opportunities in general which is also good.

I think you can safely remove this limitation.

jeff


Re: [PATCH 1/5] Middle-end _BitInt support [PR102989]

2023-08-04 Thread Jakub Jelinek via Gcc-patches
On Fri, Aug 04, 2023 at 01:25:07PM +, Richard Biener wrote:
> > @@ -144,6 +144,9 @@ DEFTREECODE (BOOLEAN_TYPE, "boolean_type
> > and TYPE_PRECISION (number of bits used by this type).  */
> >  DEFTREECODE (INTEGER_TYPE, "integer_type", tcc_type, 0)

Thanks.

> > +/* Bit-precise integer type.  */
> > +DEFTREECODE (BITINT_TYPE, "bitint_type", tcc_type, 0)
> > +
> 
> So what was the main reason to not make BITINT_TYPE equal to INTEGER_TYPE?

The fact that they do or can have different calling conventions from normal
integers; they e.g. don't promote to integers, so IFN_VA_ARG handling is
affected (lowered only during stdarg pass after IPA), calling conventions
depend (with a single finalized target it is premature to hardcode how it
will behave for all the others, and while on x86_64 the up to 128-bit
_BitInt pass/return mostly the same, e.g. _BitInt(128) has alignof
like long long, while __int128 has twice as large alignment.

So, the above was the main reason to make BITINT_TYPE <-> non-BITINT_TYPE
conversions non-useless such that calls have the right type of arguments.

I'll try to adjust the comments and mention it in generic.texi.

> Maybe note that in the comment as
> 
> "While bit-precise integer types share the same properties as
> INTEGER_TYPE ..."
> 
> ?
> 
> Note INTEGER_TYPE is documeted in generic.texi but unless I missed
> it the changelog above doesn't mention documentation for BITINT_TYPE
> added there.

> > +  if (bitint_type_cache == NULL)
> > +vec_safe_grow_cleared (bitint_type_cache, 2 * MAX_INT_CACHED_PREC + 2);
> > +
> > +  if (precision <= MAX_INT_CACHED_PREC)
> > +{
> > +  itype = (*bitint_type_cache)[precision + unsignedp];
> > +  if (itype)
> > +   return itype;
> 
> I think we added this kind of cache for standard INTEGER_TYPE because
> the middle-end builds those all over the place and going through
> the type_hash is expensive.  Is that true for _BitInt as well?  If
> not it doesn't seem worth the extra caching.

As even the very large _BitInts are used in the pre-IPA passes, IPA passes
and a few post-IPA passes similarly to other integral types, I think the
caching is very useful.  But if you want, I could gather some statistics
on those.  Most importantly, no price (almost) is paid if one doesn't use
those types in the source.

> In fact, I wonder whether the middle-end does/should treat
> _BitInt and an INTEGER_TYPE with precision N any different?

See above.

> Aka, should we build an INTEGER_TYPE whenever N is say less than
> the number of bits in word_mode?
> 
> > +  if (TREE_CODE (pval) == INTEGER_CST
> > + && TREE_CODE (TREE_TYPE (pval)) == BITINT_TYPE)
> > +   {
> > + unsigned int prec = TYPE_PRECISION (TREE_TYPE (pval));
> > + struct bitint_info info;
> > + gcc_assert (targetm.c.bitint_type_info (prec, ));
> > + scalar_int_mode limb_mode = as_a  (info.limb_mode);
> > + unsigned int limb_prec = GET_MODE_PRECISION (limb_mode);
> > + if (prec > limb_prec)
> > +   {
> > + scalar_int_mode arith_mode
> > +   = (targetm.scalar_mode_supported_p (TImode)
> > +  ? TImode : DImode);
> > + if (prec > GET_MODE_PRECISION (arith_mode))
> > +   pval = tree_output_constant_def (pval);
> > +   }
> 
> A comment would be helpful to understand what we are doing here.

Ok, will add that.  Note, this particular spot is an area for future
improvement, I've spent half of day on it but then gave up for now.
In the lowering pass I'm trying to optimize the common case where a lot
of constants don't need all the limbs and can be represented as one limb
or several limbs in memory with all the higher limbs then filled with 0s
or -1s.  For the argument passing, it would be even useful to have smaller
_BitInt constants passed by not having them in memory at all and just
pushing a couple of constants (i.e. store_by_pieces way).  But trying to
do that in emit_push_insn wasn't really easy...

> > --- gcc/config/i386/i386.cc.jj  2023-07-19 10:01:17.380467993 +0200
> > +++ gcc/config/i386/i386.cc 2023-07-27 15:03:24.230234508 +0200
> > @@ -2121,7 +2121,8 @@ classify_argument (machine_mode mode, co
> > return 0;
> >  }
> 
> splitting out target support to a separate patch might be helpful

Ok.

> > --- gcc/doc/tm.texi.jj  2023-05-30 17:52:34.474857301 +0200
> > +++ gcc/doc/tm.texi 2023-07-27 15:03:24.284233753 +0200
> > @@ -1020,6 +1020,11 @@ Return a value, with the same meaning as
> >  @code{FLT_EVAL_METHOD} that describes which excess precision should be
> >  applied.
> >  
> > +@deftypefn {Target Hook} bool TARGET_C_BITINT_TYPE_INFO (int @var{n}, 
> > struct bitint_info *@var{info})
> > +This target hook returns true if _BitInt(N) is supported and provides some
> > +details on it.
> > +@end deftypefn
> > +
> 
> document the "details" here please?

Will do.

> > @@ -20523,6 +20546,22 @@ rtl_for_decl_init (tree init, tree type)
> > return NULL;
> >   }
> >  
> > +  /* 

Re: One question on the source code of tree-object-size.cc

2023-08-04 Thread Qing Zhao via Gcc-patches


> On Aug 4, 2023, at 10:42 AM, Siddhesh Poyarekar  wrote:
> 
> On 2023-08-04 10:40, Siddhesh Poyarekar wrote:
>> On 2023-08-03 13:34, Qing Zhao wrote:
>>> One thing I need to point out first is, currently, even for regular fixed 
>>> size array in the structure,
>>> We have this same issue, for example:
>>> 
>>> #define LENGTH 10
>>> 
>>> struct fix {
>>>size_t foo;
>>>int array[LENGTH];
>>> };
>>> 
>>> …
>>> int main ()
>>> {
>>>struct fix *p;
>>>p = alloc_buf_more ();
>>> 
>>>expect(__builtin_object_size(p->array, 1), LENGTH * sizeof(int));
>>>expect(__builtin_object_size(p->array, 0), -1);
>>> }
>>> 
>>> Currently, for __builtin_object_size(p->array, 0),  GCC return UNKNOWN for 
>>> it.
>>> This is not a special issue for flexible array member.
>> That's fine for fixed arrays at the end of a struct because the "whole 
>> object" size could be anything; `p` could be pointing to the beginning of an 
>> array for all we know.  If however `array` is strictly a flex array, i.e.:
>> ```
>> struct A
>> {
>>   size_t foo;
>>   int array[];
>> };
>> ```
>> then there's no way in valid C to have an array of `struct fix`, so `q` must 
>> be pointing to a single element.  So you could deduce:
>> 1. the minimum size of the whole object that q points to.
> 
> Actually for minimum size we'd also need a guarantee that `alloc_buf_more` 
> returns a valid allocated object.

Why? Please explain a little bit here.

thanks.

Qing
> 
> Sid



Re: One question on the source code of tree-object-size.cc

2023-08-04 Thread Qing Zhao via Gcc-patches


> On Aug 4, 2023, at 10:40 AM, Siddhesh Poyarekar  wrote:
> 
> On 2023-08-03 13:34, Qing Zhao wrote:
>> One thing I need to point out first is, currently, even for regular fixed 
>> size array in the structure,
>> We have this same issue, for example:
>> #define LENGTH 10
>> struct fix {
>>   size_t foo;
>>   int array[LENGTH];
>> };
>> …
>> int main ()
>> {
>>   struct fix *p;
>>   p = alloc_buf_more ();
>>   expect(__builtin_object_size(p->array, 1), LENGTH * sizeof(int));
>>   expect(__builtin_object_size(p->array, 0), -1);
>> }
>> Currently, for __builtin_object_size(p->array, 0),  GCC return UNKNOWN for 
>> it.
>> This is not a special issue for flexible array member.
> 
> That's fine for fixed arrays at the end of a struct because the "whole 
> object" size could be anything; `p` could be pointing to the beginning of an 
> array for all we know.  If however `array` is strictly a flex array, i.e.:
> 
> ```
> struct A
> {
>  size_t foo;
>  int array[];
> };
> ```
> 
> then there's no way in valid C to have an array of `struct fix`,

Yes!!   this is exactly the place that makes difference between structures with 
fixed arrays and the ones with flexible arrays. 

With such difference, I guess that using the type of the structure with 
flexible array member for p->array to get the size of the whole object p point 
to might be reasonable? 

> so `q` must be pointing to a single element.  So you could deduce:
> 
> 1. the minimum size of the whole object that q points to.

You mean that the TYPE will determine the minimum size of the whole object?  
(Does this include the size of the flexible array member, or only the other 
part of the structure except the flexible array member?)

> 
> and
> 
> 2. if you're able to determine the size of the flex array (through 
> __element_count__(foo) for example), you could even determine the maximum 
> size of the whole object.
> 
> For (2) though, you'd break applications that overallocate and then expect to 
> be able to use that overallocation despite the space not being reflected in 
> the __element_count__.  I think it's a bug in the application and I can't see 
> a way for an application to be able to do this in a valid way so I'm inclined 
> towards breaking it.

Currently, we allow the situation when the allocation size for the whole object 
is larger than the value reflected in the “counted_by” attribute (the old name 
is __element_count__). But don’t allow the other way around (i.e, when the 
allocation size for the whole object is smaller than the value reflected in the 
“counted_by” attribute. 
> 
> Of course, the fact that gcc allows flex arrays to be in the middle of 
> structs breaks the base assumption but that's something we need to get rid of 
> anyway since there's no way for valid C programs to use that safely.

Since GCC14, we started to deprecate this extension (allow flex array to be in 
the middle of structs).
https://gcc.gnu.org/pipermail/gcc-cvs/2023-June/385730.html

Thanks.

Qing


> 
> Thanks,
> Sid



Re: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

2023-08-04 Thread Wilco Dijkstra via Gcc-patches
ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 
[PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
supported.
This results in an implicit store which is invisible to software as long as the 
given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
    PR target/110061
    config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
    config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
    State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    .  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat__16 entry points are ARMv8.0.
+   The libat__16_i1 entry points are used when LSE2 is available.  */
+
+
 .arch   armv8-a+lse
 
 #define ENTRY(name) \
@@ -37,6 +52,10 @@ name:    \
 .cfi_endproc;   \
 .size name, .-name;
 
+#define ALIAS(alias,name)  \
+   .global alias;  \
+   .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:    \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+   mov x5, x0
+   cbnz    w1, 2f
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 1b
+   ret
+
+   /* ACQUIRE/CONSUME/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 2b
+   ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
 cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+   cbnz    w4, 2f
+
+   /* RELAXED.  */
+1: ldxp    xzr, tmp0, [x0]
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   ret
+
+   /* RELEASE/SEQ_CST.  */
+2: ldxp    xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnz    w4, 2b
+   ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
 cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
 ret
 
 /* RELEASE/SEQ_CST.  */
-1: ldaxp   xzr, tmp0, [x0]
+1: ldxp    xzr, tmp0, [x0]
 stlxp   w4, in0, in1, [x0]
 cbnz    w4, 1b
 ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
 mov x5, x0
 cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 3b
 ret
-4:
-   cmp w4, RELEASE
-   b.ne    6f
 
-   /* RELEASE.  */
-5: ldxp    res0, res1, [x5]
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 5b
+   cbnz    w4, 4b
 ret
+END (libat_exchange_16)
 
-   /* ACQ_REL/SEQ_CST.  */
-6: ldaxp   res0, res1, [x5]
-   stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+   ldp exp0, exp1, [x1]
+   cbz w4, 3f
+   cmp w4, RELEASE
+   b.hs    4f
+
+   /* ACQUIRE/CONSUME.  */
+1: ldaxp   tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2f
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   mov x0, 1
 ret
-END (libat_exchange_16_i1)
+
+2: stp tmp0, tmp1, [x1]
+   mov x0, 0
+   ret
+
+   /* RELAXED.  */
+3: ldxp    tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2b
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 3b
+   mov x0, 1
+   ret
+
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   tmp0, tmp1, 

Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-08-04 Thread Richard Sandiford via Gcc-patches
Full review this time, sorry for the skipping the tests earlier.

Prathamesh Kulkarni  writes:
> diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> index 7e5494dfd39..680d0e54fd4 100644
> --- a/gcc/fold-const.cc
> +++ b/gcc/fold-const.cc
> @@ -85,6 +85,10 @@ along with GCC; see the file COPYING3.  If not see
>  #include "vec-perm-indices.h"
>  #include "asan.h"
>  #include "gimple-range.h"
> +#include 

This should be included by defining INCLUDE_ALGORITHM instead.

> +#include "tree-pretty-print.h"
> +#include "gimple-pretty-print.h"
> +#include "print-tree.h"

Are these still needed, or were they for debugging?

>  
>  /* Nonzero if we are folding constants inside an initializer or a C++
> manifestly-constant-evaluated context; zero otherwise.
> @@ -10494,15 +10498,9 @@ fold_mult_zconjz (location_t loc, tree type, tree 
> expr)
>  static bool
>  vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts)
>  {
> -  unsigned HOST_WIDE_INT i, nunits;
> +  unsigned HOST_WIDE_INT i;
>  
> -  if (TREE_CODE (arg) == VECTOR_CST
> -  && VECTOR_CST_NELTS (arg).is_constant ())
> -{
> -  for (i = 0; i < nunits; ++i)
> - elts[i] = VECTOR_CST_ELT (arg, i);
> -}
> -  else if (TREE_CODE (arg) == CONSTRUCTOR)
> +  if (TREE_CODE (arg) == CONSTRUCTOR)
>  {
>constructor_elt *elt;
>  
> @@ -10520,6 +10518,192 @@ vec_cst_ctor_to_array (tree arg, unsigned int 
> nelts, tree *elts)
>return true;
>  }
>  
> +/* Helper routine for fold_vec_perm_cst to check if SEL is a suitable
> +   mask for VLA vec_perm folding.
> +   REASON if specified, will contain the reason why SEL is not suitable.
> +   Used only for debugging and unit-testing.
> +   VERBOSE if enabled is used for debugging output.  */
> +
> +static bool
> +valid_mask_for_fold_vec_perm_cst_p (tree arg0, tree arg1,
> + const vec_perm_indices ,
> + const char **reason = NULL,
> + ATTRIBUTE_UNUSED bool verbose = false)

Since verbose is no longer needed (good!), I think we should just remove it.

> +{
> +  unsigned sel_npatterns = sel.encoding ().npatterns ();
> +  unsigned sel_nelts_per_pattern = sel.encoding ().nelts_per_pattern ();
> +
> +  if (!(pow2p_hwi (sel_npatterns)
> + && pow2p_hwi (VECTOR_CST_NPATTERNS (arg0))
> + && pow2p_hwi (VECTOR_CST_NPATTERNS (arg1
> +{
> +  if (reason)
> + *reason = "npatterns is not power of 2";
> +  return false;
> +}
> +
> +  /* We want to avoid cases where sel.length is not a multiple of npatterns.
> + For eg: sel.length = 2 + 2x, and sel npatterns = 4.  */
> +  poly_uint64 esel;
> +  if (!multiple_p (sel.length (), sel_npatterns, ))
> +{
> +  if (reason)
> + *reason = "sel.length is not multiple of sel_npatterns";
> +  return false;
> +}
> +
> +  if (sel_nelts_per_pattern < 3)
> +return true;
> +
> +  for (unsigned pattern = 0; pattern < sel_npatterns; pattern++)
> +{
> +  poly_uint64 a1 = sel[pattern + sel_npatterns];
> +  poly_uint64 a2 = sel[pattern + 2 * sel_npatterns];
> +  HOST_WIDE_INT S; 

Trailing whitespace.  The convention is to use lowercase variable
names, so please call this "step".

> +  if (!poly_int64 (a2 - a1).is_constant ())
> + {
> +   if (reason)
> + *reason = "step is not constant";
> +   return false;
> + }
> +  // FIXME: Punt on S < 0 for now, revisit later.
> +  if (S < 0)
> + return false;
> +  if (S == 0)
> + continue;
> +
> +  if (!pow2p_hwi (S))
> + {
> +   if (reason)
> + *reason = "step is not power of 2";
> +   return false;
> + }
> +
> +  /* Ensure that stepped sequence of the pattern selects elements
> +  only from the same input vector if it's VLA.  */

s/ if it's VLA//

> +  uint64_t q1, qe;
> +  poly_uint64 r1, re;
> +  poly_uint64 ae = a1 + (esel - 2) * S;
> +  poly_uint64 arg_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +  if (!(can_div_trunc_p (a1, arg_len, , )
> + && can_div_trunc_p (ae, arg_len, , )
> + && q1 == qe))
> + {
> +   if (reason)
> + *reason = "crossed input vectors";
> +   return false;
> + }
> +

Probably worth a comment above the following code too:

  /* Ensure that the stepped sequence always selects from the same
 input pattern.  */

> +  unsigned arg_npatterns
> + = ((q1 & 0) == 0) ? VECTOR_CST_NPATTERNS (arg0)
> +   : VECTOR_CST_NPATTERNS (arg1);
> +
> +  if (!multiple_p (S, arg_npatterns))
> + {
> +   if (reason)
> + *reason = "S is not multiple of npatterns";
> +   return false;
> + }
> +}
> +
> +  return true;
> +}
> +
> +/* Try to fold permutation of ARG0 and ARG1 with SEL selector when
> +   the input vectors are VECTOR_CST. Return NULL_TREE otherwise.
> +   REASON and VERBOSE have same purpose as described in
> +   

[PATCH] libatomic: Improve ifunc selection on AArch64

2023-08-04 Thread Wilco Dijkstra via Gcc-patches

Add support for ifunc selection based on CPUID register.  Neoverse N1 supports
atomic 128-bit load/store, so use the FEAT_USCAT ifunc like newer Neoverse
cores.

Passes regress, OK for commit?

libatomic/
config/linux/aarch64/host-config.h (ifunc1): Use CPUID in ifunc
selection.

---

diff --git a/libatomic/config/linux/aarch64/host-config.h 
b/libatomic/config/linux/aarch64/host-config.h
index 
851c78c01cd643318aaa52929ce4550266238b79..e5dc33c030a4bab927874fa6c69425db463fdc4b
 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -26,7 +26,7 @@
 
 #ifdef HWCAP_USCAT
 # if N == 16
-#  define IFUNC_COND_1 (hwcap & HWCAP_USCAT)
+#  define IFUNC_COND_1 ifunc1 (hwcap)
 # else
 #  define IFUNC_COND_1 (hwcap & HWCAP_ATOMICS)
 # endif
@@ -50,4 +50,28 @@
 #undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
 #define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
 
+#ifdef HWCAP_USCAT
+
+#define MIDR_IMPLEMENTOR(midr) (((midr) >> 24) & 255)
+#define MIDR_PARTNUM(midr) (((midr) >> 4) & 0xfff)
+
+static inline bool
+ifunc1 (unsigned long hwcap)
+{
+  if (hwcap & HWCAP_USCAT)
+return true;
+  if (!(hwcap & HWCAP_CPUID))
+return false;
+
+  unsigned long midr;
+  asm volatile ("mrs %0, midr_el1" : "=r" (midr));
+
+  /* Neoverse N1 supports atomic 128-bit load/store.  */
+  if (MIDR_IMPLEMENTOR (midr) == 'A' && MIDR_PARTNUM(midr) == 0xd0c)
+return true;
+
+  return false;
+}
+#endif
+
 #include_next 



Re: One question on the source code of tree-object-size.cc

2023-08-04 Thread Siddhesh Poyarekar

On 2023-08-04 10:40, Siddhesh Poyarekar wrote:

On 2023-08-03 13:34, Qing Zhao wrote:
One thing I need to point out first is, currently, even for regular 
fixed size array in the structure,

We have this same issue, for example:

#define LENGTH 10

struct fix {
   size_t foo;
   int array[LENGTH];
};

…
int main ()
{
   struct fix *p;
   p = alloc_buf_more ();

   expect(__builtin_object_size(p->array, 1), LENGTH * sizeof(int));
   expect(__builtin_object_size(p->array, 0), -1);
}

Currently, for __builtin_object_size(p->array, 0),  GCC return UNKNOWN 
for it.

This is not a special issue for flexible array member.


That's fine for fixed arrays at the end of a struct because the "whole 
object" size could be anything; `p` could be pointing to the beginning 
of an array for all we know.  If however `array` is strictly a flex 
array, i.e.:


```
struct A
{
   size_t foo;
   int array[];
};
```

then there's no way in valid C to have an array of `struct fix`, so `q` 
must be pointing to a single element.  So you could deduce:


1. the minimum size of the whole object that q points to.


Actually for minimum size we'd also need a guarantee that 
`alloc_buf_more` returns a valid allocated object.


Sid


Re: One question on the source code of tree-object-size.cc

2023-08-04 Thread Siddhesh Poyarekar

On 2023-08-03 13:34, Qing Zhao wrote:

One thing I need to point out first is, currently, even for regular fixed size 
array in the structure,
We have this same issue, for example:

#define LENGTH 10

struct fix {
   size_t foo;
   int array[LENGTH];
};

…
int main ()
{
   struct fix *p;
   p = alloc_buf_more ();

   expect(__builtin_object_size(p->array, 1), LENGTH * sizeof(int));
   expect(__builtin_object_size(p->array, 0), -1);
}

Currently, for __builtin_object_size(p->array, 0),  GCC return UNKNOWN for it.
This is not a special issue for flexible array member.


That's fine for fixed arrays at the end of a struct because the "whole 
object" size could be anything; `p` could be pointing to the beginning 
of an array for all we know.  If however `array` is strictly a flex 
array, i.e.:


```
struct A
{
  size_t foo;
  int array[];
};
```

then there's no way in valid C to have an array of `struct fix`, so `q` 
must be pointing to a single element.  So you could deduce:


1. the minimum size of the whole object that q points to.

and

2. if you're able to determine the size of the flex array (through 
__element_count__(foo) for example), you could even determine the 
maximum size of the whole object.


For (2) though, you'd break applications that overallocate and then 
expect to be able to use that overallocation despite the space not being 
reflected in the __element_count__.  I think it's a bug in the 
application and I can't see a way for an application to be able to do 
this in a valid way so I'm inclined towards breaking it.


Of course, the fact that gcc allows flex arrays to be in the middle of 
structs breaks the base assumption but that's something we need to get 
rid of anyway since there's no way for valid C programs to use that safely.


Thanks,
Sid


Re: [PATCH] Add documentation for -Wflex-array-member-not-at-end.

2023-08-04 Thread Qing Zhao via Gcc-patches
Thanks.

I just updated the doc per your suggestion and committed as:

https://gcc.gnu.org/pipermail/gcc-cvs/2023-August/387588.html

Qing
> On Aug 3, 2023, at 1:29 PM, Joseph Myers  wrote:
> 
> On Thu, 3 Aug 2023, Qing Zhao via Gcc-patches wrote:
> 
>> +@opindex Wflex-array-member-not-at-end
>> +@opindex Wno-flex-array-member-not-at-end
>> +@item -Wflex-array-member-not-at-end
> 
> I'd expect this to have @r{(C and C++ only)} to indicate what languages 
> the option applies to.  OK with that change.
> 
> -- 
> Joseph S. Myers
> jos...@codesourcery.com



Re: One question on the source code of tree-object-size.cc

2023-08-04 Thread Qing Zhao via Gcc-patches


> On Aug 4, 2023, at 3:38 AM, Kees Cook  wrote:
> 
> On Thu, Aug 03, 2023 at 09:31:24PM +, Qing Zhao wrote:
>> So, the basic question is:
>> 
>> Given the following:
>> 
>> struct fix {
>>  int others;
>>  int array[10];
>> }
>> 
>> extern struct fix * alloc_buf ();
>> 
>> int main ()
>> {
>>  struct fix *p = alloc_buf ();
>>  __builtin_object_size(p->array,0) == ?
>> }
>> 
>> Given p->array, can the compiler determine that p points to an object that 
>> has TYPE struct fix?
>> 
>> If the answer is YES, then the current__builtin_object_size algorithm can be 
>> improved to determine __builtin_object_size(p->array, 0)  with the TYPE of 
>> the struct fix.
> 
> I think it is fine to leave __bos(..., 0) as-is. From the Linux kernel's
> use of __bos, we are almost exclusively only interesting the mode 1, not
> node 0. :)

Okay, that’s good to know.

Qing
> 
> -- 
> Kees Cook



Re: [PATCH] tree-optimization/110838 - vectorization of widened right shifts

2023-08-04 Thread Richard Sandiford via Gcc-patches
Richard Biener  writes:
> The following fixes a problem with my last attempt of avoiding
> out-of-bound shift values for vectorized right shifts of widened
> operands.  Instead of truncating the shift amount with a bitwise
> and we actually need to saturate it to the target precision.
>
> The following does that and adds test coverage for the constant
> and invariant but variable case that would previously have failed.
>
> Bootstrap & regtest on x86_64-unknown-linux-gnu in progress, I plan
> to push this soon, just in case you have any comments here.

LGTM FWIW.

Richard

> Richard.
>
>   PR tree-optimization/110838
>   * tree-vect-patterns.cc (vect_recog_over_widening_pattern):
>   Fix right-shift value sanitizing.  Properly emit external
>   def mangling in the preheader rather than in the pattern
>   def sequence where it will fail vectorizing.
>
>   * gcc.dg/vect/pr110838.c: New testcase.
> ---
>  gcc/testsuite/gcc.dg/vect/pr110838.c | 31 
>  gcc/tree-vect-patterns.cc| 22 +++-
>  2 files changed, 48 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/pr110838.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/pr110838.c 
> b/gcc/testsuite/gcc.dg/vect/pr110838.c
> new file mode 100644
> index 000..cf8765be603
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr110838.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +
> +#include "tree-vect.h"
> +
> +short a[32], b[32];
> +
> +void __attribute__((noipa)) foo ()
> +{
> +  for (int i = 0; i < 32; ++i)
> +a[i] = b[i] >> 16;
> +}
> +
> +void __attribute__((noipa)) bar (int n)
> +{
> +  int np = n & 31;
> +  for (int i = 0; i < 32; ++i)
> +a[i] = b[i] >> np;
> +}
> +
> +int main ()
> +{
> +  check_vect ();
> +  b[0] = -8;
> +  foo ();
> +  if (a[0] != -1)
> +abort ();
> +  bar (16);
> +  if (a[0] != -1)
> +abort ();
> +  return 0;
> +}
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index e4ab8c2d65b..2cedf238450 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3109,8 +3109,8 @@ vect_recog_over_widening_pattern (vec_info *vinfo,
>wide_int min_value, max_value;
>if (TREE_CODE (ops[1]) == INTEGER_CST)
>   ops[1] = wide_int_to_tree (op_type,
> -wi::bit_and (wi::to_wide (ops[1]),
> - new_precision - 1));
> +wi::umin (wi::to_wide (ops[1]),
> +  new_precision - 1));
>else if (!vect_get_range_info (ops[1], _value, _value)
>  || wi::ge_p (max_value, new_precision, TYPE_SIGN (op_type)))
>   {
> @@ -3118,11 +3118,23 @@ vect_recog_over_widening_pattern (vec_info *vinfo,
>same argument widened shifts and it un-CSEs same arguments.  */
> tree new_var = vect_recog_temp_ssa_var (op_type, NULL);
> gimple *pattern_stmt
> - = gimple_build_assign (new_var, BIT_AND_EXPR, ops[1],
> + = gimple_build_assign (new_var, MIN_EXPR, ops[1],
>  build_int_cst (op_type, new_precision - 1));
> -   ops[1] = new_var;
> gimple_set_location (pattern_stmt, gimple_location (last_stmt));
> -   append_pattern_def_seq (vinfo, last_stmt_info, pattern_stmt);
> +   if (unprom[1].dt == vect_external_def)
> + {
> +   if (edge e = vect_get_external_def_edge (vinfo, ops[1]))
> + {
> +   basic_block new_bb
> + = gsi_insert_on_edge_immediate (e, pattern_stmt);
> +   gcc_assert (!new_bb);
> + }
> +   else
> + return NULL;
> + }
> +   else
> + append_pattern_def_seq (vinfo, last_stmt_info, pattern_stmt);
> +   ops[1] = new_var;
>   }
>  }


[pushed][LRA] Check input insn pattern hard regs against early clobber hard regs for live info

2023-08-04 Thread Vladimir Makarov via Gcc-patches
The following patch fixes a problem found by LRA port for avr target.  
The problem description is in the commit message.


The patch was successfully bootstrapped and tested on x86-64 and aarch64.
commit abf953042ace471720c1dc284b5f38e546fc0595
Author: Vladimir N. Makarov 
Date:   Fri Aug 4 08:04:44 2023 -0400

LRA: Check input insn pattern hard regs against early clobber hard regs for live info

For the test case LRA generates wrong code for AVR cpymem_qi insn:

(insn 16 15 17 3 (parallel [
(set (mem:BLK (reg:HI 26 r26) [0  A8])
(mem:BLK (reg:HI 30 r30) [0  A8]))
(unspec [
(const_int 0 [0])
] UNSPEC_CPYMEM)
(use (reg:QI 52))
(clobber (reg:HI 26 r26))
(clobber (reg:HI 30 r30))
(clobber (reg:QI 0 r0))
(clobber (reg:QI 52))
]) "t.c":16:22 132 {cpymem_qi}

The insn gets the same value in r26 and r30.  The culprit is clobbering
r30 and using r30 as input.  For such situation LRA wrongly assumes that
r30 does not live before the insn.  The patch is fixing it.

gcc/ChangeLog:

* lra-lives.cc (process_bb_lives): Check input insn pattern hard regs
against early clobber hard regs.

gcc/testsuite/ChangeLog:

* gcc.target/avr/lra-cpymem_qi.c: New.

diff --git a/gcc/lra-lives.cc b/gcc/lra-lives.cc
index f7a3ba8d76a..f60e564da82 100644
--- a/gcc/lra-lives.cc
+++ b/gcc/lra-lives.cc
@@ -989,7 +989,7 @@ process_bb_lives (basic_block bb, int _point, bool dead_insn_p)
 	/* We can have early clobbered non-operand hard reg and
 	   the same hard reg as an insn input.  Don't make hard
 	   reg dead before the insns.  */
-	for (reg2 = curr_id->regs; reg2 != NULL; reg2 = reg2->next)
+	for (reg2 = curr_static_id->hard_regs; reg2 != NULL; reg2 = reg2->next)
 	  if (reg2->type != OP_OUT && reg2->regno == reg->regno)
 		break;
 	if (reg2 == NULL)
diff --git a/gcc/testsuite/gcc.target/avr/lra-cpymem_qi.c b/gcc/testsuite/gcc.target/avr/lra-cpymem_qi.c
new file mode 100644
index 000..fdffb445b45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/lra-cpymem_qi.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-mmcu=avr51 -Os" } */
+
+#include 
+
+struct A
+{
+  unsigned int a;
+  unsigned char c1, c2;
+  bool b1 : 1;
+};
+
+void
+foo (const struct A *x, int y)
+{
+  int s = 0, i;
+  for (i = 0; i < y; ++i)
+{
+  const struct A a = x[i];
+  s += a.b1 ? 1 : 0;
+}
+  if (s != 0)
+__builtin_abort ();
+}
+
+/* { dg-final { scan-assembler-not "movw\[^\n\r]*r26,r30" } } */


[PATCH] tree-optimization/110838 - vectorization of widened right shifts

2023-08-04 Thread Richard Biener via Gcc-patches
The following fixes a problem with my last attempt of avoiding
out-of-bound shift values for vectorized right shifts of widened
operands.  Instead of truncating the shift amount with a bitwise
and we actually need to saturate it to the target precision.

The following does that and adds test coverage for the constant
and invariant but variable case that would previously have failed.

Bootstrap & regtest on x86_64-unknown-linux-gnu in progress, I plan
to push this soon, just in case you have any comments here.

Richard.

PR tree-optimization/110838
* tree-vect-patterns.cc (vect_recog_over_widening_pattern):
Fix right-shift value sanitizing.  Properly emit external
def mangling in the preheader rather than in the pattern
def sequence where it will fail vectorizing.

* gcc.dg/vect/pr110838.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/pr110838.c | 31 
 gcc/tree-vect-patterns.cc| 22 +++-
 2 files changed, 48 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr110838.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr110838.c 
b/gcc/testsuite/gcc.dg/vect/pr110838.c
new file mode 100644
index 000..cf8765be603
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr110838.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+
+#include "tree-vect.h"
+
+short a[32], b[32];
+
+void __attribute__((noipa)) foo ()
+{
+  for (int i = 0; i < 32; ++i)
+a[i] = b[i] >> 16;
+}
+
+void __attribute__((noipa)) bar (int n)
+{
+  int np = n & 31;
+  for (int i = 0; i < 32; ++i)
+a[i] = b[i] >> np;
+}
+
+int main ()
+{
+  check_vect ();
+  b[0] = -8;
+  foo ();
+  if (a[0] != -1)
+abort ();
+  bar (16);
+  if (a[0] != -1)
+abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index e4ab8c2d65b..2cedf238450 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3109,8 +3109,8 @@ vect_recog_over_widening_pattern (vec_info *vinfo,
   wide_int min_value, max_value;
   if (TREE_CODE (ops[1]) == INTEGER_CST)
ops[1] = wide_int_to_tree (op_type,
-  wi::bit_and (wi::to_wide (ops[1]),
-   new_precision - 1));
+  wi::umin (wi::to_wide (ops[1]),
+new_precision - 1));
   else if (!vect_get_range_info (ops[1], _value, _value)
   || wi::ge_p (max_value, new_precision, TYPE_SIGN (op_type)))
{
@@ -3118,11 +3118,23 @@ vect_recog_over_widening_pattern (vec_info *vinfo,
 same argument widened shifts and it un-CSEs same arguments.  */
  tree new_var = vect_recog_temp_ssa_var (op_type, NULL);
  gimple *pattern_stmt
-   = gimple_build_assign (new_var, BIT_AND_EXPR, ops[1],
+   = gimple_build_assign (new_var, MIN_EXPR, ops[1],
   build_int_cst (op_type, new_precision - 1));
- ops[1] = new_var;
  gimple_set_location (pattern_stmt, gimple_location (last_stmt));
- append_pattern_def_seq (vinfo, last_stmt_info, pattern_stmt);
+ if (unprom[1].dt == vect_external_def)
+   {
+ if (edge e = vect_get_external_def_edge (vinfo, ops[1]))
+   {
+ basic_block new_bb
+   = gsi_insert_on_edge_immediate (e, pattern_stmt);
+ gcc_assert (!new_bb);
+   }
+ else
+   return NULL;
+   }
+ else
+   append_pattern_def_seq (vinfo, last_stmt_info, pattern_stmt);
+ ops[1] = new_var;
}
 }
 
-- 
2.35.3


Re: [RFC] Combine zero_extract and sign_extend for TARGET_TRULY_NOOP_TRUNCATION

2023-08-04 Thread YunQiang Su via Gcc-patches
>
> Like I mentioned in the other thread, I think things went wrong when
> we generated the subreg in this sign_extend.  The operation should
> have been a truncate of (reg/v:DI 200) followed by a sign extension
> of the result.
>

Sorry for my misunderstanding.

So you mean that in the RTL, for this operation:
we should have 3 (insn ) RTX?

(zero_extract  )
(truncate_64_to_32)
(sign_extend_32_to_64)

> What piece of code is generating the subreg?
>
> Thanks,
> Richard



-- 
YunQiang Su


Re: [PATCH] mid-end: Use integral time intervals in timevar.cc

2023-08-04 Thread Richard Biener via Gcc-patches
On Fri, 4 Aug 2023, Matthew Malcomson wrote:

> Hopefully last update ...
> 
> > Specifically, please try compiling with
> >-ftime-report -fdiagnostics-format=sarif-file
> > and have a look at the generated .sarif file, e.g. via
> >python -m json.tool foo.c.sarif
> > which will pretty-print the JSON to stdout.
> 
> Rebasing onto the JSON output was quite simple -- I've inlined the only
> change in the patch below (to cast to floating point seconds before
> generating the json report).
> 
> I have manually checked the SARIF output as you suggested and all looks
> good (an in fact better because we no longer save some strange times
> like the below due to avoiding the floating point rounding).
> "wall": -4.49516e-09,
> 
> 
> > 
> > The patch looks OK to me if it passes bootstrap / regtest and the
> > output of -ftime-report doesn't change (too much).
> > 
> > Thanks,
> > Richard.
> 
> Though I don't expect you were asking for this, confirmation below that
> the output doesn't change.  (Figured I may as well include that info
> since the rebase to include the JSON output that David had just added
> required re-sending an email anyway).
> 
> 
> 
> ```
> hw-a20-8:checking-theory [10:07:01] $ ${old_build}/gcc/xgcc 
> -B${old_build}/gcc/-fdiagnostics-plain-output-Os  -w -S test-sum.c -o 
> /dev/null   -ftime-report
> 
> Time variable   usr   sys  
> wall   GGC
>  phase setup:   0.01 ( 14%)   0.00 (  0%)   0.02 ( 
> 18%)  3389k ( 74%)
>  phase parsing  :   0.03 ( 43%)   0.02 ( 67%)   0.06 ( 
> 55%)   982k ( 21%)
>  phase opt and generate :   0.03 ( 43%)   0.01 ( 33%)   0.03 ( 
> 27%)   215k (  5%)
>  callgraph functions expansion  :   0.02 ( 29%)   0.00 (  0%)   0.03 ( 
> 27%)   162k (  4%)
>  callgraph ipa passes   :   0.01 ( 14%)   0.01 ( 33%)  -0.00 ( 
> -0%)38k (  1%)
>  CFG verifier   :   0.00 (  0%)   0.00 (  0%)   0.01 (  
> 9%) 0  (  0%)
>  preprocessing  :   0.02 ( 29%)   0.02 ( 67%)   0.02 ( 
> 18%)   272k (  6%)
>  lexical analysis   :   0.01 ( 14%)   0.00 (  0%)   0.02 ( 
> 18%) 0  (  0%)
>  parser (global):   0.00 (  0%)   0.00 (  0%)   0.01 (  
> 9%)   637k ( 14%)
>  parser function body   :   0.00 (  0%)   0.00 (  0%)   0.01 (  
> 9%)18k (  0%)
>  tree CFG cleanup   :   0.00 (  0%)   0.01 ( 33%)   0.00 (  
> 0%) 0  (  0%)
>  tree STMT verifier :   0.01 ( 14%)   0.00 (  0%)   0.00 (  
> 0%) 0  (  0%)
>  expand :   0.01 ( 14%)   0.00 (  0%)   0.00 (  
> 0%)12k (  0%)
>  loop init  :   0.00 (  0%)   0.00 (  0%)   0.01 (  
> 9%)12k (  0%)
>  initialize rtl :   0.00 (  0%)   0.00 (  0%)   0.01 (  
> 9%)15k (  0%)
>  rest of compilation:   0.01 ( 14%)   0.00 (  0%)   0.00 (  
> 0%)  6920  (  0%)
>  TOTAL  :   0.07  0.03  0.11  
>4587k
> Extra diagnostic checks enabled; compiler may run slowly.
> Configure with --enable-checking=release to disable checks.
> hw-a20-8:checking-theory [10:06:44] $ ${new_build}/gcc/xgcc 
> -B${new_build}/gcc/-fdiagnostics-plain-output-Os  -w -S test-sum.c -o 
> /dev/null   -ftime-report
> 
> Time variable   usr   sys  
> wall   GGC
>  phase setup:   0.01 ( 17%)   0.00 (  0%)   0.02 ( 
> 18%)  3389k ( 74%)
>  phase parsing  :   0.02 ( 33%)   0.03 ( 75%)   0.05 ( 
> 45%)   982k ( 21%)
>  phase opt and generate :   0.03 ( 50%)   0.01 ( 25%)   0.04 ( 
> 36%)   215k (  5%)
>  callgraph construction :   0.00 (  0%)   0.01 ( 25%)   0.00 (  
> 0%)  1864  (  0%)
>  callgraph functions expansion  :   0.02 ( 33%)   0.00 (  0%)   0.03 ( 
> 27%)   162k (  4%)
>  callgraph ipa passes   :   0.01 ( 17%)   0.00 (  0%)   0.01 (  
> 9%)38k (  1%)
>  ipa free lang data :   0.00 (  0%)   0.00 (  0%)   0.01 (  
> 9%) 0  (  0%)
>  CFG verifier   :   0.02 ( 33%)   0.00 (  0%)   0.00 (  
> 0%) 0  (  0%)
>  preprocessing  :   0.01 ( 17%)   0.03 ( 75%)   0.01 (  
> 9%)   272k (  6%)
>  lexical analysis   :   0.01 ( 17%)   0.00 (  0%)   0.02 ( 
> 18%) 0  (  0%)
>  parser (global):   0.00 (  0%)   0.00 (  0%)   0.01 (  
> 9%)   637k ( 14%)
>  parser inl. func. body :   0.00 (  0%)   0.00 (  0%)   0.01 (  
> 9%)19k (  0%)
>  tree STMT verifier :   0.01 ( 17%)   0.00 (  0%)   0.00 (  
> 0%) 0  (  0%)
>  expand :   0.00 (  0%)   0.00 (  0%)   0.01 (  
> 9%)12k (  0%)
>  integrated RA  :   0.00 (  0%)   0.00 (  0%)   0.01 (  
> 9%) 

[PATCH] tree-optimization/110838 - less aggressively fold out-of-bound shifts

2023-08-04 Thread Richard Biener via Gcc-patches
The following adjusts the shift simplification patterns to avoid
touching out-of-bound shift value arithmetic right shifts of
possibly negative values.  While simplifying those to zero isn't
wrong it's violating the principle of least surprise.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR tree-optimization/110838
* match.pd (([rl]shift @0 out-of-bounds) -> zero): Restrict
the arithmetic right-shift case to non-negative operands.
---
 gcc/match.pd | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index 53e622bf28f..a1a82a5f954 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1064,6 +1064,10 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (simplify
   (shift @0 uniform_integer_cst_p@1)
   (if ((GIMPLE || !sanitize_flags_p (SANITIZE_SHIFT_EXPONENT))
+   /* Leave arithmetic right shifts of possibly negative values alone.  */
+   && (TYPE_UNSIGNED (type)
+   || shift == LSHIFT_EXPR
+  || tree_expr_nonnegative_p (@0))
/* Use a signed compare to leave negative shift counts alone.  */
&& wi::ges_p (wi::to_wide (uniform_integer_cst_p (@1)),
 element_precision (type)))
-- 
2.35.3


[PING^1][PATCH V2] rs6000: Don't allow AltiVec address in movoo & movxo pattern [PR110411]

2023-08-04 Thread jeevitha via Gcc-patches
Ping!

please review.

Thanks & Regards
Jeevitha

On 19/07/23 10:16 pm, jeevitha wrote:
> Hi All,
> 
> The following patch has been bootstrapped and regtested on powerpc64le-linux.
> 
> There are no instructions that do traditional AltiVec addresses (i.e.
> with the low four bits of the address masked off) for OOmode and XOmode
> objects. The solution is to modify the constraints used in the movoo and
> movxo pattern to disallow these types of addresses, which assists LRA in
> resolving this issue. Furthermore, the mode size 16 check has been
> removed in vsx_quad_dform_memory_operand to allow OOmode and
> quad_address_p already handles less than size 16.
> 
> 2023-07-19  Jeevitha Palanisamy  
> 
> gcc/
>   PR target/110411
>   * config/rs6000/mma.md (define_insn_and_split movoo): Disallow
>   AltiVec address in movoo and movxo pattern.
>   (define_insn_and_split movxo): Likewise.
>   *config/rs6000/predicates.md (vsx_quad_dform_memory_operand):Remove
>   redundant mode size check.
> 
> gcc/testsuite/
>   PR target/110411
>   * gcc.target/powerpc/pr110411-1.c: New testcase.
>   * gcc.target/powerpc/pr110411-2.c: New testcase.
> 
> diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
> index d36dc13872b..575751d477e 100644
> --- a/gcc/config/rs6000/mma.md
> +++ b/gcc/config/rs6000/mma.md
> @@ -293,8 +293,8 @@
>  })
>  
>  (define_insn_and_split "*movoo"
> -  [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,m,wa")
> - (match_operand:OO 1 "input_operand" "m,wa,wa"))]
> +  [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
> + (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
>"TARGET_MMA
> && (gpc_reg_operand (operands[0], OOmode)
> || gpc_reg_operand (operands[1], OOmode))"
> @@ -340,8 +340,8 @@
>  })
>  
>  (define_insn_and_split "*movxo"
> -  [(set (match_operand:XO 0 "nonimmediate_operand" "=d,m,d")
> - (match_operand:XO 1 "input_operand" "m,d,d"))]
> +  [(set (match_operand:XO 0 "nonimmediate_operand" "=d,ZwO,d")
> + (match_operand:XO 1 "input_operand" "ZwO,d,d"))]
>"TARGET_MMA
> && (gpc_reg_operand (operands[0], XOmode)
> || gpc_reg_operand (operands[1], XOmode))"
> diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
> index 3552d908e9d..925f69cd3fc 100644
> --- a/gcc/config/rs6000/predicates.md
> +++ b/gcc/config/rs6000/predicates.md
> @@ -924,7 +924,7 @@
>  (define_predicate "vsx_quad_dform_memory_operand"
>(match_code "mem")
>  {
> -  if (!TARGET_P9_VECTOR || GET_MODE_SIZE (mode) != 16)
> +  if (!TARGET_P9_VECTOR)
>  return false;
>  
>return quad_address_p (XEXP (op, 0), mode, false);
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr110411-1.c 
> b/gcc/testsuite/gcc.target/powerpc/pr110411-1.c
> new file mode 100644
> index 000..f42e9388d65
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr110411-1.c
> @@ -0,0 +1,22 @@
> +/* PR target/110411 */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power10 -mblock-ops-vector-pair" } */
> +
> +/* Verify we do not ICE on the following.  */
> +
> +#include 
> +
> +struct s {
> +  long a;
> +  long b;
> +  long c;
> +  long d: 1;
> +};
> +unsigned long ptr;
> +
> +void
> +bug (struct s *dst)
> +{
> +  struct s *src = (struct s *)(ptr & ~0xFUL);
> +  memcpy (dst, src, sizeof(struct s));
> +}
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr110411-2.c 
> b/gcc/testsuite/gcc.target/powerpc/pr110411-2.c
> new file mode 100644
> index 000..c2046fb9855
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr110411-2.c
> @@ -0,0 +1,12 @@
> +/* PR target/110411 */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power10" } */
> +
> +/* Verify we do not ICE on the following.  */
> +
> +void
> +bug (__vector_quad *dst)
> +{
> +  dst = (__vector_quad *)((unsigned long)dst & ~0xFUL);
> +  __builtin_mma_xxsetaccz (dst);
> +}
> 
> 
> 


[PING ^1][PATCH] rs6000: Fix issue in specifying PTImode as an attribute [PR106895]

2023-08-04 Thread jeevitha via Gcc-patches
Ping!

please review.

Thanks & Regards
Jeevitha

On 20/07/23 10:05 am, jeevitha wrote:
> Hi All,
> 
> The following patch has been bootstrapped and regtested on powerpc64le-linux.
> 
> When the user specifies PTImode as an attribute, it breaks. Created
> a tree node to handle PTImode types. PTImode attribute helps in generating
> even/odd register pairs on 128 bits.
> 
> 2023-07-20  Jeevitha Palanisamy  
> 
> gcc/
>   PR target/110411
>   * config/rs6000/rs6000.h (enum rs6000_builtin_type_index): Add fields
>   to hold PTImode type.
>   * config/rs6000/rs6000-builtin.cc (rs6000_init_builtins): Add node
>   for PTImode type.
> 
> gcc/testsuite/
>   PR target/106895
>   * gcc.target/powerpc/pr106895.c: New testcase.
> 
> diff --git a/gcc/config/rs6000/rs6000-builtin.cc 
> b/gcc/config/rs6000/rs6000-builtin.cc
> index a8f291c6a72..ca00c3b0d4c 100644
> --- a/gcc/config/rs6000/rs6000-builtin.cc
> +++ b/gcc/config/rs6000/rs6000-builtin.cc
> @@ -756,6 +756,15 @@ rs6000_init_builtins (void)
>else
>  ieee128_float_type_node = NULL_TREE;
>  
> +  /* PTImode to get even/odd register pairs.  */
> +  intPTI_type_internal_node = make_node(INTEGER_TYPE);
> +  TYPE_PRECISION (intPTI_type_internal_node) = GET_MODE_BITSIZE (PTImode);
> +  layout_type (intPTI_type_internal_node);
> +  SET_TYPE_MODE (intPTI_type_internal_node, PTImode);
> +  t = build_qualified_type (intPTI_type_internal_node, TYPE_QUAL_CONST);
> +  lang_hooks.types.register_builtin_type (intPTI_type_internal_node,
> +   "__int128pti");
> +
>/* Vector pair and vector quad support.  */
>vector_pair_type_node = make_node (OPAQUE_TYPE);
>SET_TYPE_MODE (vector_pair_type_node, OOmode);
> diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
> index 3503614efbd..0456bf56d17 100644
> --- a/gcc/config/rs6000/rs6000.h
> +++ b/gcc/config/rs6000/rs6000.h
> @@ -2303,6 +2303,7 @@ enum rs6000_builtin_type_index
>RS6000_BTI_ptr_vector_quad,
>RS6000_BTI_ptr_long_long,
>RS6000_BTI_ptr_long_long_unsigned,
> +  RS6000_BTI_PTI,
>RS6000_BTI_MAX
>  };
>  
> @@ -2347,6 +2348,7 @@ enum rs6000_builtin_type_index
>  #define uintDI_type_internal_node 
> (rs6000_builtin_types[RS6000_BTI_UINTDI])
>  #define intTI_type_internal_node  
> (rs6000_builtin_types[RS6000_BTI_INTTI])
>  #define uintTI_type_internal_node 
> (rs6000_builtin_types[RS6000_BTI_UINTTI])
> +#define intPTI_type_internal_node (rs6000_builtin_types[RS6000_BTI_PTI])
>  #define float_type_internal_node  
> (rs6000_builtin_types[RS6000_BTI_float])
>  #define double_type_internal_node 
> (rs6000_builtin_types[RS6000_BTI_double])
>  #define long_double_type_internal_node
> (rs6000_builtin_types[RS6000_BTI_long_double])
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106895.c 
> b/gcc/testsuite/gcc.target/powerpc/pr106895.c
> new file mode 100644
> index 000..04630fe1df5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr106895.c
> @@ -0,0 +1,15 @@
> +/* PR target/106895 */
> +/* { dg-require-effective-target int128 } */
> +/* { dg-options "-O2" } */
> +
> +/* Verify the following generates even/odd register pairs.  */
> +
> +typedef __int128 pti __attribute__((mode(PTI)));
> +
> +void
> +set128 (pti val, pti *mem)
> +{
> +asm("stq %1,%0" : "=m"(*mem) : "r"(val));
> +}
> +
> +/* { dg-final { scan-assembler "stq 10,0\\(5\\)" } } */
> 
> 


Re: [PATCH] poly_int: Handle more can_div_trunc_p cases

2023-08-04 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 3 Aug 2023 at 18:15, Richard Sandiford
 wrote:
>
> can_div_trunc_p (a, b, , ) tries to compute a Q and r that
> satisfy the usual conditions for truncating division:
>
>  (1) a = b * Q + r
>  (2) |b * Q| <= |a|
>  (3) |r| < |b|
>
> We can compute Q using the constant component (the case when
> all indeterminates are zero).  Since |r| < |b| for the constant
> case, the requirements for indeterminate xi with coefficients
> ai (for a) and bi (for b) are:
>
>  (2') |bi * Q| <= |ai|
>  (3') |ai - bi * Q| <= |bi|
>
> (See the big comment for more details, restrictions, and reasoning).
>
> However, the function works on abstract arithmetic types, and so
> it has to be careful not to introduce new overflow.  The code
> therefore only handled the extreme for (3'), that is:
>
>  |ai - bi * Q| = |bi|
>
> for the case where Q is zero.
>
> Looking at it again, the overflow issue is a bit easier to handle than
> I'd originally thought (or so I hope).  This patch therefore extends the
> code to handle |ai - bi * Q| = |bi| for all Q, with Q = 0 no longer
> being a separate case.
>
> The net effect is to allow the function to succeed for things like:
>
>  (a0 + b1 (Q+1) x) / (b0 + b1 x)
>
> where Q = a0 / b0, with various sign conditions.  E.g. we now handle:
>
>  (7 + 8x) / (4 + 4x)
>
> with Q = 1 and r = 3 + 4x,
>
> Tested on aarch64-linux-gnu.  OK to install?
Hi Richard,
Thanks for the fix! With this patch, I can confirm we correctly select arg1,
when a pattern in sel has len = 4 + 4x, a1 = 5 + 4x and ae = 7 + 8x.

Thanks,
Prathamesh

>
> Richard
>
>
> gcc/
> * poly-int.h (can_div_trunc_p): Succeed for more boundary conditions.
>
> gcc/testsuite/
> * gcc.dg/plugin/poly-int-tests.h (test_can_div_trunc_p_const)
> (test_can_div_trunc_p_const): Add more tests.
> ---
>  gcc/poly-int.h   | 45 ++-
>  gcc/testsuite/gcc.dg/plugin/poly-int-tests.h | 85 +---
>  2 files changed, 98 insertions(+), 32 deletions(-)
>
> diff --git a/gcc/poly-int.h b/gcc/poly-int.h
> index 12571455081..7bff5e5ad26 100644
> --- a/gcc/poly-int.h
> +++ b/gcc/poly-int.h
> @@ -2355,28 +2355,31 @@ can_div_trunc_p (const poly_int_pod ,
> }
>else
> {
> - if (q == 0)
> -   {
> - /* For Q == 0 we simply need: (3') |ai| <= |bi|.  */
> - if (a.coeffs[i] != ICa (0))
> -   {
> - /* Use negative absolute to avoid overflow, i.e.
> --|ai| >= -|bi|.  */
> - C neg_abs_a = (a.coeffs[i] < 0 ? a.coeffs[i] : 
> -a.coeffs[i]);
> - C neg_abs_b = (b.coeffs[i] < 0 ? b.coeffs[i] : 
> -b.coeffs[i]);
> - if (neg_abs_a < neg_abs_b)
> -   return false;
> - rem_p = true;
> -   }
> -   }
> + /* The only unconditional arithmetic that we can do on ai,
> +bi and Q is ai / bi and ai % bi.  (ai == minimum int and
> +bi == -1 would be UB in the caller.)  Anything else runs
> +the risk of overflow.  */
> + auto qi = NCa (a.coeffs[i]) / NCb (b.coeffs[i]);
> + auto ri = NCa (a.coeffs[i]) % NCb (b.coeffs[i]);
> + /* (2') and (3') are satisfied when ai /[trunc] bi == q.
> +So is the stricter condition |ai - bi * Q| < |bi|.  */
> + if (qi == q)
> +   rem_p |= (ri != 0);
> + /* The only other case is when:
> +
> +|bi * Q| + |bi| = |ai| (for (2'))
> +and |ai - bi * Q|   = |bi| (for (3'))
> +
> +The first is equivalent to |bi|(|Q| + 1) == |ai|.
> +The second requires ai == bi * (Q + 1) or ai == bi * (Q - 1).  */
> + else if (ri != 0)
> +   return false;
> + else if (q <= 0 && qi < q && qi + 1 == q)
> +   ;
> + else if (q >= 0 && qi > q && qi - 1 == q)
> +   ;
>   else
> -   {
> - /* Otherwise just check for the case in which ai / bi == Q.  */
> - if (NCa (a.coeffs[i]) / NCb (b.coeffs[i]) != q)
> -   return false;
> - if (NCa (a.coeffs[i]) % NCb (b.coeffs[i]) != 0)
> -   rem_p = true;
> -   }
> +   return false;
> }
>  }
>
> diff --git a/gcc/testsuite/gcc.dg/plugin/poly-int-tests.h 
> b/gcc/testsuite/gcc.dg/plugin/poly-int-tests.h
> index 0b89acd91cd..7af98595a5e 100644
> --- a/gcc/testsuite/gcc.dg/plugin/poly-int-tests.h
> +++ b/gcc/testsuite/gcc.dg/plugin/poly-int-tests.h
> @@ -1899,14 +1899,19 @@ test_can_div_trunc_p_const ()
> ph::make (4, 8, 12),
> _quot));
>ASSERT_EQ (const_quot, C (2));
> -  ASSERT_EQ (can_div_trunc_p (ph::make (15, 25, 40),
> +  ASSERT_TRUE (can_div_trunc_p (ph::make (15, 25, 40),
> +   ph::make (4, 8, 10),
> +   _quot));
> +  

Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-08-04 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 3 Aug 2023 at 18:46, Richard Sandiford
 wrote:
>
> Richard Sandiford  writes:
> > Prathamesh Kulkarni  writes:
> >> On Tue, 25 Jul 2023 at 18:25, Richard Sandiford
> >>  wrote:
> >>>
> >>> Hi,
> >>>
> >>> Thanks for the rework and sorry for the slow review.
> >> Hi Richard,
> >> Thanks for the suggestions!  Please find my responses inline below.
> >>>
> >>> Prathamesh Kulkarni  writes:
> >>> > Hi Richard,
> >>> > This is reworking of patch to extend fold_vec_perm to handle VLA 
> >>> > vectors.
> >>> > The attached patch unifies handling of VLS and VLA vector_csts, while
> >>> > using fallback code
> >>> > for ctors.
> >>> >
> >>> > For VLS vector, the patch ignores underlying encoding, and
> >>> > uses npatterns = nelts, and nelts_per_pattern = 1.
> >>> >
> >>> > For VLA patterns, if sel has a stepped sequence, then it
> >>> > only chooses elements from a particular pattern of a particular
> >>> > input vector.
> >>> >
> >>> > To make things simpler, the patch imposes following constraints:
> >>> > (a) op0_npatterns, op1_npatterns and sel_npatterns are powers of 2.
> >>> > (b) The step size for a stepped sequence is a power of 2, and
> >>> >   multiple of npatterns of chosen input vector.
> >>> > (c) Runtime vector length of sel is a multiple of sel_npatterns.
> >>> >  So, we don't handle sel.length = 2 + 2x and npatterns = 4.
> >>> >
> >>> > Eg:
> >>> > op0, op1: npatterns = 2, nelts_per_pattern = 3
> >>> > op0_len = op1_len = 16 + 16x.
> >>> > sel = { 0, 0, 2, 0, 4, 0, ... }
> >>> > npatterns = 2, nelts_per_pattern = 3.
> >>> >
> >>> > For pattern {0, 2, 4, ...}
> >>> > Let,
> >>> > a1 = 2
> >>> > S = step size = 2
> >>> >
> >>> > Let Esel denote number of elements per pattern in sel at runtime.
> >>> > Esel = (16 + 16x) / npatterns_sel
> >>> > = (16 + 16x) / 2
> >>> > = (8 + 8x)
> >>> >
> >>> > So, last element of pattern:
> >>> > ae = a1 + (Esel - 2) * S
> >>> >  = 2 + (8 + 8x - 2) * 2
> >>> >  = 14 + 16x
> >>> >
> >>> > a1 /trunc arg0_len = 2 / (16 + 16x) = 0
> >>> > ae /trunc arg0_len = (14 + 16x) / (16 + 16x) = 0
> >>> > Since both are equal with quotient = 0, we select elements from op0.
> >>> >
> >>> > Since step size (S) is a multiple of npatterns(op0), we select
> >>> > all elements from same pattern of op0.
> >>> >
> >>> > res_npatterns = max (op0_npatterns, max (op1_npatterns, sel_npatterns))
> >>> >= max (2, max (2, 2)
> >>> >= 2
> >>> >
> >>> > res_nelts_per_pattern = max (op0_nelts_per_pattern,
> >>> > max 
> >>> > (op1_nelts_per_pattern,
> >>> >  
> >>> > sel_nelts_per_pattern))
> >>> > = max (3, max (3, 3))
> >>> > = 3
> >>> >
> >>> > So res has encoding with npatterns = 2, nelts_per_pattern = 3.
> >>> > res: { op0[0], op0[0], op0[2], op0[0], op0[4], op0[0], ... }
> >>> >
> >>> > Unfortunately, this results in an issue for poly_int_cst index:
> >>> > For example,
> >>> > op0, op1: npatterns = 1, nelts_per_pattern = 3
> >>> > op0_len = op1_len = 4 + 4x
> >>> >
> >>> > sel: { 4 + 4x, 5 + 4x, 6 + 4x, ... } // should choose op1
> >>> >
> >>> > In this case,
> >>> > a1 = 5 + 4x
> >>> > S = (6 + 4x) - (5 + 4x) = 1
> >>> > Esel = 4 + 4x
> >>> >
> >>> > ae = a1 + (esel - 2) * S
> >>> >  = (5 + 4x) + (4 + 4x - 2) * 1
> >>> >  = 7 + 8x
> >>> >
> >>> > IIUC, 7 + 8x will always be index for last element of op1 ?
> >>> > if x = 0, len = 4, 7 + 8x = 7
> >>> > if x = 1, len = 8, 7 + 8x = 15, etc.
> >>> > So the stepped sequence will always choose elements
> >>> > from op1 regardless of vector length for above case ?
> >>> >
> >>> > However,
> >>> > ae /trunc op0_len
> >>> > = (7 + 8x) / (4 + 4x)
> >>> > which is not defined because 7/4 != 8/4
> >>> > and we return NULL_TREE, but I suppose the expected result would be:
> >>> > res: { op1[0], op1[1], op1[2], ... } ?
> >>> >
> >>> > The patch passes bootstrap+test on aarch64-linux-gnu with and without 
> >>> > sve,
> >>> > and on x86_64-unknown-linux-gnu.
> >>> > I would be grateful for suggestions on how to proceed.
> >>> >
> >>> > Thanks,
> >>> > Prathamesh
> >>> >
> >>> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> >>> > index a02ede79fed..8028b3e8e9a 100644
> >>> > --- a/gcc/fold-const.cc
> >>> > +++ b/gcc/fold-const.cc
> >>> > @@ -85,6 +85,10 @@ along with GCC; see the file COPYING3.  If not see
> >>> >  #include "vec-perm-indices.h"
> >>> >  #include "asan.h"
> >>> >  #include "gimple-range.h"
> >>> > +#include 
> >>> > +#include "tree-pretty-print.h"
> >>> > +#include "gimple-pretty-print.h"
> >>> > +#include "print-tree.h"
> >>> >
> >>> >  /* Nonzero if we are folding constants inside an initializer or a C++
> >>> > manifestly-constant-evaluated context; zero otherwise.
> >>> > @@ -10493,15 +10497,9 @@ fold_mult_zconjz (location_t loc, tree type, 
> >>> > tree expr)

Re: RISC-V: Folding memory for FP + constant case

2023-08-04 Thread Manolis Tsamis
Hi all,

It is true that regcprop currently does not propagate sp and hence
leela is not optimized, but from what I see this should be something
we can address.

The reason that the propagation fails is this check that I have added
when I introduced maybe_copy_reg_attrs:

else if (REG_POINTER (new_reg) != REG_POINTER (old_reg))
  {
/* Only a single instance of STACK_POINTER_RTX must exist and we cannot
   modify it. Allow propagation if REG_POINTER for OLD_REG matches and
   don't touch ORIGINAL_REGNO and REG_ATTRS. */
return NULL_RTX;
  }

To be honest I did add this back then just to be on the safe side of
whether a mismatch in REG_POINTER after propagation would be an issue
(since the original regcprop had caused enough problems).

I see two ways to solve this and make fmo able to optimize leela as well:
 1) Remove the REG_POINTER check in regcprop if that is safe. My
understanding is that REG_POINTER is used as a hint and there would be
no correctness issues.
 2) Mark the corresponding registers with REG_POINTER. I'm not sure
where that is supposed to happen.

Since the instructions look like this:
  (insn 113 11 16 2 (set (reg:DI 15 a5 [226])
  (reg/f:DI 2 sp)) 179 {*movdi_64bit}
   (nil))

I assume that we'd want to mark a5 as REG_POINTER anyway (which is
not), and in that case propagation would work.
On the other hand if there's no correctness issue w.r.t. REG_POINTER
and regcprop then removing the additional check would increase
propagation opportunities in general which is also good.

Thanks,
Manolis

On Wed, Aug 2, 2023 at 2:52 AM Jeff Law  wrote:
>
>
>
> On 8/1/23 17:38, Vineet Gupta wrote:
> >>
> >> Also note that getting FP out of the shift-add sequences is the other
> >> key goal of Jivan's work.  FP elimination always results in a
> >> spill/reload if we have a shift-add insn where one operand is FP.
> >
> > Hmm, are you saying it should NOT be generating shift-add with SP as
> > src, because currently thats exactly what fold FP offset *is* doing and
> > is the reason it has 5 less insns.
> We should not have shift-add with FP as a source prior to register
> allocation because it will almost always generate spill code.
>
>
> jeff


[PATCH] mid-end: Use integral time intervals in timevar.cc

2023-08-04 Thread Matthew Malcomson via Gcc-patches
Hopefully last update ...

> Specifically, please try compiling with
>-ftime-report -fdiagnostics-format=sarif-file
> and have a look at the generated .sarif file, e.g. via
>python -m json.tool foo.c.sarif
> which will pretty-print the JSON to stdout.

Rebasing onto the JSON output was quite simple -- I've inlined the only
change in the patch below (to cast to floating point seconds before
generating the json report).

I have manually checked the SARIF output as you suggested and all looks
good (an in fact better because we no longer save some strange times
like the below due to avoiding the floating point rounding).
  "wall": -4.49516e-09,


> 
> The patch looks OK to me if it passes bootstrap / regtest and the
> output of -ftime-report doesn't change (too much).
> 
> Thanks,
> Richard.

Though I don't expect you were asking for this, confirmation below that
the output doesn't change.  (Figured I may as well include that info
since the rebase to include the JSON output that David had just added
required re-sending an email anyway).



```
hw-a20-8:checking-theory [10:07:01] $ ${old_build}/gcc/xgcc -B${old_build}/gcc/ 
   -fdiagnostics-plain-output-Os  -w -S test-sum.c -o /dev/null   
-ftime-report

Time variable   usr   sys  wall 
  GGC
 phase setup:   0.01 ( 14%)   0.00 (  0%)   0.02 ( 18%) 
 3389k ( 74%)
 phase parsing  :   0.03 ( 43%)   0.02 ( 67%)   0.06 ( 55%) 
  982k ( 21%)
 phase opt and generate :   0.03 ( 43%)   0.01 ( 33%)   0.03 ( 27%) 
  215k (  5%)
 callgraph functions expansion  :   0.02 ( 29%)   0.00 (  0%)   0.03 ( 27%) 
  162k (  4%)
 callgraph ipa passes   :   0.01 ( 14%)   0.01 ( 33%)  -0.00 ( -0%) 
   38k (  1%)
 CFG verifier   :   0.00 (  0%)   0.00 (  0%)   0.01 (  9%) 
0  (  0%)
 preprocessing  :   0.02 ( 29%)   0.02 ( 67%)   0.02 ( 18%) 
  272k (  6%)
 lexical analysis   :   0.01 ( 14%)   0.00 (  0%)   0.02 ( 18%) 
0  (  0%)
 parser (global):   0.00 (  0%)   0.00 (  0%)   0.01 (  9%) 
  637k ( 14%)
 parser function body   :   0.00 (  0%)   0.00 (  0%)   0.01 (  9%) 
   18k (  0%)
 tree CFG cleanup   :   0.00 (  0%)   0.01 ( 33%)   0.00 (  0%) 
0  (  0%)
 tree STMT verifier :   0.01 ( 14%)   0.00 (  0%)   0.00 (  0%) 
0  (  0%)
 expand :   0.01 ( 14%)   0.00 (  0%)   0.00 (  0%) 
   12k (  0%)
 loop init  :   0.00 (  0%)   0.00 (  0%)   0.01 (  9%) 
   12k (  0%)
 initialize rtl :   0.00 (  0%)   0.00 (  0%)   0.01 (  9%) 
   15k (  0%)
 rest of compilation:   0.01 ( 14%)   0.00 (  0%)   0.00 (  0%) 
 6920  (  0%)
 TOTAL  :   0.07  0.03  0.11
 4587k
Extra diagnostic checks enabled; compiler may run slowly.
Configure with --enable-checking=release to disable checks.
hw-a20-8:checking-theory [10:06:44] $ ${new_build}/gcc/xgcc -B${new_build}/gcc/ 
   -fdiagnostics-plain-output-Os  -w -S test-sum.c -o /dev/null   
-ftime-report

Time variable   usr   sys  wall 
  GGC
 phase setup:   0.01 ( 17%)   0.00 (  0%)   0.02 ( 18%) 
 3389k ( 74%)
 phase parsing  :   0.02 ( 33%)   0.03 ( 75%)   0.05 ( 45%) 
  982k ( 21%)
 phase opt and generate :   0.03 ( 50%)   0.01 ( 25%)   0.04 ( 36%) 
  215k (  5%)
 callgraph construction :   0.00 (  0%)   0.01 ( 25%)   0.00 (  0%) 
 1864  (  0%)
 callgraph functions expansion  :   0.02 ( 33%)   0.00 (  0%)   0.03 ( 27%) 
  162k (  4%)
 callgraph ipa passes   :   0.01 ( 17%)   0.00 (  0%)   0.01 (  9%) 
   38k (  1%)
 ipa free lang data :   0.00 (  0%)   0.00 (  0%)   0.01 (  9%) 
0  (  0%)
 CFG verifier   :   0.02 ( 33%)   0.00 (  0%)   0.00 (  0%) 
0  (  0%)
 preprocessing  :   0.01 ( 17%)   0.03 ( 75%)   0.01 (  9%) 
  272k (  6%)
 lexical analysis   :   0.01 ( 17%)   0.00 (  0%)   0.02 ( 18%) 
0  (  0%)
 parser (global):   0.00 (  0%)   0.00 (  0%)   0.01 (  9%) 
  637k ( 14%)
 parser inl. func. body :   0.00 (  0%)   0.00 (  0%)   0.01 (  9%) 
   19k (  0%)
 tree STMT verifier :   0.01 ( 17%)   0.00 (  0%)   0.00 (  0%) 
0  (  0%)
 expand :   0.00 (  0%)   0.00 (  0%)   0.01 (  9%) 
   12k (  0%)
 integrated RA  :   0.00 (  0%)   0.00 (  0%)   0.01 (  9%) 
   50k (  1%)
 initialize rtl :   0.00 (  0%)   0.00 (  0%)   0.01 (  9%) 
   15k (  0%)
 TOTAL  :   0.06  0.04  0.11
 4587k
Extra diagnostic checks enabled; compiler may run slowly.
Configure 

Re: [RFC] Combine zero_extract and sign_extend for TARGET_TRULY_NOOP_TRUNCATION

2023-08-04 Thread Richard Sandiford via Gcc-patches
YunQiang Su  writes:
> PR #104914
>
> On TRULY_NOOP_TRUNCATION_MODES_P (DImode, SImode)) == true platforms,
> zero_extract (SI, SI) can be sign-extended.  So, if a zero_extract (DI,
> DI) following with an sign_extend(SI, DI) can be merged to a single
> zero_extract (SI, SI).
>
> gcc/ChangeLog:
>   PR: 104914.
>   * combine.cc (try_combine): Combine zero_extract (DI, DI) and
> following sign_extend (DI, SI) for
> TRULY_NOOP_TRUNCATION_MODES_P (DImode, SImode)) == true.
> (subst): Allow replacing reg(DI) with subreg(SI (reg DI))
> if to is SImode and from is DImode for
> TRULY_NOOP_TRUNCATION_MODES_P (DImode, SImode)) == true.
>
> gcc/testsuite/ChangeLog:
>   PR: 104914.
>   * gcc.target/mips/pr104914.c: New testcase.
> ---
>  gcc/combine.cc   | 88 
>  gcc/testsuite/gcc.target/mips/pr104914.c | 17 +
>  2 files changed, 90 insertions(+), 15 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/mips/pr104914.c
>
> diff --git a/gcc/combine.cc b/gcc/combine.cc
> index e46d202d0a7..701b7c33b17 100644
> --- a/gcc/combine.cc
> +++ b/gcc/combine.cc
> @@ -3294,15 +3294,64 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn 
> *i1, rtx_insn *i0,
>n_occurrences = 0; /* `subst' counts here */
>subst_low_luid = DF_INSN_LUID (i2);
>  
> -  /* If I1 feeds into I2 and I1DEST is in I1SRC, we need to make a unique
> -  copy of I2SRC each time we substitute it, in order to avoid creating
> -  self-referential RTL when we will be substituting I1SRC for I1DEST
> -  later.  Likewise if I0 feeds into I2, either directly or indirectly
> -  through I1, and I0DEST is in I0SRC.  */
> -  newpat = subst (PATTERN (i3), i2dest, i2src, false, false,
> -   (i1_feeds_i2_n && i1dest_in_i1src)
> -   || ((i0_feeds_i2_n || (i0_feeds_i1_n && i1_feeds_i2_n))
> -   && i0dest_in_i0src));
> +  /* Try to combine zero_extract (DImode) and sign_extend (SImode to 
> DImode)
> +  for TARGET_TRULY_NOOP_TRUNCATION.  The RTL may look like:
> +
> +  (insn 10 49 11 2 (set (zero_extract:DI (reg/v:DI 200 [ val ])
> + (const_int 8 [0x8])
> + (const_int 0 [0]))
> +  (subreg:DI (reg:QI 202 [ *buf_8(D) ]) 0)) "xx.c":4:29 278 {*insvdi}
> +  (expr_list:REG_DEAD (reg:QI 202 [ *buf_8(D) ]) (nil)))
> +  (insn 11 10 12 2 (set (reg/v:DI 200 [ val ])
> +
> +  (sign_extend:DI (subreg:SI (reg/v:DI 200 [ val ]) 0))) 238 
> {extendsidi2}
> +  (nil))

Like I mentioned in the other thread, I think things went wrong when
we generated the subreg in this sign_extend.  The operation should
have been a truncate of (reg/v:DI 200) followed by a sign extension
of the result.

What piece of code is generating the subreg?

Thanks,
Richard


Re: [committed][RISC-V] Fix 20010221-1.c with zicond

2023-08-04 Thread Xiao Zeng
On Thu, Aug 03, 2023 at 01:20:00 AM  Jeff Law  wrote:
>
>
>
>So we're being a bit too aggressive with the .opt zicond patterns.
>
>
>> (define_insn "*czero.eqz..opt1"
>>   [(set (match_operand:GPR 0 "register_operand"   "=r")
>> (if_then_else:GPR (eq (match_operand:X 1 "register_operand" "r")
>>   (const_int 0))
>>   (match_operand:GPR 2 "register_operand" "1")
>>   (match_operand:GPR 3 "register_operand" "r")))]
>>   "(TARGET_ZICOND || 1) && rtx_equal_p (operands[1], operands[2])"
>>   "czero.eqz\t%0,%3,%1"
>> )
>>
>The RTL semantics here are op0 = (op1 == 0) ? op1 : op2.  That maps
>directly to czero.eqz.  ie, we select op1 when we know it's zero, op2
>otherwise.  So this pattern is fine.
>
>
>
>> (define_insn "*czero.eqz..opt2"
>>   [(set (match_operand:GPR 0 "register_operand"   "=r")
>> (if_then_else:GPR (eq (match_operand:X 1 "register_operand" "r")
>>   (const_int 0))
>>   (match_operand:GPR 2 "register_operand" "r")
>>   (match_operand:GPR 3 "register_operand" "1")))]
>>   "(TARGET_ZICOND || 1) && rtx_equal_p (operands[1],  operands[3])"
>>   "czero.nez\t%0,%2,%1"
>> )
>
>The RTL semantics of this pattern are are: op0 = (op1 == 0) ? op2 : op1;
>
>That's not something that can be expressed by the zicond extension as it
>selects op1 if and only if op1 is not equal to zero.
>
>
>
>> (define_insn "*czero.nez..opt3"
>>   [(set (match_operand:GPR 0 "register_operand"   "=r")
>> (if_then_else:GPR (ne (match_operand:X 1 "register_operand" "r")
>>   (const_int 0))
>>   (match_operand:GPR 2 "register_operand" "r")
>>   (match_operand:GPR 3 "register_operand" "1")))]
>>   "(TARGET_ZICOND || 1) && rtx_equal_p (operands[1], operands[3])"
>>   "czero.eqz\t%0,%2,%1"
>> )
>The RTL semantics of this pattern are op0 = (op1 != 0) ? op2 : op1.
>That maps to czero.nez.  But the output template uses czero.eqz.  Opps.
>
>> (define_insn "*czero.nez..opt4"
>>   [(set (match_operand:GPR 0 "register_operand"   "=r")
>> (if_then_else:GPR (ne (match_operand:X 1 "register_operand" "r")
>>   (const_int 0))
>>   (match_operand:GPR 2 "register_operand" "1")
>>   (match_operand:GPR 3 "register_operand" "r")))]
>>   "(TARGET_ZICOND || 1) && rtx_equal_p (operands[1], operands[2])"
>>   "czero.nez\t%0,%3,%1"
>> )
>The RTL semantics of this pattern are op0 = (op1 != 0) ? op1 : op2 which
>obviously doesn't match to any zicond instruction as op1 is selected
>when it is not zero.
>
>
>So two of the patterns are just totally bogus as they are not
>implementable with zicond.  They are removed.  The asm template for the
>.opt3 pattern is fixed to use czero.nez and its name is changed to .opt2.
>
>This fixes the known issues with the zicond.md bits.  Onward to the rest
>of the expansion work :-)
>
>Committed to the trunk,
>
>jeff
>

Yes, two of these four optimization patterns are wrong.

In the wrong two optimization modes, I only considered the
case of satisfying the ELSE branch, but in fact, like the correct
two optimization modes, I should consider the case of satisfying
both the THAN and ELSE branches.

By the way, I was assigned other tasks during the week and
didn't have time to reply to emails, sorry.

Although I can't reply in time to the emails received from the
gcc community, I will definitely reply when I am free.

At the same time, I will improve my time management skills, keep
the same frequency with the community as much as possible, and
work better with everyone.

Thanks
Xiao Zeng

Re: Fix profile upate after vectorizer peeling

2023-08-04 Thread Richard Biener via Gcc-patches
On Fri, Aug 4, 2023 at 10:52 AM Jan Hubicka  wrote:
>
> Hi,
> so I found the problem.  We duplicate multiple paths and end up with:
>
> ;; basic block 6, loop depth 0, count 365072224 (estimated locally, freq 
> 0.3400)
> ;;  prev block 12, next block 7, flags: (NEW, REACHABLE, VISITED)
> ;;  pred:   4 [never (guessed)]  count:0 (estimated locally, freq 0.) 
> (TRUE_VALUE,EXECUTABLE)
> ;;  10 [always]  count:365072224 (estimated locally, freq 0.3400) 
> (FALLTHRU,EXECUTABLE)
> # _18 = PHI <0(4), 0(10)>
> # d_39 = PHI 
> if (_18 == 0)
>   goto ; [97.06%]
> else
>   goto ; [2.94%]
> ;;  succ:   8 [97.1% (guessed)]  count:354334801 (estimated locally, freq 
> 0.3300) (TRUE_VALUE,EXECUTABLE)
> ;;  7 [2.9% (guessed)]  count:10737423 (estimated locally, freq 
> 0.0100) (FALSE_VALUE,EXECUTABLE)
>
> Here goto bb 7 is never taken but profile is wrong.
>
> Before threading we have chain of conditionals:
>
>   __asm__("pushf{l|d}
> pushf{l|d}
> pop{l}  %0
> mov{l}  {%0, %1|%1, %0}
> xor{l}  {%2, %0|%0, %2}
> push{l} %0
> popf{l|d}
> pushf{l|d}
> pop{l}  %0
> popf{l|d}
> " : "=" __eax_19, "=" __ebx_20 : "i" 2097152);
>   _21 = __eax_19 ^ __ebx_20;
>   _22 = _21 & 2097152;
>   if (_22 == 0)
> goto ; [34.00%]
>   else
> goto ; [66.00%]
>
>[local count: 708669602 freq: 0.66]:
>   __asm__ __volatile__("cpuid
> " : "=a" __eax_24, "=b" __ebx_25, "=c" __ecx_26, "=d" __edx_27 : "0" 
> 0);
>
>[local count: 1073741826 freq: 1.00]:
>   # _33 = PHI <0(2), __eax_24(3)>
>   _16 = _33 == 0;
>   if (_33 == 0)
> goto ; [34.00%]
>   else
> goto ; [66.00%]
>
>[local count: 708669600 freq: 0.66]:
>   __asm__ __volatile__("cpuid
> " : "=a" a_44, "=b" b_45, "=c" c_46, "=d" d_47 : "0" 1, "2" 0);
>
>[local count: 1073741824 freq: 1.00]:
>   # _18 = PHI <0(4), 1(5)>
>   # d_39 = PHI 
>   if (_18 == 0)
> goto ; [33.00%]
>   else
> goto ; [67.00%]
>
>
> If first _22 == 0 then also _33 == 0 and _18 == 0 but the last case has
> probability 33% while the first 34%, so the profile guess is not
> consistent with the threaded path.  So threading is right to end up with
> profile inconsistency, but it should print reason for doing it.
>
> One option is to disable optimization for the check.  Other option is to
> get the first conditional predicted right.
> Would this be OK?

Yeah, that looks OK.

> gcc/ChangeLog:
>
> * config/i386/cpuid.h: Mark CPUs not supporting cpuid as unlikely.
>
> diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
> index 03fd6fc9478..9c768ac0b6d 100644
> --- a/gcc/config/i386/cpuid.h
> +++ b/gcc/config/i386/cpuid.h
> @@ -295,7 +295,7 @@ __get_cpuid_max (unsigned int __ext, unsigned int *__sig)
>: "i" (0x0020));
>  #endif
>
> -  if (!((__eax ^ __ebx) & 0x0020))
> +  if (__builtin_expect (!((__eax ^ __ebx) & 0x0020), 0))
>  return 0;
>  #endif
>


Re: [PATCH 3/3 v3] genmatch: Log line numbers indirectly

2023-08-04 Thread Richard Biener via Gcc-patches
On Thu, Aug 3, 2023 at 4:24 PM Andrzej Turko via Gcc-patches
 wrote:
>
> Currently fprintf calls logging to a dump file take line numbers
> in the match.pd file directly as arguments.
> When match.pd is edited, referenced code changes line numbers,
> which causes changes to many fprintf calls and, thus, to many
> (usually all) .cc files generated by genmatch. This forces make
> to (unnecessarily) rebuild many .o files.
>
> This change replaces those logging fprintf calls with calls to
> a dedicated logging function. Because it reads the line numbers
> from the lookup table, it is enough to pass a corresponding index.
> Thanks to this, when match.pd changes, it is enough to rebuild
> the file containing the lookup table and, of course, those
> actually affected by the change.
>
> Signed-off-by: Andrzej Turko 
>
> gcc/ChangeLog:
>
> * genmatch.cc: Log line numbers indirectly.
> ---
>  gcc/genmatch.cc | 89 -
>  1 file changed, 74 insertions(+), 15 deletions(-)
>
> diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
> index 1deca505603..63d6ba6dab0 100644
> --- a/gcc/genmatch.cc
> +++ b/gcc/genmatch.cc
> @@ -217,9 +217,57 @@ fp_decl_done (FILE *f, const char *trailer)
>  fprintf (header_file, "%s;", trailer);
>  }
>
> +/* Line numbers for use by indirect line directives.  */
> +static vec dbg_line_numbers;
> +
> +static void
> +write_header_declarations (bool gimple, FILE *f)
> +{
> +  fprintf (f, "\nextern void\n%s_dump_logs (const char *file1, int line1_id, 
> "
> + "const char *file2, int line2, bool simplify);\n",
> + gimple ? "gimple" : "generic");
> +}
> +
> +static void
> +define_dump_logs (bool gimple, FILE *f)
> +{
> +

extra vertical space is unwanted here.

> +  if (dbg_line_numbers.is_empty ())
> +{
> +  fprintf (f, "};\n\n");
> +  return;
> +}

shouldn't the above come after ...

> +  fprintf (f , "void\n%s_dump_logs (const char *file1, int line1_id, "
> +   "const char *file2, int line2, bool simplify)\n{\n",
> +   gimple ? "gimple" : "generic");

... this?

> +  fprintf_indent (f, 2, "static int dbg_line_numbers[%d] = {",
> + dbg_line_numbers.length ());
> +
> +  for (int i = 0; i < (int)dbg_line_numbers.length () - 1; i++)

use an unsigned int to avoid the cast?

> +{
> +  if (i % 20 == 0)
> +   fprintf (f, "\n\t");
> +
> +  fprintf (f, "%d, ", dbg_line_numbers[i]);
> +}
> +  fprintf (f, "%d\n  };\n\n", dbg_line_numbers.last ());
> +
> +
> +  fprintf_indent (f, 2, "fprintf (dump_file, \"%%s "
> + "%%s:%%d, %%s:%%d\\n\",\n");
> +  fprintf_indent (f, 10, "simplify ? \"Applying pattern\" : "
> + "\"Matching expression\", file1, "
> + "dbg_line_numbers[line1_id], file2, line2);");
> +
> +  fprintf (f, "\n}\n\n");
> +}
> +
>  static void
>  output_line_directive (FILE *f, location_t location,
> -  bool dumpfile = false, bool fnargs = false)
> + bool dumpfile = false, bool fnargs = false,
> + bool indirect_line_numbers = false)
>  {
>const line_map_ordinary *map;
>linemap_resolve_location (line_table, location, LRK_SPELLING_LOCATION, 
> );
> @@ -239,7 +287,15 @@ output_line_directive (FILE *f, location_t location,
> ++file;
>
>if (fnargs)
> -   fprintf (f, "\"%s\", %d", file, loc.line);
> +  {
> +if (indirect_line_numbers)
> +  {
> +   fprintf (f, "\"%s\", %d", file, dbg_line_numbers.length ());
> +   dbg_line_numbers.safe_push (loc.line);
> +  }
> +else
> +  fprintf (f, "\"%s\", %d", file, loc.line);
> +  }

The indent is off here.  I notice the same lines often appear repeatedly so
a simple optimization like doing

if (indirect_line_numbers)
  {
if (!dbg_line_numbers.is_empty ()
&& dbg_line_numbers.last () == loc.line)
  ;
else
  dbg_line_numbers.safe_push (loc.line);
fprintf (f, "\"%s\", %d", file, dbg_line_numbers.length () - 1);
  }

shrinks the table quite a bit (not all duplicates are gone this way).
It doesn't
seem we can easily keep the list sorted, adding another hash-map could
avoid duplicates completely, maybe worth pursuing.

Otherwise the patch series looks fine to me.

Thanks,
Richard.

>else
> fprintf (f, "%s:%d", file, loc.line);
>  }
> @@ -3375,20 +3431,19 @@ dt_operand::gen (FILE *f, int indent, bool gimple, 
> int depth)
>  }
>  }
>
> -/* Emit a fprintf to the debug file to the file F, with the INDENT from
> +/* Emit a logging call to the debug file to the file F, with the INDENT from
> either the RESULT location or the S's match location if RESULT is null. */
>  static void
> -emit_debug_printf (FILE *f, int indent, class simplify *s, operand *result)
> +emit_logging_call (FILE *f, int indent, class simplify *s, operand *result,
> + bool 

[avr,committed] Add some more devices to avr-mcus.def.

2023-08-04 Thread Georg-Johann Lay

This adds some more Xmega like devices to the avr backend.

Johann

AVR: Add some more devices: AVR16DD*, AVR32DD*, AVR64DD*, AVR64EA*, 
ATtiny42*, ATtiny82*, ATtiny162*, ATtiny322*, ATtiny10*.


gcc/
* config/avr/avr-mcus.def (avr64dd14, avr64dd20, avr64dd28, 
avr64dd32)
(avr64ea28, avr64ea32, avr64ea48, attiny424, attiny426, 
attiny427)
(attiny824, attiny826, attiny827, attiny1624, attiny1626, 
attiny1627)
(attiny3224, attiny3226, attiny3227, avr16dd14, avr16dd20, 
avr16dd28)

(avr16dd32, avr32dd14, avr32dd20, avr32dd28, avr32dd32)
(attiny102, attiny104): New devices.
* doc/avr-mmcu.texi: Regenerate.AVR: Add some more devices: AVR16DD*, AVR32DD*, AVR64DD*, AVR64EA*, ATtiny42*, ATtiny82*, ATtiny162*, ATtiny322*, ATtiny10*.

gcc/
	* config/avr/avr-mcus.def (avr64dd14, avr64dd20, avr64dd28, avr64dd32)
	(avr64ea28, avr64ea32, avr64ea48, attiny424, attiny426, attiny427)
	(attiny824, attiny826, attiny827, attiny1624, attiny1626, attiny1627)
	(attiny3224, attiny3226, attiny3227, avr16dd14, avr16dd20, avr16dd28)
	(avr16dd32, avr32dd14, avr32dd20, avr32dd28, avr32dd32)
	(attiny102, attiny104): New devices.
	* doc/avr-mmcu.texi: Regenerate.

diff --git a/gcc/config/avr/avr-mcus.def b/gcc/config/avr/avr-mcus.def
index d0056c960ee..4c4269cd429 100644
--- a/gcc/config/avr/avr-mcus.def
+++ b/gcc/config/avr/avr-mcus.def
@@ -314,6 +314,13 @@ AVR_MCU ("avr64db28",ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DB28__",
 AVR_MCU ("avr64db32",ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DB32__",0x6000, 0x0, 0x1, 0)
 AVR_MCU ("avr64db48",ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DB48__",0x6000, 0x0, 0x1, 0)
 AVR_MCU ("avr64db64",ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DB64__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64dd14",ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DD14__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64dd20",ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DD20__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64dd28",ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DD28__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64dd32",ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DD32__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64ea28",ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64EA28__",0x6800, 0x0, 0x1, 0)
+AVR_MCU ("avr64ea32",ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64EA32__",0x6800, 0x0, 0x1, 0)
+AVR_MCU ("avr64ea48",ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64EA48__",0x6800, 0x0, 0x1, 0)
 /* Xmega, Flash + RAM < 64K, flash visible in RAM address space */
 AVR_MCU ("avrxmega3",ARCH_AVRXMEGA3, AVR_ISA_NONE,  NULL,  0x3f00, 0x0, 0x8000, 0)
 AVR_MCU ("attiny202",ARCH_AVRXMEGA3, AVR_ISA_RCALL, "__AVR_ATtiny202__",   0x3f80, 0x0, 0x800,  0x8000)
@@ -342,6 +349,18 @@ AVR_MCU ("attiny1617",   ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATtiny1617__"
 AVR_MCU ("attiny3214",   ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATtiny3214__",  0x3800, 0x0, 0x8000, 0x8000)
 AVR_MCU ("attiny3216",   ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATtiny3216__",  0x3800, 0x0, 0x8000, 0x8000)
 AVR_MCU ("attiny3217",   ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATtiny3217__",  0x3800, 0x0, 0x8000, 0x8000)
+AVR_MCU ("attiny424",ARCH_AVRXMEGA3, AVR_ISA_RCALL, "__AVR_ATtiny424__",   0x3e00, 0x0, 0x1000, 0x8000)
+AVR_MCU ("attiny426",ARCH_AVRXMEGA3, AVR_ISA_RCALL, "__AVR_ATtiny426__",   0x3e00, 0x0, 0x1000, 0x8000)
+AVR_MCU ("attiny427",ARCH_AVRXMEGA3, AVR_ISA_RCALL, "__AVR_ATtiny427__",   0x3e00, 0x0, 0x1000, 0x8000)
+AVR_MCU ("attiny824",ARCH_AVRXMEGA3, AVR_ISA_RCALL, "__AVR_ATtiny824__",   0x3c00, 0x0, 0x2000, 0x8000)
+AVR_MCU ("attiny826",ARCH_AVRXMEGA3, AVR_ISA_RCALL, "__AVR_ATtiny826__",   0x3c00, 0x0, 0x2000, 0x8000)
+AVR_MCU ("attiny827",ARCH_AVRXMEGA3, AVR_ISA_RCALL, "__AVR_ATtiny827__",   0x3c00, 0x0, 0x2000, 0x8000)
+AVR_MCU ("attiny1624",   ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATtiny1624__",  0x3800, 0x0, 0x4000, 0x8000)
+AVR_MCU ("attiny1626",   ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATtiny1626__",  0x3800, 0x0, 0x4000, 0x8000)
+AVR_MCU ("attiny1627",   ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATtiny1627__",  0x3800, 0x0, 0x4000, 0x8000)
+AVR_MCU ("attiny3224",   ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATtiny3224__",  0x3400, 0x0, 0x8000, 0x8000)
+AVR_MCU ("attiny3226",   ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATtiny3226__",  0x3400, 0x0, 0x8000, 0x8000)
+AVR_MCU ("attiny3227",   ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATtiny3227__",  0x3400, 0x0, 0x8000, 0x8000)
 AVR_MCU ("atmega808",ARCH_AVRXMEGA3, AVR_ISA_RCALL, "__AVR_ATmega808__",   0x3c00, 0x0, 0x2000, 0x4000)
 AVR_MCU ("atmega809",ARCH_AVRXMEGA3, AVR_ISA_RCALL, "__AVR_ATmega809__",   0x3c00, 0x0, 0x2000, 0x4000)
 AVR_MCU ("atmega1608",   ARCH_AVRXMEGA3, AVR_ISA_NONE,  

Re: Disable loop distribution for loops with estimated iterations 0

2023-08-04 Thread Jan Hubicka via Gcc-patches
> On Fri, Aug 4, 2023 at 9:16 AM Jan Hubicka via Gcc-patches
>  wrote:
> >
> > Hi,
> > this prevents useless loop distribiton produced in hmmer.  With FDO we now
> > correctly work out that the loop created for last iteraiton is not going to
> > iterate however loop distribution still produces a verioned loop that has no
> > chance to survive loop vectorizer since we only keep distributed loops
> > when loop vectorization suceeds and it requires number of (header) 
> > iterations
> > to exceed the vectorization factor.
> >
> > Bootstrapped/regtested x86_64-linux, OK?
> 
> OK.
> 
> But why does optimize_loop_for_speed_p () return true when the loop
> isn't expected to iterate?  Wouldn't that be a better place to fix this
> and similar issues in other places then?

optimize_loop_for_speed_p checks whether the loop header is considered
hot so we want to get it running fast.  I think it is up to each loop
transform to decide whether it helps loops with low iteration counts or
hight iteration counts or both.  Loop peeling and copy header are passes
that does helps low iteration count loops.  I think we have more.

For example I wondered if I should also disable splitting but I think
moving the conditional out of loop will likely help even if loop has
small trip count.

I briefly looked what passes already have cost model based on iteration
estimate. I guess we should also tame down invariant motion and perhaps
others.

Honza
> 
> Thanks,
> Richard.
> 
> > gcc/ChangeLog:
> >
> > * tree-loop-distribution.cc (loop_distribution::execute): Disable
> > distribution for loops with estimated iterations 0.
> >
> > diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
> > index cf7c197aaf7..8ff2108f284 100644
> > --- a/gcc/tree-loop-distribution.cc
> > +++ b/gcc/tree-loop-distribution.cc
> > @@ -3871,10 +3871,20 @@ loop_distribution::execute (function *fun)
> >
> >   bool destroy_p;
> >   int nb_generated_loops, nb_generated_calls;
> > + bool only_patterns = !optimize_loop_for_speed_p (loop)
> > +  || !flag_tree_loop_distribution;
> > + /* do not try to distribute loops that are not expected to 
> > iterate.  */
> > + if (!only_patterns)
> > +   {
> > + HOST_WIDE_INT iterations = estimated_loop_iterations_int 
> > (loop);
> > + if (iterations < 0)
> > +   iterations = likely_max_loop_iterations_int (loop);
> > + if (!iterations)
> > +   only_patterns = true;
> > +   }
> >   nb_generated_loops
> > = distribute_loop (loop, work_list, cd, _generated_calls,
> > -  _p, (!optimize_loop_for_speed_p 
> > (loop)
> > -   || 
> > !flag_tree_loop_distribution));
> > +  _p, only_patterns);
> >   if (destroy_p)
> > loops_to_be_destroyed.safe_push (loop);
> >


[avr,committed] Fix some typos in avr-mcus.def

2023-08-04 Thread Georg-Johann Lay

This fixes some minor typos in avr-mcus.def.

Johan


gcc/
* config/avr/avr-mcus.def (avr128d*, avr64d*): Fix their 
FLASH_SIZE

and PM_OFFSET entries.

diff --git a/gcc/config/avr/avr-mcus.def b/gcc/config/avr/avr-mcus.def
index ca99116adab..d0056c960ee 100644
--- a/gcc/config/avr/avr-mcus.def
+++ b/gcc/config/avr/avr-mcus.def
@@ -291,7 +291,7 @@ AVR_MCU ("atmega2560",   ARCH_AVR6, 
AVR_ISA_NONE, "__AVR_ATmega2560__",
 AVR_MCU ("atmega2561",   ARCH_AVR6, AVR_ISA_NONE, 
"__AVR_ATmega2561__",0x0200, 0x0, 0x4, 0)
 AVR_MCU ("atmega256rfr2",ARCH_AVR6, AVR_ISA_NONE, 
"__AVR_ATmega256RFR2__", 0x0200, 0x0, 0x4, 0)
 AVR_MCU ("atmega2564rfr2",   ARCH_AVR6, AVR_ISA_NONE, 
"__AVR_ATmega2564RFR2__",0x0200, 0x0, 0x4, 0)

-/* Xmega, 16K <= Flash < 64K, RAM <= 64K */
+/* Xmega, 16K <= Flash <= 64K, RAM <= 64K */
 AVR_MCU ("avrxmega2",ARCH_AVRXMEGA2, AVR_ISA_NONE, NULL, 
0x2000, 0x0, 0x9000, 0)
 AVR_MCU ("atxmega8e5",   ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_ATxmega8E5__",   0x2000, 0x0, 0x2800, 0)
 AVR_MCU ("atxmega16a4",  ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_ATxmega16A4__",  0x2000, 0x0, 0x5000, 0)
@@ -306,14 +306,14 @@ AVR_MCU ("atxmega16c4",  ARCH_AVRXMEGA2, 
AVR_ISA_RMW,  "__AVR_ATxmega16C4__"
 AVR_MCU ("atxmega32a4u", ARCH_AVRXMEGA2, AVR_ISA_RMW, 
"__AVR_ATxmega32A4U__", 0x2000, 0x0, 0x9000, 0)
 AVR_MCU ("atxmega32c4",  ARCH_AVRXMEGA2, AVR_ISA_RMW, 
"__AVR_ATxmega32C4__",  0x2000, 0x0, 0x9000, 0)
 AVR_MCU ("atxmega32e5",  ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_ATxmega32E5__",  0x2000, 0x0, 0x9000, 0)
-AVR_MCU ("avr64da28",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DA28__",0x6000, 0x0, 0x8000, 0x1)
-AVR_MCU ("avr64da32",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DA32__",0x6000, 0x0, 0x8000, 0x1)
-AVR_MCU ("avr64da48",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DA48__",0x6000, 0x0, 0x8000, 0x1)
-AVR_MCU ("avr64da64",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DA64__",0x6000, 0x0, 0x8000, 0x1)
-AVR_MCU ("avr64db28",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DB28__",0x6000, 0x0, 0x8000, 0x1)
-AVR_MCU ("avr64db32",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DB32__",0x6000, 0x0, 0x8000, 0x1)
-AVR_MCU ("avr64db48",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DB48__",0x6000, 0x0, 0x8000, 0x1)
-AVR_MCU ("avr64db64",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DB64__",0x6000, 0x0, 0x8000, 0x1)
+AVR_MCU ("avr64da28",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DA28__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64da32",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DA32__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64da48",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DA48__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64da64",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DA64__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64db28",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DB28__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64db32",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DB32__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64db48",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DB48__",0x6000, 0x0, 0x1, 0)
+AVR_MCU ("avr64db64",ARCH_AVRXMEGA2, AVR_ISA_NONE, 
"__AVR_AVR64DB64__",0x6000, 0x0, 0x1, 0)

 /* Xmega, Flash + RAM < 64K, flash visible in RAM address space */
 AVR_MCU ("avrxmega3",ARCH_AVRXMEGA3, AVR_ISA_NONE,  NULL, 
0x3f00, 0x0, 0x8000, 0)
 AVR_MCU ("attiny202",ARCH_AVRXMEGA3, AVR_ISA_RCALL, 
"__AVR_ATtiny202__",   0x3f80, 0x0, 0x800,  0x8000)
@@ -366,14 +366,14 @@ AVR_MCU ("atxmega64b1",  ARCH_AVRXMEGA4, 
AVR_ISA_RMW,  "__AVR_ATxmega64B1__"
 AVR_MCU ("atxmega64b3",  ARCH_AVRXMEGA4, AVR_ISA_RMW, 
"__AVR_ATxmega64B3__",  0x2000, 0x0, 0x11000, 0)
 AVR_MCU ("atxmega64c3",  ARCH_AVRXMEGA4, AVR_ISA_RMW, 
"__AVR_ATxmega64C3__",  0x2000, 0x0, 0x11000, 0)
 AVR_MCU ("atxmega64d4",  ARCH_AVRXMEGA4, AVR_ISA_NONE, 
"__AVR_ATxmega64D4__",  0x2000, 0x0, 0x11000, 0)
-AVR_MCU ("avr128da28",   ARCH_AVRXMEGA4, AVR_ISA_NONE, 
"__AVR_AVR128DA28__",   0x4000, 0x0, 0x8000,  0x2)
-AVR_MCU ("avr128da32",   ARCH_AVRXMEGA4, AVR_ISA_NONE, 
"__AVR_AVR128DA32__",   0x4000, 0x0, 0x8000,  0x2)
-AVR_MCU ("avr128da48",   ARCH_AVRXMEGA4, AVR_ISA_NONE, 
"__AVR_AVR128DA48__",   0x4000, 0x0, 0x8000,  0x2)
-AVR_MCU ("avr128da64",   ARCH_AVRXMEGA4, AVR_ISA_NONE, 
"__AVR_AVR128DA64__",   0x4000, 0x0, 0x8000,  0x2)
-AVR_MCU ("avr128db28",   ARCH_AVRXMEGA4, AVR_ISA_NONE, 
"__AVR_AVR128DB28__",   0x4000, 0x0, 0x8000,  0x2)
-AVR_MCU ("avr128db32",   ARCH_AVRXMEGA4, AVR_ISA_NONE, 
"__AVR_AVR128DB32__",   0x4000, 0x0, 0x8000,  0x2)
-AVR_MCU ("avr128db48",   ARCH_AVRXMEGA4, AVR_ISA_NONE, 
"__AVR_AVR128DB48__",   0x4000, 0x0, 0x8000,  0x2)
-AVR_MCU ("avr128db64",   

Re: Fix profile upate after vectorizer peeling

2023-08-04 Thread Jan Hubicka via Gcc-patches
Hi,
so I found the problem.  We duplicate multiple paths and end up with:

;; basic block 6, loop depth 0, count 365072224 (estimated locally, freq 0.3400)
;;  prev block 12, next block 7, flags: (NEW, REACHABLE, VISITED)
;;  pred:   4 [never (guessed)]  count:0 (estimated locally, freq 0.) 
(TRUE_VALUE,EXECUTABLE)
;;  10 [always]  count:365072224 (estimated locally, freq 0.3400) 
(FALLTHRU,EXECUTABLE)
# _18 = PHI <0(4), 0(10)>
# d_39 = PHI 
if (_18 == 0)
  goto ; [97.06%]
else
  goto ; [2.94%]
;;  succ:   8 [97.1% (guessed)]  count:354334801 (estimated locally, freq 
0.3300) (TRUE_VALUE,EXECUTABLE)
;;  7 [2.9% (guessed)]  count:10737423 (estimated locally, freq 
0.0100) (FALSE_VALUE,EXECUTABLE)

Here goto bb 7 is never taken but profile is wrong.

Before threading we have chain of conditionals:

  __asm__("pushf{l|d}
pushf{l|d}
pop{l}  %0
mov{l}  {%0, %1|%1, %0}
xor{l}  {%2, %0|%0, %2}
push{l} %0
popf{l|d}
pushf{l|d}
pop{l}  %0
popf{l|d}
" : "=" __eax_19, "=" __ebx_20 : "i" 2097152);
  _21 = __eax_19 ^ __ebx_20;
  _22 = _21 & 2097152;
  if (_22 == 0)
goto ; [34.00%]
  else
goto ; [66.00%]
  
   [local count: 708669602 freq: 0.66]:
  __asm__ __volatile__("cpuid
" : "=a" __eax_24, "=b" __ebx_25, "=c" __ecx_26, "=d" __edx_27 : "0" 0);
  
   [local count: 1073741826 freq: 1.00]:
  # _33 = PHI <0(2), __eax_24(3)> 
  _16 = _33 == 0;
  if (_33 == 0)
goto ; [34.00%]
  else
goto ; [66.00%]

   [local count: 708669600 freq: 0.66]:
  __asm__ __volatile__("cpuid
" : "=a" a_44, "=b" b_45, "=c" c_46, "=d" d_47 : "0" 1, "2" 0);

   [local count: 1073741824 freq: 1.00]:
  # _18 = PHI <0(4), 1(5)>
  # d_39 = PHI 
  if (_18 == 0)
goto ; [33.00%]
  else
goto ; [67.00%]


If first _22 == 0 then also _33 == 0 and _18 == 0 but the last case has
probability 33% while the first 34%, so the profile guess is not
consistent with the threaded path.  So threading is right to end up with
profile inconsistency, but it should print reason for doing it.

One option is to disable optimization for the check.  Other option is to
get the first conditional predicted right.
Would this be OK?

gcc/ChangeLog:

* config/i386/cpuid.h: Mark CPUs not supporting cpuid as unlikely.

diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index 03fd6fc9478..9c768ac0b6d 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -295,7 +295,7 @@ __get_cpuid_max (unsigned int __ext, unsigned int *__sig)
   : "i" (0x0020));
 #endif
 
-  if (!((__eax ^ __ebx) & 0x0020))
+  if (__builtin_expect (!((__eax ^ __ebx) & 0x0020), 0))
 return 0;
 #endif
 


Re: Disable loop distribution for loops with estimated iterations 0

2023-08-04 Thread Richard Biener via Gcc-patches
On Fri, Aug 4, 2023 at 9:16 AM Jan Hubicka via Gcc-patches
 wrote:
>
> Hi,
> this prevents useless loop distribiton produced in hmmer.  With FDO we now
> correctly work out that the loop created for last iteraiton is not going to
> iterate however loop distribution still produces a verioned loop that has no
> chance to survive loop vectorizer since we only keep distributed loops
> when loop vectorization suceeds and it requires number of (header) iterations
> to exceed the vectorization factor.
>
> Bootstrapped/regtested x86_64-linux, OK?

OK.

But why does optimize_loop_for_speed_p () return true when the loop
isn't expected to iterate?  Wouldn't that be a better place to fix this
and similar issues in other places then?

Thanks,
Richard.

> gcc/ChangeLog:
>
> * tree-loop-distribution.cc (loop_distribution::execute): Disable
> distribution for loops with estimated iterations 0.
>
> diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
> index cf7c197aaf7..8ff2108f284 100644
> --- a/gcc/tree-loop-distribution.cc
> +++ b/gcc/tree-loop-distribution.cc
> @@ -3871,10 +3871,20 @@ loop_distribution::execute (function *fun)
>
>   bool destroy_p;
>   int nb_generated_loops, nb_generated_calls;
> + bool only_patterns = !optimize_loop_for_speed_p (loop)
> +  || !flag_tree_loop_distribution;
> + /* do not try to distribute loops that are not expected to iterate. 
>  */
> + if (!only_patterns)
> +   {
> + HOST_WIDE_INT iterations = estimated_loop_iterations_int (loop);
> + if (iterations < 0)
> +   iterations = likely_max_loop_iterations_int (loop);
> + if (!iterations)
> +   only_patterns = true;
> +   }
>   nb_generated_loops
> = distribute_loop (loop, work_list, cd, _generated_calls,
> -  _p, (!optimize_loop_for_speed_p (loop)
> -   || !flag_tree_loop_distribution));
> +  _p, only_patterns);
>   if (destroy_p)
> loops_to_be_destroyed.safe_push (loop);
>


Re: One question on the source code of tree-object-size.cc

2023-08-04 Thread Kees Cook via Gcc-patches
On Thu, Aug 03, 2023 at 09:31:24PM +, Qing Zhao wrote:
> So, the basic question is:
> 
> Given the following:
> 
> struct fix {
>   int others;
>   int array[10];
> }
> 
> extern struct fix * alloc_buf ();
> 
> int main ()
> {
>   struct fix *p = alloc_buf ();
>   __builtin_object_size(p->array,0) == ?
> }
> 
> Given p->array, can the compiler determine that p points to an object that 
> has TYPE struct fix?
> 
> If the answer is YES, then the current__builtin_object_size algorithm can be 
> improved to determine __builtin_object_size(p->array, 0)  with the TYPE of 
> the struct fix.

I think it is fine to leave __bos(..., 0) as-is. From the Linux kernel's
use of __bos, we are almost exclusively only interesting the mode 1, not
node 0. :)

-- 
Kees Cook


Re: One question on the source code of tree-object-size.cc

2023-08-04 Thread Kees Cook via Gcc-patches
On Thu, Aug 03, 2023 at 07:55:54PM +, Qing Zhao wrote:
> 
> 
> > On Aug 3, 2023, at 1:51 PM, Kees Cook  wrote:
> > 
> > On August 3, 2023 10:34:24 AM PDT, Qing Zhao  wrote:
> >> One thing I need to point out first is, currently, even for regular fixed 
> >> size array in the structure,
> >> We have this same issue, for example:
> >> 
> >> #define LENGTH 10
> >> 
> >> struct fix {
> >> size_t foo;
> >> int array[LENGTH];
> >> };
> >> 
> >> …
> >> int main ()
> >> {
> >> struct fix *p;
> >> p = alloc_buf_more ();
> >> 
> >> expect(__builtin_object_size(p->array, 1), LENGTH * sizeof(int));
> >> expect(__builtin_object_size(p->array, 0), -1);
> >> }
> >> 
> >> Currently, for __builtin_object_size(p->array, 0),  GCC return UNKNOWN for 
> >> it.
> >> This is not a special issue for flexible array member.
> > 
> > Is this true with -fstrict-flex-arrays=3 ?
> 
> Yes. 

Okay, right, I understand now -- it doesn't see the allocation, therefore
max size is unknown. Sounds good.

-Kees

-- 
Kees Cook


Disable loop distribution for loops with estimated iterations 0

2023-08-04 Thread Jan Hubicka via Gcc-patches
Hi,
this prevents useless loop distribiton produced in hmmer.  With FDO we now
correctly work out that the loop created for last iteraiton is not going to
iterate however loop distribution still produces a verioned loop that has no
chance to survive loop vectorizer since we only keep distributed loops
when loop vectorization suceeds and it requires number of (header) iterations
to exceed the vectorization factor.

Bootstrapped/regtested x86_64-linux, OK?

gcc/ChangeLog:

* tree-loop-distribution.cc (loop_distribution::execute): Disable
distribution for loops with estimated iterations 0.

diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index cf7c197aaf7..8ff2108f284 100644
--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -3871,10 +3871,20 @@ loop_distribution::execute (function *fun)
 
  bool destroy_p;
  int nb_generated_loops, nb_generated_calls;
+ bool only_patterns = !optimize_loop_for_speed_p (loop)
+  || !flag_tree_loop_distribution;
+ /* do not try to distribute loops that are not expected to iterate.  
*/
+ if (!only_patterns)
+   {
+ HOST_WIDE_INT iterations = estimated_loop_iterations_int (loop);
+ if (iterations < 0)
+   iterations = likely_max_loop_iterations_int (loop);
+ if (!iterations)
+   only_patterns = true;
+   }
  nb_generated_loops
= distribute_loop (loop, work_list, cd, _generated_calls,
-  _p, (!optimize_loop_for_speed_p (loop)
-   || !flag_tree_loop_distribution));
+  _p, only_patterns);
  if (destroy_p)
loops_to_be_destroyed.safe_push (loop);
 


Re: [PATCH] match.pd: Canonicalize (signed x << c) >> c [PR101955]

2023-08-04 Thread Jakub Jelinek via Gcc-patches
> Canonicalizes (signed x << c) >> c into the lowest
> precision(type) - c bits of x IF those bits have a mode precision or a
> precision of 1. Also combines this rule with (unsigned x << c) >> c -> x &
> ((unsigned)-1 >> c) to prevent duplicate pattern. Tested successfully on
> x86_64 and x86 targets.
> 
>   PR middle-end/101955
> 
> gcc/ChangeLog:
> 
>   * match.pd ((signed x << c) >> c): New canonicalization.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.dg/pr101955.c: New test.

LGTM, I've committed this one for you.

Jakub



Re: Fix profile upate after vectorizer peeling

2023-08-04 Thread Jan Hubicka via Gcc-patches
> >
> > A couple cycles ago I separated most of code to distinguish between the
> > back and forward threaders.  There is class jt_path_registry that is
> > common to both, and {fwd,back}_jt_path_registry for the forward and
> > backward threaders respectively.  It's not perfect, but it's a start.
> 
> Yep, it's back_jt_path_registry::update_cfg / duplicate_thread_path
> that lacks the updates.

duplicate_thread_path has profile update (using
profile_bb_update_for_threading and
scale_bbs_frequencies_profile_count).  It will however silently keep
profile misupdated if the cfg were originally inconsistent with the
threaded path (in which case it is intended to keep profile
inconsistent, but we should have it logged so we know it is "okay after
all").  I will add logging same as in profile_bb_update_for_threading, so
these things are easier to figure out.

What happens in the test is that we have __builtin_constant_p that
blocks early threading and we thread only after profile is constructed.
I did not check by hand if the original profile is guessed
inconsistently.

Honza
> 
> Richard.
> 
> > Aldy
> >


Re: [PATCH] Specify signed/unsigned/dontcare in calls to extract_bit_field_1.

2023-08-04 Thread Richard Biener via Gcc-patches
On Thu, Aug 3, 2023 at 9:15 PM Roger Sayle  wrote:
>
>
> This patch is inspired by Jakub's work on PR rtl-optimization/110717.
> The bitfield example described in comment #2, looks like:
>
> struct S { __int128 a : 69; };
> unsigned type bar (struct S *p) {
>   return p->a;
> }
>
> which on x86_64 with -O2 currently generates:
>
> bar:movzbl  8(%rdi), %ecx
> movq(%rdi), %rax
> andl$31, %ecx
> movq%rcx, %rdx
> salq$59, %rdx
> sarq$59, %rdx
> ret
>
> The ANDL $31 is interesting... we first extract an unsigned 69-bit bitfield
> by masking/clearing the top bits of the most significant word, and then
> it gets sign-extended, by left shifting and arithmetic right shifting.
> Obviously, this bit-wise AND is redundant, for signed bit-fields, we don't
> require these bits to be cleared, if we're about to set them appropriately.
>
> This patch eliminates this redundancy in the middle-end, during RTL
> expansion, but extending the extract_bit_field APIs so that the integer
> UNSIGNEDP argument takes a special value; 0 indicates the field should
> be sign extended, 1 (any non-zero value) indicates the field should be
> zero extended, but -1 indicates a third option, that we don't care how
> or whether the field is extended.  By passing and checking this sentinel
> value at the appropriate places we avoid the useless bit masking (on
> all targets).
>
> For the test case above, with this patch we now generate:
>
> bar:movzbl  8(%rdi), %ecx
> movq(%rdi), %rax
> movq%rcx, %rdx
> salq$59, %rdx
> sarq$59, %rdx
> ret
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?

OK.

Thanks,
Richard.

>
> 2023-08-03  Roger Sayle  
>
> gcc/ChangeLog
> * expmed.cc (extract_bit_field_1): Document that an UNSIGNEDP
> value of -1 is equivalent to don't care.
> (extract_integral_bit_field): Indicate that we don't require
> the most significant word to be zero extended, if we're about
> to sign extend it.
> (extract_fixed_bit_field_1): Document that an UNSIGNEDP value
> of -1 is equivalent to don't care.  Don't clear the most
> most significant bits with AND mask when UNSIGNEDP is -1.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/pr110717-2.c: New test case.
>
>
> Thanks in advance,
> Roger
> --
>


Re: [PATCHv2] Fix PR 110874: infinite loop in gimple_bitwise_inverted_equal_p with fre

2023-08-04 Thread Richard Biener via Gcc-patches
On Thu, Aug 3, 2023 at 6:41 PM Andrew Pinski via Gcc-patches
 wrote:
>
> This changes gimple_bitwise_inverted_equal_p to use a 2 different match 
> patterns
> to try to match bit_not wrapped with a possible nop_convert and a comparison
> also wrapped with a possible nop_convert. This is to avoid being recursive.
>
> OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.

OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
> PR tree-optimization/110874
> * gimple-match-head.cc (gimple_bit_not_with_nop): New declaration.
> (gimple_maybe_cmp): Likewise.
> (gimple_bitwise_inverted_equal_p): Rewrite to use 
> gimple_bit_not_with_nop
> and gimple_maybe_cmp instead of being recursive.
> * match.pd (bit_not_with_nop): New match pattern.
> (maybe_cmp): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/110874
> * gcc.c-torture/compile/pr110874-a.c: New test.
> ---
>  gcc/gimple-match-head.cc  | 87 ++-
>  gcc/match.pd  | 17 
>  .../gcc.c-torture/compile/pr110874-a.c| 17 
>  3 files changed, 79 insertions(+), 42 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.c-torture/compile/pr110874-a.c
>
> diff --git a/gcc/gimple-match-head.cc b/gcc/gimple-match-head.cc
> index b1e96304d7c..a097a494c39 100644
> --- a/gcc/gimple-match-head.cc
> +++ b/gcc/gimple-match-head.cc
> @@ -270,6 +270,10 @@ gimple_bitwise_equal_p (tree expr1, tree expr2, tree 
> (*valueize) (tree))
>  #define bitwise_inverted_equal_p(expr1, expr2) \
>gimple_bitwise_inverted_equal_p (expr1, expr2, valueize)
>
> +
> +bool gimple_bit_not_with_nop (tree, tree *, tree (*) (tree));
> +bool gimple_maybe_cmp (tree, tree *, tree (*) (tree));
> +
>  /* Helper function for bitwise_equal_p macro.  */
>
>  static inline bool
> @@ -285,52 +289,51 @@ gimple_bitwise_inverted_equal_p (tree expr1, tree 
> expr2, tree (*valueize) (tree)
>  return false;
>
>tree other;
> -  if (gimple_nop_convert (expr1, , valueize)
> -  && gimple_bitwise_inverted_equal_p (other, expr2, valueize))
> -return true;
> -
> -  if (gimple_nop_convert (expr2, , valueize)
> -  && gimple_bitwise_inverted_equal_p (expr1, other, valueize))
> -return true;
> -
> -  if (TREE_CODE (expr1) != SSA_NAME
> -  || TREE_CODE (expr2) != SSA_NAME)
> -return false;
> -
> -  gimple *d1 = get_def (valueize, expr1);
> -  gassign *a1 = safe_dyn_cast  (d1);
> -  gimple *d2 = get_def (valueize, expr2);
> -  gassign *a2 = safe_dyn_cast  (d2);
> -  if (a1
> -  && gimple_assign_rhs_code (a1) == BIT_NOT_EXPR
> -  && gimple_bitwise_equal_p (do_valueize (valueize,
> - gimple_assign_rhs1 (a1)),
> -expr2, valueize))
> +  /* Try if EXPR1 was defined as ~EXPR2. */
> +  if (gimple_bit_not_with_nop (expr1, , valueize))
> +{
> +  if (operand_equal_p (other, expr2, 0))
> return true;
> -  if (a2
> -  && gimple_assign_rhs_code (a2) == BIT_NOT_EXPR
> -  && gimple_bitwise_equal_p (expr1,
> -do_valueize (valueize,
> - gimple_assign_rhs1 (a2)),
> -valueize))
> +  tree expr4;
> +  if (gimple_nop_convert (expr2, , valueize)
> + && operand_equal_p (other, expr4, 0))
> return true;
> -
> -  if (a1 && a2
> -  && TREE_CODE_CLASS (gimple_assign_rhs_code (a1)) == tcc_comparison
> -  && TREE_CODE_CLASS (gimple_assign_rhs_code (a2)) == tcc_comparison)
> +}
> +  /* Try if EXPR2 was defined as ~EXPR1. */
> +  if (gimple_bit_not_with_nop (expr2, , valueize))
>  {
> -  tree op10 = do_valueize (valueize, gimple_assign_rhs1 (a1));
> -  tree op20 = do_valueize (valueize, gimple_assign_rhs1 (a2));
> -  if (!operand_equal_p (op10, op20))
> -return false;
> -  tree op11 = do_valueize (valueize, gimple_assign_rhs2 (a1));
> -  tree op21 = do_valueize (valueize, gimple_assign_rhs2 (a2));
> -  if (!operand_equal_p (op11, op21))
> -return false;
> -  if (invert_tree_comparison (gimple_assign_rhs_code (a1),
> - HONOR_NANS (op10))
> - == gimple_assign_rhs_code (a2))
> +  if (operand_equal_p (other, expr1, 0))
> +   return true;
> +  tree expr3;
> +  if (gimple_nop_convert (expr1, , valueize)
> + && operand_equal_p (other, expr3, 0))
> return true;
>  }
> +
> +  /* If neither are defined by BIT_NOT, try to see if
> + both are defined by comparisons and see if they are
> + complementary (inversion) of each other. */
> +  tree newexpr1, newexpr2;
> +  if (!gimple_maybe_cmp (expr1, , valueize))
> +return false;
> +  if (!gimple_maybe_cmp (expr2, , valueize))
> +return false;
> +
> +  gimple *d1 = get_def (valueize, newexpr1);
> +  gassign *a1 = dyn_cast  (d1);
> +  gimple *d2 

Re: [PATCH] mid-end: Use integral time intervals in timevar.cc

2023-08-04 Thread Richard Biener via Gcc-patches
On Thu, 3 Aug 2023, Matthew Malcomson wrote:

> > 
> > I think this is undesriable.  With fused you mean we use FMA?
> > I think you could use -ffp-contract=off for the TU instead.
> > 
> > Note you can't use __attribute__((noinline)) literally since the
> > host compiler might not support this.
> > 
> > Richard.
> > 
> 
> 
> Trying to make the timevar store integral time intervals.
> Hope this is acceptable -- I had originally planned to use
> `-ffp-contract` as agreed until I saw the email mentioning the old x86
> bug in the same area which was not to do with floating point contraction
> of operations (PR 99903) and figured it would be better to try and solve
> both at the same time while making things in general a bit more robust.
> 
> 
> 
> On some AArch64 bootstrapped builds, we were getting a flaky test
> because the floating point operations in `get_time` were being fused
> with the floating point operations in `timevar_accumulate`.
> 
> This meant that the rounding behaviour of our multiplication with
> `ticks_to_msec` was different when used in `timer::start` and when
> performed in `timer::stop`.  These extra inaccuracies led to the
> testcase `g++.dg/ext/timevar1.C` being flaky on some hardware.
> 
> --
> Avoiding the inlining which was agreed to be undesirable.  Three
> alternative approaches:
> 1) Use `-ffp-contract=on` to avoid this particular optimisation.
> 2) Adjusting the code so that the "tolerance" is always of the order of
>a "tick".
> 3) Recording times and elapsed differences in integral values.
>- Could be in terms of a standard measurement (e.g. nanoseconds or
>  microseconds).
>- Could be in terms of whatever integral value ("ticks" /
>  seconds / "clock ticks") is returned from the syscall
>  chosen at configure time.
> 
> While `-ffp-contract=on` removes the problem that I bumped into, there
> has been a similar bug on x86 that was to do with a different floating
> point problem that also happens after `get_time` and
> `timevar_accumulate` both being inlined into the same function.  Hence
> it seems worth choosing a different approach.
> 
> Of the two other solutions, recording measurements in integral values
> seems the most robust against slightly "off" measurements being
> presented to the user -- even though it could avoid the ICE that creates
> a flaky test.
> 
> I considered storing time in whatever units our syscall returns and
> normalising them at the time we print out rather than normalising them
> to nanoseconds at the point we record our "current time".  The logic
> being that normalisation could have some rounding affect (e.g. if
> TICKS_PER_SECOND is 3) that would be taken into account in calculations.
> 
> I decided against it in order to give the values recorded in
> `timevar_time_def` some interpretive value so it's easier to read the
> code.  Compared to the small rounding that would represent a tiny amount
> of time and AIUI can not trigger the same kind of ICE's as we are
> attempting to fix, said interpretive value seems more valuable.
> 
> Recording time in microseconds seemed reasonable since all obvious
> values for ticks and `getrusage` are at microsecond granularity or less
> precise.  That said, since TICKS_PER_SECOND and CLOCKS_PER_SEC are both
> variables given to use by the host system I was not sure of that enough
> to make this decision.
> 
> --
> timer::all_zero is ignoring rows which are inconsequential to the user
> and would be printed out as all zeros.  Since upon printing rows we
> convert to the same double value and print out the same precision as
> before, we return true/false based on the same amount of time as before.
> 
> timer::print_row casts to a floating point measurement in units of
> seconds as was printed out before.
> 
> timer::validate_phases -- I'm printing out nanoseconds here rather than
> floating point seconds since this is an error message for when things
> have "gone wrong" printing out the actual nanoseconds that have been
> recorded seems like the best approach.
> N.b. since we now print out nanoseconds instead of floating point value
> the padding requirements are different.  Originally we were padding to
> 24 characters and printing 18 decimal places.  This looked odd with the
> now visually smaller values getting printed.  I judged 13 characters
> (corresponding to 2 hours) to be a reasonable point at which our
> alignment could start to degrade and this provides a more compact output
> for the majority of cases (checked by triggering the error case via
> GDB).
> 
> --
> N.b. I use a literal 10 for "NANOSEC_PER_SEC".  I believe this
> would fit in an integer on all hosts that GCC supports, but am not
> certain there are not strange integer sizes we support hence am pointing
> it out for special attention during review.
> 
> 

Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.

2023-08-04 Thread Richard Biener via Gcc-patches
On Thu, Aug 3, 2023 at 5:21 PM Jeff Law  wrote:
>
>
>
> On 8/3/23 01:04, Richard Biener wrote:
> > On Wed, Aug 2, 2023 at 4:08 PM Manolis Tsamis  
> > wrote:
> >>
> >> Hi all,
> >>
> >> I'm pinging to discuss again if we want to move this forward for GCC14.
> >>
> >> I did some testing again and I haven't been able to find obvious
> >> regressions, including testing the code from PR86270 and PR70359 that
> >> Richard mentioned.
> >> I still believe that zero can be considered a special case even for
> >> hardware that doesn't directly benefit in the comparison.
> >> For example it happens that the testcase from the commit compiles to
> >> one instruction less in x86:
> >>
> >> .LFB0:
> >>  movl(%rdi), %eax
> >>  leal1(%rax), %edx
> >>  movl%edx, (%rdi)
> >>  testl%eax, %eax
> >>  je.L4
> >>  ret
> >> .L4:
> >>  jmpg
> >>
> >> vs
> >>
> >> .LFB0:
> >>  movl(%rdi), %eax
> >>  addl$1, %eax
> >>  movl%eax, (%rdi)
> >>  cmpl$1, %eax
> >>  je.L4
> >>  ret
> >> .L4:
> >>  xorl%eax, %eax
> >>  jmpg
> >>
> >> (The xorl is not emitted  when testl is used. LLVM uses testl but also
> >> does xor eax, eax :) )
> >> Although this is accidental, I believe it also showcases that zero is
> >> a preferential value in various ways.
> >>
> >> I'm running benchmarks comparing the effects of this change and I'm
> >> also still looking for testcases that result in problematic
> >> regressions.
> >> Any feedback or other concerns about this are appreciated!
> >
> > My comment from Apr 24th still holds, IMO this is something for
> > instruction selection (aka the ISEL pass) or the out-of-SSA tweaks
> > we do during RTL expansion (see insert_backedge_copies)
> I'm still generally supportive of biasing to zero, but as Richi has
> noted the current implementation needs to be pushed further back into
> the pipeline, preferably all the way to isel or gimple->rtl expansion.

Note the main reason is that if you only "fix" forwprop you miss other places
that will happily undo this.  As the intent is to get better
instruction selection
doing the canoncalization at RTL expansion sounds like the best idea to me.

Richard.

> Jeff


Re: Fix profile upate after vectorizer peeling

2023-08-04 Thread Richard Biener via Gcc-patches
On Thu, Aug 3, 2023 at 5:12 PM Aldy Hernandez  wrote:
>
>
>
> On 8/3/23 16:29, Jeff Law wrote:
> >
> >
> > On 8/3/23 08:23, Jan Hubicka wrote:
>  Jeff, an help would be appreciated here :)
> 
>  I will try to debug this.  One option would be to disable branch
>  prediciton on vect_check for time being - it is not inlined anyway
> >>> Not a lot of insight.  The backwards threader uses a totally
> >>> different API
> >>> for the CFG/SSA updates and that API I don't think has made any
> >>> significant
> >>> effort to keep the profile up-to-date.
> >>
> >> OK, at least some hints where the missing profile updat should be, would
> >> be good. There is update_profile in tree-ssa-threadupdate and
> >> understaning what is missing would be nice
> >> In general it would be nice to mind profile when updating CFG :)
> > THe backwards threader doesn't use much of the code in
> > tree-ssa-threadupdate IIRC.  The bulk of the work for the backwards
> > threader is done by copy_bbs.  I've actually suggested those two
> > implementations be totally separated from each other to avoid confusion.
> >   I just haven't had the time to do it (or much of anything with
> > threading) myself.
>
> A couple cycles ago I separated most of code to distinguish between the
> back and forward threaders.  There is class jt_path_registry that is
> common to both, and {fwd,back}_jt_path_registry for the forward and
> backward threaders respectively.  It's not perfect, but it's a start.

Yep, it's back_jt_path_registry::update_cfg / duplicate_thread_path
that lacks the updates.

Richard.

> Aldy
>


Re: [PATCH v1] RISC-V: Support RVV VFMADD rounding mode intrinsic API

2023-08-04 Thread juzhe.zh...@rivai.ai
LGTM



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-08-04 14:10
To: gcc-patches
CC: juzhe.zhong; kito.cheng; pan2.li; yanzhang.wang
Subject: [PATCH v1] RISC-V: Support RVV VFMADD rounding mode intrinsic API
From: Pan Li 
 
This patch would like to support the rounding mode API for the
VFMADD as the below samples.
 
* __riscv_vfmadd_vv_f32m1_rm
* __riscv_vfmadd_vv_f32m1_rm_m
* __riscv_vfmadd_vf_f32m1_rm
* __riscv_vfmadd_vf_f32m1_rm_m
 
Signed-off-by: Pan Li 
 
gcc/ChangeLog:
 
* config/riscv/riscv-vector-builtins-bases.cc
(class vfmadd_frm): New class for vfmadd frm.
(vfmadd_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfmadd_frm): New function definition.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/base/float-point-single-madd.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc  | 24 ++
.../riscv/riscv-vector-builtins-bases.h   |  1 +
.../riscv/riscv-vector-builtins-functions.def |  2 +
.../riscv/rvv/base/float-point-single-madd.c  | 47 +++
4 files changed, 74 insertions(+)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-madd.c
 
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 9c6ca8d1ddc..5b02b04aacb 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -445,6 +445,28 @@ public:
   }
};
+/* Implements below instructions for frm
+   - vfmadd
+*/
+class vfmadd_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  bool has_merge_operand_p () const override { return false; }
+
+  rtx expand (function_expander ) const override
+  {
+if (e.op_info->op == OP_TYPE_vf)
+  return e.use_ternop_insn (
+ false, code_for_pred_mul_scalar (PLUS, e.vector_mode ()));
+if (e.op_info->op == OP_TYPE_vv)
+  return e.use_ternop_insn (
+ false, code_for_pred_mul (PLUS, e.vector_mode ()));
+gcc_unreachable ();
+  }
+};
+
/* Implements vrsub.  */
class vrsub : public function_base
{
@@ -2209,6 +2231,7 @@ static CONSTEXPR const vfmacc_frm vfmacc_frm_obj;
static CONSTEXPR const vfnmsac vfnmsac_obj;
static CONSTEXPR const vfnmsac_frm vfnmsac_frm_obj;
static CONSTEXPR const vfmadd vfmadd_obj;
+static CONSTEXPR const vfmadd_frm vfmadd_frm_obj;
static CONSTEXPR const vfnmsub vfnmsub_obj;
static CONSTEXPR const vfnmacc vfnmacc_obj;
static CONSTEXPR const vfnmacc_frm vfnmacc_frm_obj;
@@ -2448,6 +2471,7 @@ BASE (vfmacc_frm)
BASE (vfnmsac)
BASE (vfnmsac_frm)
BASE (vfmadd)
+BASE (vfmadd_frm)
BASE (vfnmsub)
BASE (vfnmacc)
BASE (vfnmacc_frm)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index 28eec2c3e99..5850ff0cf2e 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -164,6 +164,7 @@ extern const function_base *const vfmacc_frm;
extern const function_base *const vfnmsac;
extern const function_base *const vfnmsac_frm;
extern const function_base *const vfmadd;
+extern const function_base *const vfmadd_frm;
extern const function_base *const vfnmsub;
extern const function_base *const vfnmacc;
extern const function_base *const vfnmacc_frm;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 9c964ae6fcb..c9a06e6b644 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -356,6 +356,8 @@ DEF_RVV_FUNCTION (vfmsac_frm, alu_frm, full_preds, 
f__ops)
DEF_RVV_FUNCTION (vfmsac_frm, alu_frm, full_preds, f_vvfv_ops)
DEF_RVV_FUNCTION (vfnmsac_frm, alu_frm, full_preds, f__ops)
DEF_RVV_FUNCTION (vfnmsac_frm, alu_frm, full_preds, f_vvfv_ops)
+DEF_RVV_FUNCTION (vfmadd_frm, alu_frm, full_preds, f__ops)
+DEF_RVV_FUNCTION (vfmadd_frm, alu_frm, full_preds, f_vvfv_ops)
// 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
DEF_RVV_FUNCTION (vfwmacc, alu, full_preds, f_wwvv_ops)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-madd.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-madd.c
new file mode 100644
index 000..00c9d002998
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-madd.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
+
+#include "riscv_vector.h"
+
+typedef float float32_t;
+
+vfloat32m1_t
+test_riscv_vfmadd_vv_f32m1_rm (vfloat32m1_t vd, vfloat32m1_t op1,
+vfloat32m1_t op2, size_t vl) {
+  return __riscv_vfmadd_vv_f32m1_rm (vd, op1, op2, 0, vl);
+}
+
+vfloat32m1_t
+test_vfmadd_vv_f32m1_rm_m (vbool32_t mask, vfloat32m1_t vd, vfloat32m1_t op1,
+vfloat32m1_t op2, size_t vl) {
+  return __riscv_vfmadd_vv_f32m1_rm_m (mask, 

[PATCH v1] RISC-V: Support RVV VFMADD rounding mode intrinsic API

2023-08-04 Thread Pan Li via Gcc-patches
From: Pan Li 

This patch would like to support the rounding mode API for the
VFMADD as the below samples.

* __riscv_vfmadd_vv_f32m1_rm
* __riscv_vfmadd_vv_f32m1_rm_m
* __riscv_vfmadd_vf_f32m1_rm
* __riscv_vfmadd_vf_f32m1_rm_m

Signed-off-by: Pan Li 

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc
(class vfmadd_frm): New class for vfmadd frm.
(vfmadd_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfmadd_frm): New function definition.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-single-madd.c: New test.
---
 .../riscv/riscv-vector-builtins-bases.cc  | 24 ++
 .../riscv/riscv-vector-builtins-bases.h   |  1 +
 .../riscv/riscv-vector-builtins-functions.def |  2 +
 .../riscv/rvv/base/float-point-single-madd.c  | 47 +++
 4 files changed, 74 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-madd.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 9c6ca8d1ddc..5b02b04aacb 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -445,6 +445,28 @@ public:
   }
 };
 
+/* Implements below instructions for frm
+   - vfmadd
+*/
+class vfmadd_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  bool has_merge_operand_p () const override { return false; }
+
+  rtx expand (function_expander ) const override
+  {
+if (e.op_info->op == OP_TYPE_vf)
+  return e.use_ternop_insn (
+   false, code_for_pred_mul_scalar (PLUS, e.vector_mode ()));
+if (e.op_info->op == OP_TYPE_vv)
+  return e.use_ternop_insn (
+   false, code_for_pred_mul (PLUS, e.vector_mode ()));
+gcc_unreachable ();
+  }
+};
+
 /* Implements vrsub.  */
 class vrsub : public function_base
 {
@@ -2209,6 +2231,7 @@ static CONSTEXPR const vfmacc_frm vfmacc_frm_obj;
 static CONSTEXPR const vfnmsac vfnmsac_obj;
 static CONSTEXPR const vfnmsac_frm vfnmsac_frm_obj;
 static CONSTEXPR const vfmadd vfmadd_obj;
+static CONSTEXPR const vfmadd_frm vfmadd_frm_obj;
 static CONSTEXPR const vfnmsub vfnmsub_obj;
 static CONSTEXPR const vfnmacc vfnmacc_obj;
 static CONSTEXPR const vfnmacc_frm vfnmacc_frm_obj;
@@ -2448,6 +2471,7 @@ BASE (vfmacc_frm)
 BASE (vfnmsac)
 BASE (vfnmsac_frm)
 BASE (vfmadd)
+BASE (vfmadd_frm)
 BASE (vfnmsub)
 BASE (vfnmacc)
 BASE (vfnmacc_frm)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index 28eec2c3e99..5850ff0cf2e 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -164,6 +164,7 @@ extern const function_base *const vfmacc_frm;
 extern const function_base *const vfnmsac;
 extern const function_base *const vfnmsac_frm;
 extern const function_base *const vfmadd;
+extern const function_base *const vfmadd_frm;
 extern const function_base *const vfnmsub;
 extern const function_base *const vfnmacc;
 extern const function_base *const vfnmacc_frm;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 9c964ae6fcb..c9a06e6b644 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -356,6 +356,8 @@ DEF_RVV_FUNCTION (vfmsac_frm, alu_frm, full_preds, 
f__ops)
 DEF_RVV_FUNCTION (vfmsac_frm, alu_frm, full_preds, f_vvfv_ops)
 DEF_RVV_FUNCTION (vfnmsac_frm, alu_frm, full_preds, f__ops)
 DEF_RVV_FUNCTION (vfnmsac_frm, alu_frm, full_preds, f_vvfv_ops)
+DEF_RVV_FUNCTION (vfmadd_frm, alu_frm, full_preds, f__ops)
+DEF_RVV_FUNCTION (vfmadd_frm, alu_frm, full_preds, f_vvfv_ops)
 
 // 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
 DEF_RVV_FUNCTION (vfwmacc, alu, full_preds, f_wwvv_ops)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-madd.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-madd.c
new file mode 100644
index 000..00c9d002998
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-madd.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
+
+#include "riscv_vector.h"
+
+typedef float float32_t;
+
+vfloat32m1_t
+test_riscv_vfmadd_vv_f32m1_rm (vfloat32m1_t vd, vfloat32m1_t op1,
+  vfloat32m1_t op2, size_t vl) {
+  return __riscv_vfmadd_vv_f32m1_rm (vd, op1, op2, 0, vl);
+}
+
+vfloat32m1_t
+test_vfmadd_vv_f32m1_rm_m (vbool32_t mask, vfloat32m1_t vd, vfloat32m1_t op1,
+  vfloat32m1_t op2, size_t vl) {
+  return __riscv_vfmadd_vv_f32m1_rm_m (mask, vd, op1, op2, 1, vl);
+}
+
+vfloat32m1_t

[C PATCH] _Generic should not warn in non-active branches [PR68193,PR97100]

2023-08-04 Thread Martin Uecker via Gcc-patches



Here is a patch to reduce false positives in _Generic.

Bootstrapped and regression tested on x86_64-linux.

Martin

c: _Generic should not warn in non-active branches [PR68193,PR97100]

To avoid false diagnostics, use c_inhibit_evaluation_warnings when
a generic association is known to match during parsing.  We may still
generate false positives if the default branch comes earler than
a specific association that matches.

PR c/68193
PR c/97100

gcc/c/:
* c-parser.cc (c_parser_generic_selection): Inhibit evaluation
warnings branches that are known not be taken during parsing.

gcc/testsuite/ChangeLog:
* gcc.dg/pr68193.c: New test.


diff --git a/gcc/c/c-parser.cc b/gcc/c/c-parser.cc
index 24a6eb6e459..d1863b301e0 100644
--- a/gcc/c/c-parser.cc
+++ b/gcc/c/c-parser.cc
@@ -9350,7 +9350,17 @@ c_parser_generic_selection (c_parser *parser)
  return error_expr;
}
 
+  bool match = assoc.type == NULL_TREE
+  || comptypes (assoc.type, selector_type);
+
+  if (!match)
+   c_inhibit_evaluation_warnings++;
+
   assoc.expression = c_parser_expr_no_commas (parser, NULL);
+
+  if (!match)
+ c_inhibit_evaluation_warnings--;
+
   if (assoc.expression.value == error_mark_node)
{
  c_parser_skip_until_found (parser, CPP_CLOSE_PAREN, NULL);
@@ -9387,7 +9397,7 @@ c_parser_generic_selection (c_parser *parser)
  match_found = associations.length ();
}
}
-  else if (comptypes (assoc.type, selector_type))
+  else if (match)
{
  if (match_found < 0 || matched_assoc.type == NULL_TREE)
{
diff --git a/gcc/testsuite/gcc.dg/pr68193.c b/gcc/testsuite/gcc.dg/pr68193.c
new file mode 100644
index 000..2267593e363
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr68193.c
@@ -0,0 +1,15 @@
+/*  pr69193 */
+/* { dg-do compile } */
+/* { dg-options "-Wall" } */
+
+int
+main (void)
+{
+  int i = 0;
+  int j = _Generic (i,
+   int: 0,
+   long int: (i = (long int) 9223372036854775808UL));
+  return i + j;
+}
+
+




RE: [PATCH v1] RISC-V: Support RVV VFNMSAC rounding mode intrinsic API

2023-08-04 Thread Li, Pan2 via Gcc-patches
Committed, thanks Juzhe.

Pan

From: juzhe.zh...@rivai.ai 
Sent: Friday, August 4, 2023 1:46 PM
To: Li, Pan2 ; gcc-patches 
Cc: Kito.cheng ; Li, Pan2 ; Wang, 
Yanzhang 
Subject: Re: [PATCH v1] RISC-V: Support RVV VFNMSAC rounding mode intrinsic API

ok


juzhe.zh...@rivai.ai

From: pan2.li
Date: 2023-08-04 11:28
To: gcc-patches
CC: juzhe.zhong; 
kito.cheng; pan2.li; 
yanzhang.wang
Subject: [PATCH v1] RISC-V: Support RVV VFNMSAC rounding mode intrinsic API
From: Pan Li mailto:pan2...@intel.com>>

This patch would like to support the rounding mode API for the
VFNMSAC for the below samples.

* __riscv_vfnmsac_vv_f32m1_rm
* __riscv_vfnmsac_vv_f32m1_rm_m
* __riscv_vfnmsac_vf_f32m1_rm
* __riscv_vfnmsac_vf_f32m1_rm_m

Signed-off-by: Pan Li mailto:pan2...@intel.com>>

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc
(class vfnmsac_frm): New class for vfnmsac frm.
(vfnmsac_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfnmsac_frm): New function definition.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-single-negate-multiply-sub.c:
New test.
---
.../riscv/riscv-vector-builtins-bases.cc  | 24 ++
.../riscv/riscv-vector-builtins-bases.h   |  1 +
.../riscv/riscv-vector-builtins-functions.def |  2 +
.../float-point-single-negate-multiply-sub.c  | 47 +++
4 files changed, 74 insertions(+)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-negate-multiply-sub.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index e73051bbd89..9c6ca8d1ddc 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -423,6 +423,28 @@ public:
   }
};
+/* Implements below instructions for frm
+   - vfnmsac
+*/
+class vfnmsac_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  bool has_merge_operand_p () const override { return false; }
+
+  rtx expand (function_expander ) const override
+  {
+if (e.op_info->op == OP_TYPE_vf)
+  return e.use_ternop_insn (
+ true, code_for_pred_mul_neg_scalar (PLUS, e.vector_mode ()));
+if (e.op_info->op == OP_TYPE_vv)
+  return e.use_ternop_insn (
+ true, code_for_pred_mul_neg (PLUS, e.vector_mode ()));
+gcc_unreachable ();
+  }
+};
+
/* Implements vrsub.  */
class vrsub : public function_base
{
@@ -2185,6 +2207,7 @@ static CONSTEXPR const widen_binop_frm 
vfwmul_frm_obj;
static CONSTEXPR const vfmacc vfmacc_obj;
static CONSTEXPR const vfmacc_frm vfmacc_frm_obj;
static CONSTEXPR const vfnmsac vfnmsac_obj;
+static CONSTEXPR const vfnmsac_frm vfnmsac_frm_obj;
static CONSTEXPR const vfmadd vfmadd_obj;
static CONSTEXPR const vfnmsub vfnmsub_obj;
static CONSTEXPR const vfnmacc vfnmacc_obj;
@@ -2423,6 +2446,7 @@ BASE (vfwmul_frm)
BASE (vfmacc)
BASE (vfmacc_frm)
BASE (vfnmsac)
+BASE (vfnmsac_frm)
BASE (vfmadd)
BASE (vfnmsub)
BASE (vfnmacc)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index ca8a6dc1cc3..28eec2c3e99 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -162,6 +162,7 @@ extern const function_base *const vfwmul_frm;
extern const function_base *const vfmacc;
extern const function_base *const vfmacc_frm;
extern const function_base *const vfnmsac;
+extern const function_base *const vfnmsac_frm;
extern const function_base *const vfmadd;
extern const function_base *const vfnmsub;
extern const function_base *const vfnmacc;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 8bae7e616ba..9c964ae6fcb 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -354,6 +354,8 @@ DEF_RVV_FUNCTION (vfnmacc_frm, alu_frm, full_preds, 
f__ops)
DEF_RVV_FUNCTION (vfnmacc_frm, alu_frm, full_preds, f_vvfv_ops)
DEF_RVV_FUNCTION (vfmsac_frm, alu_frm, full_preds, f__ops)
DEF_RVV_FUNCTION (vfmsac_frm, alu_frm, full_preds, f_vvfv_ops)
+DEF_RVV_FUNCTION (vfnmsac_frm, alu_frm, full_preds, f__ops)
+DEF_RVV_FUNCTION (vfnmsac_frm, alu_frm, full_preds, f_vvfv_ops)
// 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
DEF_RVV_FUNCTION (vfwmacc, alu, full_preds, f_wwvv_ops)
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-negate-multiply-sub.c
 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-negate-multiply-sub.c
new file mode 100644
index 000..c3089234272
--- /dev/null

RE: [PATCH v1] RISC-V: Support RVV VFMSAC rounding mode intrinsic API

2023-08-04 Thread Li, Pan2 via Gcc-patches
Committed, thanks Juzhe.

Pan

From: juzhe.zh...@rivai.ai 
Sent: Friday, August 4, 2023 1:46 PM
To: Li, Pan2 ; gcc-patches 
Cc: Kito.cheng ; Li, Pan2 ; Wang, 
Yanzhang 
Subject: Re: [PATCH v1] RISC-V: Support RVV VFMSAC rounding mode intrinsic API

ok


juzhe.zh...@rivai.ai

From: pan2.li
Date: 2023-08-04 10:58
To: gcc-patches
CC: juzhe.zhong; 
kito.cheng; pan2.li; 
yanzhang.wang
Subject: [PATCH v1] RISC-V: Support RVV VFMSAC rounding mode intrinsic API
From: Pan Li mailto:pan2...@intel.com>>

This patch would like to support the rounding mode API for the
VFMSAC for the below samples.

* __riscv_vfmsac_vv_f32m1_rm
* __riscv_vfmsac_vv_f32m1_rm_m
* __riscv_vfmsac_vf_f32m1_rm
* __riscv_vfmsac_vf_f32m1_rm_m

Signed-off-by: Pan Li mailto:pan2...@intel.com>>

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc
(class vfmsac_frm): New class for vfmsac frm.
(vfmsac_frm_obj): New declaration.
(BASE): Ditto.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def
(vfmsac_frm): New function definition.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-single-multiply-sub.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc  | 24 ++
.../riscv/riscv-vector-builtins-bases.h   |  1 +
.../riscv/riscv-vector-builtins-functions.def |  2 +
.../base/float-point-single-multiply-sub.c| 47 +++
4 files changed, 74 insertions(+)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-multiply-sub.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 1d4a5a18bf9..e73051bbd89 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -401,6 +401,28 @@ public:
   }
};
+/* Implements below instructions for frm
+   - vfmsac
+*/
+class vfmsac_frm : public function_base
+{
+public:
+  bool has_rounding_mode_operand_p () const override { return true; }
+
+  bool has_merge_operand_p () const override { return false; }
+
+  rtx expand (function_expander ) const override
+  {
+if (e.op_info->op == OP_TYPE_vf)
+  return e.use_ternop_insn (
+ true, code_for_pred_mul_scalar (MINUS, e.vector_mode ()));
+if (e.op_info->op == OP_TYPE_vv)
+  return e.use_ternop_insn (
+ true, code_for_pred_mul (MINUS, e.vector_mode ()));
+gcc_unreachable ();
+  }
+};
+
/* Implements vrsub.  */
class vrsub : public function_base
{
@@ -2168,6 +2190,7 @@ static CONSTEXPR const vfnmsub vfnmsub_obj;
static CONSTEXPR const vfnmacc vfnmacc_obj;
static CONSTEXPR const vfnmacc_frm vfnmacc_frm_obj;
static CONSTEXPR const vfmsac vfmsac_obj;
+static CONSTEXPR const vfmsac_frm vfmsac_frm_obj;
static CONSTEXPR const vfnmadd vfnmadd_obj;
static CONSTEXPR const vfmsub vfmsub_obj;
static CONSTEXPR const vfwmacc vfwmacc_obj;
@@ -2405,6 +2428,7 @@ BASE (vfnmsub)
BASE (vfnmacc)
BASE (vfnmacc_frm)
BASE (vfmsac)
+BASE (vfmsac_frm)
BASE (vfnmadd)
BASE (vfmsub)
BASE (vfwmacc)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.h 
b/gcc/config/riscv/riscv-vector-builtins-bases.h
index 247074d0868..ca8a6dc1cc3 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.h
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.h
@@ -167,6 +167,7 @@ extern const function_base *const vfnmsub;
extern const function_base *const vfnmacc;
extern const function_base *const vfnmacc_frm;
extern const function_base *const vfmsac;
+extern const function_base *const vfmsac_frm;
extern const function_base *const vfnmadd;
extern const function_base *const vfmsub;
extern const function_base *const vfwmacc;
diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 223e8346cd8..8bae7e616ba 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -352,6 +352,8 @@ DEF_RVV_FUNCTION (vfmacc_frm, alu_frm, full_preds, 
f__ops)
DEF_RVV_FUNCTION (vfmacc_frm, alu_frm, full_preds, f_vvfv_ops)
DEF_RVV_FUNCTION (vfnmacc_frm, alu_frm, full_preds, f__ops)
DEF_RVV_FUNCTION (vfnmacc_frm, alu_frm, full_preds, f_vvfv_ops)
+DEF_RVV_FUNCTION (vfmsac_frm, alu_frm, full_preds, f__ops)
+DEF_RVV_FUNCTION (vfmsac_frm, alu_frm, full_preds, f_vvfv_ops)
// 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
DEF_RVV_FUNCTION (vfwmacc, alu, full_preds, f_wwvv_ops)
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-multiply-sub.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-single-multiply-sub.c
new file mode 100644
index 000..8fee552dd30
--- /dev/null
+++