[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Cleanup some temporally files [NFC]

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:586e678cd18c8d7a72e5f785094d911a098092ff

commit 586e678cd18c8d7a72e5f785094d911a098092ff
Author: Pan Li 
Date:   Fri May 17 07:45:19 2024 +0800

RISC-V: Cleanup some temporally files [NFC]

Just notice some temporally files under gcc/config/riscv,
deleted as useless.

* Empty file j.
* Vim swap file.

gcc/ChangeLog:

* config/riscv/.riscv.cc.swo: Removed.
* config/riscv/j: Removed.

Signed-off-by: Pan Li 
(cherry picked from commit d477d683d5c6db90c80d348c795709ae6444ba7a)

Diff:
---
 gcc/config/riscv/.riscv.cc.swo | Bin 417792 -> 0 bytes
 gcc/config/riscv/j |   0
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/gcc/config/riscv/.riscv.cc.swo b/gcc/config/riscv/.riscv.cc.swo
deleted file mode 100644
index 77ed37353bee..
Binary files a/gcc/config/riscv/.riscv.cc.swo and /dev/null differ
diff --git a/gcc/config/riscv/j b/gcc/config/riscv/j
deleted file mode 100644
index e69de29bb2d1..


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Enable vectorizable early exit testsuite

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:c1ad575242ff3dee66f2775412b1c65efbc2269b

commit c1ad575242ff3dee66f2775412b1c65efbc2269b
Author: Pan Li 
Date:   Thu May 16 10:04:10 2024 +0800

RISC-V: Enable vectorizable early exit testsuite

After we supported vectorizable early exit in RISC-V,  we would like to
enable the gcc vect test for vectorizable early test.

The vect-early-break_124-pr114403.c failed to vectorize for now.
Because that the __builtin_memcpy with 8 bytes failed to folded into
int64 assignment during ccp1.  We will improve that first and mark
this as xfail for RISC-V.

The below tests are passed for this patch:
1. The riscv fully regression tests.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/slp-mask-store-1.c: Add pragma novector as it will
have 2 times LOOP VECTORIZED in RISC-V.
* gcc.dg/vect/vect-early-break_124-pr114403.c: Xfail for the
riscv backend.
* lib/target-supports.exp: Add RISC-V backend.

Signed-off-by: Pan Li 
(cherry picked from commit 556e777298dac8574533935000c57335c5232921)

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c  | 2 ++
 gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c | 2 +-
 gcc/testsuite/lib/target-supports.exp | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c 
b/gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c
index fdd9032da98a..2f80bf89e5e6 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c
@@ -28,6 +28,8 @@ main ()
 
   if (__builtin_memcmp (x, res, sizeof (x)) != 0)
 abort ();
+
+#pragma GCC novector
   for (int i = 0; i < 32; ++i)
 if (flag[i] != 0 && flag[i] != 1)
   abort ();
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
index 51abf245ccb5..101ae1e0eaa1 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
@@ -2,7 +2,7 @@
 /* { dg-require-effective-target vect_early_break_hw } */
 /* { dg-require-effective-target vect_long_long } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { xfail riscv*-*-* } } 
} */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 3a55b2a4159c..6c828b73ded3 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4105,6 +4105,7 @@ proc check_effective_target_vect_early_break { } {
|| [check_effective_target_arm_v8_neon_ok]
|| [check_effective_target_sse4]
|| [istarget amdgcn-*-*]
+   || [check_effective_target_riscv_v]
}}]
 }
 
@@ -4120,6 +4121,7 @@ proc check_effective_target_vect_early_break_hw { } {
|| [check_effective_target_arm_v8_neon_hw]
|| [check_sse4_hw_available]
|| [istarget amdgcn-*-*]
+   || [check_effective_target_riscv_v_ok]
}}]
 }


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Implement vectorizable early exit with vcond_mask_len

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:b1aab03aed7f3d8c9b104b5f596e7e9853b8d5e6

commit b1aab03aed7f3d8c9b104b5f596e7e9853b8d5e6
Author: Pan Li 
Date:   Thu May 16 10:02:40 2024 +0800

RISC-V: Implement vectorizable early exit with vcond_mask_len

After we support the loop lens for the vectorizable,  we would like to
implement the feature for the RISC-V target.  Given below example:

unsigned vect_a[1923];
unsigned vect_b[1923];

void test (unsigned limit, int n)
{
  for (int i = 0; i < n; i++)
{
  vect_b[i] = limit + i;

  if (vect_a[i] > limit)
{
  ret = vect_b[i];
  return ret;
}

  vect_a[i] = limit;
}
}

Before this patch:
  ...
.L8:
  swa3,0(a5)
  addiw a0,a0,1
  addi  a4,a4,4
  addi  a5,a5,4
  beq   a1,a0,.L2
.L4:
  swa0,0(a4)
  lwa2,0(a5)
  bleu  a2,a3,.L8
  ret

After this patch:
  ...
.L5:
  vsetvli   a5,a3,e8,mf4,ta,ma
  vmv1r.v   v4,v2
  vsetvli   t4,zero,e32,m1,ta,ma
  vmv.v.x   v1,a5
  vadd.vv   v2,v2,v1
  vsetvli   zero,a5,e32,m1,ta,ma
  vadd.vv   v5,v4,v3
  slli  a6,a5,2
  vle32.v   v1,0(t1)
  vmsltu.vv v1,v3,v1
  vcpop.m   t4,v1
  beq   t4,zero,.L4
  vmv.x.s   a4,v4
.L3:
  ...

The below tests are passed for this patch:
1. The riscv fully regression tests.

gcc/ChangeLog:

* 
config/riscv/autovec-opt.md(*vcond_mask_len_popcount_):
New pattern of vcond_mask_len_popcount for vector bool mode.
* config/riscv/autovec.md (vcond_mask_len_): New pattern of
vcond_mask_len for vector bool mode.
(cbranch4): New pattern for vector bool mode.
* config/riscv/vector-iterators.md: Add new unspec 
UNSPEC_SELECT_MASK.
* config/riscv/vector.md (@pred_popcount): Add VLS 
mode
to popcount pattern.
(@pred_popcount): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/early-break-1.c: New test.
* gcc.target/riscv/rvv/autovec/early-break-2.c: New test.

Signed-off-by: Pan Li 
(cherry picked from commit 6c1de786e53a11150feb16ba990d0d6c6fd910db)

Diff:
---
 gcc/config/riscv/autovec-opt.md| 33 
 gcc/config/riscv/autovec.md| 61 ++
 gcc/config/riscv/vector-iterators.md   |  1 +
 gcc/config/riscv/vector.md | 18 +++
 .../gcc.target/riscv/rvv/autovec/early-break-1.c   | 34 
 .../gcc.target/riscv/rvv/autovec/early-break-2.c   | 37 +
 6 files changed, 175 insertions(+), 9 deletions(-)

diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index 645dc53d8680..04f85d8e4553 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1436,3 +1436,36 @@
 DONE;
   }
   [(set_attr "type" "vmalu")])
+
+;; Optimization pattern for early break auto-vectorization
+;; vcond_mask_len (mask, ones, zeros, len, bias) + vlmax popcount
+;; -> non vlmax popcount (mask, len)
+(define_insn_and_split "*vcond_mask_len_popcount_"
+  [(set (match_operand:P 0 "register_operand")
+(popcount:P
+ (unspec:VB_VLS [
+  (unspec:VB_VLS [
+   (match_operand:VB_VLS 1 "register_operand")
+   (match_operand:VB_VLS 2 "const_1_operand")
+   (match_operand:VB_VLS 3 "const_0_operand")
+   (match_operand 4 "autovec_length_operand")
+   (match_operand 5 "const_0_operand")] UNSPEC_SELECT_MASK)
+  (match_operand 6 "autovec_length_operand")
+  (const_int 1)
+  (reg:SI VL_REGNUM)
+  (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)))]
+  "TARGET_VECTOR
+   && can_create_pseudo_p ()
+   && riscv_vector::get_vector_mode (Pmode, GET_MODE_NUNITS 
(mode)).exists ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+riscv_vector::emit_nonvlmax_insn (
+   code_for_pred_popcount (mode, Pmode),
+   riscv_vector::CPOP_OP,
+   operands, operands[4]);
+DONE;
+  }
+  [(set_attr "type" "vector")]
+)
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index aa1ae0fe075b..1ee3c8052fb4 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2612,3 +2612,64 @@
 DONE;
   }
 )
+
+;; =
+;; == Early break auto-vectorization patterns
+;; =
+
+;; vcond_mask_len (mask, 1s, 0s, len, bias)
+;; => mask[i] = mask[i] && i < len ? 1 : 0
+(define_insn_and_split "vcond_mask_len_"
+  [(set (match_operand:VB 0 "register_operand")
+(unspec: VB [
+ (match_operand:VB 1 "register_operand")
+ (match_operand:VB 2 "const_1_operand")
+ (match_operand:VB 3 "const_0_operand")
+ (match_operand 4 

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] Vect: Support loop len in vectorizable early exit

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:4ec3a6b6022c1853cfd5866dea0324a4002413b2

commit 4ec3a6b6022c1853cfd5866dea0324a4002413b2
Author: Pan Li 
Date:   Thu May 16 09:58:13 2024 +0800

Vect: Support loop len in vectorizable early exit

This patch adds early break auto-vectorization support for target which
use length on partial vectorization.  Consider this following example:

unsigned vect_a[802];
unsigned vect_b[802];

void test (unsigned x, int n)
{
  for (int i = 0; i < n; i++)
  {
vect_b[i] = x + i;

if (vect_a[i] > x)
  break;

vect_a[i] = x;
  }
}

We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias).
And then the IR of RVV looks like below:

  ...
  _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]);
  _55 = (int) _87;
  ...
  mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67;
  vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \
{0, ... }, _87, 0);
  if (vec_len_mask_72 != { 0, ... })
goto ; [5.50%]
  else
goto ; [94.50%]

The below tests are passed for this patch:
1. The riscv fully regression tests.
2. The x86 bootstrap tests.
3. The x86 fully regression tests.

gcc/ChangeLog:

* tree-vect-loop.cc (vect_gen_loop_len_mask): New func to gen
the loop len mask.
* tree-vect-stmts.cc (vectorizable_early_exit): Invoke the
vect_gen_loop_len_mask for 1 or more stmt(s).
* tree-vectorizer.h (vect_gen_loop_len_mask): New func decl
for vect_gen_loop_len_mask.

Signed-off-by: Pan Li 
(cherry picked from commit 57f8a2f67c1536be23231808ab00613ab69193ed)

Diff:
---
 gcc/tree-vect-loop.cc  | 27 +++
 gcc/tree-vect-stmts.cc | 17 +++--
 gcc/tree-vectorizer.h  |  4 
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 29c03c246d45..6ff3ca09dc6a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -11394,6 +11394,33 @@ vect_get_loop_len (loop_vec_info loop_vinfo, 
gimple_stmt_iterator *gsi,
   return loop_len;
 }
 
+/* Generate the tree for the loop len mask and return it.  Given the lens,
+   nvectors, vectype, index and factor to gen the len mask as below.
+
+   tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
+*/
+tree
+vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+   gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
+   unsigned int nvectors, tree vectype, tree stmt,
+   unsigned int index, unsigned int factor)
+{
+  tree all_one_mask = build_all_ones_cst (vectype);
+  tree all_zero_mask = build_zero_cst (vectype);
+  tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, 
index,
+   factor);
+  tree bias = build_int_cst (intQI_type_node,
+LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
+  tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
+  gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
+   all_one_mask, all_zero_mask, len,
+   bias);
+  gimple_call_set_lhs (call, len_mask);
+  gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
+
+  return len_mask;
+}
+
 /* Scale profiling counters by estimation for LOOP which is vectorized
by factor VF.
If FLAT is true, the loop we started with had unrealistically flat
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index f8d8636b139a..d592dff73e33 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12893,7 +12893,9 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info 
stmt_info,
 ncopies = vect_get_num_copies (loop_vinfo, vectype);
 
   vec_loop_masks *masks = _VINFO_MASKS (loop_vinfo);
+  vec_loop_lens *lens = _VINFO_LENS (loop_vinfo);
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
 
   /* Now build the new conditional.  Pattern gimple_conds get dropped during
  codegen so we must replace the original insn.  */
@@ -12957,12 +12959,11 @@ vectorizable_early_exit (vec_info *vinfo, 
stmt_vec_info stmt_info,
{
  if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
  OPTIMIZE_FOR_SPEED))
-   return false;
+   vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1);
  else
vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL);
}
 
-
   return true;
 }
 
@@ -13015,6 +13016,15 @@ vectorizable_early_exit (vec_info *vinfo, 
stmt_vec_info stmt_info,
  

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:51b69c80a76ba767ed166e93a569a84dae445b23

commit 51b69c80a76ba767ed166e93a569a84dae445b23
Author: Pan Li 
Date:   Wed May 15 10:14:05 2024 +0800

Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

This patch would like to add the middle-end presentation for the
saturation add.  Aka set the result of add to the max when overflow.
It will take the pattern similar as below.

SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))

Take uint8_t as example, we will have:

* SAT_ADD (1, 254)   => 255.
* SAT_ADD (1, 255)   => 255.
* SAT_ADD (2, 255)   => 255.
* SAT_ADD (255, 255) => 255.

Given below example for the unsigned scalar integer uint64_t:

uint64_t sat_add_u64 (uint64_t x, uint64_t y)
{
  return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
}

Before this patch:
uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
{
  long unsigned int _1;
  _Bool _2;
  long unsigned int _3;
  long unsigned int _4;
  uint64_t _7;
  long unsigned int _10;
  __complex__ long unsigned int _11;

;;   basic block 2, loop depth 0
;;pred:   ENTRY
  _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
  _1 = REALPART_EXPR <_11>;
  _10 = IMAGPART_EXPR <_11>;
  _2 = _10 != 0;
  _3 = (long unsigned int) _2;
  _4 = -_3;
  _7 = _1 | _4;
  return _7;
;;succ:   EXIT

}

After this patch:
uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
{
  uint64_t _7;

;;   basic block 2, loop depth 0
;;pred:   ENTRY
  _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
  return _7;
;;succ:   EXIT
}

The below tests are passed for this patch:
1. The riscv fully regression tests.
3. The x86 bootstrap tests.
4. The x86 fully regression tests.

PR target/51492
PR target/112600

gcc/ChangeLog:

* internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD
to the return true switch case(s).
* internal-fn.def (SAT_ADD):  Add new signed optab SAT_ADD.
* match.pd: Add unsigned SAT_ADD match(es).
* optabs.def (OPTAB_NL): Remove fixed-point limitation for
us/ssadd.
* tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New
extern func decl generated in match.pd match.
(match_saturation_arith): New func impl to match the saturation 
arith.
(math_opts_dom_walker::after_dom_children): Try match saturation
arith when IOR expr.

Signed-off-by: Pan Li 
(cherry picked from commit 52b0536710ff3f3ace72ab00ce9ef6c630cd1183)

Diff:
---
 gcc/internal-fn.cc|  1 +
 gcc/internal-fn.def   |  2 ++
 gcc/match.pd  | 51 +++
 gcc/optabs.def|  4 ++--
 gcc/tree-ssa-math-opts.cc | 32 +
 5 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 0a7053c2286c..73045ca8c8c1 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn)
 case IFN_UBSAN_CHECK_MUL:
 case IFN_ADD_OVERFLOW:
 case IFN_MUL_OVERFLOW:
+case IFN_SAT_ADD:
 case IFN_VEC_WIDEN_PLUS:
 case IFN_VEC_WIDEN_PLUS_LO:
 case IFN_VEC_WIDEN_PLUS_HI:
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 848bb9dbff3f..25badbb86e56 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | 
ECF_NOTHROW, first,
 DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
  smulhrs, umulhrs, binary)
 
+DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, binary)
+
 DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
 DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
 DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
diff --git a/gcc/match.pd b/gcc/match.pd
index d401e7503e62..aa1e2875c604 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3043,6 +3043,57 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
|| POINTER_TYPE_P (itype))
   && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype))
 
+/* Unsigned Saturation Add */
+(match (usadd_left_part_1 @0 @1)
+ (plus:c @0 @1)
+ (if (INTEGRAL_TYPE_P (type)
+  && TYPE_UNSIGNED (TREE_TYPE (@0))
+  && types_match (type, TREE_TYPE (@0))
+  && types_match (type, TREE_TYPE (@1)
+
+(match (usadd_left_part_2 @0 @1)
+ (realpart (IFN_ADD_OVERFLOW:c @0 @1))
+ (if (INTEGRAL_TYPE_P (type)
+  && TYPE_UNSIGNED (TREE_TYPE (@0))
+  && types_match (type, TREE_TYPE (@0))
+  && types_match (type, TREE_TYPE (@1)
+
+(match (usadd_right_part_1 @0 @1)
+ (negate (convert (lt (plus:c @0 @1) @0)))
+ (if (INTEGRAL_TYPE_P 

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] Vect: Support new IFN SAT_ADD for unsigned vector int

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:674362d73e964815cdb700edd9fedbfc34c24c21

commit 674362d73e964815cdb700edd9fedbfc34c24c21
Author: Pan Li 
Date:   Wed May 15 10:14:06 2024 +0800

Vect: Support new IFN SAT_ADD for unsigned vector int

For vectorize, we leverage the existing vect pattern recog to find
the pattern similar to scalar and let the vectorizer to perform
the rest part for standard name usadd3 in vector mode.
The riscv vector backend have insn "Vector Single-Width Saturating
Add and Subtract" which can be leveraged when expand the usadd3
in vector mode.  For example:

void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
{
  unsigned i;

  for (i = 0; i < n; i++)
out[i] = (x[i] + y[i]) | (- (uint64_t)((uint64_t)(x[i] + y[i]) < x[i]));
}

Before this patch:
void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
{
  ...
  _80 = .SELECT_VL (ivtmp_78, POLY_INT_CST [2, 2]);
  ivtmp_58 = _80 * 8;
  vect__4.7_61 = .MASK_LEN_LOAD (vectp_x.5_59, 64B, { -1, ... }, _80, 0);
  vect__6.10_65 = .MASK_LEN_LOAD (vectp_y.8_63, 64B, { -1, ... }, _80, 0);
  vect__7.11_66 = vect__4.7_61 + vect__6.10_65;
  mask__8.12_67 = vect__4.7_61 > vect__7.11_66;
  vect__12.15_72 = .VCOND_MASK (mask__8.12_67, { 18446744073709551615,
... }, vect__7.11_66);
  .MASK_LEN_STORE (vectp_out.16_74, 64B, { -1, ... }, _80, 0, 
vect__12.15_72);
  vectp_x.5_60 = vectp_x.5_59 + ivtmp_58;
  vectp_y.8_64 = vectp_y.8_63 + ivtmp_58;
  vectp_out.16_75 = vectp_out.16_74 + ivtmp_58;
  ivtmp_79 = ivtmp_78 - _80;
  ...
}

After this patch:
void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
{
  ...
  _62 = .SELECT_VL (ivtmp_60, POLY_INT_CST [2, 2]);
  ivtmp_46 = _62 * 8;
  vect__4.7_49 = .MASK_LEN_LOAD (vectp_x.5_47, 64B, { -1, ... }, _62, 0);
  vect__6.10_53 = .MASK_LEN_LOAD (vectp_y.8_51, 64B, { -1, ... }, _62, 0);
  vect__12.11_54 = .SAT_ADD (vect__4.7_49, vect__6.10_53);
  .MASK_LEN_STORE (vectp_out.12_56, 64B, { -1, ... }, _62, 0, 
vect__12.11_54);
  ...
}

The below test suites are passed for this patch.
* The riscv fully regression tests.
* The x86 bootstrap tests.
* The x86 fully regression tests.

PR target/51492
PR target/112600

gcc/ChangeLog:

* tree-vect-patterns.cc (gimple_unsigned_integer_sat_add): New
func decl generated by match.pd match.
(vect_recog_sat_add_pattern): New func impl to recog the pattern
for unsigned SAT_ADD.

Signed-off-by: Pan Li 
(cherry picked from commit d4dee347b3fe1982bab26485ff31cd039c9df010)

Diff:
---
 gcc/tree-vect-patterns.cc | 52 +++
 1 file changed, 52 insertions(+)

diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 87c2acff386d..6fd2373644f4 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -4487,6 +4487,57 @@ vect_recog_mult_pattern (vec_info *vinfo,
   return pattern_stmt;
 }
 
+extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
+
+/*
+ * Try to detect saturation add pattern (SAT_ADD), aka below gimple:
+ *   _7 = _4 + _6;
+ *   _8 = _4 > _7;
+ *   _9 = (long unsigned int) _8;
+ *   _10 = -_9;
+ *   _12 = _7 | _10;
+ *
+ * And then simplied to
+ *   _12 = .SAT_ADD (_4, _6);
+ */
+
+static gimple *
+vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
+   tree *type_out)
+{
+  gimple *last_stmt = STMT_VINFO_STMT (stmt_vinfo);
+
+  if (!is_gimple_assign (last_stmt))
+return NULL;
+
+  tree res_ops[2];
+  tree lhs = gimple_assign_lhs (last_stmt);
+
+  if (gimple_unsigned_integer_sat_add (lhs, res_ops, NULL))
+{
+  tree itype = TREE_TYPE (res_ops[0]);
+  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
+
+  if (vtype != NULL_TREE
+   && direct_internal_fn_supported_p (IFN_SAT_ADD, vtype,
+  OPTIMIZE_FOR_BOTH))
+   {
+ *type_out = vtype;
+ gcall *call = gimple_build_call_internal (IFN_SAT_ADD, 2, res_ops[0],
+   res_ops[1]);
+
+ gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
+ gimple_call_set_nothrow (call, /* nothrow_p */ false);
+ gimple_set_location (call, gimple_location (last_stmt));
+
+ vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
+ return call;
+   }
+}
+
+  return NULL;
+}
+
 /* Detect a signed division by a constant that wouldn't be
otherwise vectorized:
 
@@ -6987,6 +7038,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
   { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },
   { vect_recog_divmod_pattern, "divmod" },
   { 

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: testsuite: Drop march-string in cmpmemsi/cpymemsi tests

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:faf2f9ed73969d838026027566473bde14db748b

commit faf2f9ed73969d838026027566473bde14db748b
Author: Christoph Müllner 
Date:   Thu May 16 09:53:47 2024 +0200

RISC-V: testsuite: Drop march-string in cmpmemsi/cpymemsi tests

The tests cmpmemsi-1.c and cpymemsi-1.c are execution ("dg-do run")
tests, which does not have any restrictions for the enabled extensions.
Further, no other listed options are required.
Let's drop the options, so that the test can also be executed on
non-f and non-d targets.  However, we need to set options to the
defaults without '-ansi', because the included test file uses the
'asm' keyword, which is not part of ANSI C.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cmpmemsi-1.c: Drop options.
* gcc.target/riscv/cpymemsi-1.c: Likewise.

Signed-off-by: Christoph Müllner 
(cherry picked from commit b8b82bb05c10544da05cd0d3d39e6bc3763a8d9f)

Diff:
---
 gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c | 3 +--
 gcc/testsuite/gcc.target/riscv/cpymemsi-1.c | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c 
b/gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c
index d7e0bc474073..698f27d89fbf 100644
--- a/gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c
@@ -1,6 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-march=rv32gc_zbb -save-temps -g0 -fno-lto" { target { rv32 } 
} } */
-/* { dg-options "-march=rv64gc_zbb -save-temps -g0 -fno-lto" { target { rv64 } 
} } */
+/* { dg-options "-pedantic-errors" } */
 /* { dg-timeout-factor 2 } */
 
 #include "../../gcc.dg/memcmp-1.c"
diff --git a/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c 
b/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c
index 983b564ccaf7..30e9f119bedc 100644
--- a/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c
@@ -1,7 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-march=rv32gc -save-temps -g0 -fno-lto" { target { rv32 } } } 
*/
-/* { dg-options "-march=rv64gc -save-temps -g0 -fno-lto" { target { rv64 } } } 
*/
-/* { dg-additional-options "-DRUN_FRACTION=11" { target simulator } } */
+/* { dg-options "-pedantic-errors" } */
 /* { dg-timeout-factor 2 } */
 
 #include "../../gcc.dg/memcmp-1.c"


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Add Zvfbfwma extension to the -march= option

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:67195fbc4deac8659d8f65ab922416ac451ae5bb

commit 67195fbc4deac8659d8f65ab922416ac451ae5bb
Author: Xiao Zeng 
Date:   Wed May 15 10:03:40 2024 +0800

RISC-V: Add Zvfbfwma extension to the -march= option

This patch would like to add new sub extension (aka Zvfbfwma) to the
-march= option. It introduces a new data type BF16.

1 In spec: "Zvfbfwma requires the Zvfbfmin extension and the Zfbfmin 
extension."
  1.1 In EmbeddedProcessor: Zvfbfwma -> Zvfbfmin -> Zve32f
  1.2 In Application Processor: Zvfbfwma -> Zvfbfmin -> V
  1.3 In both scenarios, there are: Zvfbfwma -> Zfbfmin

2 Zvfbfmin's information is in:



3 Zfbfmin's formation is in:



4 Depending on different usage scenarios, the Zvfbfwma extension may
depend on 'V' or 'Zve32f'. This patch only implements dependencies in
scenario of Embedded Processor. This is consistent with the processing
strategy in Zvfbfmin. In scenario of Application Processor, it is
necessary to explicitly indicate the dependent 'V' extension.

5 You can locate more information about Zvfbfwma from below spec doc:



gcc/ChangeLog:

* common/config/riscv/riscv-common.cc:
(riscv_implied_info): Add zvfbfwma item.
(riscv_ext_version_table): Ditto.
(riscv_ext_flag_table): Ditto.
* config/riscv/riscv.opt:
(MASK_ZVFBFWMA): New macro.
(TARGET_ZVFBFWMA): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/arch-37.c: New test.
* gcc.target/riscv/arch-38.c: New test.
* gcc.target/riscv/predef-36.c: New test.
* gcc.target/riscv/predef-37.c: New test.

(cherry picked from commit 38dd4e26e07c6be7cf4d169141ee4f3a03f3a09d)

Diff:
---
 gcc/common/config/riscv/riscv-common.cc|  5 
 gcc/config/riscv/riscv.opt |  2 ++
 gcc/testsuite/gcc.target/riscv/arch-37.c   |  5 
 gcc/testsuite/gcc.target/riscv/arch-38.c   |  5 
 gcc/testsuite/gcc.target/riscv/predef-36.c | 48 ++
 gcc/testsuite/gcc.target/riscv/predef-37.c | 48 ++
 6 files changed, 113 insertions(+)

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc
index fb76017ffbc0..88204393fde0 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -162,6 +162,8 @@ static const riscv_implied_info_t riscv_implied_info[] =
   {"zfa", "f"},
 
   {"zvfbfmin", "zve32f"},
+  {"zvfbfwma", "zvfbfmin"},
+  {"zvfbfwma", "zfbfmin"},
   {"zvfhmin", "zve32f"},
   {"zvfh", "zve32f"},
   {"zvfh", "zfhmin"},
@@ -336,6 +338,7 @@ static const struct riscv_ext_version 
riscv_ext_version_table[] =
   {"zfh",   ISA_SPEC_CLASS_NONE, 1, 0},
   {"zfhmin",ISA_SPEC_CLASS_NONE, 1, 0},
   {"zvfbfmin",  ISA_SPEC_CLASS_NONE, 1, 0},
+  {"zvfbfwma",  ISA_SPEC_CLASS_NONE, 1, 0},
   {"zvfhmin",   ISA_SPEC_CLASS_NONE, 1, 0},
   {"zvfh",  ISA_SPEC_CLASS_NONE, 1, 0},
 
@@ -1667,6 +1670,7 @@ static const riscv_ext_flag_table_t 
riscv_ext_flag_table[] =
   {"zve64f",   _options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_FP_32},
   {"zve64d",   _options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_FP_64},
   {"zvfbfmin", _options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_BF_16},
+  {"zvfbfwma", _options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_BF_16},
   {"zvfhmin",  _options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_FP_16},
   {"zvfh", _options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_FP_16},
 
@@ -1704,6 +1708,7 @@ static const riscv_ext_flag_table_t 
riscv_ext_flag_table[] =
   {"zfhmin",_options::x_riscv_zf_subext, MASK_ZFHMIN},
   {"zfh",   _options::x_riscv_zf_subext, MASK_ZFH},
   {"zvfbfmin",  _options::x_riscv_zf_subext, MASK_ZVFBFMIN},
+  {"zvfbfwma",  _options::x_riscv_zf_subext, MASK_ZVFBFWMA},
   {"zvfhmin",   _options::x_riscv_zf_subext, MASK_ZVFHMIN},
   {"zvfh",  _options::x_riscv_zf_subext, MASK_ZVFH},
 
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 1252834aec5b..d209ac896fde 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -401,6 +401,8 @@ Mask(ZFH) Var(riscv_zf_subext)
 
 Mask(ZVFBFMIN) Var(riscv_zf_subext)
 
+Mask(ZVFBFWMA) Var(riscv_zf_subext)
+
 Mask(ZVFHMIN) Var(riscv_zf_subext)
 
 Mask(ZVFH)Var(riscv_zf_subext)
diff --git a/gcc/testsuite/gcc.target/riscv/arch-37.c 
b/gcc/testsuite/gcc.target/riscv/arch-37.c
new file mode 100644
index ..5b19a73c5567
--- /dev/null
+++ 

Re: [PATCH] RISC-V: Fix "Nan-box the result of movbf on soft-bf16"

2024-05-17 Thread Jeff Law




On 5/15/24 7:55 PM, Xiao Zeng wrote:

1 According to unpriv-isa spec:

   1.1 "FMV.H.X moves the half-precision value encoded in IEEE 754-2008
   standard encoding from the lower 16 bits of integer register rs1
   to the floating-point register rd, NaN-boxing the result."
   1.2 "FMV.W.X moves the single-precision value encoded in IEEE 754-2008
   standard encoding from the lower 32 bits of integer register rs1
   to the floating-point register rd. The bits are not modified in the
   transfer, and in particular, the payloads of non-canonical NaNs are 
preserved."

2 When (!TARGET_ZFHMIN == true && TARGET_HARD_FLOAT == true), instruction needs
to be added to complete the Nan-box, as done in
"RISC-V: Nan-box the result of movhf on soft-fp16":


3 Consider the "RISC-V: Nan-box the result of movbf on soft-bf16" in:

It ignores that both hf16 and bf16 are 16bits floating-point.

4 zfbfmin -> zfhmin in:


gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_legitimize_move): Optimize movbf
with Nan-boxing value.
* config/riscv/riscv.md (*movhf_softfloat_boxing): Expand movbf
with Nan-boxing value.
(*mov_softfloat_boxing): Ditto.
with Nan-boxing value.
(*movbf_softfloat_boxing): Delete abandon pattern.
---
  gcc/config/riscv/riscv.cc | 15 +--
  gcc/config/riscv/riscv.md | 19 +--
  2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 4067505270e..04513537aad 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -3178,13 +3178,10 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx 
src)
   (set (reg:SI/DI mask) (const_int -65536)
   (set (reg:SI/DI temp) (zero_extend:SI/DI (subreg:HI (reg:HF/BF src) 0)))
   (set (reg:SI/DI temp) (ior:SI/DI (reg:SI/DI mask) (reg:SI/DI temp)))
- (set (reg:HF/BF dest) (unspec:HF/BF[ (reg:SI/DI temp) ]
-   UNSPEC_FMV_SFP16_X/UNSPEC_FMV_SBF16_X))
- */
+ (set (reg:HF/BF dest) (unspec:HF/BF[ (reg:SI/DI temp) ] 
UNSPEC_FMV_FP16_X))
+  */
  
-  if (TARGET_HARD_FLOAT

-  && ((!TARGET_ZFHMIN && mode == HFmode)
- || (!TARGET_ZFBFMIN && mode == BFmode))
+  if (TARGET_HARD_FLOAT && !TARGET_ZFHMIN && (mode == HFmode || mode == BFmode)
We generally prefer not to mix && and || operators on the same line. 
I'd suggest


if (TARGET_HARD_FLOAT
&& !TARGET_ZFHMIN
&& (mode == HFmode || mode == BFmode)
[ ... ]



@@ -1959,23 +1958,15 @@
 (set_attr "type" "fmove,move,load,store,mtc,mfc")
 (set_attr "mode" "")])
  
-(define_insn "*movhf_softfloat_boxing"

-  [(set (match_operand:HF 0 "register_operand""=f")
-(unspec:HF [(match_operand:X 1 "register_operand" " r")] 
UNSPEC_FMV_SFP16_X))]
+(define_insn "*mov_softfloat_boxing"
+  [(set (match_operand:HFBF 0 "register_operand" "=f")
+(unspec:HFBF [(match_operand:X 1 "register_operand" " r")]
+UNSPEC_FMV_FP16_X))]
"!TARGET_ZFHMIN"
I think the linter complained about having 8 spaces instead of a tab in 
one of the lines above.


With those fixes, this is fine for the trunk.

jeff


Re: [PATCH] RISC-V: Modify _Bfloat16 to __bf16

2024-05-17 Thread Jeff Law




On 5/17/24 2:19 AM, Kito Cheng wrote:

LGTM, thanks for fixing this :)
And just to be clear for Xiao, you can go ahead and commit this patch to 
the trunk.  An ACK from Kito, Juzhe, Palmer, Robin or myself is all you 
need for a change that is isolated to RISC-V code.


jeff



Re: [PATCH] RISC-V: Remove dead perm series code and document.

2024-05-17 Thread Jeff Law




On 5/17/24 9:27 AM, Robin Dapp wrote:

Hi,

with the introduction of shuffle_series_patterns the explicit handler
code for a perm series is dead.  This patch removes it and also adds
a function-level comment to shuffle_series_patterns.

Regtested on rv64gcv_zvfh_zvbb.

Regards
  Robin

gcc/ChangeLog:

* config/riscv/riscv-v.cc (expand_const_vector): Document.
(shuffle_extract_and_slide1up_patterns): Remove.

OK.

Jeff



Re: [PATCH v1] RISC-V: Cleanup some temporally files [NFC]

2024-05-17 Thread Jeff Law




On 5/16/24 6:12 PM, Li, Pan2 wrote:

Committed, thanks Juzhe.

Thanks for cleaning up my little mess!  Sorry about that.

jeff



Re: [PATCH gcc-13] Fix RISC-V missing stack tie

2024-05-16 Thread Jeff Law




On 5/16/24 12:24 PM, Palmer Dabbelt wrote:



gcc/
* config/riscv/riscv.cc (riscv_expand_prologue): Add missing stack
tie for scalable and final stack adjustment if needed.

Co-authored-by: Raphael Zinsly 

(cherry picked from commit c65046ff2ef0a9a46e59bc0b3369b2d226f6a239)
---
I've only build tested this one, but it's tripping up some of the Fedora
folks here https://bugzilla.redhat.com/show_bug.cgi?id=2242327 so I
figured it's worth backporting.
Yes, that's the the original report from Florian that led Raphael and I 
to dive in.  Definitely worth backporting.


jeff



Re: [PATCH v5 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

2024-05-16 Thread Jeff Law




On 5/16/24 5:58 AM, Richard Biener wrote:

On Thu, May 16, 2024 at 11:35 AM Li, Pan2  wrote:



OK.


Thanks Richard for help and coaching. To double confirm, are you OK with this 
patch only or for the series patch(es) of SAT middle-end?
Thanks again for reviewing and suggestions.


For the series, the riscv specific part of course needs riscv approval.
Yea, we'll take a look at it.  Tons of stuff to go through, but this is 
definitely on the list.


jeff



Re: [PATCH] tree-optimization/13962 - handle ptr-ptr compares in ptrs_compare_unequal

2024-05-16 Thread Jeff Law




On 5/16/24 6:03 AM, Richard Biener wrote:

Now that we handle pt.null conservatively we can implement the missing
tracking of constant pool entries (aka STRING_CST) and handle
ptr-ptr compares using points-to info in ptrs_compare_unequal.

Bootstrapped on x86_64-unknown-linux-gnu, (re-)testing in progress.

Richard.

PR tree-optimization/13962
PR tree-optimization/96564
* tree-ssa-alias.h (pt_solution::const_pool): New flag.
* tree-ssa-alias.cc (ptrs_compare_unequal): Handle pointer-pointer
compares.
(dump_points_to_solution): Dump the const_pool flag, fix guard
of flag dumping.
* gimple-pretty-print.cc (pp_points_to_solution): Likewise.
* tree-ssa-structalias.cc (find_what_var_points_to): Set
the const_pool flag for STRING.
(pt_solution_ior_into): Handle the const_pool flag.
(ipa_escaped_pt): Initialize it.

* gcc.dg/tree-ssa/alias-39.c: New testcase.
* g++.dg/vect/pr68145.cc: Use -fno-tree-pta to avoid UB
to manifest in transforms no longer vectorizing this testcase
for an ICE.
You might want to test this against 92539 as well.  There's a nonzero 
chance it'll resolve that one.


jeff



[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] Add missing hunk in recent change.

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:45c5684c8242add5e97a392374dc160a6e68f2f0

commit 45c5684c8242add5e97a392374dc160a6e68f2f0
Author: Jeff Law 
Date:   Wed May 15 17:05:24 2024 -0600

Add missing hunk in recent change.

gcc/
* config/riscv/riscv-string.cc: Add missing hunk from last change.

(cherry picked from commit d7e6fe0f72ad41b8361f927d2796dbc275347297)

Diff:
---
 gcc/config/riscv/riscv-string.cc | 177 +++
 1 file changed, 177 insertions(+)

diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index cbb9724d2308..83e7afbd693b 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -627,6 +627,183 @@ riscv_expand_strlen (rtx result, rtx src, rtx 
search_char, rtx align)
   return false;
 }
 
+/* Generate the sequence of load and compares for memcmp using Zbb.
+
+   RESULT is the register where the return value of memcmp will be stored.
+   The source pointers are SRC1 and SRC2 (NBYTES bytes to compare).
+   DATA1 and DATA2 are registers where the data chunks will be stored.
+   DIFF_LABEL is the location of the code that calculates the return value.
+   FINAL_LABEL is the location of the code that comes after the calculation
+   of the return value.  */
+
+static void
+emit_memcmp_scalar_load_and_compare (rtx result, rtx src1, rtx src2,
+unsigned HOST_WIDE_INT nbytes,
+rtx data1, rtx data2,
+rtx diff_label, rtx final_label)
+{
+  const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
+  unsigned HOST_WIDE_INT offset = 0;
+
+  while (nbytes > 0)
+{
+  unsigned HOST_WIDE_INT cmp_bytes = xlen < nbytes ? xlen : nbytes;
+  machine_mode load_mode;
+
+  /* Special cases to avoid masking of trailing bytes.  */
+  if (cmp_bytes == 1)
+   load_mode = QImode;
+  else if (cmp_bytes == 2)
+   load_mode = HImode;
+  else if (cmp_bytes == 4)
+   load_mode = SImode;
+  else
+   load_mode = Xmode;
+
+  rtx addr1 = adjust_address (src1, load_mode, offset);
+  do_load (load_mode, data1, addr1);
+  rtx addr2 = adjust_address (src2, load_mode, offset);
+  do_load (load_mode, data2, addr2);
+
+  /* Fast-path for a single byte.  */
+  if (cmp_bytes == 1)
+   {
+ rtx tmp = gen_reg_rtx (Xmode);
+ do_sub3 (tmp, data1, data2);
+ emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp)));
+ emit_jump_insn (gen_jump (final_label));
+ emit_barrier (); /* No fall-through.  */
+ return;
+   }
+
+  /* Shift off trailing bytes in words if needed.  */
+  unsigned int load_bytes = GET_MODE_SIZE (load_mode).to_constant ();
+  if (cmp_bytes < load_bytes)
+   {
+ int shamt = (load_bytes - cmp_bytes) * BITS_PER_UNIT;
+ do_ashl3 (data1, data1, GEN_INT (shamt));
+ do_ashl3 (data2, data2, GEN_INT (shamt));
+   }
+
+  /* Break out if data1 != data2 */
+  rtx cond = gen_rtx_NE (VOIDmode, data1, data2);
+  emit_unlikely_jump_insn (gen_cbranch4 (Pmode, cond, data1,
+data2, diff_label));
+  /* Fall-through on equality.  */
+
+  offset += cmp_bytes;
+  nbytes -= cmp_bytes;
+}
+}
+
+/* memcmp result calculation.
+
+   RESULT is the register where the return value will be stored.
+   The two data chunks are in DATA1 and DATA2.  */
+
+static void
+emit_memcmp_scalar_result_calculation (rtx result, rtx data1, rtx data2)
+{
+  /* Get bytes in big-endian order and compare as words.  */
+  do_bswap2 (data1, data1);
+  do_bswap2 (data2, data2);
+  /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
+  rtx tmp = gen_reg_rtx (Xmode);
+  emit_insn (gen_slt_3 (LTU, Xmode, Xmode, tmp, data1, data2));
+  do_neg2 (tmp, tmp);
+  do_ior3 (tmp, tmp, const1_rtx);
+  emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp)));
+}
+
+/* Expand memcmp using scalar instructions (incl. Zbb).
+
+   RESULT is the register where the return value will be stored.
+   The source pointers are SRC1 and SRC2 (NBYTES bytes to compare).  */
+
+static bool
+riscv_expand_block_compare_scalar (rtx result, rtx src1, rtx src2, rtx nbytes)
+{
+  const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
+
+  if (optimize_function_for_size_p (cfun))
+return false;
+
+  /* We don't support big endian.  */
+  if (BYTES_BIG_ENDIAN)
+return false;
+
+  if (!CONST_INT_P (nbytes))
+return false;
+
+  /* We need the rev (bswap) instruction.  */
+  if (!TARGET_ZBB)
+return false;
+
+  unsigned HOST_WIDE_INT length = UINTVAL (nbytes);
+
+  /* Limit to 12-bits (maximum load-offset).  */
+  if (length > IMM_REACH)
+length = IMM_REACH;
+
+  /* We need xlen-aligned memory.  */
+  unsigned HOST_WIDE_INT align = MIN (MEM_ALIGN (src1), MEM_ALIGN (src2));
+  if (align < (xlen * BITS_PER_

[gcc r15-527] Add missing hunk in recent change.

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:d7e6fe0f72ad41b8361f927d2796dbc275347297

commit r15-527-gd7e6fe0f72ad41b8361f927d2796dbc275347297
Author: Jeff Law 
Date:   Wed May 15 17:05:24 2024 -0600

Add missing hunk in recent change.

gcc/
* config/riscv/riscv-string.cc: Add missing hunk from last change.

Diff:
---
 gcc/config/riscv/riscv-string.cc | 177 +++
 1 file changed, 177 insertions(+)

diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index cbb9724d2308..83e7afbd693b 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -627,6 +627,183 @@ riscv_expand_strlen (rtx result, rtx src, rtx 
search_char, rtx align)
   return false;
 }
 
+/* Generate the sequence of load and compares for memcmp using Zbb.
+
+   RESULT is the register where the return value of memcmp will be stored.
+   The source pointers are SRC1 and SRC2 (NBYTES bytes to compare).
+   DATA1 and DATA2 are registers where the data chunks will be stored.
+   DIFF_LABEL is the location of the code that calculates the return value.
+   FINAL_LABEL is the location of the code that comes after the calculation
+   of the return value.  */
+
+static void
+emit_memcmp_scalar_load_and_compare (rtx result, rtx src1, rtx src2,
+unsigned HOST_WIDE_INT nbytes,
+rtx data1, rtx data2,
+rtx diff_label, rtx final_label)
+{
+  const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
+  unsigned HOST_WIDE_INT offset = 0;
+
+  while (nbytes > 0)
+{
+  unsigned HOST_WIDE_INT cmp_bytes = xlen < nbytes ? xlen : nbytes;
+  machine_mode load_mode;
+
+  /* Special cases to avoid masking of trailing bytes.  */
+  if (cmp_bytes == 1)
+   load_mode = QImode;
+  else if (cmp_bytes == 2)
+   load_mode = HImode;
+  else if (cmp_bytes == 4)
+   load_mode = SImode;
+  else
+   load_mode = Xmode;
+
+  rtx addr1 = adjust_address (src1, load_mode, offset);
+  do_load (load_mode, data1, addr1);
+  rtx addr2 = adjust_address (src2, load_mode, offset);
+  do_load (load_mode, data2, addr2);
+
+  /* Fast-path for a single byte.  */
+  if (cmp_bytes == 1)
+   {
+ rtx tmp = gen_reg_rtx (Xmode);
+ do_sub3 (tmp, data1, data2);
+ emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp)));
+ emit_jump_insn (gen_jump (final_label));
+ emit_barrier (); /* No fall-through.  */
+ return;
+   }
+
+  /* Shift off trailing bytes in words if needed.  */
+  unsigned int load_bytes = GET_MODE_SIZE (load_mode).to_constant ();
+  if (cmp_bytes < load_bytes)
+   {
+ int shamt = (load_bytes - cmp_bytes) * BITS_PER_UNIT;
+ do_ashl3 (data1, data1, GEN_INT (shamt));
+ do_ashl3 (data2, data2, GEN_INT (shamt));
+   }
+
+  /* Break out if data1 != data2 */
+  rtx cond = gen_rtx_NE (VOIDmode, data1, data2);
+  emit_unlikely_jump_insn (gen_cbranch4 (Pmode, cond, data1,
+data2, diff_label));
+  /* Fall-through on equality.  */
+
+  offset += cmp_bytes;
+  nbytes -= cmp_bytes;
+}
+}
+
+/* memcmp result calculation.
+
+   RESULT is the register where the return value will be stored.
+   The two data chunks are in DATA1 and DATA2.  */
+
+static void
+emit_memcmp_scalar_result_calculation (rtx result, rtx data1, rtx data2)
+{
+  /* Get bytes in big-endian order and compare as words.  */
+  do_bswap2 (data1, data1);
+  do_bswap2 (data2, data2);
+  /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
+  rtx tmp = gen_reg_rtx (Xmode);
+  emit_insn (gen_slt_3 (LTU, Xmode, Xmode, tmp, data1, data2));
+  do_neg2 (tmp, tmp);
+  do_ior3 (tmp, tmp, const1_rtx);
+  emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp)));
+}
+
+/* Expand memcmp using scalar instructions (incl. Zbb).
+
+   RESULT is the register where the return value will be stored.
+   The source pointers are SRC1 and SRC2 (NBYTES bytes to compare).  */
+
+static bool
+riscv_expand_block_compare_scalar (rtx result, rtx src1, rtx src2, rtx nbytes)
+{
+  const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
+
+  if (optimize_function_for_size_p (cfun))
+return false;
+
+  /* We don't support big endian.  */
+  if (BYTES_BIG_ENDIAN)
+return false;
+
+  if (!CONST_INT_P (nbytes))
+return false;
+
+  /* We need the rev (bswap) instruction.  */
+  if (!TARGET_ZBB)
+return false;
+
+  unsigned HOST_WIDE_INT length = UINTVAL (nbytes);
+
+  /* Limit to 12-bits (maximum load-offset).  */
+  if (length > IMM_REACH)
+length = IMM_REACH;
+
+  /* We need xlen-aligned memory.  */
+  unsigned HOST_WIDE_INT align = MIN (MEM_ALIGN (src1), MEM_ALIGN (src2));
+  if (align < (xlen * BITS_PER_UNIT))
+return false;
+
+  if (length > RISCV_MAX_MOVE_BYTES_STRAIG

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [v2, 2/2] RISC-V: strcmp expansion: Use adjust_address() for address calculation

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:72e6ff2bcf293116099988ebd367182cba699e9b

commit 72e6ff2bcf293116099988ebd367182cba699e9b
Author: Christoph Müllner 
Date:   Wed May 15 12:19:40 2024 -0600

[v2,2/2] RISC-V: strcmp expansion: Use adjust_address() for address 
calculation

We have an arch-independent routine to generate an address with an offset.
Let's use that instead of doing the calculation in the backend.

gcc/ChangeLog:

* config/riscv/riscv-string.cc 
(emit_strcmp_scalar_load_and_compare):
Use adjust_address() to calculate MEM-PLUS pattern.

(cherry picked from commit 1fbbae1d4ba3618a3da829a6d7e11a1606a583b3)

Diff:
---
 gcc/config/riscv/riscv-string.cc | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 8f3b6f925e01..cbb9724d2308 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -227,8 +227,6 @@ emit_strcmp_scalar_load_and_compare (rtx result, rtx src1, 
rtx src2,
 rtx final_label)
 {
   const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
-  rtx src1_addr = force_reg (Pmode, XEXP (src1, 0));
-  rtx src2_addr = force_reg (Pmode, XEXP (src2, 0));
   unsigned HOST_WIDE_INT offset = 0;
 
   rtx testval = gen_reg_rtx (Xmode);
@@ -246,10 +244,10 @@ emit_strcmp_scalar_load_and_compare (rtx result, rtx 
src1, rtx src2,
   else
load_mode = Xmode;
 
-  rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
-  do_load_from_addr (load_mode, data1, addr1, src1);
-  rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
-  do_load_from_addr (load_mode, data2, addr2, src2);
+  rtx addr1 = adjust_address (src1, load_mode, offset);
+  do_load (load_mode, data1, addr1);
+  rtx addr2 = adjust_address (src2, load_mode, offset);
+  do_load (load_mode, data2, addr2);
 
   if (cmp_bytes == 1)
{


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [v2, 1/2] RISC-V: Add cmpmemsi expansion

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:d57dfea6e051695349fb9f6da1c30899b7f5

commit d57dfea6e051695349fb9f6da1c30899b7f5
Author: Christoph Müllner 
Date:   Wed May 15 12:18:20 2024 -0600

[v2,1/2] RISC-V: Add cmpmemsi expansion

GCC has a generic cmpmemsi expansion via the by-pieces framework,
which shows some room for target-specific optimizations.
E.g. for comparing two aligned memory blocks of 15 bytes
we get the following sequence:

my_mem_cmp_aligned_15:
li  a4,0
j   .L2
.L8:
bgeua4,a7,.L7
.L2:
add a2,a0,a4
add a3,a1,a4
lbu a5,0(a2)
lbu a6,0(a3)
addia4,a4,1
li  a7,15// missed hoisting
subwa5,a5,a6
andia5,a5,0xff // useless
beq a5,zero,.L8
lbu a0,0(a2) // loading again!
lbu a5,0(a3) // loading again!
subwa0,a0,a5
ret
.L7:
li  a0,0
ret

Diff first byte: 15 insns
Diff second byte: 25 insns
No diff: 25 insns

Possible improvements:
* unroll the loop and use load-with-displacement to avoid offset increments
* load and compare multiple (aligned) bytes at once
* Use the bitmanip/strcmp result calculation (reverse words and
  synthesize (a2 >= a3) ? 1 : -1 in a branchless sequence)

When applying these improvements we get the following sequence:

my_mem_cmp_aligned_15:
ld  a5,0(a0)
ld  a4,0(a1)
bne a5,a4,.L2
ld  a5,8(a0)
ld  a4,8(a1)
sllia5,a5,8
sllia4,a4,8
bne a5,a4,.L2
li  a0,0
.L3:
sext.w  a0,a0
ret
.L2:
rev8a5,a5
rev8a4,a4
sltua5,a5,a4
neg a5,a5
ori a0,a5,1
j   .L3

Diff first byte: 11 insns
Diff second byte: 16 insns
No diff: 11 insns

This patch implements this improvements.

The tests consist of a execution test (similar to
gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c) and a few tests
that test the expansion conditions (known length and alignment).

Similar to the cpymemsi expansion this patch does not introduce any
gating for the cmpmemsi expansion (on top of requiring the known length,
alignment and Zbb).

Bootstrapped and SPEC CPU 2017 tested.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (riscv_expand_block_compare): New
prototype.
* config/riscv/riscv-string.cc (GEN_EMIT_HELPER2): New helper
for zero_extendhi.
(do_load_from_addr): Add support for HI and SI/64 modes.
(do_load): Add helper for zero-extended loads.
(emit_memcmp_scalar_load_and_compare): New helper to emit memcmp.
(emit_memcmp_scalar_result_calculation): Likewise.
(riscv_expand_block_compare_scalar): Likewise.
(riscv_expand_block_compare): New RISC-V expander for memory 
compare.
* config/riscv/riscv.md (cmpmemsi): New cmpmem expansion.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cmpmemsi-1.c: New test.
* gcc.target/riscv/cmpmemsi-2.c: New test.
* gcc.target/riscv/cmpmemsi-3.c: New test.
* gcc.target/riscv/cmpmemsi.c: New test.

(cherry picked from commit 4bf1aa1ab90dd487fadc27c86523ec3562b2d2fe)

Diff:
---
 gcc/config/riscv/riscv-protos.h |  1 +
 gcc/config/riscv/riscv-string.cc| 40 +--
 gcc/config/riscv/riscv.md   | 15 ++
 gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c |  6 
 gcc/testsuite/gcc.target/riscv/cmpmemsi-2.c | 42 
 gcc/testsuite/gcc.target/riscv/cmpmemsi-3.c | 43 +
 gcc/testsuite/gcc.target/riscv/cmpmemsi.c   | 22 +++
 7 files changed, 155 insertions(+), 14 deletions(-)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 5c8a52b78a22..565ead1382a7 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -189,6 +189,7 @@ rtl_opt_pass * make_pass_avlprop (gcc::context *ctxt);
 rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
 
 /* Routines implemented in riscv-string.c.  */
+extern bool riscv_expand_block_compare (rtx, rtx, rtx, rtx);
 extern bool riscv_expand_block_move (rtx, rtx, rtx);
 extern bool riscv_expand_block_clear (rtx, rtx);
 
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 96394844bbb6..8f3b6f925e01 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -86,35 +86,47 @@ GEN_EMIT_HELPER2(th_rev) /* do_th_rev2  */
 

Re: [PATCH v2 1/2] RISC-V: Add cmpmemsi expansion

2024-05-15 Thread Jeff Law




On 5/15/24 12:49 AM, Christoph Müllner wrote:

GCC has a generic cmpmemsi expansion via the by-pieces framework,
which shows some room for target-specific optimizations.
E.g. for comparing two aligned memory blocks of 15 bytes
we get the following sequence:

my_mem_cmp_aligned_15:
 li  a4,0
 j   .L2
.L8:
 bgeua4,a7,.L7
.L2:
 add a2,a0,a4
 add a3,a1,a4
 lbu a5,0(a2)
 lbu a6,0(a3)
 addia4,a4,1
 li  a7,15// missed hoisting
 subwa5,a5,a6
 andia5,a5,0xff // useless
 beq a5,zero,.L8
 lbu a0,0(a2) // loading again!
 lbu a5,0(a3) // loading again!
 subwa0,a0,a5
 ret
.L7:
 li  a0,0
 ret

Diff first byte: 15 insns
Diff second byte: 25 insns
No diff: 25 insns

Possible improvements:
* unroll the loop and use load-with-displacement to avoid offset increments
* load and compare multiple (aligned) bytes at once
* Use the bitmanip/strcmp result calculation (reverse words and
   synthesize (a2 >= a3) ? 1 : -1 in a branchless sequence)

When applying these improvements we get the following sequence:

my_mem_cmp_aligned_15:
 ld  a5,0(a0)
 ld  a4,0(a1)
 bne a5,a4,.L2
 ld  a5,8(a0)
 ld  a4,8(a1)
 sllia5,a5,8
 sllia4,a4,8
 bne a5,a4,.L2
 li  a0,0
.L3:
 sext.w  a0,a0
 ret
.L2:
 rev8a5,a5
 rev8a4,a4
 sltua5,a5,a4
 neg a5,a5
 ori a0,a5,1
 j   .L3

Diff first byte: 11 insns
Diff second byte: 16 insns
No diff: 11 insns

This patch implements this improvements.

The tests consist of a execution test (similar to
gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c) and a few tests
that test the expansion conditions (known length and alignment).

Similar to the cpymemsi expansion this patch does not introduce any
gating for the cmpmemsi expansion (on top of requiring the known length,
alignment and Zbb).

Bootstrapped and SPEC CPU 2017 tested.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (riscv_expand_block_compare): New
prototype.
* config/riscv/riscv-string.cc (GEN_EMIT_HELPER2): New helper
for zero_extendhi.
(do_load_from_addr): Add support for HI and SI/64 modes.
(do_load): Add helper for zero-extended loads.
(emit_memcmp_scalar_load_and_compare): New helper to emit memcmp.
(emit_memcmp_scalar_result_calculation): Likewise.
(riscv_expand_block_compare_scalar): Likewise.
(riscv_expand_block_compare): New RISC-V expander for memory compare.
* config/riscv/riscv.md (cmpmemsi): New cmpmem expansion.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cmpmemsi-1.c: New test.
* gcc.target/riscv/cmpmemsi-2.c: New test.
* gcc.target/riscv/cmpmemsi-3.c: New test.
* gcc.target/riscv/cmpmemsi.c: New test.

[ ... ]
I fixed some of the nits from the linter (whitespace stuff) and pushed 
both patches of this series.


Jeff



[gcc r15-525] [v2, 2/2] RISC-V: strcmp expansion: Use adjust_address() for address calculation

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:1fbbae1d4ba3618a3da829a6d7e11a1606a583b3

commit r15-525-g1fbbae1d4ba3618a3da829a6d7e11a1606a583b3
Author: Christoph Müllner 
Date:   Wed May 15 12:19:40 2024 -0600

[v2,2/2] RISC-V: strcmp expansion: Use adjust_address() for address 
calculation

We have an arch-independent routine to generate an address with an offset.
Let's use that instead of doing the calculation in the backend.

gcc/ChangeLog:

* config/riscv/riscv-string.cc 
(emit_strcmp_scalar_load_and_compare):
Use adjust_address() to calculate MEM-PLUS pattern.

Diff:
---
 gcc/config/riscv/riscv-string.cc | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 8f3b6f925e01..cbb9724d2308 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -227,8 +227,6 @@ emit_strcmp_scalar_load_and_compare (rtx result, rtx src1, 
rtx src2,
 rtx final_label)
 {
   const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
-  rtx src1_addr = force_reg (Pmode, XEXP (src1, 0));
-  rtx src2_addr = force_reg (Pmode, XEXP (src2, 0));
   unsigned HOST_WIDE_INT offset = 0;
 
   rtx testval = gen_reg_rtx (Xmode);
@@ -246,10 +244,10 @@ emit_strcmp_scalar_load_and_compare (rtx result, rtx 
src1, rtx src2,
   else
load_mode = Xmode;
 
-  rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
-  do_load_from_addr (load_mode, data1, addr1, src1);
-  rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
-  do_load_from_addr (load_mode, data2, addr2, src2);
+  rtx addr1 = adjust_address (src1, load_mode, offset);
+  do_load (load_mode, data1, addr1);
+  rtx addr2 = adjust_address (src2, load_mode, offset);
+  do_load (load_mode, data2, addr2);
 
   if (cmp_bytes == 1)
{


[gcc r15-524] [v2,1/2] RISC-V: Add cmpmemsi expansion

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:4bf1aa1ab90dd487fadc27c86523ec3562b2d2fe

commit r15-524-g4bf1aa1ab90dd487fadc27c86523ec3562b2d2fe
Author: Christoph Müllner 
Date:   Wed May 15 12:18:20 2024 -0600

[v2,1/2] RISC-V: Add cmpmemsi expansion

GCC has a generic cmpmemsi expansion via the by-pieces framework,
which shows some room for target-specific optimizations.
E.g. for comparing two aligned memory blocks of 15 bytes
we get the following sequence:

my_mem_cmp_aligned_15:
li  a4,0
j   .L2
.L8:
bgeua4,a7,.L7
.L2:
add a2,a0,a4
add a3,a1,a4
lbu a5,0(a2)
lbu a6,0(a3)
addia4,a4,1
li  a7,15// missed hoisting
subwa5,a5,a6
andia5,a5,0xff // useless
beq a5,zero,.L8
lbu a0,0(a2) // loading again!
lbu a5,0(a3) // loading again!
subwa0,a0,a5
ret
.L7:
li  a0,0
ret

Diff first byte: 15 insns
Diff second byte: 25 insns
No diff: 25 insns

Possible improvements:
* unroll the loop and use load-with-displacement to avoid offset increments
* load and compare multiple (aligned) bytes at once
* Use the bitmanip/strcmp result calculation (reverse words and
  synthesize (a2 >= a3) ? 1 : -1 in a branchless sequence)

When applying these improvements we get the following sequence:

my_mem_cmp_aligned_15:
ld  a5,0(a0)
ld  a4,0(a1)
bne a5,a4,.L2
ld  a5,8(a0)
ld  a4,8(a1)
sllia5,a5,8
sllia4,a4,8
bne a5,a4,.L2
li  a0,0
.L3:
sext.w  a0,a0
ret
.L2:
rev8a5,a5
rev8a4,a4
sltua5,a5,a4
neg a5,a5
ori a0,a5,1
j   .L3

Diff first byte: 11 insns
Diff second byte: 16 insns
No diff: 11 insns

This patch implements this improvements.

The tests consist of a execution test (similar to
gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c) and a few tests
that test the expansion conditions (known length and alignment).

Similar to the cpymemsi expansion this patch does not introduce any
gating for the cmpmemsi expansion (on top of requiring the known length,
alignment and Zbb).

Bootstrapped and SPEC CPU 2017 tested.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (riscv_expand_block_compare): New
prototype.
* config/riscv/riscv-string.cc (GEN_EMIT_HELPER2): New helper
for zero_extendhi.
(do_load_from_addr): Add support for HI and SI/64 modes.
(do_load): Add helper for zero-extended loads.
(emit_memcmp_scalar_load_and_compare): New helper to emit memcmp.
(emit_memcmp_scalar_result_calculation): Likewise.
(riscv_expand_block_compare_scalar): Likewise.
(riscv_expand_block_compare): New RISC-V expander for memory 
compare.
* config/riscv/riscv.md (cmpmemsi): New cmpmem expansion.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cmpmemsi-1.c: New test.
* gcc.target/riscv/cmpmemsi-2.c: New test.
* gcc.target/riscv/cmpmemsi-3.c: New test.
* gcc.target/riscv/cmpmemsi.c: New test.

Diff:
---
 gcc/config/riscv/riscv-protos.h |  1 +
 gcc/config/riscv/riscv-string.cc| 40 +--
 gcc/config/riscv/riscv.md   | 15 ++
 gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c |  6 
 gcc/testsuite/gcc.target/riscv/cmpmemsi-2.c | 42 
 gcc/testsuite/gcc.target/riscv/cmpmemsi-3.c | 43 +
 gcc/testsuite/gcc.target/riscv/cmpmemsi.c   | 22 +++
 7 files changed, 155 insertions(+), 14 deletions(-)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 5c8a52b78a22..565ead1382a7 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -189,6 +189,7 @@ rtl_opt_pass * make_pass_avlprop (gcc::context *ctxt);
 rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
 
 /* Routines implemented in riscv-string.c.  */
+extern bool riscv_expand_block_compare (rtx, rtx, rtx, rtx);
 extern bool riscv_expand_block_move (rtx, rtx, rtx);
 extern bool riscv_expand_block_clear (rtx, rtx);
 
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 96394844bbb6..8f3b6f925e01 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -86,35 +86,47 @@ GEN_EMIT_HELPER2(th_rev) /* do_th_rev2  */
 GEN_EMIT_HELPER2(th_tstnbz) /* do_th_tstnbz2  */
 GEN_EMIT_HELPER3(xor) /* 

Re: [PATCH] RISC-V: propgue/epilogue expansion code minor changes [NFC]

2024-05-15 Thread Jeff Law




On 5/15/24 12:55 PM, Vineet Gupta wrote:

Saw this little room for improvement in current debugging of
prologue/epilogue expansion code.

---

Use the following pattern consistently
`RTX_FRAME_RELATED_P (gen_insn (insn)) = 1`

vs. calling gen_insn around apriori gen_xxx_insn () calls.

This reduces weird indentations which are done inconsistently.

And also move the RTX_FRAME_RELATED_P () calls immediately after those
gen_xxx_insn () calls.

gcc/ChangeLog:
* config/riscv/riscv.cc (riscv_expand_epilogue): Use pattern
described above.
(riscv_expand_prologue): Ditto.
(riscv_for_each_saved_v_reg): Ditto.

Thanks for cleaning this up.  Just having consistency is helpful.

All this gets scrambled again with stack-clash protection :(  But that's 
just the nature of the beast.


jeff


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Test cbo.zero expansion for rv32

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:f3d5808070acf09d4ca1da5f5e692be52e3a73a6

commit f3d5808070acf09d4ca1da5f5e692be52e3a73a6
Author: Christoph Müllner 
Date:   Wed May 15 01:34:54 2024 +0200

RISC-V: Test cbo.zero expansion for rv32

We had an issue when expanding via cmo-zero for RV32.
This was fixed upstream, but we don't have a RV32 test.
Therefore, this patch introduces such a test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cmo-zicboz-zic64-1.c: Fix for rv32.

Signed-off-by: Christoph Müllner 
(cherry picked from commit 5609d77e683944439fae38323ecabc44a1eb4671)

Diff:
---
 .../gcc.target/riscv/cmo-zicboz-zic64-1.c  | 37 +++---
 1 file changed, 11 insertions(+), 26 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
index 6d4535287d08..9192b391b11d 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
@@ -1,24 +1,9 @@
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gc_zic64b_zicboz -mabi=lp64d" } */
+/* { dg-options "-march=rv32gc_zic64b_zicboz" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc_zic64b_zicboz" { target { rv64 } } } */
 /* { dg-skip-if "" { *-*-* } {"-O0" "-Os" "-Og" "-Oz" "-flto" } } */
-/* { dg-final { check-function-bodies "**" "" } } */
-/* { dg-allow-blank-lines-in-output 1 } */
 
-/*
-**clear_buf_123:
-**...
-**cbo\.zero\t0\(a[0-9]+\)
-**sd\tzero,64\(a[0-9]+\)
-**sd\tzero,72\(a[0-9]+\)
-**sd\tzero,80\(a[0-9]+\)
-**sd\tzero,88\(a[0-9]+\)
-**sd\tzero,96\(a[0-9]+\)
-**sd\tzero,104\(a[0-9]+\)
-**sd\tzero,112\(a[0-9]+\)
-**sh\tzero,120\(a[0-9]+\)
-**sb\tzero,122\(a[0-9]+\)
-**...
-*/
+// 1x cbo.zero, 7x sd (rv64) or 14x sw (rv32), 1x sh, 1x sb
 int
 clear_buf_123 (void *p)
 {
@@ -26,17 +11,17 @@ clear_buf_123 (void *p)
   __builtin_memset (p, 0, 123);
 }
 
-/*
-**clear_buf_128:
-**...
-**cbo\.zero\t0\(a[0-9]+\)
-**addi\ta[0-9]+,a[0-9]+,64
-**cbo\.zero\t0\(a[0-9]+\)
-**...
-*/
+// 2x cbo.zero, 1x addi 64
 int
 clear_buf_128 (void *p)
 {
   p = __builtin_assume_aligned(p, 64);
   __builtin_memset (p, 0, 128);
 }
+
+/* { dg-final { scan-assembler-times "cbo\.zero\t" 3 } } */
+/* { dg-final { scan-assembler-times "addi\ta\[0-9\]+,a\[0-9\]+,64" 1 } } */
+/* { dg-final { scan-assembler-times "sd\t" 7 { target { rv64 } } } } */
+/* { dg-final { scan-assembler-times "sw\t" 14 { target { rv32 } } } } */
+/* { dg-final { scan-assembler-times "sh\t" 1 } } */
+/* { dg-final { scan-assembler-times "sb\t" 1 } } */


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Allow by-pieces to do overlapping accesses in block_move_straight

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:59e6343f99eb53da07bbd6198f083ce1bbdf20d8

commit 59e6343f99eb53da07bbd6198f083ce1bbdf20d8
Author: Christoph Müllner 
Date:   Mon Apr 29 02:53:20 2024 +0200

RISC-V: Allow by-pieces to do overlapping accesses in block_move_straight

The current implementation of riscv_block_move_straight() emits a couple
of loads/stores with with maximum width (e.g. 8-byte for RV64).
The remainder is handed over to move_by_pieces().
The by-pieces framework utilizes target hooks to decide about the emitted
instructions (e.g. unaligned accesses or overlapping accesses).

Since the current implementation will always request less than XLEN bytes
to be handled by the by-pieces infrastructure, it is impossible that
overlapping memory accesses can ever be emitted (the by-pieces code does
not know of any previous instructions that were emitted by the backend).

This patch changes the implementation of riscv_block_move_straight()
such, that it utilizes the by-pieces framework if the remaining data
is less than 2*XLEN bytes, which is sufficient to enable overlapping
memory accesses (if the requirements for them are given).

The changes in the expansion can be seen in the adjustments of the
cpymem-NN-ooo test cases. The changes in the cpymem-NN tests are
caused by the different instruction ordering of the code emitted
by the by-pieces infrastructure, which emits alternating load/store
sequences.

gcc/ChangeLog:

* config/riscv/riscv-string.cc (riscv_block_move_straight):
Hand over up to 2xXLEN bytes to move_by_pieces().

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cpymem-32-ooo.c: Adjustments for overlapping
access.
* gcc.target/riscv/cpymem-32.c: Adjustments for code emitted by
by-pieces.
* gcc.target/riscv/cpymem-64-ooo.c: Adjustments for overlapping
access.
* gcc.target/riscv/cpymem-64.c: Adjustments for code emitted by
by-pieces.

Signed-off-by: Christoph Müllner 
(cherry picked from commit ad22c607f3e17f2c6ca45699c1d88adaa618c23c)

Diff:
---
 gcc/config/riscv/riscv-string.cc   |  6 +++---
 gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c | 16 
 gcc/testsuite/gcc.target/riscv/cpymem-32.c | 10 --
 gcc/testsuite/gcc.target/riscv/cpymem-64-ooo.c |  8 
 gcc/testsuite/gcc.target/riscv/cpymem-64.c |  9 +++--
 5 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index b6cd70323563..96394844bbb6 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -637,18 +637,18 @@ riscv_block_move_straight (rtx dest, rtx src, unsigned 
HOST_WIDE_INT length,
   delta = bits / BITS_PER_UNIT;
 
   /* Allocate a buffer for the temporary registers.  */
-  regs = XALLOCAVEC (rtx, length / delta);
+  regs = XALLOCAVEC (rtx, length / delta - 1);
 
   /* Load as many BITS-sized chunks as possible.  Use a normal load if
  the source has enough alignment, otherwise use left/right pairs.  */
-  for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
+  for (offset = 0, i = 0; offset + 2 * delta <= length; offset += delta, i++)
 {
   regs[i] = gen_reg_rtx (mode);
   riscv_emit_move (regs[i], adjust_address (src, mode, offset));
 }
 
   /* Copy the chunks to the destination.  */
-  for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
+  for (offset = 0, i = 0; offset + 2 * delta <= length; offset += delta, i++)
 riscv_emit_move (adjust_address (dest, mode, offset), regs[i]);
 
   /* Mop up any left-over bytes.  */
diff --git a/gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c 
b/gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c
index 947d58c30fa3..2a48567353a6 100644
--- a/gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c
+++ b/gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c
@@ -91,8 +91,8 @@ COPY_ALIGNED_N(11)
 **...
 **sw\t[at][0-9],0\([at][0-9]\)
 **...
-**lbu\t[at][0-9],14\([at][0-9]\)
-**sb\t[at][0-9],14\([at][0-9]\)
+**lw\t[at][0-9],11\([at][0-9]\)
+**sw\t[at][0-9],11\([at][0-9]\)
 **...
 */
 COPY_N(15)
@@ -104,8 +104,8 @@ COPY_N(15)
 **...
 **sw\t[at][0-9],0\([at][0-9]\)
 **...
-**lbu\t[at][0-9],14\([at][0-9]\)
-**sb\t[at][0-9],14\([at][0-9]\)
+**lw\t[at][0-9],11\([at][0-9]\)
+**sw\t[at][0-9],11\([at][0-9]\)
 **...
 */
 COPY_ALIGNED_N(15)
@@ -117,8 +117,8 @@ COPY_ALIGNED_N(15)
 **...
 **sw\t[at][0-9],20\([at][0-9]\)
 **...
-**lbu\t[at][0-9],26\([at][0-9]\)
-**sb\t[at][0-9],26\([at][0-9]\)
+**lw\t[at][0-9],23\([at][0-9]\)
+**sw\t[at][0-9],23\([at][0-9]\)
 **...
 */
 COPY_N(27)
@@ -130,8 +130,8 @@ COPY_N(27)
 **...
 **sw\t[at][0-9],20\([at][0-9]\)
 **...
-**

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: add tests for overlapping mem ops

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:ad0413b832400aa9e81e20070b3ef6b0a9a6d888

commit ad0413b832400aa9e81e20070b3ef6b0a9a6d888
Author: Christoph Müllner 
Date:   Mon Apr 29 03:06:52 2024 +0200

RISC-V: add tests for overlapping mem ops

A recent patch added the field overlap_op_by_pieces to the struct
riscv_tune_param, which is used by the TARGET_OVERLAP_OP_BY_PIECES_P()
hook. This hook is used by the by-pieces infrastructure to decide
if overlapping memory accesses should be emitted.

The changes in the expansion can be seen in the adjustments of the
cpymem test cases. These tests also reveal a limitation in the
RISC-V cpymem expansion that prevents this optimization as only
by-pieces cpymem expansions emit overlapping memory accesses.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cpymem-32-ooo.c: Adjust for overlapping
access.
* gcc.target/riscv/cpymem-64-ooo.c: Likewise.

Signed-off-by: Christoph Müllner 
(cherry picked from commit 5814437b4fcc550697d6e286f49a2f8b108815bf)

Diff:
---
 gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c | 20 +++-
 gcc/testsuite/gcc.target/riscv/cpymem-64-ooo.c | 33 ++
 2 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c 
b/gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c
index 946a773f77a0..947d58c30fa3 100644
--- a/gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c
+++ b/gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c
@@ -24,9 +24,8 @@ void copy_aligned_##N (void *to, void *from)  \
 **...
 **lw\t[at][0-9],0\([at][0-9]\)
 **sw\t[at][0-9],0\([at][0-9]\)
-**...
-**lbu\t[at][0-9],6\([at][0-9]\)
-**sb\t[at][0-9],6\([at][0-9]\)
+**lw\t[at][0-9],3\([at][0-9]\)
+**sw\t[at][0-9],3\([at][0-9]\)
 **...
 */
 COPY_N(7)
@@ -36,9 +35,8 @@ COPY_N(7)
 **...
 **lw\t[at][0-9],0\([at][0-9]\)
 **sw\t[at][0-9],0\([at][0-9]\)
-**...
-**lbu\t[at][0-9],6\([at][0-9]\)
-**sb\t[at][0-9],6\([at][0-9]\)
+**lw\t[at][0-9],3\([at][0-9]\)
+**sw\t[at][0-9],3\([at][0-9]\)
 **...
 */
 COPY_ALIGNED_N(7)
@@ -66,11 +64,10 @@ COPY_ALIGNED_N(8)
 **...
 **...
 **lw\t[at][0-9],0\([at][0-9]\)
-**...
 **sw\t[at][0-9],0\([at][0-9]\)
 **...
-**lbu\t[at][0-9],10\([at][0-9]\)
-**sb\t[at][0-9],10\([at][0-9]\)
+**lw\t[at][0-9],7\([at][0-9]\)
+**sw\t[at][0-9],7\([at][0-9]\)
 **...
 */
 COPY_N(11)
@@ -79,11 +76,10 @@ COPY_N(11)
 **copy_aligned_11:
 **...
 **lw\t[at][0-9],0\([at][0-9]\)
-**...
 **sw\t[at][0-9],0\([at][0-9]\)
 **...
-**lbu\t[at][0-9],10\([at][0-9]\)
-**sb\t[at][0-9],10\([at][0-9]\)
+**lw\t[at][0-9],7\([at][0-9]\)
+**sw\t[at][0-9],7\([at][0-9]\)
 **...
 */
 COPY_ALIGNED_N(11)
diff --git a/gcc/testsuite/gcc.target/riscv/cpymem-64-ooo.c 
b/gcc/testsuite/gcc.target/riscv/cpymem-64-ooo.c
index 08a927b94835..108748690cd3 100644
--- a/gcc/testsuite/gcc.target/riscv/cpymem-64-ooo.c
+++ b/gcc/testsuite/gcc.target/riscv/cpymem-64-ooo.c
@@ -24,9 +24,8 @@ void copy_aligned_##N (void *to, void *from)  \
 **...
 **lw\t[at][0-9],0\([at][0-9]\)
 **sw\t[at][0-9],0\([at][0-9]\)
-**...
-**lbu\t[at][0-9],6\([at][0-9]\)
-**sb\t[at][0-9],6\([at][0-9]\)
+**lw\t[at][0-9],3\([at][0-9]\)
+**sw\t[at][0-9],3\([at][0-9]\)
 **...
 */
 COPY_N(7)
@@ -36,9 +35,8 @@ COPY_N(7)
 **...
 **lw\t[at][0-9],0\([at][0-9]\)
 **sw\t[at][0-9],0\([at][0-9]\)
-**...
-**lbu\t[at][0-9],6\([at][0-9]\)
-**sb\t[at][0-9],6\([at][0-9]\)
+**lw\t[at][0-9],3\([at][0-9]\)
+**sw\t[at][0-9],3\([at][0-9]\)
 **...
 */
 COPY_ALIGNED_N(7)
@@ -66,9 +64,8 @@ COPY_ALIGNED_N(8)
 **...
 **ld\t[at][0-9],0\([at][0-9]\)
 **sd\t[at][0-9],0\([at][0-9]\)
-**...
-**lbu\t[at][0-9],10\([at][0-9]\)
-**sb\t[at][0-9],10\([at][0-9]\)
+**lw\t[at][0-9],7\([at][0-9]\)
+**sw\t[at][0-9],7\([at][0-9]\)
 **...
 */
 COPY_N(11)
@@ -77,11 +74,9 @@ COPY_N(11)
 **copy_aligned_11:
 **...
 **ld\t[at][0-9],0\([at][0-9]\)
-**...
 **sd\t[at][0-9],0\([at][0-9]\)
-**...
-**lbu\t[at][0-9],10\([at][0-9]\)
-**sb\t[at][0-9],10\([at][0-9]\)
+**lw\t[at][0-9],7\([at][0-9]\)
+**sw\t[at][0-9],7\([at][0-9]\)
 **...
 */
 COPY_ALIGNED_N(11)
@@ -90,11 +85,9 @@ COPY_ALIGNED_N(11)
 **copy_15:
 **...
 **ld\t[at][0-9],0\([at][0-9]\)
-**...
 **sd\t[at][0-9],0\([at][0-9]\)
-**...
-**lbu\t[at][0-9],14\([at][0-9]\)
-**sb\t[at][0-9],14\([at][0-9]\)
+**ld\t[at][0-9],7\([at][0-9]\)
+**sd\t[at][0-9],7\([at][0-9]\)
 **...
 */
 COPY_N(15)
@@ -103,11 +96,9 @@ COPY_N(15)
 **copy_aligned_15:
 **...
 **ld\t[at][0-9],0\([at][0-9]\)
-**...
 **sd\t[at][0-9],0\([at][0-9]\)
-**...
-**lbu\t[at][0-9],14\([at][0-9]\)
-**sb\t[at][0-9],14\([at][0-9]\)
+**ld\t[at][0-9],7\([at][0-9]\)
+**

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Add test cases for cpymem expansion

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:0dcd2d26d0da77af7f173b6c0d79a7f5ea25c642

commit 0dcd2d26d0da77af7f173b6c0d79a7f5ea25c642
Author: Christoph Müllner 
Date:   Wed May 1 16:54:42 2024 +0200

RISC-V: Add test cases for cpymem expansion

We have two mechanisms in the RISC-V backend that expand
cpymem pattern: a) by-pieces, b) riscv_expand_block_move()
in riscv-string.cc. The by-pieces framework has higher priority
and emits a sequence of up to 15 instructions
(see use_by_pieces_infrastructure_p() for more details).

As a rule-of-thumb, by-pieces emits alternating load/store sequences
and the setmem expansion in the backend emits a sequence of loads
followed by a sequence of stores.

Let's add some test cases to document the current behaviour
and to have tests to identify regressions.

Signed-off-by: Christoph Müllner 

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cpymem-32-ooo.c: New test.
* gcc.target/riscv/cpymem-32.c: New test.
* gcc.target/riscv/cpymem-64-ooo.c: New test.
* gcc.target/riscv/cpymem-64.c: New test.

(cherry picked from commit 00029408387e9cc64e135324c22d15cd5a70e946)

Diff:
---
 gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c | 131 +++
 gcc/testsuite/gcc.target/riscv/cpymem-32.c | 138 +
 gcc/testsuite/gcc.target/riscv/cpymem-64-ooo.c | 129 +++
 gcc/testsuite/gcc.target/riscv/cpymem-64.c | 138 +
 4 files changed, 536 insertions(+)

diff --git a/gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c 
b/gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c
new file mode 100644
index ..33fb9891d823
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c
@@ -0,0 +1,131 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv32 } */
+/* { dg-options "-march=rv32gc -mabi=ilp32d -mtune=generic-ooo" } */
+/* { dg-skip-if "" { *-*-* } {"-O0" "-Os" "-Og" "-Oz" "-flto" } } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-allow-blank-lines-in-output 1 } */
+
+#define COPY_N(N)  \
+void copy_##N (void *to, void *from)   \
+{  \
+  __builtin_memcpy (to, from, N);  \
+}
+
+#define COPY_ALIGNED_N(N)  \
+void copy_aligned_##N (void *to, void *from)   \
+{  \
+  to = __builtin_assume_aligned(to, sizeof(long)); \
+  from = __builtin_assume_aligned(from, sizeof(long)); \
+  __builtin_memcpy (to, from, N);  \
+}
+
+/*
+**copy_7:
+**...
+**lw\t[at][0-9],0\([at][0-9]\)
+**sw\t[at][0-9],0\([at][0-9]\)
+**...
+**lbu\t[at][0-9],6\([at][0-9]\)
+**sb\t[at][0-9],6\([at][0-9]\)
+**...
+*/
+COPY_N(7)
+
+/*
+**copy_aligned_7:
+**...
+**lw\t[at][0-9],0\([at][0-9]\)
+**sw\t[at][0-9],0\([at][0-9]\)
+**...
+**lbu\t[at][0-9],6\([at][0-9]\)
+**sb\t[at][0-9],6\([at][0-9]\)
+**...
+*/
+COPY_ALIGNED_N(7)
+
+/*
+**copy_8:
+**...
+**lw\ta[0-9],0\(a[0-9]\)
+**sw\ta[0-9],0\(a[0-9]\)
+**...
+*/
+COPY_N(8)
+
+/*
+**copy_aligned_8:
+**...
+**lw\ta[0-9],0\(a[0-9]\)
+**sw\ta[0-9],0\(a[0-9]\)
+**...
+*/
+COPY_ALIGNED_N(8)
+
+/*
+**copy_11:
+**...
+**lbu\t[at][0-9],0\([at][0-9]\)
+**...
+**lbu\t[at][0-9],10\([at][0-9]\)
+**...
+**sb\t[at][0-9],0\([at][0-9]\)
+**...
+**sb\t[at][0-9],10\([at][0-9]\)
+**...
+*/
+COPY_N(11)
+
+/*
+**copy_aligned_11:
+**...
+**lw\t[at][0-9],0\([at][0-9]\)
+**...
+**sw\t[at][0-9],0\([at][0-9]\)
+**...
+**lbu\t[at][0-9],10\([at][0-9]\)
+**sb\t[at][0-9],10\([at][0-9]\)
+**...
+*/
+COPY_ALIGNED_N(11)
+
+/*
+**copy_15:
+**...
+**(call|tail)\tmemcpy
+**...
+*/
+COPY_N(15)
+
+/*
+**copy_aligned_15:
+**...
+**lw\t[at][0-9],0\([at][0-9]\)
+**...
+**sw\t[at][0-9],0\([at][0-9]\)
+**...
+**lbu\t[at][0-9],14\([at][0-9]\)
+**sb\t[at][0-9],14\([at][0-9]\)
+**...
+*/
+COPY_ALIGNED_N(15)
+
+/*
+**copy_27:
+**...
+**(call|tail)\tmemcpy
+**...
+*/
+COPY_N(27)
+
+/*
+**copy_aligned_27:
+**...
+**lw\t[at][0-9],20\([at][0-9]\)
+**...
+**sw\t[at][0-9],20\([at][0-9]\)
+**...
+**lbu\t[at][0-9],26\([at][0-9]\)
+**sb\t[at][0-9],26\([at][0-9]\)
+**...
+*/
+COPY_ALIGNED_N(27)
diff --git a/gcc/testsuite/gcc.target/riscv/cpymem-32.c 
b/gcc/testsuite/gcc.target/riscv/cpymem-32.c
new file mode 100644
index ..44ba14a1d51f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cpymem-32.c
@@ -0,0 +1,138 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv32 } */
+/* { dg-options "-march=rv32gc -mabi=ilp32d -mtune=rocket" } */
+/* { dg-skip-if "" { *-*-* } {"-O0" "-Os" "-Og" "-Oz" "-flto" } } */
+/* { dg-final { 

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Allow unaligned accesses in cpymemsi expansion

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:69408db9b2b3ede055f4392f9d30be33804eec77

commit 69408db9b2b3ede055f4392f9d30be33804eec77
Author: Christoph Müllner 
Date:   Wed May 1 18:50:38 2024 +0200

RISC-V: Allow unaligned accesses in cpymemsi expansion

The RISC-V cpymemsi expansion is called, whenever the by-pieces
infrastructure will not take care of the builtin expansion.
The code emitted by the by-pieces infrastructure may emits code,
that includes unaligned accesses if riscv_slow_unaligned_access_p
is false.

The RISC-V cpymemsi expansion is handled via riscv_expand_block_move().
The current implementation of this function does not check
riscv_slow_unaligned_access_p and never emits unaligned accesses.

Since by-pieces emits unaligned accesses, it is reasonable to implement
the same behaviour in the cpymemsi expansion. And that's what this patch
is doing.

The patch checks riscv_slow_unaligned_access_p at the entry and sets
the allowed alignment accordingly. This alignment is then propagated
down to the routines that emit the actual instructions.

The changes introduced by this patch can be seen in the adjustments
of the cpymem tests.

gcc/ChangeLog:

* config/riscv/riscv-string.cc (riscv_block_move_straight): Add
parameter align.
(riscv_adjust_block_mem): Replace parameter length by align.
(riscv_block_move_loop): Add parameter align.
(riscv_expand_block_move_scalar): Set alignment properly if the
target has fast unaligned access.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cpymem-32-ooo.c: Adjust for unaligned access.
* gcc.target/riscv/cpymem-64-ooo.c: Likewise.

Signed-off-by: Christoph Müllner 
(cherry picked from commit 04cd8ccaec90405ccf7471252c0e06ba7f5437dc)

Diff:
---
 gcc/config/riscv/riscv-string.cc   | 54 --
 gcc/testsuite/gcc.target/riscv/cpymem-32-ooo.c | 20 +++---
 gcc/testsuite/gcc.target/riscv/cpymem-64-ooo.c | 14 ++-
 3 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index b515f44d17ae..b6cd70323563 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -617,11 +617,13 @@ riscv_expand_strlen (rtx result, rtx src, rtx 
search_char, rtx align)
   return false;
 }
 
-/* Emit straight-line code to move LENGTH bytes from SRC to DEST.
+/* Emit straight-line code to move LENGTH bytes from SRC to DEST
+   with accesses that are ALIGN bytes aligned.
Assume that the areas do not overlap.  */
 
 static void
-riscv_block_move_straight (rtx dest, rtx src, unsigned HOST_WIDE_INT length)
+riscv_block_move_straight (rtx dest, rtx src, unsigned HOST_WIDE_INT length,
+  unsigned HOST_WIDE_INT align)
 {
   unsigned HOST_WIDE_INT offset, delta;
   unsigned HOST_WIDE_INT bits;
@@ -629,8 +631,7 @@ riscv_block_move_straight (rtx dest, rtx src, unsigned 
HOST_WIDE_INT length)
   enum machine_mode mode;
   rtx *regs;
 
-  bits = MAX (BITS_PER_UNIT,
- MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest;
+  bits = MAX (BITS_PER_UNIT, MIN (BITS_PER_WORD, align));
 
   mode = mode_for_size (bits, MODE_INT, 0).require ();
   delta = bits / BITS_PER_UNIT;
@@ -655,21 +656,20 @@ riscv_block_move_straight (rtx dest, rtx src, unsigned 
HOST_WIDE_INT length)
 {
   src = adjust_address (src, BLKmode, offset);
   dest = adjust_address (dest, BLKmode, offset);
-  move_by_pieces (dest, src, length - offset,
- MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), RETURN_BEGIN);
+  move_by_pieces (dest, src, length - offset, align, RETURN_BEGIN);
 }
 }
 
 /* Helper function for doing a loop-based block operation on memory
-   reference MEM.  Each iteration of the loop will operate on LENGTH
-   bytes of MEM.
+   reference MEM.
 
Create a new base register for use within the loop and point it to
the start of MEM.  Create a new memory reference that uses this
-   register.  Store them in *LOOP_REG and *LOOP_MEM respectively.  */
+   register and has an alignment of ALIGN.  Store them in *LOOP_REG
+   and *LOOP_MEM respectively.  */
 
 static void
-riscv_adjust_block_mem (rtx mem, unsigned HOST_WIDE_INT length,
+riscv_adjust_block_mem (rtx mem, unsigned HOST_WIDE_INT align,
rtx *loop_reg, rtx *loop_mem)
 {
   *loop_reg = copy_addr_to_reg (XEXP (mem, 0));
@@ -677,15 +677,17 @@ riscv_adjust_block_mem (rtx mem, unsigned HOST_WIDE_INT 
length,
   /* Although the new mem does not refer to a known location,
  it does keep up to LENGTH bytes of alignment.  */
   *loop_mem = change_address (mem, BLKmode, *loop_reg);
-  set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
+  set_mem_align (*loop_mem, align);
 }
 
 /* Move LENGTH bytes from SRC 

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [committed] Fix rv32 issues with recent zicboz work

2024-05-15 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:75a06302ef660397001d67afc1fb4d22e6da5870

commit 75a06302ef660397001d67afc1fb4d22e6da5870
Author: Jeff Law 
Date:   Tue May 14 22:50:15 2024 -0600

[committed] Fix rv32 issues with recent zicboz work

I should have double-checked the CI system before pushing Christoph's 
patches
for memset-zero.  While I thought I'd checked CI state, I must have been
looking at the wrong patch from Christoph.

Anyway, this fixes the rv32 ICEs and disables one of the tests for rv32.

The test would need a revamp for rv32 as the expected output is all rv64 
code
using "sd" instructions.  I'm just not vested deeply enough into rv32 to 
adjust
the test to work in that environment though it should be fairly trivial to 
copy
the test and provide new expected output if someone cares enough.

Verified this fixes the rv32 failures in my tester:
> New tests that FAIL (6 tests):
>
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1  
(internal compiler error: in extract_insn, at recog.cc:2812)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1  
(test for excess errors)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2  
(internal compiler error: in extract_insn, at recog.cc:2812)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2  
(test for excess errors)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g  
(internal compiler error: in extract_insn, at recog.cc:2812)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g  
(test for excess errors)

And after the ICE is fixed, these are eliminated by only running the test 
for
rv64:

> New tests that FAIL (3 tests):
>
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1   
check-function-bodies clear_buf_123
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2   
check-function-bodies clear_buf_123
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g  
 check-function-bodies clear_buf_123

gcc/
* config/riscv/riscv-string.cc
(riscv_expand_block_clear_zicboz_zic64b): Handle rv32 correctly.

gcc/testsuite

* gcc.target/riscv/cmo-zicboz-zic64-1.c: Don't run on rv32.

(cherry picked from commit e410ad74e5e4589aeb666aa298b2f933e7b5d9e7)

Diff:
---
 gcc/config/riscv/riscv-string.cc| 5 -
 gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c | 3 +--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 87f5fdee3c14..b515f44d17ae 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -827,7 +827,10 @@ riscv_expand_block_clear_zicboz_zic64b (rtx dest, rtx 
length)
 {
   rtx mem = adjust_address (dest, BLKmode, offset);
   rtx addr = force_reg (Pmode, XEXP (mem, 0));
-  emit_insn (gen_riscv_zero_di (addr));
+  if (TARGET_64BIT)
+   emit_insn (gen_riscv_zero_di (addr));
+  else
+   emit_insn (gen_riscv_zero_si (addr));
   offset += cbo_bytes;
 }
 
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
index c2d79eb7ae68..6d4535287d08 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
@@ -1,6 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gc_zic64b_zicboz" { target { rv64 } } } */
-/* { dg-options "-march=rv32gc_zic64b_zicboz" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc_zic64b_zicboz -mabi=lp64d" } */
 /* { dg-skip-if "" { *-*-* } {"-O0" "-Os" "-Og" "-Oz" "-flto" } } */
 /* { dg-final { check-function-bodies "**" "" } } */
 /* { dg-allow-blank-lines-in-output 1 } */


[to-be-committed][RISC-V] Improve some shift-add sequences

2024-05-15 Thread Jeff Law


So this is a minor fix/improvement for shift-add sequences.  This was 
supposed to help xz in a minor way IIRC.


Combine may present us with (x + C2') << C1 which was canonicalized from 
(x << C1) + C2.


Depending on the precise values of C2 and C2' one form may be better 
than the other.  We can (somewhat awkwardly) use riscv_const_insns to 
test for which sequence would be preferred.


Tested on Ventana's CI system as well as my own.  Waiting on CI results 
from Rivos's tester before moving forward.


Jeff




gcc/
* config/riscv/riscv.md: Add new patterns to allow selection
between (x << C1) + C2 vs (x + C2') << C1 depending on the
cost C2 vs C2'.

gcc/testsuite

* gcc.target/riscv/shift-add-1.c: New test.

commit 03933cf8813b28587ceb7f6f66ac03d08c5de58b
Author: Jeff Law 
Date:   Thu Apr 4 13:35:54 2024 -0600

Optimize (x << C1) + C2 after canonicalization to ((x + C2') << C1).

C2' may have a lower cost to synthesize than C1.  Reassociate to take
advantage of that.

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index ffb09a4109d..69c80bc4a86 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -4416,6 +4416,62 @@ (define_insn_and_split ""
   "{ operands[6] = gen_lowpart (SImode, operands[5]); }"
   [(set_attr "type" "arith")])
 
+;; These are forms of (x << C1) + C2, potentially canonicalized from
+;; ((x + C2') << C1.  Depending on the cost to load C2 vs C2' we may
+;; want to go ahead and recognize this form as C2 may be cheaper to
+;; synthesize than C2'.
+;;
+;; It might be better to refactor riscv_const_insns a bit so that we
+;; can have an API that passes integer values around rather than
+;; constructing a lot of garbage RTL.
+;;
+;; The mvconst_internal pattern in effect requires this pattern to
+;; also be a define_insn_and_split due to insn count costing when
+;; splitting in combine.
+(define_insn_and_split ""
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (plus:DI (ashift:DI (match_operand:DI 1 "register_operand" "r")
+   (match_operand 2 "const_int_operand" "n"))
+(match_operand 3 "const_int_operand" "n")))
+   (clobber (match_scratch:DI 4 "="))]
+  "(TARGET_64BIT
+&& riscv_const_insns (operands[3])
+&& ((riscv_const_insns (operands[3])
+< riscv_const_insns (GEN_INT (INTVAL (operands[3]) >> INTVAL 
(operands[2]
+   || riscv_const_insns (GEN_INT (INTVAL (operands[3]) >> INTVAL 
(operands[2]))) == 0))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (ashift:DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 4) (match_dup 3))
+   (set (match_dup 0) (plus:DI (match_dup 0) (match_dup 4)))]
+  ""
+  [(set_attr "type" "arith")])
+
+(define_insn_and_split ""
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (sign_extend:DI (plus:SI (ashift:SI
+  (match_operand:SI 1 "register_operand" "r")
+  (match_operand 2 "const_int_operand" "n"))
+(match_operand 3 "const_int_operand" "n"
+   (clobber (match_scratch:DI 4 "="))]
+  "(TARGET_64BIT
+&& riscv_const_insns (operands[3])
+&& ((riscv_const_insns (operands[3])
+< riscv_const_insns (GEN_INT (INTVAL (operands[3]) >> INTVAL 
(operands[2]
+   || riscv_const_insns (GEN_INT (INTVAL (operands[3]) >> INTVAL 
(operands[2]))) == 0))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (ashift:DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 4) (match_dup 3))
+   (set (match_dup 0) (sign_extend:DI (plus:SI (match_dup 5) (match_dup 6]
+  "{
+ operands[1] = gen_lowpart (DImode, operands[1]);
+ operands[5] = gen_lowpart (SImode, operands[0]);
+ operands[6] = gen_lowpart (SImode, operands[4]);
+   }"
+  [(set_attr "type" "arith")])
+
+
 (include "bitmanip.md")
 (include "crypto.md")
 (include "sync.md")
diff --git a/gcc/testsuite/gcc.target/riscv/shift-add-1.c 
b/gcc/testsuite/gcc.target/riscv/shift-add-1.c
new file mode 100644
index 000..d98875c3271
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/shift-add-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+int composeFromSurrogate(const unsigned short high) {
+
+return  ((high - 0xD800) << 10) ;
+}
+
+
+long composeFromSurrogate_2(const unsigned long high) {
+
+return  ((high - 0xD800) << 10) ;
+}
+
+
+/* { dg-final { scan-assembler-times "\tli\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tslli\t" 2 } } */
+/* { dg-final { scan-assembler-times "\taddw\t" 1 } } */
+/* { dg-final { scan-assembler-times "\tadd\t" 1 } } */
+


Re: [PATCH] RISC-V: Fix cbo.zero expansion for rv32

2024-05-15 Thread Jeff Law




On 5/15/24 12:48 AM, Christoph Müllner wrote:

Emitting a DI pattern won't find a match for rv32 and manifests in
the failing test case gcc.target/riscv/cmo-zicboz-zic64-1.c.
Let's fix this in the expansion and also address the different
code that gets generated for rv32/rv64.

gcc/ChangeLog:

* config/riscv/riscv-string.cc (riscv_expand_block_clear_zicboz_zic64b):
Fix expansion for rv32.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cmo-zicboz-zic64-1.c: Fix for rv32.
The exact change I made yesterday for the code generator.  Glad to see I 
didn't muck it up :-)  And thanks for fixing the test to have some 
coverage on rv32.


Jeff



Re: [PATCH] RISC-V: Test cbo.zero expansion for rv32

2024-05-15 Thread Jeff Law




On 5/15/24 1:28 AM, Christoph Müllner wrote:

We had an issue when expanding via cmo-zero for RV32.
This was fixed upstream, but we don't have a RV32 test.
Therefore, this patch introduces such a test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cmo-zicboz-zic64-1.c: Fix for rv32.

OK.  Thanks!

jeff



[committed] Fix rv32 issues with recent zicboz work

2024-05-14 Thread Jeff Law
I should have double-checked the CI system before pushing Christoph's 
patches for memset-zero.  While I thought I'd checked CI state, I must 
have been looking at the wrong patch from Christoph.


Anyway, this fixes the rv32 ICEs and disables one of the tests for rv32.

The test would need a revamp for rv32 as the expected output is all rv64 
code using "sd" instructions.  I'm just not vested deeply enough into 
rv32 to adjust the test to work in that environment though it should be 
fairly trivial to copy the test and provide new expected output if 
someone cares enough.





Verified this fixes the rv32 failures in my tester:

New tests that FAIL (6 tests):

unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1  
(internal compiler error: in extract_insn, at recog.cc:2812)
unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1  (test 
for excess errors)
unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2  
(internal compiler error: in extract_insn, at recog.cc:2812)
unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2  (test 
for excess errors)
unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g  
(internal compiler error: in extract_insn, at recog.cc:2812)
unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g  (test 
for excess errors)



And after the ICE is fixed, these are eliminated by only running the 
test for rv64:



New tests that FAIL (3 tests):

unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1   
check-function-bodies clear_buf_123
unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2   
check-function-bodies clear_buf_123
unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g   
check-function-bodies clear_buf_123


Pushed to the trunk.

Jeff

commit e410ad74e5e4589aeb666aa298b2f933e7b5d9e7
Author: Jeff Law 
Date:   Tue May 14 22:50:15 2024 -0600

[committed] Fix rv32 issues with recent zicboz work

I should have double-checked the CI system before pushing Christoph's 
patches
for memset-zero.  While I thought I'd checked CI state, I must have been
looking at the wrong patch from Christoph.

Anyway, this fixes the rv32 ICEs and disables one of the tests for rv32.

The test would need a revamp for rv32 as the expected output is all rv64 
code
using "sd" instructions.  I'm just not vested deeply enough into rv32 to 
adjust
the test to work in that environment though it should be fairly trivial to 
copy
the test and provide new expected output if someone cares enough.

Verified this fixes the rv32 failures in my tester:
> New tests that FAIL (6 tests):
>
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1  
(internal compiler error: in extract_insn, at recog.cc:2812)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1  
(test for excess errors)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2  
(internal compiler error: in extract_insn, at recog.cc:2812)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2  
(test for excess errors)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g  
(internal compiler error: in extract_insn, at recog.cc:2812)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g  
(test for excess errors)

And after the ICE is fixed, these are eliminated by only running the test 
for
rv64:

> New tests that FAIL (3 tests):
>
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1   
check-function-bodies clear_buf_123
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2   
check-function-bodies clear_buf_123
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g  
 check-function-bodies clear_buf_123

gcc/
* config/riscv/riscv-string.cc
(riscv_expand_block_clear_zicboz_zic64b): Handle rv32 correctly.

gcc/testsuite

* gcc.target/riscv/cmo-zicboz-zic64-1.c: Don't run on rv32.

diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 87f5fdee3c1..b515f44d17a 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -827,7 +827,10 @@ riscv_expand_block_clear_zicboz_zic64b (rtx dest, rtx 
length)
 {
   rtx mem = adjust_address (dest, BLKmode, offset);
   rtx addr = force_reg (Pmode, XEXP (mem, 0));
-  emit_insn (gen_riscv_zero_di (addr));
+  if (TARGET_64BIT)
+   emit_insn (gen_riscv_zero_di (addr));
+  else
+   emit_insn (gen_riscv_zero_si (addr));
   offset += cbo_bytes;
 }
 
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
index c2d

[gcc r15-500] [committed] Fix rv32 issues with recent zicboz work

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:e410ad74e5e4589aeb666aa298b2f933e7b5d9e7

commit r15-500-ge410ad74e5e4589aeb666aa298b2f933e7b5d9e7
Author: Jeff Law 
Date:   Tue May 14 22:50:15 2024 -0600

[committed] Fix rv32 issues with recent zicboz work

I should have double-checked the CI system before pushing Christoph's 
patches
for memset-zero.  While I thought I'd checked CI state, I must have been
looking at the wrong patch from Christoph.

Anyway, this fixes the rv32 ICEs and disables one of the tests for rv32.

The test would need a revamp for rv32 as the expected output is all rv64 
code
using "sd" instructions.  I'm just not vested deeply enough into rv32 to 
adjust
the test to work in that environment though it should be fairly trivial to 
copy
the test and provide new expected output if someone cares enough.

Verified this fixes the rv32 failures in my tester:
> New tests that FAIL (6 tests):
>
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1  
(internal compiler error: in extract_insn, at recog.cc:2812)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1  
(test for excess errors)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2  
(internal compiler error: in extract_insn, at recog.cc:2812)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2  
(test for excess errors)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g  
(internal compiler error: in extract_insn, at recog.cc:2812)
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g  
(test for excess errors)

And after the ICE is fixed, these are eliminated by only running the test 
for
rv64:

> New tests that FAIL (3 tests):
>
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O1   
check-function-bodies clear_buf_123
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O2   
check-function-bodies clear_buf_123
> unix/-march=rv32gcv: gcc: gcc.target/riscv/cmo-zicboz-zic64-1.c   -O3 -g  
 check-function-bodies clear_buf_123

gcc/
* config/riscv/riscv-string.cc
(riscv_expand_block_clear_zicboz_zic64b): Handle rv32 correctly.

gcc/testsuite

* gcc.target/riscv/cmo-zicboz-zic64-1.c: Don't run on rv32.

Diff:
---
 gcc/config/riscv/riscv-string.cc| 5 -
 gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c | 3 +--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 87f5fdee3c14..b515f44d17ae 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -827,7 +827,10 @@ riscv_expand_block_clear_zicboz_zic64b (rtx dest, rtx 
length)
 {
   rtx mem = adjust_address (dest, BLKmode, offset);
   rtx addr = force_reg (Pmode, XEXP (mem, 0));
-  emit_insn (gen_riscv_zero_di (addr));
+  if (TARGET_64BIT)
+   emit_insn (gen_riscv_zero_di (addr));
+  else
+   emit_insn (gen_riscv_zero_si (addr));
   offset += cbo_bytes;
 }
 
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
index c2d79eb7ae68..6d4535287d08 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
@@ -1,6 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gc_zic64b_zicboz" { target { rv64 } } } */
-/* { dg-options "-march=rv32gc_zic64b_zicboz" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc_zic64b_zicboz -mabi=lp64d" } */
 /* { dg-skip-if "" { *-*-* } {"-O0" "-Os" "-Og" "-Oz" "-flto" } } */
 /* { dg-final { check-function-bodies "**" "" } } */
 /* { dg-allow-blank-lines-in-output 1 } */


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [to-be-committed, RISC-V] Remove redundant AND in shift-add sequence

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:9de32107d731fbbf15096d065bf706bb9aff94f6

commit 9de32107d731fbbf15096d065bf706bb9aff94f6
Author: Jeff Law 
Date:   Tue May 14 18:17:59 2024 -0600

[to-be-committed,RISC-V] Remove redundant AND in shift-add sequence

So this patch allows us to eliminate an redundant AND in some shift-add
style sequences.   I think the testcase was reduced from xz by the RAU
team, but I'm not highly confident of that.

Specifically the AND is masking off the upper 32 bits of the un-shifted
value and there's an outer SIGN_EXTEND from SI to DI.  However in the
RTL it's working on the post-shifted value, so the constant is left
shifted, so we have to account for that in the pattern's condition.

We can just drop the AND in this case.  So instead we do a 64bit shift,
then a sign extending ADD utilizing the low part of that 64bit shift result.

This has run through Ventana's CI as well as my own.  I'll wait for it
to run through the larger CI system before pushing.

Jeff

gcc/
* config/riscv/riscv.md: Add pattern for sign extended shift-add
sequence with a masked input.

gcc/testsuite

* gcc.target/riscv/shift-add-2.c: New test.

(cherry picked from commit 32ff344d57d56fddb66c4976b5651345d40b7157)

Diff:
---
 gcc/config/riscv/riscv.md| 25 +
 gcc/testsuite/gcc.target/riscv/shift-add-2.c | 16 
 2 files changed, 41 insertions(+)

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 893040f28541..ee15c63db107 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -4120,6 +4120,31 @@
   [(set_attr "type" "load")
(set (attr "length") (const_int 8))])
 
+;; The AND is redunant here.  It always turns off the high 32 bits  and the
+;; low number of bits equal to the shift count.  Those upper 32 bits will be
+;; reset by the SIGN_EXTEND at the end.
+;;
+;; One could argue combine should have realized this and simplified what it
+;; presented to the backend.  But we can obviously cope with what it gave us.
+(define_insn_and_split ""
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (sign_extend:DI
+ (plus:SI (subreg:SI
+(and:DI
+  (ashift:DI (match_operand:DI 1 "register_operand" "r")
+ (match_operand 2 "const_int_operand" "n"))
+  (match_operand 3 "const_int_operand" "n")) 0)
+  (match_operand:SI 4 "register_operand" "r"
+   (clobber (match_scratch:DI 5 "="))]
+  "TARGET_64BIT
+   && (INTVAL (operands[3]) | ((1 << INTVAL (operands[2])) - 1)) == 0x"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 5) (ashift:DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 0) (sign_extend:DI (plus:SI (match_dup 6) (match_dup 4]
+  "{ operands[6] = gen_lowpart (SImode, operands[5]); }"
+  [(set_attr "type" "arith")])
+
 (include "bitmanip.md")
 (include "crypto.md")
 (include "sync.md")
diff --git a/gcc/testsuite/gcc.target/riscv/shift-add-2.c 
b/gcc/testsuite/gcc.target/riscv/shift-add-2.c
new file mode 100644
index ..87439858e59e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/shift-add-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+int sub2(int a, long long b) {
+  b = (b << 32) >> 31;
+  unsigned int x = a + b;
+  return x;
+}
+
+
+/* { dg-final { scan-assembler-times "\tslli\t" 1 } } */
+/* { dg-final { scan-assembler-times "\taddw\t" 1 } } */
+/* { dg-final { scan-assembler-not "\tsrai\t" } } */
+/* { dg-final { scan-assembler-not "\tsh.add\t" } } */
+


Re: [PATCH] RISC-V: Implement -m{,no}fence-tso

2024-05-14 Thread Jeff Law




On 5/14/24 5:13 PM, Palmer Dabbelt wrote:

Some processors from T-Head don't implement the `fence.tso` instruction
natively and instead trap to firmware.  This breaks some users who
haven't yet updated the firmware and one could imagine it breaking users
who are trying to build firmware if they're using the C memory model.

So just add an option to disable emitting it, in a similar fashion to
how we allow users to forbid other instructions.

gcc/ChangeLog:

* config/riscv/riscv.opt: Add -mno-fence-tso.
* config/riscv/sync-rvwmo.md (mem_thread_fence_rvwmo): Respect
-mno-fence-tso.
* doc/invoke.texi (RISC-V): Document -mno-fence-tso.

Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1070959
---
I've just smoke tested this one, but

 void func(void) { __atomic_thread_fence(__ATOMIC_ACQ_REL); }

generates `fence.tso` without the argument and `fence rw,rw` with
`-mno-fence-tso`, so it seems to be at least mostly there.  I figured
I'd just send it up for comments before putting together the DG bits:
it's kind of a pain to carry around these workarounds for unimplemented
instructions, but it's in HW so there's not much we can do about that.
Seems reasonable.  We might consider adding a comment in the code 
indicating this is for a particular set of thead systems.  10 years from 
now when someone else looks at the code they'll know why this is in 
there and they won't have to do the archaeology.


Jeff


[gcc r15-497] [to-be-committed, RISC-V] Remove redundant AND in shift-add sequence

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:32ff344d57d56fddb66c4976b5651345d40b7157

commit r15-497-g32ff344d57d56fddb66c4976b5651345d40b7157
Author: Jeff Law 
Date:   Tue May 14 18:17:59 2024 -0600

[to-be-committed,RISC-V] Remove redundant AND in shift-add sequence

So this patch allows us to eliminate an redundant AND in some shift-add
style sequences.   I think the testcase was reduced from xz by the RAU
team, but I'm not highly confident of that.

Specifically the AND is masking off the upper 32 bits of the un-shifted
value and there's an outer SIGN_EXTEND from SI to DI.  However in the
RTL it's working on the post-shifted value, so the constant is left
shifted, so we have to account for that in the pattern's condition.

We can just drop the AND in this case.  So instead we do a 64bit shift,
then a sign extending ADD utilizing the low part of that 64bit shift result.

This has run through Ventana's CI as well as my own.  I'll wait for it
to run through the larger CI system before pushing.

Jeff

gcc/
* config/riscv/riscv.md: Add pattern for sign extended shift-add
sequence with a masked input.

gcc/testsuite

* gcc.target/riscv/shift-add-2.c: New test.

Diff:
---
 gcc/config/riscv/riscv.md| 25 +
 gcc/testsuite/gcc.target/riscv/shift-add-2.c | 16 
 2 files changed, 41 insertions(+)

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 893040f28541..ee15c63db107 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -4120,6 +4120,31 @@
   [(set_attr "type" "load")
(set (attr "length") (const_int 8))])
 
+;; The AND is redunant here.  It always turns off the high 32 bits  and the
+;; low number of bits equal to the shift count.  Those upper 32 bits will be
+;; reset by the SIGN_EXTEND at the end.
+;;
+;; One could argue combine should have realized this and simplified what it
+;; presented to the backend.  But we can obviously cope with what it gave us.
+(define_insn_and_split ""
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (sign_extend:DI
+ (plus:SI (subreg:SI
+(and:DI
+  (ashift:DI (match_operand:DI 1 "register_operand" "r")
+ (match_operand 2 "const_int_operand" "n"))
+  (match_operand 3 "const_int_operand" "n")) 0)
+  (match_operand:SI 4 "register_operand" "r"
+   (clobber (match_scratch:DI 5 "="))]
+  "TARGET_64BIT
+   && (INTVAL (operands[3]) | ((1 << INTVAL (operands[2])) - 1)) == 0x"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 5) (ashift:DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 0) (sign_extend:DI (plus:SI (match_dup 6) (match_dup 4]
+  "{ operands[6] = gen_lowpart (SImode, operands[5]); }"
+  [(set_attr "type" "arith")])
+
 (include "bitmanip.md")
 (include "crypto.md")
 (include "sync.md")
diff --git a/gcc/testsuite/gcc.target/riscv/shift-add-2.c 
b/gcc/testsuite/gcc.target/riscv/shift-add-2.c
new file mode 100644
index ..87439858e59e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/shift-add-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+int sub2(int a, long long b) {
+  b = (b << 32) >> 31;
+  unsigned int x = a + b;
+  return x;
+}
+
+
+/* { dg-final { scan-assembler-times "\tslli\t" 1 } } */
+/* { dg-final { scan-assembler-times "\taddw\t" 1 } } */
+/* { dg-final { scan-assembler-not "\tsrai\t" } } */
+/* { dg-final { scan-assembler-not "\tsh.add\t" } } */
+


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: avoid LUI based const materialization ... [part of PR/106265]

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:de257cc78146b0e518b272de5afc3faa9bbf3669

commit de257cc78146b0e518b272de5afc3faa9bbf3669
Author: Vineet Gupta 
Date:   Mon May 13 11:45:55 2024 -0700

RISC-V: avoid LUI based const materialization ... [part of PR/106265]

... if the constant can be represented as sum of two S12 values.
The two S12 values could instead be fused with subsequent ADD insn.
The helps
 - avoid an additional LUI insn
 - side benefits of not clobbering a reg

e.g.
w/o patch w/ patch
long  | |
plus(unsigned long i) | li  a5,4096 |
{ | addia5,a5,-2032 | addi a0, a0, 2047
   return i + 2064;   | add a0,a0,a5| addi a0, a0, 17
} | ret | ret

NOTE: In theory not having const in a standalone reg might seem less
  CSE friendly, but for workloads in consideration these mat are
  from very late LRA reloads and follow on GCSE is not doing much
  currently.

The real benefit however is seen in base+offset computation for array
accesses and especially for stack accesses which are finalized late in
optim pipeline, during LRA register allocation. Often the finalized
offsets trigger LRA reloads resulting in mind boggling repetition of
exact same insn sequence including LUI based constant materialization.

This shaves off 290 billion dynamic instrustions (QEMU icounts) in
SPEC 2017 Cactu benchmark which is over 10% of workload. In the rest of
suite, there additional 10 billion shaved, with both gains and losses
in indiv workloads as is usual with compiler changes.

 500.perlbench_r-0 |  1,214,534,029,025 | 1,212,887,959,387 |
 500.perlbench_r-1 |740,383,419,739 |   739,280,308,163 |
 500.perlbench_r-2 |692,074,638,817 |   691,118,734,547 |
 502.gcc_r-0   |190,820,141,435 |   190,857,065,988 |
 502.gcc_r-1   |225,747,660,839 |   225,809,444,357 | <- -0.02%
 502.gcc_r-2   |220,370,089,641 |   220,406,367,876 | <- -0.03%
 502.gcc_r-3   |179,111,460,458 |   179,135,609,723 | <- -0.02%
 502.gcc_r-4   |219,301,546,340 |   219,320,416,956 | <- -0.01%
 503.bwaves_r-0|278,733,324,691 |   278,733,323,575 | <- -0.01%
 503.bwaves_r-1|442,397,521,282 |   442,397,519,616 |
 503.bwaves_r-2|344,112,218,206 |   344,112,216,760 |
 503.bwaves_r-3|417,561,469,153 |   417,561,467,597 |
 505.mcf_r |669,319,257,525 |   669,318,763,084 |
 507.cactuBSSN_r   |  2,852,767,394,456 | 2,564,736,063,742 | <+ 10.10%
 508.namd_r|  1,855,884,342,110 | 1,855,881,110,934 |
 510.parest_r  |  1,654,525,521,053 | 1,654,402,859,174 |
 511.povray_r  |  2,990,146,655,619 | 2,990,060,324,589 |
 519.lbm_r |  1,158,337,294,525 | 1,158,337,294,529 |
 520.omnetpp_r |  1,021,765,791,283 | 1,026,165,661,394 |
 521.wrf_r |  1,715,955,652,503 | 1,714,352,737,385 |
 523.xalancbmk_r   |849,846,008,075 |   849,836,851,752 |
 525.x264_r-0  |277,801,762,763 |   277,488,776,427 |
 525.x264_r-1  |927,281,789,540 |   926,751,516,742 |
 525.x264_r-2  |915,352,631,375 |   914,667,785,953 |
 526.blender_r |  1,652,839,180,887 | 1,653,260,825,512 |
 527.cam4_r|  1,487,053,494,925 | 1,484,526,670,770 |
 531.deepsjeng_r   |  1,641,969,526,837 | 1,642,126,598,866 |
 538.imagick_r |  2,098,016,546,691 | 2,097,997,929,125 |
 541.leela_r   |  1,983,557,323,877 | 1,983,531,314,526 |
 544.nab_r |  1,516,061,611,233 | 1,516,061,407,715 |
 548.exchange2_r   |  2,072,594,330,215 | 2,072,591,648,318 |
 549.fotonik3d_r   |  1,001,499,307,366 | 1,001,478,944,189 |
 554.roms_r|  1,028,799,739,111 | 1,028,780,904,061 |
 557.xz_r-0|363,827,039,684 |   363,057,014,260 |
 557.xz_r-1|906,649,112,601 |   905,928,888,732 |
 557.xz_r-2|509,023,898,187 |   508,140,356,932 |
 997.specrand_fr   |402,535,577 |   403,052,561 |
 999.specrand_ir   |402,535,577 |   403,052,561 |

This should still be considered damage control as the real/deeper fix
would be to reduce number of LRA reloads or CSE/anchor those during
LRA constraint sub-pass (re)runs (thats a different PR/114729.

Implementation Details (for posterity)
--
 - basic idea is to have a splitter selected via a new predicate for 
constant
   being possible sum of two S12 and provide the transform.
   This is however a 2 -> 2 transform which combine can't handle.
   So we specify it using a define_insn_and_split.

 - the initial loose "i" constraint caused LRA to accept invalid insns thus
  

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [PATCH 3/3] RISC-V: Add memset-zero expansion to cbo.zero

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:f9a0426cdbd0d1e796cd0a9bcd39d31e3d2df018

commit f9a0426cdbd0d1e796cd0a9bcd39d31e3d2df018
Author: Christoph Müllner 
Date:   Tue May 14 09:21:17 2024 -0600

[PATCH 3/3] RISC-V: Add memset-zero expansion to cbo.zero

The Zicboz extension offers the cbo.zero instruction, which can be used
to clean a memory region corresponding to a cache block.
The Zic64b extension defines the cache block size to 64 byte.
If both extensions are available, it is possible to use cbo.zero
to clear memory, if the alignment and size constraints are met.
This patch implements this.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (riscv_expand_block_clear): New 
prototype.
* config/riscv/riscv-string.cc 
(riscv_expand_block_clear_zicboz_zic64b):
New function to expand a block-clear with cbo.zero.
(riscv_expand_block_clear): New RISC-V block-clear expansion 
function.
* config/riscv/riscv.md (setmem): New setmem expansion.

(cherry picked from commit 54ba8d44bbd703bca6984700b4d6f978890097e2)

Diff:
---
 gcc/config/riscv/riscv-protos.h|  1 +
 gcc/config/riscv/riscv-string.cc   | 59 ++
 gcc/config/riscv/riscv.md  | 24 +
 .../gcc.target/riscv/cmo-zicboz-zic64-1.c  | 43 
 4 files changed, 127 insertions(+)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index e5aebf3fc3d5..255fd6a0de97 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -189,6 +189,7 @@ rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
 
 /* Routines implemented in riscv-string.c.  */
 extern bool riscv_expand_block_move (rtx, rtx, rtx);
+extern bool riscv_expand_block_clear (rtx, rtx);
 
 /* Information about one CPU we know about.  */
 struct riscv_cpu_info {
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 41cb061c746d..87f5fdee3c14 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -794,6 +794,65 @@ riscv_expand_block_move (rtx dest, rtx src, rtx length)
   return false;
 }
 
+/* Expand a block-clear instruction via cbo.zero instructions.  */
+
+static bool
+riscv_expand_block_clear_zicboz_zic64b (rtx dest, rtx length)
+{
+  unsigned HOST_WIDE_INT hwi_length;
+  unsigned HOST_WIDE_INT align;
+  const unsigned HOST_WIDE_INT cbo_bytes = 64;
+
+  gcc_assert (TARGET_ZICBOZ && TARGET_ZIC64B);
+
+  if (!CONST_INT_P (length))
+return false;
+
+  hwi_length = UINTVAL (length);
+  if (hwi_length < cbo_bytes)
+return false;
+
+  align = MEM_ALIGN (dest) / BITS_PER_UNIT;
+  if (align < cbo_bytes)
+return false;
+
+  /* We don't emit loops.  Instead apply move-bytes limitation.  */
+  unsigned HOST_WIDE_INT max_bytes = RISCV_MAX_MOVE_BYTES_STRAIGHT /
+ UNITS_PER_WORD * cbo_bytes;
+  if (hwi_length > max_bytes)
+return false;
+
+  unsigned HOST_WIDE_INT offset = 0;
+  while (offset + cbo_bytes <= hwi_length)
+{
+  rtx mem = adjust_address (dest, BLKmode, offset);
+  rtx addr = force_reg (Pmode, XEXP (mem, 0));
+  emit_insn (gen_riscv_zero_di (addr));
+  offset += cbo_bytes;
+}
+
+  if (offset < hwi_length)
+{
+  rtx mem = adjust_address (dest, BLKmode, offset);
+  clear_by_pieces (mem, hwi_length - offset, align);
+}
+
+  return true;
+}
+
+bool
+riscv_expand_block_clear (rtx dest, rtx length)
+{
+  /* Only use setmem-zero expansion for Zicboz + Zic64b.  */
+  if (!TARGET_ZICBOZ || !TARGET_ZIC64B)
+return false;
+
+  if (optimize_function_for_size_p (cfun))
+return false;
+
+  return riscv_expand_block_clear_zicboz_zic64b (dest, length);
+}
+
 /* --- Vector expanders --- */
 
 namespace riscv_vector {
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 4d6de9925572..c45b1129b0a0 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2608,6 +2608,30 @@
 FAIL;
 })
 
+;; Fill memory with constant byte.
+;; Argument 0 is the destination
+;; Argument 1 is the constant byte
+;; Argument 2 is the length
+;; Argument 3 is the alignment
+
+(define_expand "setmem"
+  [(parallel [(set (match_operand:BLK 0 "memory_operand")
+  (match_operand:QI 2 "const_int_operand"))
+ (use (match_operand:P 1 ""))
+ (use (match_operand:SI 3 "const_int_operand"))])]
+ ""
+ {
+  /* If value to set is not zero, use the library routine.  */
+  if (operands[2] != const0_rtx)
+FAIL;
+
+  if (riscv_expand_block_clear (operands[0], operands[1]))
+DONE;
+  else
+FAIL;
+})
+
+
 ;; Expand in-line code to clear the instruction cache between operand[0] and
 ;; operand[1].
 (define_expand "clear_cache"
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
new file mode 100644
index 

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [PATCH 2/3] RISC-V: testsuite: Make cmo tests LTO safe

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:0db572dff53572f4c471ec588c7328a33f2cb6ab

commit 0db572dff53572f4c471ec588c7328a33f2cb6ab
Author: Christoph Müllner 
Date:   Tue May 14 09:20:18 2024 -0600

[PATCH 2/3] RISC-V: testsuite: Make cmo tests LTO safe

Let's add '\t' to the instruction match pattern to avoid false positive
matches when compiling with -flto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cmo-zicbom-1.c: Add \t to test pattern.
* gcc.target/riscv/cmo-zicbom-2.c: Likewise.
* gcc.target/riscv/cmo-zicbop-1.c: Likewise.
* gcc.target/riscv/cmo-zicbop-2.c: Likewise.
* gcc.target/riscv/cmo-zicboz-1.c: Likewise.
* gcc.target/riscv/cmo-zicboz-2.c: Likewise.

(cherry picked from commit 21855f960141c1811d6a5f6ad3b2065f20d4b353)

Diff:
---
 gcc/testsuite/gcc.target/riscv/cmo-zicbom-1.c | 6 +++---
 gcc/testsuite/gcc.target/riscv/cmo-zicbom-2.c | 6 +++---
 gcc/testsuite/gcc.target/riscv/cmo-zicbop-1.c | 6 +++---
 gcc/testsuite/gcc.target/riscv/cmo-zicbop-2.c | 6 +++---
 gcc/testsuite/gcc.target/riscv/cmo-zicboz-1.c | 2 +-
 gcc/testsuite/gcc.target/riscv/cmo-zicboz-2.c | 2 +-
 6 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicbom-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicbom-1.c
index 6341f7874d3e..02c38e201fae 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicbom-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicbom-1.c
@@ -24,6 +24,6 @@ void foo3()
 __builtin_riscv_zicbom_cbo_inval((void*)0x111);
 }
 
-/* { dg-final { scan-assembler-times "cbo.clean" 3 } } */
-/* { dg-final { scan-assembler-times "cbo.flush" 3 } } */
-/* { dg-final { scan-assembler-times "cbo.inval" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.clean\t" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.flush\t" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.inval\t" 3 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicbom-2.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicbom-2.c
index a04f106c8b0e..040b96952bc3 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicbom-2.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicbom-2.c
@@ -24,6 +24,6 @@ void foo3()
 __builtin_riscv_zicbom_cbo_inval((void*)0x111);
 }
 
-/* { dg-final { scan-assembler-times "cbo.clean" 3 } } */
-/* { dg-final { scan-assembler-times "cbo.flush" 3 } } */
-/* { dg-final { scan-assembler-times "cbo.inval" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.clean\t" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.flush\t" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.inval\t" 3 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicbop-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicbop-1.c
index c5d78c1763d3..97181154d85b 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicbop-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicbop-1.c
@@ -18,6 +18,6 @@ int foo1()
   return __builtin_riscv_zicbop_cbo_prefetchi(1);
 }
 
-/* { dg-final { scan-assembler-times "prefetch.i" 1 } } */
-/* { dg-final { scan-assembler-times "prefetch.r" 4 } } */
-/* { dg-final { scan-assembler-times "prefetch.w" 4 } } */
+/* { dg-final { scan-assembler-times "prefetch.i\t" 1 } } */
+/* { dg-final { scan-assembler-times "prefetch.r\t" 4 } } */
+/* { dg-final { scan-assembler-times "prefetch.w\t" 4 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicbop-2.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicbop-2.c
index 6576365b39ca..4871a97b21aa 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicbop-2.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicbop-2.c
@@ -18,6 +18,6 @@ int foo1()
   return __builtin_riscv_zicbop_cbo_prefetchi(1);
 }
 
-/* { dg-final { scan-assembler-times "prefetch.i" 1 } } */
-/* { dg-final { scan-assembler-times "prefetch.r" 4 } } */
-/* { dg-final { scan-assembler-times "prefetch.w" 4 } } */ 
+/* { dg-final { scan-assembler-times "prefetch.i\t" 1 } } */
+/* { dg-final { scan-assembler-times "prefetch.r\t" 4 } } */
+/* { dg-final { scan-assembler-times "prefetch.w\t" 4 } } */ 
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-1.c
index 5eb78ab94b5a..63b8782bf89e 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-1.c
@@ -10,4 +10,4 @@ void foo1()
 __builtin_riscv_zicboz_cbo_zero((void*)0x121);
 }
 
-/* { dg-final { scan-assembler-times "cbo.zero" 3 } } */ 
+/* { dg-final { scan-assembler-times "cbo.zero\t" 3 } } */ 
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-2.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-2.c
index fdc9c719669c..cc3bd505ec09 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-2.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-2.c
@@ -10,4 +10,4 @@ void foo1()
 __builtin_riscv_zicboz_cbo_zero((void*)0x121);
 }
 
-/* { dg-final { scan-assembler-times "cbo.zero" 3 } } */ 
+/* { dg-final { scan-assembler-times "cbo.zero\t" 3 } } */


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [1/3] expr: Export clear_by_pieces()

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:5b00e29d1833dee69e1146f13a8d8a37dadfa31a

commit 5b00e29d1833dee69e1146f13a8d8a37dadfa31a
Author: Christoph Müllner 
Date:   Tue May 14 09:19:13 2024 -0600

[1/3] expr: Export clear_by_pieces()

Make clear_by_pieces() available to other parts of the compiler,
similar to store_by_pieces().

gcc/ChangeLog:

* expr.cc (clear_by_pieces): Remove static from clear_by_pieces.
* expr.h (clear_by_pieces): Add prototype for clear_by_pieces.

(cherry picked from commit e6e41b68fd805ab126895a20bb9670442b198f62)

Diff:
---
 gcc/expr.cc | 6 +-
 gcc/expr.h  | 5 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/gcc/expr.cc b/gcc/expr.cc
index d4414e242cb9..eaf86d3d8429 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -85,7 +85,6 @@ static void emit_block_move_via_sized_loop (rtx, rtx, rtx, 
unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, 
unsigned);
 static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
unsigned, unsigned);
-static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
 static rtx store_field (rtx, poly_int64, poly_int64, poly_uint64, poly_uint64,
@@ -1832,10 +1831,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
 return to;
 }
 
-/* Generate several move instructions to clear LEN bytes of block TO.  (A MEM
-   rtx with BLKmode).  ALIGN is maximum alignment we can assume.  */
-
-static void
+void
 clear_by_pieces (rtx to, unsigned HOST_WIDE_INT len, unsigned int align)
 {
   if (len == 0)
diff --git a/gcc/expr.h b/gcc/expr.h
index 64956f630297..751815841083 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -245,6 +245,11 @@ extern bool can_store_by_pieces (unsigned HOST_WIDE_INT,
 extern rtx store_by_pieces (rtx, unsigned HOST_WIDE_INT, by_pieces_constfn,
void *, unsigned int, bool, memop_ret);
 
+/* Generate several move instructions to clear LEN bytes of block TO.  (A MEM
+   rtx with BLKmode).  ALIGN is maximum alignment we can assume.  */
+
+extern void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
+
 /* If can_store_by_pieces passes for worst-case values near MAX_LEN, call
store_by_pieces within conditionals so as to handle variable LEN 
efficiently,
storing VAL, if non-NULL_RTX, or valc instead.  */


Re: [PATCH v2 2/2] RISC-V: avoid LUI based const mat in prologue/epilogue expansion [PR/105733]

2024-05-14 Thread Jeff Law




On 5/14/24 10:36 AM, Vineet Gupta wrote:



On 5/14/24 08:44, Jeff Law wrote:

On 5/14/24 8:51 AM, Patrick O'Neill wrote:

I was able to find the summary info:


Tests that now fail, but worked before (15 tests):
libgomp: libgomp.fortran/simd7.f90   -O0  execution test
libgomp: libgomp.fortran/task2.f90   -O0  execution test
libgomp: libgomp.fortran/vla2.f90   -O0  execution test
libgomp: libgomp.fortran/vla3.f90   -O3 -fomit-frame-pointer -
funroll-loops -fpeel-loops -ftracer -finline-functions execution test
libgomp: libgomp.fortran/vla3.f90   -O3 -g  execution test
libgomp: libgomp.fortran/vla4.f90   -O1  execution test
libgomp: libgomp.fortran/vla4.f90   -O2  execution test
libgomp: libgomp.fortran/vla4.f90   -O3 -fomit-frame-pointer -
funroll-loops -fpeel-loops -ftracer -finline-functions execution test
libgomp: libgomp.fortran/vla4.f90   -O3 -g  execution test
libgomp: libgomp.fortran/vla4.f90   -Os  execution test
libgomp: libgomp.fortran/vla5.f90   -O1  execution test
libgomp: libgomp.fortran/vla5.f90   -O2  execution test
libgomp: libgomp.fortran/vla5.f90   -O3 -fomit-frame-pointer -
funroll-loops -fpeel-loops -ftracer -finline-functions execution test
libgomp: libgomp.fortran/vla5.f90   -O3 -g  execution test
libgomp: libgomp.fortran/vla5.f90   -Os  execution test

So if you could check on those, it'd be appreciated.

I checked rv64gcv linux and those do not currently run in CI.

So just ran with Vineet's patch in our CI system.  His patch is still
triggering those regressions.  So we need to get that resolved before
that second patch can go in.


And just for reproducibility what exact --with-arch build is this from ?

This run was with "--with-arch=rv64gc_zba_zbb_zbc_zbkb_zbs_zfa_zicond"

I think we likely saw it without zbkb & zfa when we first looked at this 
a few months back.


jeff



Re: [PATCH v2 2/2] RISC-V: avoid LUI based const mat in prologue/epilogue expansion [PR/105733]

2024-05-14 Thread Jeff Law




On 5/14/24 8:51 AM, Patrick O'Neill wrote:





I was able to find the summary info:


Tests that now fail, but worked before (15 tests):
libgomp: libgomp.fortran/simd7.f90   -O0  execution test
libgomp: libgomp.fortran/task2.f90   -O0  execution test
libgomp: libgomp.fortran/vla2.f90   -O0  execution test
libgomp: libgomp.fortran/vla3.f90   -O3 -fomit-frame-pointer - 
funroll-loops -fpeel-loops -ftracer -finline-functions execution test

libgomp: libgomp.fortran/vla3.f90   -O3 -g  execution test
libgomp: libgomp.fortran/vla4.f90   -O1  execution test
libgomp: libgomp.fortran/vla4.f90   -O2  execution test
libgomp: libgomp.fortran/vla4.f90   -O3 -fomit-frame-pointer - 
funroll-loops -fpeel-loops -ftracer -finline-functions execution test

libgomp: libgomp.fortran/vla4.f90   -O3 -g  execution test
libgomp: libgomp.fortran/vla4.f90   -Os  execution test
libgomp: libgomp.fortran/vla5.f90   -O1  execution test
libgomp: libgomp.fortran/vla5.f90   -O2  execution test
libgomp: libgomp.fortran/vla5.f90   -O3 -fomit-frame-pointer - 
funroll-loops -fpeel-loops -ftracer -finline-functions execution test

libgomp: libgomp.fortran/vla5.f90   -O3 -g  execution test
libgomp: libgomp.fortran/vla5.f90   -Os  execution test


So if you could check on those, it'd be appreciated.


I checked rv64gcv linux and those do not currently run in CI.
So just ran with Vineet's patch in our CI system.  His patch is still 
triggering those regressions.  So we need to get that resolved before 
that second patch can go in.


jeff



Re: [PATCH 1/3] expr: Export clear_by_pieces()

2024-05-14 Thread Jeff Law




On 5/7/24 11:38 PM, Christoph Müllner wrote:

Make clear_by_pieces() available to other parts of the compiler,
similar to store_by_pieces().

gcc/ChangeLog:

* expr.cc (clear_by_pieces): Remove static from clear_by_pieces.
* expr.h (clear_by_pieces): Add prototype for clear_by_pieces.
I'm going to push this series.  It's fully ack'd, tested and is going to 
interact with Sergei's work on vector variants of relevant patterns.


Jeff


[gcc r15-489] [PATCH 2/3] RISC-V: testsuite: Make cmo tests LTO safe

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:21855f960141c1811d6a5f6ad3b2065f20d4b353

commit r15-489-g21855f960141c1811d6a5f6ad3b2065f20d4b353
Author: Christoph Müllner 
Date:   Tue May 14 09:20:18 2024 -0600

[PATCH 2/3] RISC-V: testsuite: Make cmo tests LTO safe

Let's add '\t' to the instruction match pattern to avoid false positive
matches when compiling with -flto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cmo-zicbom-1.c: Add \t to test pattern.
* gcc.target/riscv/cmo-zicbom-2.c: Likewise.
* gcc.target/riscv/cmo-zicbop-1.c: Likewise.
* gcc.target/riscv/cmo-zicbop-2.c: Likewise.
* gcc.target/riscv/cmo-zicboz-1.c: Likewise.
* gcc.target/riscv/cmo-zicboz-2.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.target/riscv/cmo-zicbom-1.c | 6 +++---
 gcc/testsuite/gcc.target/riscv/cmo-zicbom-2.c | 6 +++---
 gcc/testsuite/gcc.target/riscv/cmo-zicbop-1.c | 6 +++---
 gcc/testsuite/gcc.target/riscv/cmo-zicbop-2.c | 6 +++---
 gcc/testsuite/gcc.target/riscv/cmo-zicboz-1.c | 2 +-
 gcc/testsuite/gcc.target/riscv/cmo-zicboz-2.c | 2 +-
 6 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicbom-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicbom-1.c
index 6341f7874d3e..02c38e201fae 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicbom-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicbom-1.c
@@ -24,6 +24,6 @@ void foo3()
 __builtin_riscv_zicbom_cbo_inval((void*)0x111);
 }
 
-/* { dg-final { scan-assembler-times "cbo.clean" 3 } } */
-/* { dg-final { scan-assembler-times "cbo.flush" 3 } } */
-/* { dg-final { scan-assembler-times "cbo.inval" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.clean\t" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.flush\t" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.inval\t" 3 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicbom-2.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicbom-2.c
index a04f106c8b0e..040b96952bc3 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicbom-2.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicbom-2.c
@@ -24,6 +24,6 @@ void foo3()
 __builtin_riscv_zicbom_cbo_inval((void*)0x111);
 }
 
-/* { dg-final { scan-assembler-times "cbo.clean" 3 } } */
-/* { dg-final { scan-assembler-times "cbo.flush" 3 } } */
-/* { dg-final { scan-assembler-times "cbo.inval" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.clean\t" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.flush\t" 3 } } */
+/* { dg-final { scan-assembler-times "cbo.inval\t" 3 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicbop-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicbop-1.c
index c5d78c1763d3..97181154d85b 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicbop-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicbop-1.c
@@ -18,6 +18,6 @@ int foo1()
   return __builtin_riscv_zicbop_cbo_prefetchi(1);
 }
 
-/* { dg-final { scan-assembler-times "prefetch.i" 1 } } */
-/* { dg-final { scan-assembler-times "prefetch.r" 4 } } */
-/* { dg-final { scan-assembler-times "prefetch.w" 4 } } */
+/* { dg-final { scan-assembler-times "prefetch.i\t" 1 } } */
+/* { dg-final { scan-assembler-times "prefetch.r\t" 4 } } */
+/* { dg-final { scan-assembler-times "prefetch.w\t" 4 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicbop-2.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicbop-2.c
index 6576365b39ca..4871a97b21aa 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicbop-2.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicbop-2.c
@@ -18,6 +18,6 @@ int foo1()
   return __builtin_riscv_zicbop_cbo_prefetchi(1);
 }
 
-/* { dg-final { scan-assembler-times "prefetch.i" 1 } } */
-/* { dg-final { scan-assembler-times "prefetch.r" 4 } } */
-/* { dg-final { scan-assembler-times "prefetch.w" 4 } } */ 
+/* { dg-final { scan-assembler-times "prefetch.i\t" 1 } } */
+/* { dg-final { scan-assembler-times "prefetch.r\t" 4 } } */
+/* { dg-final { scan-assembler-times "prefetch.w\t" 4 } } */ 
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-1.c
index 5eb78ab94b5a..63b8782bf89e 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-1.c
@@ -10,4 +10,4 @@ void foo1()
 __builtin_riscv_zicboz_cbo_zero((void*)0x121);
 }
 
-/* { dg-final { scan-assembler-times "cbo.zero" 3 } } */ 
+/* { dg-final { scan-assembler-times "cbo.zero\t" 3 } } */ 
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-2.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-2.c
index fdc9c719669c..cc3bd505ec09 100644
--- a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-2.c
+++ b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-2.c
@@ -10,4 +10,4 @@ void foo1()
 __builtin_riscv_zicboz_cbo_zero((void*)0x121);
 }
 
-/* { dg-final { scan-assembler-times "cbo.zero" 3 } } */ 
+/* { dg-final { scan-assembler-times "cbo.zero\t" 3 } } */


[gcc r15-490] [PATCH 3/3] RISC-V: Add memset-zero expansion to cbo.zero

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:54ba8d44bbd703bca6984700b4d6f978890097e2

commit r15-490-g54ba8d44bbd703bca6984700b4d6f978890097e2
Author: Christoph Müllner 
Date:   Tue May 14 09:21:17 2024 -0600

[PATCH 3/3] RISC-V: Add memset-zero expansion to cbo.zero

The Zicboz extension offers the cbo.zero instruction, which can be used
to clean a memory region corresponding to a cache block.
The Zic64b extension defines the cache block size to 64 byte.
If both extensions are available, it is possible to use cbo.zero
to clear memory, if the alignment and size constraints are met.
This patch implements this.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (riscv_expand_block_clear): New 
prototype.
* config/riscv/riscv-string.cc 
(riscv_expand_block_clear_zicboz_zic64b):
New function to expand a block-clear with cbo.zero.
(riscv_expand_block_clear): New RISC-V block-clear expansion 
function.
* config/riscv/riscv.md (setmem): New setmem expansion.

Diff:
---
 gcc/config/riscv/riscv-protos.h|  1 +
 gcc/config/riscv/riscv-string.cc   | 59 ++
 gcc/config/riscv/riscv.md  | 24 +
 .../gcc.target/riscv/cmo-zicboz-zic64-1.c  | 43 
 4 files changed, 127 insertions(+)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index e5aebf3fc3d5..255fd6a0de97 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -189,6 +189,7 @@ rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
 
 /* Routines implemented in riscv-string.c.  */
 extern bool riscv_expand_block_move (rtx, rtx, rtx);
+extern bool riscv_expand_block_clear (rtx, rtx);
 
 /* Information about one CPU we know about.  */
 struct riscv_cpu_info {
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 41cb061c746d..87f5fdee3c14 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -794,6 +794,65 @@ riscv_expand_block_move (rtx dest, rtx src, rtx length)
   return false;
 }
 
+/* Expand a block-clear instruction via cbo.zero instructions.  */
+
+static bool
+riscv_expand_block_clear_zicboz_zic64b (rtx dest, rtx length)
+{
+  unsigned HOST_WIDE_INT hwi_length;
+  unsigned HOST_WIDE_INT align;
+  const unsigned HOST_WIDE_INT cbo_bytes = 64;
+
+  gcc_assert (TARGET_ZICBOZ && TARGET_ZIC64B);
+
+  if (!CONST_INT_P (length))
+return false;
+
+  hwi_length = UINTVAL (length);
+  if (hwi_length < cbo_bytes)
+return false;
+
+  align = MEM_ALIGN (dest) / BITS_PER_UNIT;
+  if (align < cbo_bytes)
+return false;
+
+  /* We don't emit loops.  Instead apply move-bytes limitation.  */
+  unsigned HOST_WIDE_INT max_bytes = RISCV_MAX_MOVE_BYTES_STRAIGHT /
+ UNITS_PER_WORD * cbo_bytes;
+  if (hwi_length > max_bytes)
+return false;
+
+  unsigned HOST_WIDE_INT offset = 0;
+  while (offset + cbo_bytes <= hwi_length)
+{
+  rtx mem = adjust_address (dest, BLKmode, offset);
+  rtx addr = force_reg (Pmode, XEXP (mem, 0));
+  emit_insn (gen_riscv_zero_di (addr));
+  offset += cbo_bytes;
+}
+
+  if (offset < hwi_length)
+{
+  rtx mem = adjust_address (dest, BLKmode, offset);
+  clear_by_pieces (mem, hwi_length - offset, align);
+}
+
+  return true;
+}
+
+bool
+riscv_expand_block_clear (rtx dest, rtx length)
+{
+  /* Only use setmem-zero expansion for Zicboz + Zic64b.  */
+  if (!TARGET_ZICBOZ || !TARGET_ZIC64B)
+return false;
+
+  if (optimize_function_for_size_p (cfun))
+return false;
+
+  return riscv_expand_block_clear_zicboz_zic64b (dest, length);
+}
+
 /* --- Vector expanders --- */
 
 namespace riscv_vector {
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 4d6de9925572..c45b1129b0a0 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2608,6 +2608,30 @@
 FAIL;
 })
 
+;; Fill memory with constant byte.
+;; Argument 0 is the destination
+;; Argument 1 is the constant byte
+;; Argument 2 is the length
+;; Argument 3 is the alignment
+
+(define_expand "setmem"
+  [(parallel [(set (match_operand:BLK 0 "memory_operand")
+  (match_operand:QI 2 "const_int_operand"))
+ (use (match_operand:P 1 ""))
+ (use (match_operand:SI 3 "const_int_operand"))])]
+ ""
+ {
+  /* If value to set is not zero, use the library routine.  */
+  if (operands[2] != const0_rtx)
+FAIL;
+
+  if (riscv_expand_block_clear (operands[0], operands[1]))
+DONE;
+  else
+FAIL;
+})
+
+
 ;; Expand in-line code to clear the instruction cache between operand[0] and
 ;; operand[1].
 (define_expand "clear_cache"
diff --git a/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c 
b/gcc/testsuite/gcc.target/riscv/cmo-zicboz-zic64-1.c
new file mode 100644
index ..c2d79eb7ae68
--- /dev/null
+++ 

[gcc r15-488] [1/3] expr: Export clear_by_pieces()

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:e6e41b68fd805ab126895a20bb9670442b198f62

commit r15-488-ge6e41b68fd805ab126895a20bb9670442b198f62
Author: Christoph Müllner 
Date:   Tue May 14 09:19:13 2024 -0600

[1/3] expr: Export clear_by_pieces()

Make clear_by_pieces() available to other parts of the compiler,
similar to store_by_pieces().

gcc/ChangeLog:

* expr.cc (clear_by_pieces): Remove static from clear_by_pieces.
* expr.h (clear_by_pieces): Add prototype for clear_by_pieces.

Diff:
---
 gcc/expr.cc | 6 +-
 gcc/expr.h  | 5 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/gcc/expr.cc b/gcc/expr.cc
index 9f66d4794459..1baa39b98eba 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -85,7 +85,6 @@ static void emit_block_move_via_sized_loop (rtx, rtx, rtx, 
unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, 
unsigned);
 static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
unsigned, unsigned);
-static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
 static rtx store_field (rtx, poly_int64, poly_int64, poly_uint64, poly_uint64,
@@ -1840,10 +1839,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
 return to;
 }
 
-/* Generate several move instructions to clear LEN bytes of block TO.  (A MEM
-   rtx with BLKmode).  ALIGN is maximum alignment we can assume.  */
-
-static void
+void
 clear_by_pieces (rtx to, unsigned HOST_WIDE_INT len, unsigned int align)
 {
   if (len == 0)
diff --git a/gcc/expr.h b/gcc/expr.h
index 64956f630297..751815841083 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -245,6 +245,11 @@ extern bool can_store_by_pieces (unsigned HOST_WIDE_INT,
 extern rtx store_by_pieces (rtx, unsigned HOST_WIDE_INT, by_pieces_constfn,
void *, unsigned int, bool, memop_ret);
 
+/* Generate several move instructions to clear LEN bytes of block TO.  (A MEM
+   rtx with BLKmode).  ALIGN is maximum alignment we can assume.  */
+
+extern void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
+
 /* If can_store_by_pieces passes for worst-case values near MAX_LEN, call
store_by_pieces within conditionals so as to handle variable LEN 
efficiently,
storing VAL, if non-NULL_RTX, or valc instead.  */


Re: [RFC][PATCH] PR tree-optimization/109071 - -Warray-bounds false positive warnings due to code duplication from jump threading

2024-05-14 Thread Jeff Law




On 5/14/24 8:57 AM, Qing Zhao wrote:




On May 13, 2024, at 20:14, Kees Cook  wrote:

On Tue, May 14, 2024 at 01:38:49AM +0200, Andrew Pinski wrote:

On Mon, May 13, 2024, 11:41 PM Kees Cook  wrote:

But it makes no sense to warn about:

void sparx5_set (int * ptr, struct nums * sg, int index)
{
   if (index >= 4)
 warn ();
   *ptr = 0;
   *val = sg->vals[index];
   if (index >= 4)
 warn ();
   *ptr = *val;
}

Because at "*val = sg->vals[index];" the actual value range tracking for
index is _still_ [INT_MIN,INT_MAX]. (Only within the "then" side of the
"if" statements is the range tracking [4,INT_MAX].)

However, in the case where jump threading has split the execution flow
and produced a copy of "*val = sg->vals[index];" where the value range
tracking for "index" is now [4,INT_MAX], is the warning valid. But it
is only for that instance. Reporting it for effectively both (there is
only 1 source line for the array indexing) is misleading because there
is nothing the user can do about it -- the compiler created the copy and
then noticed it had a range it could apply to that array index.



"there is nothing the user can do about it" is very much false. They could
change warn call into a noreturn function call instead.  (In the case of
the Linux kernel panic). There are things the user can do to fix the
warning and even get better code generation out of the compilers.


This isn't about warn() not being noreturn. The warn() could be any
function call; the jump threading still happens.


When the program is executed on the “if (index > = 4)” path,  the value of 
“index” is definitely

= 4, when sg->vals[index] is referenced on this path (the case when the routine 
“warn” is NOT noreturn), it’s

definitely an out-of-bounds array access.  So, the compiler’s warning is 
correct. And this warning does catch
a potential issue in the source code that need to be fixed by either of the 
following two solutions:

1. Make the routine “warn” as noreturn and mark it noreturn;
This would be my recommendation.  We're about to execute undefined 
behavior.  I don't see a way to necessarily recover safely here, so I'd 
suggest having warn() not return and mark it appropriately.


That'll have numerous secondary benefits as well.

jeff



Re: [PATCH v2 2/2] RISC-V: avoid LUI based const mat in prologue/epilogue expansion [PR/105733]

2024-05-14 Thread Jeff Law




On 5/14/24 8:51 AM, Patrick O'Neill wrote:


On 5/13/24 20:36, Jeff Law wrote:



On 5/13/24 6:54 PM, Patrick O'Neill wrote:


On 5/13/24 13:28, Jeff Law wrote:



On 5/13/24 12:49 PM, Vineet Gupta wrote:
If the constant used for stack offset can be expressed as sum of 
two S12
values, the constant need not be materialized (in a reg) and 
instead the

two S12 bits can be added to instructions involved with frame pointer.
This avoids burning a register and more importantly can often get down
to be 2 insn vs. 3.

The prev patches to generally avoid LUI based const materialization 
didn't

fix this PR and need this directed fix in funcion prologue/epilogue
expansion.

This fix doesn't move the neddle for SPEC, at all, but it is still a
win considering gcc generates one insn fewer than llvm for the 
test ;-)


    gcc-13.1 release   |  gcc 230823 | |
   |    g6619b3d4c15c    |   This patch | 
clang/llvm

-
li  t0,-4096 | li    t0,-4096  | addi sp,sp,-2048 | 
addi sp,sp,-2048
addi    t0,t0,2016   | addi  t0,t0,2032    | add sp,sp,-16   | addi 
sp,sp,-32
li  a4,4096  | add   sp,sp,t0  | add a5,sp,a0    | add 
a1,sp,16
add sp,sp,t0 | addi  a5,sp,-2032   | sb zero,0(a5)  | add 
a0,a0,a1
li  a5,-4096 | add   a0,a5,a0  | addi sp,sp,2032  | sb 
zero,0(a0)
addi    a4,a4,-2032  | li    t0, 4096  | addi sp,sp,32    | 
addi sp,sp,2032
add a4,a4,a5 | sb    zero,2032(a0) | ret   | 
addi sp,sp,48

addi    a5,sp,16 | addi  t0,t0,-2032 |   | ret
add a5,a4,a5 | add   sp,sp,t0  |
add a0,a5,a0 | ret |
li  t0,4096  |
sd  a5,8(sp) |
sb  zero,2032(a0)|
addi    t0,t0,-2016  |
add sp,sp,t0 |
ret  |

gcc/ChangeLog:
PR target/105733
* config/riscv/riscv.h: New macros for with aligned offsets.
* config/riscv/riscv.cc (riscv_split_sum_of_two_s12): New
function to split a sum of two s12 values into constituents.
(riscv_expand_prologue): Handle offset being sum of two S12.
(riscv_expand_epilogue): Ditto.
* config/riscv/riscv-protos.h (riscv_split_sum_of_two_s12): New.

gcc/testsuite/ChangeLog:
* gcc.target/riscv/pr105733.c: New Test.
* gcc.target/riscv/rvv/autovec/vls/spill-1.c: Adjust to not
expect LUI 4096.
* gcc.target/riscv/rvv/autovec/vls/spill-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-6.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-7.c: Ditto.





@@ -8074,14 +8111,26 @@ riscv_expand_epilogue (int style)
  }
    else
  {
-  if (!SMALL_OPERAND (adjust_offset.to_constant ()))
+  HOST_WIDE_INT adj_off_value = adjust_offset.to_constant ();
+  if (SMALL_OPERAND (adj_off_value))
+    {
+  adjust = GEN_INT (adj_off_value);
+    }
+  else if (SUM_OF_TWO_S12_ALGN (adj_off_value))
+    {
+  HOST_WIDE_INT base, off;
+  riscv_split_sum_of_two_s12 (adj_off_value, , );
+  insn = gen_add3_insn (stack_pointer_rtx, 
hard_frame_pointer_rtx,

+    GEN_INT (base));
+  RTX_FRAME_RELATED_P (insn) = 1;
+  adjust = GEN_INT (off);
+    }
So this was the hunk that we identified internally as causing 
problems with libgomp's testsuite.  We never fully chased it down as 
this hunk didn't seem terribly important performance wise -- we just 
set it aside.  The thing is it looked basically correct to me.  So 
the failure was certainly unexpected, but it was consistent.


So I think the question is whether or not the CI system runs the 
libgomp testsuite, particularly in the rv64 linux configuration. If 
it does, and it passes, then we're good. I'm still finding my way 
around the configuration, so I don't know if the CI system Edwin & 
Patrick have built tests libgomp or not.


I poked around the .sum files in pre/postcommit and we do run tests 
like:


PASS: c-c++-common/gomp/affinity-2.c  (test for errors, line 45)

I was able to find the summary info:


Tests that now fail, but worked before (15 tests):
libgomp: libgomp.fortran/simd7.f90   -O0  execution test
libgomp: libgomp.fortran/task2.f90   -O0  execution test
libgomp: libgomp.fortran/vla2.f90   -O0  execution test
libgomp: libgomp.fortran/vla3.f90   -O3 -fomit-frame-pointer - 
funroll-loops -fpeel-loops -ftracer -finline-functions execution test

libgomp: libgomp.fortran/vla3.f90   -O3 -g  execution test
libgomp: libgomp.fortran/vla4.f90   -O1  execution test
libgomp: libgomp.fortran/vla4.f90   -O2  execution test
libgomp: libgomp.fortran/vla4.f90   -O3 -fomit-frame-pointer - 
funroll-loops -fpeel-loops -ftracer -finline-functions execution test

libgomp: libgomp.fortran/vla4.f90   -O

[to-be-committed][RISC-V] Remove redundant AND in shift-add sequence

2024-05-14 Thread Jeff Law
So this patch allows us to eliminate an redundant AND in some shift-add 
style sequences.   I think the testcase was reduced from xz by the RAU 
team, but I'm not highly confident of that.


Specifically the AND is masking off the upper 32 bits of the un-shifted 
value and there's an outer SIGN_EXTEND from SI to DI.  However in the 
RTL it's working on the post-shifted value, so the constant is left 
shifted, so we have to account for that in the pattern's condition.


We can just drop the AND in this case.  So instead we do a 64bit shift, 
then a sign extending ADD utilizing the low part of that 64bit shift result.



This has run through Ventana's CI as well as my own.  I'll wait for it 
to run through the larger CI system before pushing.


Jeff
gcc/
* config/riscv/riscv.md: Add pattern for sign extended shift-add 
sequence with a masked input.

gcc/testsuite

* gcc.target/riscv/shift-add-2.c: New test.

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 4d6de992557..520c0f54150 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -4056,6 +4056,31 @@ (define_insn "*large_load_address"
   [(set_attr "type" "load")
(set (attr "length") (const_int 8))])
 
+;; The AND is redunant here.  It always turns off the high 32 bits  and the
+;; low number of bits equal to the shift count.  Those upper 32 bits will be
+;; reset by the SIGN_EXTEND at the end.
+;;
+;; One could argue combine should have realized this and simplified what it
+;; presented to the backend.  But we can obviously cope with what it gave us.
+(define_insn_and_split ""
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (sign_extend:DI
+ (plus:SI (subreg:SI
+(and:DI
+  (ashift:DI (match_operand:DI 1 "register_operand" "r")
+ (match_operand 2 "const_int_operand" "n"))
+  (match_operand 3 "const_int_operand" "n")) 0)
+  (match_operand:SI 4 "register_operand" "r"
+   (clobber (match_scratch:DI 5 "="))]
+  "TARGET_64BIT
+   && (INTVAL (operands[3]) | ((1 << INTVAL (operands[2])) - 1)) == 0x"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 5) (ashift:DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 0) (sign_extend:DI (plus:SI (match_dup 6) (match_dup 4]
+  "{ operands[6] = gen_lowpart (SImode, operands[5]); }"
+  [(set_attr "type" "arith")])
+
 (include "bitmanip.md")
 (include "crypto.md")
 (include "sync.md")
diff --git a/gcc/testsuite/gcc.target/riscv/shift-add-2.c 
b/gcc/testsuite/gcc.target/riscv/shift-add-2.c
new file mode 100644
index 000..87439858e59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/shift-add-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+int sub2(int a, long long b) {
+  b = (b << 32) >> 31;
+  unsigned int x = a + b;
+  return x;
+}
+
+
+/* { dg-final { scan-assembler-times "\tslli\t" 1 } } */
+/* { dg-final { scan-assembler-times "\taddw\t" 1 } } */
+/* { dg-final { scan-assembler-not "\tsrai\t" } } */
+/* { dg-final { scan-assembler-not "\tsh.add\t" } } */
+


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [to-be-committed] RISC-V Fix minor regression in synthesis WRT bseti usage

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:a39fd3b589c57f3d2860f73d255902bbdef1a51c

commit a39fd3b589c57f3d2860f73d255902bbdef1a51c
Author: Jeff Law 
Date:   Sun May 12 07:05:43 2024 -0600

[to-be-committed] RISC-V Fix minor regression in synthesis WRT bseti usage

Overnight testing showed a small number of cases where constant synthesis 
was
doing something dumb.  Specifically generating more instructions than the
number of bits set in the constant.

It was a minor goof in the recent bseti code.  In the code to first figure 
out
what bits LUI could set, I included one bit outside the space LUI operates.
For some dumb reason I kept thinking in terms of 11 low bits belonging to 
addi,
but it's actually 12 bits.  The net is what we thought should be a single 
LUI
for costing turned into LUI+ADDI.

I didn't let the test run to completion, but over the course of 12 hours it
found 9 cases.  Given we know that the triggers all have 0x800 set, I bet we
could likely find more, but I doubt it's that critical to cover every 
possible
constant that regressed.

gcc/
* config/riscv/riscv.cc (riscv_build_integer_1): Fix thinko in 
testing
when lui can be used to set several bits in bseti path.

gcc/testsuite

* gcc.target/riscv/synthesis-4.c: New test

(cherry picked from commit 77a28ed91b2a527b9006ee1a220b468756b43eca)

Diff:
---
 gcc/config/riscv/riscv.cc|  6 ++---
 gcc/testsuite/gcc.target/riscv/synthesis-4.c | 34 
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 9c98b1da0357..049f8f8cb9fc 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -921,12 +921,12 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
 
   /* First handle any bits set by LUI.  Be careful of the
 SImode sign bit!.  */
-  if (value & 0x7800)
+  if (value & 0x7000)
{
  alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
- alt_codes[i].value = value & 0x7800;
+ alt_codes[i].value = value & 0x7000;
  alt_codes[i].use_uw = false;
- value &= ~0x7800;
+ value &= ~0x7000;
   i++;
}
 
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-4.c 
b/gcc/testsuite/gcc.target/riscv/synthesis-4.c
new file mode 100644
index ..328a55b9e6e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-4.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } 
*/
+/* { dg-options "-march=rv64gc_zba_zbb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions. 
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times 
"\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 45 } } */
+
+
+unsigned long foo_0x640800(void) { return 0x640800UL; }
+
+unsigned long foo_0xc40800(void) { return 0xc40800UL; }
+
+unsigned long foo_0x1840800(void) { return 0x1840800UL; }
+
+unsigned long foo_0x3040800(void) { return 0x3040800UL; }
+
+unsigned long foo_0x6040800(void) { return 0x6040800UL; }
+
+unsigned long foo_0xc040800(void) { return 0xc040800UL; }
+
+unsigned long foo_0x18040800(void) { return 0x18040800UL; }
+
+unsigned long foo_0x30040800(void) { return 0x30040800UL; }
+
+unsigned long foo_0x60040800(void) { return 0x60040800UL; }


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Fix format issue for trailing operator [NFC]

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:0c1a07d6e5b30aad71798aa7c37fc80bd19b7f63

commit 0c1a07d6e5b30aad71798aa7c37fc80bd19b7f63
Author: Pan Li 
Date:   Tue May 14 09:38:55 2024 +0800

RISC-V: Fix format issue for trailing operator [NFC]

This patch would like to fix below format issue of trailing operator.

=== ERROR type #1: trailing operator (4 error(s)) ===
gcc/config/riscv/riscv-vector-builtins.cc:4641:39:  if ((exts &
RVV_REQUIRE_ELEN_FP_16) &&
gcc/config/riscv/riscv-vector-builtins.cc:4651:39:  if ((exts &
RVV_REQUIRE_ELEN_FP_32) &&
gcc/config/riscv/riscv-vector-builtins.cc:4661:39:  if ((exts &
RVV_REQUIRE_ELEN_FP_64) &&
gcc/config/riscv/riscv-vector-builtins.cc:4670:36:  if ((exts &
RVV_REQUIRE_ELEN_64) &&

Passed the ./contrib/check_GNU_style.sh for this patch,  and double
checked there is no other format issue of the original patch.

Committed as format change.

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins.cc
(validate_instance_type_required_extensions): Remove the
operator from the trailing and put it to new line.

Signed-off-by: Pan Li 
(cherry picked from commit b6dc8464e613d1da2b28235bbd2f9c3fd4bc386b)

Diff:
---
 gcc/config/riscv/riscv-vector-builtins.cc | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/config/riscv/riscv-vector-builtins.cc 
b/gcc/config/riscv/riscv-vector-builtins.cc
index 3fdb4400d70d..c08d87a26807 100644
--- a/gcc/config/riscv/riscv-vector-builtins.cc
+++ b/gcc/config/riscv/riscv-vector-builtins.cc
@@ -4638,8 +4638,8 @@ validate_instance_type_required_extensions (const 
rvv_type_info type,
 {
   uint64_t exts = type.required_extensions;
 
-  if ((exts & RVV_REQUIRE_ELEN_FP_16) &&
-!TARGET_VECTOR_ELEN_FP_16_P (riscv_vector_elen_flags))
+  if ((exts & RVV_REQUIRE_ELEN_FP_16)
+&& !TARGET_VECTOR_ELEN_FP_16_P (riscv_vector_elen_flags))
 {
   error_at (EXPR_LOCATION (exp),
"built-in function %qE requires the "
@@ -4648,8 +4648,8 @@ validate_instance_type_required_extensions (const 
rvv_type_info type,
   return false;
 }
 
-  if ((exts & RVV_REQUIRE_ELEN_FP_32) &&
-!TARGET_VECTOR_ELEN_FP_32_P (riscv_vector_elen_flags))
+  if ((exts & RVV_REQUIRE_ELEN_FP_32)
+&& !TARGET_VECTOR_ELEN_FP_32_P (riscv_vector_elen_flags))
 {
   error_at (EXPR_LOCATION (exp),
"built-in function %qE requires the "
@@ -4658,8 +4658,8 @@ validate_instance_type_required_extensions (const 
rvv_type_info type,
   return false;
 }
 
-  if ((exts & RVV_REQUIRE_ELEN_FP_64) &&
-!TARGET_VECTOR_ELEN_FP_64_P (riscv_vector_elen_flags))
+  if ((exts & RVV_REQUIRE_ELEN_FP_64)
+&& !TARGET_VECTOR_ELEN_FP_64_P (riscv_vector_elen_flags))
 {
   error_at (EXPR_LOCATION (exp),
"built-in function %qE requires the zve64d or v ISA extension",
@@ -4667,8 +4667,8 @@ validate_instance_type_required_extensions (const 
rvv_type_info type,
   return false;
 }
 
-  if ((exts & RVV_REQUIRE_ELEN_64) &&
-!TARGET_VECTOR_ELEN_64_P (riscv_vector_elen_flags))
+  if ((exts & RVV_REQUIRE_ELEN_64)
+&& !TARGET_VECTOR_ELEN_64_P (riscv_vector_elen_flags))
 {
   error_at (EXPR_LOCATION (exp),
"built-in function %qE requires the "


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [to-be-committed, RISC-V] Improve AND with some constants

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:c6ed1bc38b17d650ae678b7cac28ce8c2692eb09

commit c6ed1bc38b17d650ae678b7cac28ce8c2692eb09
Author: Jeff Law 
Date:   Mon May 13 17:37:46 2024 -0600

[to-be-committed,RISC-V] Improve AND with some constants

If we have an AND with a constant operand and the constant operand
requires synthesis, then we may be able to generate more efficient code
than we do now.

Essentially the need for constant synthesis gives us a budget for
alternative ways to clear bits, which zext.w can do for bits 32..63
trivially.   So if we clear 32..63  via zext.w, the constant for the
remaining bits to clear may be simple enough to use with andi or bseti.
That will save us an instruction.

This has tested in Ventana's CI system as well as my own.  I'll wait for
the upstream CI tester to report success before committing.

Jeff
gcc/
* config/riscv/bitmanip.md: Add new splitter for AND with
a constant that masks off bits 32..63 and needs synthesis.

gcc/testsuite/

* gcc.target/riscv/zba_zbs_and-1.c: New test.

(cherry picked from commit 158aa1b65ce29d5e58182782de66292c51774d71)

Diff:
---
 gcc/config/riscv/bitmanip.md   | 34 ++
 gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c | 22 +
 2 files changed, 56 insertions(+)

diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index 724511b6df3b..8769a6b818b7 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -843,6 +843,40 @@
 }
 [(set_attr "type" "bitmanip")])
 
+;; If we have the ZBA extension, then we can clear the upper half of a 64
+;; bit object with a zext.w.  So if we have AND where the constant would
+;; require synthesis of two or more instructions, but 32->64 sign extension
+;; of the constant is a simm12, then we can use zext.w+andi.  If the adjusted
+;; constant is a single bit constant, then we can use zext.w+bclri
+;;
+;; With the mvconst_internal pattern claiming a single insn to synthesize
+;; constants, this must be a define_insn_and_split.
+(define_insn_and_split ""
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (and:DI (match_operand:DI 1 "register_operand" "r")
+   (match_operand 2 "const_int_operand" "n")))]
+  "TARGET_64BIT
+   && TARGET_ZBA
+   && !paradoxical_subreg_p (operands[1])
+   /* Only profitable if synthesis takes more than one insn.  */
+   && riscv_const_insns (operands[2]) != 1
+   /* We need the upper half to be zero.  */
+   && (INTVAL (operands[2]) & HOST_WIDE_INT_C (0x)) == 0
+   /* And the the adjusted constant must either be something we can
+  implement with andi or bclri.  */
+   && ((SMALL_OPERAND (sext_hwi (INTVAL (operands[2]), 32))
+|| (TARGET_ZBS && popcount_hwi (INTVAL (operands[2])) == 31))
+   && INTVAL (operands[2]) != 0x7fff)"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (zero_extend:DI (match_dup 3)))
+   (set (match_dup 0) (and:DI (match_dup 0) (match_dup 2)))]
+  "{
+ operands[3] = gen_lowpart (SImode, operands[1]);
+ operands[2] = GEN_INT (sext_hwi (INTVAL (operands[2]), 32));
+   }"
+  [(set_attr "type" "bitmanip")])
+
 ;; IF_THEN_ELSE: test for 2 bits of opposite polarity
 (define_insn_and_split "*branch_mask_twobits_equals_singlebit"
   [(set (pc)
diff --git a/gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c 
b/gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c
new file mode 100644
index ..23fd769449ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+
+unsigned long long w32mem_1(unsigned long long w32)
+{
+return w32 & ~(1U << 0);
+}
+
+unsigned long long w32mem_2(unsigned long long w32)
+{
+return w32 & ~(1U << 30);
+}
+
+unsigned long long w32mem_3(unsigned long long w32)
+{
+return w32 & ~(1U << 31);
+}
+
+/* If we do synthesis, then we'd see an addi.  */
+/* { dg-final { scan-assembler-not "addi\t" } } */


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Bugfix ICE for RVV intrinisc vfw on _Float16 scalar

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:7d135c53cf480c99b6fa883569e9b8d55ed92ea5

commit 7d135c53cf480c99b6fa883569e9b8d55ed92ea5
Author: Pan Li 
Date:   Sat May 11 15:25:28 2024 +0800

RISC-V: Bugfix ICE for RVV intrinisc vfw on _Float16 scalar

For the vfw vx format RVV intrinsic, the scalar type _Float16 also
requires the zvfh extension.  Unfortunately,  we only check the
vector tree type and miss the scalar _Float16 type checking.  For
example:

vfloat32mf2_t test_vfwsub_wf_f32mf2(vfloat32mf2_t vs2, _Float16 rs1, size_t 
vl)
{
  return __riscv_vfwsub_wf_f32mf2(vs2, rs1, vl);
}

It should report some error message like zvfh extension is required
instead of ICE for unreg insn.

This patch would like to make up such kind of validation for _Float16
in the RVV intrinsic API.  It will report some error like below when
there is no zvfh enabled.

error: built-in function '__riscv_vfwsub_wf_f32mf2(vs2,  rs1,  vl)'
  requires the zvfhmin or zvfh ISA extension

Passed the rv64gcv fully regression tests, included c/c++/fortran.

PR target/114988

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins.cc
(validate_instance_type_required_extensions): New func impl to
validate the intrinisc func type ops.
(expand_builtin): Validate instance type before expand.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/pr114988-1.c: New test.
* gcc.target/riscv/rvv/base/pr114988-2.c: New test.

Signed-off-by: Pan Li 
(cherry picked from commit 41b3cf262e61aee9d26380f1c820e0eaae740f50)

Diff:
---
 gcc/config/riscv/riscv-vector-builtins.cc  | 51 ++
 .../gcc.target/riscv/rvv/base/pr114988-1.c |  9 
 .../gcc.target/riscv/rvv/base/pr114988-2.c |  9 
 3 files changed, 69 insertions(+)

diff --git a/gcc/config/riscv/riscv-vector-builtins.cc 
b/gcc/config/riscv/riscv-vector-builtins.cc
index 192a6c230d1c..3fdb4400d70d 100644
--- a/gcc/config/riscv/riscv-vector-builtins.cc
+++ b/gcc/config/riscv/riscv-vector-builtins.cc
@@ -4632,6 +4632,54 @@ gimple_fold_builtin (unsigned int code, 
gimple_stmt_iterator *gsi, gcall *stmt)
   return gimple_folder (rfn.instance, rfn.decl, gsi, stmt).fold ();
 }
 
+static bool
+validate_instance_type_required_extensions (const rvv_type_info type,
+   tree exp)
+{
+  uint64_t exts = type.required_extensions;
+
+  if ((exts & RVV_REQUIRE_ELEN_FP_16) &&
+!TARGET_VECTOR_ELEN_FP_16_P (riscv_vector_elen_flags))
+{
+  error_at (EXPR_LOCATION (exp),
+   "built-in function %qE requires the "
+   "zvfhmin or zvfh ISA extension",
+   exp);
+  return false;
+}
+
+  if ((exts & RVV_REQUIRE_ELEN_FP_32) &&
+!TARGET_VECTOR_ELEN_FP_32_P (riscv_vector_elen_flags))
+{
+  error_at (EXPR_LOCATION (exp),
+   "built-in function %qE requires the "
+   "zve32f, zve64f, zve64d or v ISA extension",
+   exp);
+  return false;
+}
+
+  if ((exts & RVV_REQUIRE_ELEN_FP_64) &&
+!TARGET_VECTOR_ELEN_FP_64_P (riscv_vector_elen_flags))
+{
+  error_at (EXPR_LOCATION (exp),
+   "built-in function %qE requires the zve64d or v ISA extension",
+   exp);
+  return false;
+}
+
+  if ((exts & RVV_REQUIRE_ELEN_64) &&
+!TARGET_VECTOR_ELEN_64_P (riscv_vector_elen_flags))
+{
+  error_at (EXPR_LOCATION (exp),
+   "built-in function %qE requires the "
+   "zve64x, zve64f, zve64d or v ISA extension",
+   exp);
+  return false;
+}
+
+  return true;
+}
+
 /* Expand a call to the RVV function with subcode CODE.  EXP is the call
expression and TARGET is the preferred location for the result.
Return the value of the lhs.  */
@@ -4649,6 +4697,9 @@ expand_builtin (unsigned int code, tree exp, rtx target)
   return target;
 }
 
+  if (!validate_instance_type_required_extensions (rfn.instance.type, exp))
+return target;
+
   return function_expander (rfn.instance, rfn.decl, exp, target).expand ();
 }
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr114988-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/pr114988-1.c
new file mode 100644
index ..b8474804c880
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr114988-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+vfloat32mf2_t test_vfwsub_wf_f32mf2(vfloat32mf2_t vs2, _Float16 rs1, size_t vl)
+{
+  return __riscv_vfwsub_wf_f32mf2(vs2, rs1, vl); /* { dg-error {built-in 
function '__riscv_vfwsub_wf_f32mf2\(vs2,  rs1,  vl\)' requires the zvfhmin or 
zvfh ISA extension} } */
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr114988-2.c 

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [to-be-committed, RISC-V] Improve single inverted bit extraction - v3

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:8be088a0f395047189e139d4c791cfc2275898b9

commit 8be088a0f395047189e139d4c791cfc2275898b9
Author: Jeff Law 
Date:   Mon May 13 07:14:08 2024 -0600

[to-be-committed,RISC-V] Improve single inverted bit extraction - v3

So this patch fixes a minor code generation inefficiency that (IIRC) the
RAU team discovered a while ago in spec.

If we want the inverted value of a single bit we can use bext to extract
the bit, then seq to invert the value (if viewed as a 0/1 truth value).

The RTL is fairly convoluted, but it's basically a right shift to get
the bit into position, bitwise-not then masking off all but the low bit.
So it's a 3->2 combine, hidden by the fact that and-not is a
define_insn_and_split, so it actually looks like a 2->2 combine.

We've run this through Ventana's internal CI (which includes
zba_zbb_zbs) and I've run it in my own tester (rv64gc, rv32gcv).  I'll
wait for the upstream CI to finish with positive results before pushing.

gcc/
* config/riscv/bitmanip.md (bextseqzdisi): New patterns.

gcc/testsuite/

* gcc.target/riscv/zbs-bext-2.c: New test.
* gcc.target/riscv/zbs-bext.c: Fix one of the possible expectes 
sequences.

(cherry picked from commit 0c585c8d0dd85601a8d116ada99126a48c8ce9fd)

Diff:
---
 gcc/config/riscv/.riscv.cc.swo  | Bin 0 -> 417792 bytes
 gcc/config/riscv/bitmanip.md|  43 
 gcc/config/riscv/j  |   0
 gcc/testsuite/gcc.target/riscv/zbs-bext-2.c |  19 
 gcc/testsuite/gcc.target/riscv/zbs-bext.c   |   2 +-
 5 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/.riscv.cc.swo b/gcc/config/riscv/.riscv.cc.swo
new file mode 100644
index ..77ed37353bee
Binary files /dev/null and b/gcc/config/riscv/.riscv.cc.swo differ
diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index d76a72d30e02..724511b6df3b 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -711,6 +711,49 @@
   "bext\t%0,%1,%2"
   [(set_attr "type" "bitmanip")])
 
+;; This is a bext followed by a seqz.  Normally this would be a 3->2 split
+;; But the and-not pattern with a constant operand is a define_insn_and_split,
+;; so this looks like a 2->2 split, which combine rejects.  So implement it
+;; as a define_insn_and_split as well.
+(define_insn_and_split "*bextseqzdisi"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (and:DI
+ (not:DI
+   (subreg:DI
+ (lshiftrt:SI
+   (match_operand:SI 1 "register_operand" "r")
+   (match_operand:QI 2 "register_operand" "r")) 0))
+ (const_int 1)))]
+  "TARGET_64BIT && TARGET_ZBS"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (zero_extract:DI (match_dup 1)
+(const_int 1)
+(zero_extend:DI (match_dup 2
+   (set (match_dup 0) (eq:DI (match_dup 0) (const_int 0)))]
+  "operands[1] = gen_lowpart (word_mode, operands[1]);"
+  [(set_attr "type" "bitmanip")])
+
+(define_insn_and_split "*bextseqzdisi"
+  [(set (match_operand:X 0 "register_operand" "=r")
+   (and:X
+ (not:X
+   (lshiftrt:X
+ (match_operand:X 1 "register_operand" "r")
+ (match_operand:QI 2 "register_operand" "r")))
+ (const_int 1)))]
+  "TARGET_ZBS"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (zero_extract:X (match_dup 1)
+   (const_int 1)
+   (zero_extend:X (match_dup 2
+   (set (match_dup 0) (eq:X (match_dup 0) (const_int 0)))]
+  "operands[1] = gen_lowpart (word_mode, operands[1]);"
+  [(set_attr "type" "bitmanip")])
+
 ;; When performing `(a & (1UL << bitno)) ? 0 : -1` the combiner
 ;; usually has the `bitno` typed as X-mode (i.e. no further
 ;; zero-extension is performed around the bitno).
diff --git a/gcc/config/riscv/j b/gcc/config/riscv/j
new file mode 100644
index ..e69de29bb2d1
diff --git a/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c 
b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
new file mode 100644
index ..79f120b22863
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+
+_Bool match(const int ch, int fMap) {
+return ((fMap & (1<<(ch))) == 0);
+}
+
+_Bool match2(const int ch, int fMap) {
+return ((f

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [to-be-committed, RISC-V] Improve usage of slli.uw in constant synthesis

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:71edaf678fae607d7c8ce28ace9a321af711799b

commit 71edaf678fae607d7c8ce28ace9a321af711799b
Author: Jeff Law 
Date:   Sun May 12 07:12:04 2024 -0600

[to-be-committed,RISC-V] Improve usage of slli.uw in constant synthesis

And an improvement to using slli.uw...

I recently added the ability to use slli.uw in the synthesis path.  That
code was conditional on the right justified constant being a LUI_OPERAND
after sign extending from bit 31 to bit 63.

That code is working fine, but could be improved.  Specifically there's
no reason it shouldn't work for LUI+ADDI under the same circumstances.
So rather than testing the sign extended, right justified constant is a
LUI_OPERAND, we can just test that the right justified constant has
precisely 32 leading zeros.

gcc/
* config/riscv/riscv.cc (riscv_build_integer_1): Use slli.uw more.

gcc/testsuite
* gcc.target/riscv/synthesis-5.c: New test.

(cherry picked from commit 83fb5e6f382ea99ca0e2a0afeb25a9f78909f25f)

Diff:
---
 gcc/config/riscv/riscv.cc|   9 +-
 gcc/testsuite/gcc.target/riscv/synthesis-5.c | 294 +++
 2 files changed, 299 insertions(+), 4 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 049f8f8cb9fc..a1e5a014bedf 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -819,13 +819,14 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
  & ~HOST_WIDE_INT_C (0x8000)
shift -= IMM_BITS, x <<= IMM_BITS;
 
-  /* Adjust X if it isn't a LUI operand in isolation, but we can use
-a subsequent "uw" instruction form to mask off the undesirable
-bits.  */
+  /* If X has bits 32..63 clear and bit 31 set, then go ahead and mark
+it as desiring a "uw" operation for the shift.  That way we can have
+LUI+ADDI to generate the constant, then shift it into position
+clearing out the undesirable bits.  */
   if (!LUI_OPERAND (x)
  && TARGET_64BIT
  && TARGET_ZBA
- && LUI_OPERAND (x & ~HOST_WIDE_INT_C (0x8000UL)))
+ && clz_hwi (x) == 32)
{
  x = sext_hwi (x, 32);
  use_uw = true;
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-5.c 
b/gcc/testsuite/gcc.target/riscv/synthesis-5.c
new file mode 100644
index ..4d81565b563b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-5.c
@@ -0,0 +1,294 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } 
*/
+/* { dg-options "-march=rv64gc_zba_zbb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions.
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times 
"\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 556 } } */
+
+unsigned long foo_0x80180001000(void) { return 0x80180001000UL; }
+
+unsigned long foo_0x80280001000(void) { return 0x80280001000UL; }
+
+unsigned long foo_0x80480001000(void) { return 0x80480001000UL; }
+
+unsigned long foo_0x80880001000(void) { return 0x80880001000UL; }
+
+unsigned long foo_0x81080001000(void) { return 0x81080001000UL; }
+
+unsigned long foo_0x82080001000(void) { return 0x82080001000UL; }
+
+unsigned long foo_0x84080001000(void) { return 0x84080001000UL; }
+
+unsigned long foo_0x88080001000(void) { return 0x88080001000UL; }
+
+unsigned long foo_0x90080001000(void) { return 0x90080001000UL; }
+
+unsigned long foo_0xa0080001000(void) { return 0xa0080001000UL; }
+
+unsigned long foo_0x8031000(void) { return 0x8031000UL; }
+
+unsigned long foo_0x8051000(void) { return 0x8051000UL; }
+
+unsigned long foo_0x8091000(void) { return 0x8091000UL; }
+
+unsigned long foo_0x8111000(void) { return 0x8111000UL; }
+
+unsigned long foo_0x8211000(void) { return 0x8211000UL; }
+
+unsigned long foo_0x8411000(void) { return 0x8411000UL; }
+
+unsigned long foo_0x8811000(void) { return 0x8811000UL; }
+
+unsigned long foo_0x9011000(void) { return 0x9011000UL; }
+
+unsigned long foo_0xa011000(void) { return 0xa011000UL; }
+
+unsigned long foo_0xc011000(void) { return 0xc011000UL; }
+
+unsigned long foo_

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [RISC-V] Use shNadd for constant synthesis

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:28db6a196e3251908df676d7c5e9626d29d37e5e

commit 28db6a196e3251908df676d7c5e9626d29d37e5e
Author: Jeff Law 
Date:   Fri May 10 13:49:44 2024 -0600

[RISC-V] Use shNadd for constant synthesis

So here's the next idiom to improve constant synthesis.

The basic idea here is to try and use shNadd to generate the constant when 
profitable.

Let's take 0x30801.  Right now that generates:

li  a0,3145728
addia0,a0,1
sllia0,a0,12
addia0,a0,-2047

But we can do better.  The constant is evenly divisible by 9 resulting in
0x5639 which doesn't look terribly interesting.  But that constant can 
be
generated with two instructions, then we can use a sh3add to multiply it by 
9.
So the updated sequence looks like:

li  a0,1431654400
addia0,a0,1593
sh3add  a0,a0,a0

This doesn't trigger a whole lot, but I haven't really set up a test to 
explore
the most likely space where this might be useful.  The tests were found
exploring a different class of constant synthesis problems.

If you were to dive into the before/after you'd see that the shNadd 
interacts
quite nicely with the recent bseti work.   The joys of recursion.

Probably the most controversial thing in here is using the "FMA" opcode to
stand in for when we want to use shNadd.  Essentially when we synthesize a
constant we generate a series of RTL opcodes and constants for emission by
another routine.   We don't really have a way to say we want a shift-add.  
But
you can think of shift-add as a limited form of multiply-accumulate.  It's a
bit of a stretch, but not crazy bad IMHO.

Other approaches would be to store our own enum rather than an RTL opcode.  
Or
store an actual generator function rather than any kind of opcode.

It wouldn't take much pushback over (ab)using FMA in this manner to get me 
to
use our own enums rather than RTL opcodes for this stuff.

gcc/

* config/riscv/riscv.cc (riscv_build_integer_1): Recognize cases 
where
we can use shNadd to improve constant synthesis.
(riscv_move_integer): Handle code generation for shNadd.

gcc/testsuite
* gcc.target/riscv/synthesis-1.c: Also count shNadd instructions.
* gcc.target/riscv/synthesis-3.c: New test.

(cherry picked from commit dbbd059b49edc936769737204f5c270d8d6ff553)

Diff:
---
 gcc/config/riscv/riscv.cc| 42 +++
 gcc/testsuite/gcc.target/riscv/synthesis-1.c |  2 +-
 gcc/testsuite/gcc.target/riscv/synthesis-3.c | 81 
 3 files changed, 124 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 2860137af718..9c98b1da0357 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -880,6 +880,40 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
}
 }
 
+  if (cost > 2 && TARGET_64BIT && TARGET_ZBA)
+{
+  if ((value % 9) == 0
+ && (alt_cost
+ = riscv_build_integer_1 (alt_codes, value / 9, mode) + 1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 9;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+  if ((value % 5) == 0
+ && (alt_cost
+ = riscv_build_integer_1 (alt_codes, value / 5, mode) + 1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 5;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+  if ((value % 3) == 0
+ && (alt_cost
+ = riscv_build_integer_1 (alt_codes, value / 3, mode) + 1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 3;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+}
+
   /* Final cases, particularly focused on bseti.  */
   if (cost > 2 && TARGET_ZBS)
 {
@@ -2542,6 +2576,14 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT 
value,
  x = gen_rtx_fmt_ee (AND, mode, x, GEN_INT (value));
  x = riscv_emit_set (t, x);
}
+ else if (codes[i].code == FMA)
+   {
+ HOST_WIDE_INT value = exact_log2 (codes[i].value - 1);
+ rtx ashift = gen_rtx_fmt_ee (ASHIFT, mode, x, GEN_INT (value));
+ x = gen_rtx_fmt_ee (PLUS, mode, ashift, x);
+ rtx t = can_create_pse

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Fix typos in code or comment [NFC]

2024-05-14 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:11c50b62894f2f9487063b1b2906bb0c8981fa5d

commit 11c50b62894f2f9487063b1b2906bb0c8981fa5d
Author: Kito Cheng 
Date:   Tue May 7 10:18:58 2024 +0800

RISC-V: Fix typos in code or comment [NFC]

Just found some typo when fixing bugs and then use aspell to find few
more typos, this patch didn't do anything other than fix typo.

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc: Fix typos in comments.
(get_all_predecessors): Ditto.
(pre_vsetvl::m_unknow_info): Rename to...
(pre_vsetvl::m_unknown_info): this.
(pre_vsetvl::compute_vsetvl_def_data): Rename m_unknow_info to
m_unknown_info.
(pre_vsetvl::cleaup): Rename to...
(pre_vsetvl::cleanup): this.
(pre_vsetvl::compute_vsetvl_def_data): Fix typos.
(pass_vsetvl::lazy_vsetvl): Update function name and fix typos.
* config/riscv/riscv.cc: Fix typos in comments.
(struct machine_function): Fix typo in comments.
(riscv_valid_lo_sum_p): Ditto.
(riscv_force_address): Ditto.
(riscv_immediate_operand_p): Ditto.
(riscv_in_small_data_p): Ditto.
(riscv_first_stack_step): Ditto.
(riscv_expand_prologue): Ditto.
(riscv_convert_vector_chunks): Ditto.
(riscv_override_options_internal): Ditto.
(get_common_costs): Ditto.

(cherry picked from commit d83070aebdb810e38f12d008e7a10acf1063f456)

Diff:
---
 gcc/config/riscv/riscv-vsetvl.cc | 64 
 gcc/config/riscv/riscv.cc| 36 +++---
 2 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 48ce757a6ee5..bbea2b5fd4f3 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -95,7 +95,7 @@ using namespace riscv_vector;
It's a bit different from bitmap_union_of_preds in cfganal.cc. This function
takes into account the case where pred is ENTRY basic block. The main reason
for this difference is to make it easier to insert some special value into
-   the ENTRY base block. For example, vsetvl_info with a status of UNKNOW.  */
+   the ENTRY base block. For example, vsetvl_info with a status of UNKNOWN.  */
 static void
 bitmap_union_of_preds_with_entry (sbitmap dst, sbitmap *src, basic_block b)
 {
@@ -126,9 +126,9 @@ bitmap_union_of_preds_with_entry (sbitmap dst, sbitmap 
*src, basic_block b)
   }
 }
 
-/* Compute the reaching defintion in and out based on the gen and KILL
-   informations in each Base Blocks.
-   This function references the compute_avaiable implementation in lcm.cc  */
+/* Compute the reaching definition in and out based on the gen and KILL
+   information's in each Base Blocks.
+   This function references the compute_available implementation in lcm.cc  */
 static void
 compute_reaching_defintion (sbitmap *gen, sbitmap *kill, sbitmap *in,
sbitmap *out)
@@ -719,7 +719,7 @@ get_all_predecessors (basic_block bb)
require SEW and LMUL to be fixed.
Therefore, if the former RVV instruction needs DEMAND_RATIO_P and the latter
instruction needs DEMAND_SEW_LMUL_P and its SEW/LMUL is the same as that of
-   the former instruction, then we can make the minimu demand of the former
+   the former instruction, then we can make the minimum demand of the former
instruction strict to DEMAND_SEW_LMUL_P, and its required SEW and LMUL are
the SEW and LMUL of the latter instruction, and the vsetvl instruction
generated according to the new demand can also be used for the latter
@@ -741,7 +741,7 @@ enum demand_flags : unsigned
 /* We split the demand information into three parts. They are sew and lmul
related (sew_lmul_demand_type), tail and mask policy related
(policy_demand_type) and avl related (avl_demand_type). Then we define three
-   interfaces avaiable_with, compatible_p and merge. avaiable_with is
+   interfaces available_p, compatible_p and merge. available_p is
used to determine whether the two vsetvl infos prev_info and next_info are
available or not. If prev_info is available for next_info, it means that the
RVV insn corresponding to next_info on the path from prev_info to next_info
@@ -1361,17 +1361,17 @@ public:
 
 /* Demand system is the RVV-based VSETVL info analysis tools wrapper.
It defines compatible rules for SEW/LMUL, POLICY and AVL.
-   Also, it provides 3 iterfaces avaiable_p, compatible_p and
+   Also, it provides 3 interfaces available_p, compatible_p and
merge for the VSETVL PASS analysis and optimization.
 
- - avaiable_p: Determine whether the next info can get the
-   avaiable VSETVL status from previous info.
+ - available_p: Determine whether the next info can get the
+   available VSETVL status from previous info.
e.g. bb 

[gcc r15-443] Revert "[PATCH v2 1/3] RISC-V: movmem for RISCV with V extension"

2024-05-13 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:04ee1f788ceaa4c7f777ff3b9441ae076191439c

commit r15-443-g04ee1f788ceaa4c7f777ff3b9441ae076191439c
Author: Jeff Law 
Date:   Mon May 13 21:42:38 2024 -0600

Revert "[PATCH v2 1/3] RISC-V: movmem for RISCV with V extension"

This reverts commit df15eb15b5f820321c81efc75f0af13ff8c0dd5b.

Diff:
---
 gcc/config/riscv/riscv.md  | 23 -
 gcc/testsuite/gcc.target/riscv/rvv/base/movmem-1.c | 59 --
 2 files changed, 82 deletions(-)

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 696d911a7e95..4d6de9925572 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2608,29 +2608,6 @@
 FAIL;
 })
 
-;; Inlining general memmove is a pessimisation as we can't avoid having to
-;; decide which direction to go at runtime, which can be costly.  Until we
-;; can benchmark implementations on real V hardware implement a conservative
-;; approach of inlining cases which can be performed with a single vector
-;; load + store.  For tiny moves, fallback to scalar.
-(define_expand "movmem"
-  [(parallel [(set (match_operand:BLK 0 "general_operand")
-  (match_operand:BLK 1 "general_operand"))
- (use (match_operand:P 2 "const_int_operand"))
- (use (match_operand:SI 3 "const_int_operand"))])]
-  "TARGET_VECTOR"
-{
-  if (CONST_INT_P (operands[2])
-  && INTVAL (operands[2]) >= TARGET_MIN_VLEN / 8
-  && INTVAL (operands[2]) <= TARGET_MIN_VLEN
-  && riscv_vector::expand_block_move (operands[0],
- operands[1],
- operands[2]))
-DONE;
-  else
-FAIL;
-})
-
 ;; Expand in-line code to clear the instruction cache between operand[0] and
 ;; operand[1].
 (define_expand "clear_cache"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/movmem-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/movmem-1.c
deleted file mode 100644
index b930241ae5d9..
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/movmem-1.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/* { dg-do compile } */
-/* { dg-add-options riscv_v } */
-/* { dg-additional-options "-O3" } */
-/* { dg-final { check-function-bodies "**" "" } } */
-
-#include 
-
-#define MIN_VECTOR_BYTES (__riscv_v_min_vlen/8)
-
-/* tiny memmoves should not be vectorised
-** f1:
-**  li\s+a2,15
-**  tail\s+memmove
-*/
-char * f1 (char *a, char const *b)
-{
-  return memmove (a, b, 15);
-}
-
-/* vectorise+inline minimum vector register width with LMUL=1
-** f2:
-**  (
-**  vsetivli\s+zero,16,e8,m1,ta,ma
-**  |
-**  li\s+[ta][0-7],\d+
-**  vsetvli\s+zero,[ta][0-7],e8,m1,ta,ma
-**  )
-**  vle8\.v\s+v\d+,0\(a1\)
-**  vse8\.v\s+v\d+,0\(a0\)
-**  ret
-*/
-char * f2 (char *a, char const *b)
-{
-  return memmove (a, b, MIN_VECTOR_BYTES);
-}
-
-/* vectorise+inline up to LMUL=8
-** f3:
-**  li\s+[ta][0-7],\d+
-**  vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
-**  vle8\.v\s+v\d+,0\(a1\)
-**  vse8\.v\s+v\d+,0\(a0\)
-**  ret
-*/
-char * f3 (char *a, char const *b)
-{
-  return memmove (a, b, MIN_VECTOR_BYTES*8);
-}
-
-/* don't vectorise if the move is too large for one operation
-** f4:
-**  li\s+a2,\d+
-**  tail\s+memmove
-*/
-char * f4 (char *a, char const *b)
-{
-  return memmove (a, b, MIN_VECTOR_BYTES*8+1);
-}
-


Re: [PATCH v2 2/2] RISC-V: avoid LUI based const mat in prologue/epilogue expansion [PR/105733]

2024-05-13 Thread Jeff Law




On 5/13/24 6:54 PM, Patrick O'Neill wrote:


On 5/13/24 13:28, Jeff Law wrote:



On 5/13/24 12:49 PM, Vineet Gupta wrote:

If the constant used for stack offset can be expressed as sum of two S12
values, the constant need not be materialized (in a reg) and instead the
two S12 bits can be added to instructions involved with frame pointer.
This avoids burning a register and more importantly can often get down
to be 2 insn vs. 3.

The prev patches to generally avoid LUI based const materialization 
didn't

fix this PR and need this directed fix in funcion prologue/epilogue
expansion.

This fix doesn't move the neddle for SPEC, at all, but it is still a
win considering gcc generates one insn fewer than llvm for the test ;-)

    gcc-13.1 release   |  gcc 230823 | |
   |    g6619b3d4c15c    |   This patch | clang/llvm
-
li  t0,-4096 | li    t0,-4096  | addi  sp,sp,-2048 | addi 
sp,sp,-2048
addi    t0,t0,2016   | addi  t0,t0,2032    | add   sp,sp,-16   | addi 
sp,sp,-32
li  a4,4096  | add   sp,sp,t0  | add   a5,sp,a0    | add 
a1,sp,16
add sp,sp,t0 | addi  a5,sp,-2032   | sb    zero,0(a5)  | add 
a0,a0,a1
li  a5,-4096 | add   a0,a5,a0  | addi  sp,sp,2032  | sb 
zero,0(a0)
addi    a4,a4,-2032  | li    t0, 4096  | addi  sp,sp,32    | addi 
sp,sp,2032
add a4,a4,a5 | sb    zero,2032(a0) | ret   | addi 
sp,sp,48

addi    a5,sp,16 | addi  t0,t0,-2032   |   | ret
add a5,a4,a5 | add   sp,sp,t0  |
add a0,a5,a0 | ret |
li  t0,4096  |
sd  a5,8(sp) |
sb  zero,2032(a0)|
addi    t0,t0,-2016  |
add sp,sp,t0 |
ret  |

gcc/ChangeLog:
PR target/105733
* config/riscv/riscv.h: New macros for with aligned offsets.
* config/riscv/riscv.cc (riscv_split_sum_of_two_s12): New
function to split a sum of two s12 values into constituents.
(riscv_expand_prologue): Handle offset being sum of two S12.
(riscv_expand_epilogue): Ditto.
* config/riscv/riscv-protos.h (riscv_split_sum_of_two_s12): New.

gcc/testsuite/ChangeLog:
* gcc.target/riscv/pr105733.c: New Test.
* gcc.target/riscv/rvv/autovec/vls/spill-1.c: Adjust to not
expect LUI 4096.
* gcc.target/riscv/rvv/autovec/vls/spill-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-6.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-7.c: Ditto.





@@ -8074,14 +8111,26 @@ riscv_expand_epilogue (int style)
  }
    else
  {
-  if (!SMALL_OPERAND (adjust_offset.to_constant ()))
+  HOST_WIDE_INT adj_off_value = adjust_offset.to_constant ();
+  if (SMALL_OPERAND (adj_off_value))
+    {
+  adjust = GEN_INT (adj_off_value);
+    }
+  else if (SUM_OF_TWO_S12_ALGN (adj_off_value))
+    {
+  HOST_WIDE_INT base, off;
+  riscv_split_sum_of_two_s12 (adj_off_value, , );
+  insn = gen_add3_insn (stack_pointer_rtx, 
hard_frame_pointer_rtx,

+    GEN_INT (base));
+  RTX_FRAME_RELATED_P (insn) = 1;
+  adjust = GEN_INT (off);
+    }
So this was the hunk that we identified internally as causing problems 
with libgomp's testsuite.  We never fully chased it down as this hunk 
didn't seem terribly important performance wise -- we just set it 
aside.  The thing is it looked basically correct to me.  So the 
failure was certainly unexpected, but it was consistent.


So I think the question is whether or not the CI system runs the 
libgomp testsuite, particularly in the rv64 linux configuration. If it 
does, and it passes, then we're good.  I'm still finding my way around 
the configuration, so I don't know if the CI system Edwin & Patrick 
have built tests libgomp or not.


I poked around the .sum files in pre/postcommit and we do run tests like:

PASS: c-c++-common/gomp/affinity-2.c  (test for errors, line 45)

I was able to find the summary info:


Tests that now fail, but worked before (15 tests):
libgomp: libgomp.fortran/simd7.f90   -O0  execution test
libgomp: libgomp.fortran/task2.f90   -O0  execution test
libgomp: libgomp.fortran/vla2.f90   -O0  execution test
libgomp: libgomp.fortran/vla3.f90   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  execution test
libgomp: libgomp.fortran/vla3.f90   -O3 -g  execution test
libgomp: libgomp.fortran/vla4.f90   -O1  execution test
libgomp: libgomp.fortran/vla4.f90   -O2  execution test
libgomp: libgomp.fortran/vla4.f90   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  execution test
libgomp: libgomp.fortran/vla4.f90   -O3 -g  execution test
libgomp: libgomp.fortran/vla4.f90   -Os  execution 

[gcc r15-440] [to-be-committed, RISC-V] Improve AND with some constants

2024-05-13 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:158aa1b65ce29d5e58182782de66292c51774d71

commit r15-440-g158aa1b65ce29d5e58182782de66292c51774d71
Author: Jeff Law 
Date:   Mon May 13 17:37:46 2024 -0600

[to-be-committed,RISC-V] Improve AND with some constants

If we have an AND with a constant operand and the constant operand
requires synthesis, then we may be able to generate more efficient code
than we do now.

Essentially the need for constant synthesis gives us a budget for
alternative ways to clear bits, which zext.w can do for bits 32..63
trivially.   So if we clear 32..63  via zext.w, the constant for the
remaining bits to clear may be simple enough to use with andi or bseti.
That will save us an instruction.

This has tested in Ventana's CI system as well as my own.  I'll wait for
the upstream CI tester to report success before committing.

Jeff
gcc/
* config/riscv/bitmanip.md: Add new splitter for AND with
a constant that masks off bits 32..63 and needs synthesis.

gcc/testsuite/

* gcc.target/riscv/zba_zbs_and-1.c: New test.

Diff:
---
 gcc/config/riscv/bitmanip.md   | 34 ++
 gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c | 22 +
 2 files changed, 56 insertions(+)

diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index 724511b6df3b..8769a6b818b7 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -843,6 +843,40 @@
 }
 [(set_attr "type" "bitmanip")])
 
+;; If we have the ZBA extension, then we can clear the upper half of a 64
+;; bit object with a zext.w.  So if we have AND where the constant would
+;; require synthesis of two or more instructions, but 32->64 sign extension
+;; of the constant is a simm12, then we can use zext.w+andi.  If the adjusted
+;; constant is a single bit constant, then we can use zext.w+bclri
+;;
+;; With the mvconst_internal pattern claiming a single insn to synthesize
+;; constants, this must be a define_insn_and_split.
+(define_insn_and_split ""
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (and:DI (match_operand:DI 1 "register_operand" "r")
+   (match_operand 2 "const_int_operand" "n")))]
+  "TARGET_64BIT
+   && TARGET_ZBA
+   && !paradoxical_subreg_p (operands[1])
+   /* Only profitable if synthesis takes more than one insn.  */
+   && riscv_const_insns (operands[2]) != 1
+   /* We need the upper half to be zero.  */
+   && (INTVAL (operands[2]) & HOST_WIDE_INT_C (0x)) == 0
+   /* And the the adjusted constant must either be something we can
+  implement with andi or bclri.  */
+   && ((SMALL_OPERAND (sext_hwi (INTVAL (operands[2]), 32))
+|| (TARGET_ZBS && popcount_hwi (INTVAL (operands[2])) == 31))
+   && INTVAL (operands[2]) != 0x7fff)"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (zero_extend:DI (match_dup 3)))
+   (set (match_dup 0) (and:DI (match_dup 0) (match_dup 2)))]
+  "{
+ operands[3] = gen_lowpart (SImode, operands[1]);
+ operands[2] = GEN_INT (sext_hwi (INTVAL (operands[2]), 32));
+   }"
+  [(set_attr "type" "bitmanip")])
+
 ;; IF_THEN_ELSE: test for 2 bits of opposite polarity
 (define_insn_and_split "*branch_mask_twobits_equals_singlebit"
   [(set (pc)
diff --git a/gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c 
b/gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c
new file mode 100644
index ..23fd769449ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+
+unsigned long long w32mem_1(unsigned long long w32)
+{
+return w32 & ~(1U << 0);
+}
+
+unsigned long long w32mem_2(unsigned long long w32)
+{
+return w32 & ~(1U << 30);
+}
+
+unsigned long long w32mem_3(unsigned long long w32)
+{
+return w32 & ~(1U << 31);
+}
+
+/* If we do synthesis, then we'd see an addi.  */
+/* { dg-final { scan-assembler-not "addi\t" } } */


Re: [PATCH v2 1/3] RISC-V: movmem for RISCV with V extension

2024-05-13 Thread Jeff Law




On 12/19/23 10:28 PM, Jeff Law wrote:



On 12/19/23 02:53, Sergei Lewis wrote:

gcc/ChangeLog

 * config/riscv/riscv.md (movmem): Use 
riscv_vector::expand_block_move,
 if and only if we know the entire operation can be performed 
using one vector

 load followed by one vector store

gcc/testsuite/ChangeLog

 PR target/112109
 * gcc.target/riscv/rvv/base/movmem-1.c: New test
So this needs to be regression tested.  Given that it only affects RVV, 
I would suggest testing rv64gcv or rv32gcv.





+(define_expand "movmem"
+  [(parallel [(set (match_operand:BLK 0 "general_operand")
+   (match_operand:BLK 1 "general_operand"))
+    (use (match_operand:P 2 "const_int_operand"))
+    (use (match_operand:SI 3 "const_int_operand"))])]
+  "TARGET_VECTOR"
+{
+  if ((INTVAL (operands[2]) >= TARGET_MIN_VLEN/8)
+    && (INTVAL (operands[2]) <= TARGET_MIN_VLEN)
+    && riscv_vector::expand_block_move (operands[0], operands[1],
+ operands[2]))
+    DONE;
+  else
+    FAIL;
+})

Just a formatting nit.  A space on each side of the '/' operator above.
So I've fixed the formatting nit and tested on rv64gc and rv32gcv.  I 
hadn't planned to push it, but muscle memory kicked in and 1/3 has been 
pushed.


I'll be looking at 2/3 and 3/3 tomorrow (or possibly a bit tonight to 
take advantage of overnight CI runs).


jeff



[gcc r15-439] [PATCH v2 1/3] RISC-V: movmem for RISCV with V extension

2024-05-13 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:df15eb15b5f820321c81efc75f0af13ff8c0dd5b

commit r15-439-gdf15eb15b5f820321c81efc75f0af13ff8c0dd5b
Author: Sergei Lewis 
Date:   Mon May 13 17:32:24 2024 -0600

[PATCH v2 1/3] RISC-V: movmem for RISCV with V extension

This patchset permits generation of inlined vectorised code for movmem,
setmem and cmpmem, if and only if the operation size is
at least one and at most eight vector registers' worth of data.

Further vectorisation rapidly becomes debatable due to code size concerns;
however, for these simple cases we do have an unambiguous performance win
without sacrificing too much code size compared to a libc call.

Changes in v2:

* run clang-format over the code in addition to the
  contrib/check_GNU_style.sh that was used for v1

* remove string.h include and refer to __builtin_* memory functions
  in multilib tests

* respect stringop_strategy (don't vectorise if it doesn't include VECTOR)

* use an integer constraint for movmem length parameter

* use TARGET_MAX_LMUL unless riscv-autovec-lmul=dynamic
  to ensure we respect the user's wishes if they request specific lmul

* add new unit tests to check that riscv-autovec-lmul is respected

* PR target/112109 added to changelog for patch 1/3 as requested

Sergei Lewis (3):
  RISC-V: movmem for RISCV with V extension
  RISC-V: setmem for RISCV with V extension
  RISC-V: cmpmem for RISCV with V extension

gcc/ChangeLog

* config/riscv/riscv.md (movmem): Use 
riscv_vector::expand_block_move,
if and only if we know the entire operation can be performed using 
one vector
load followed by one vector store

gcc/testsuite/ChangeLog

PR target/112109
* gcc.target/riscv/rvv/base/movmem-1.c: New test

Diff:
---
 gcc/config/riscv/riscv.md  | 23 +
 gcc/testsuite/gcc.target/riscv/rvv/base/movmem-1.c | 59 ++
 2 files changed, 82 insertions(+)

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 4d6de9925572..696d911a7e95 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2608,6 +2608,29 @@
 FAIL;
 })
 
+;; Inlining general memmove is a pessimisation as we can't avoid having to
+;; decide which direction to go at runtime, which can be costly.  Until we
+;; can benchmark implementations on real V hardware implement a conservative
+;; approach of inlining cases which can be performed with a single vector
+;; load + store.  For tiny moves, fallback to scalar.
+(define_expand "movmem"
+  [(parallel [(set (match_operand:BLK 0 "general_operand")
+  (match_operand:BLK 1 "general_operand"))
+ (use (match_operand:P 2 "const_int_operand"))
+ (use (match_operand:SI 3 "const_int_operand"))])]
+  "TARGET_VECTOR"
+{
+  if (CONST_INT_P (operands[2])
+  && INTVAL (operands[2]) >= TARGET_MIN_VLEN / 8
+  && INTVAL (operands[2]) <= TARGET_MIN_VLEN
+  && riscv_vector::expand_block_move (operands[0],
+ operands[1],
+ operands[2]))
+DONE;
+  else
+FAIL;
+})
+
 ;; Expand in-line code to clear the instruction cache between operand[0] and
 ;; operand[1].
 (define_expand "clear_cache"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/movmem-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/movmem-1.c
new file mode 100644
index ..b930241ae5d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/movmem-1.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include 
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen/8)
+
+/* tiny memmoves should not be vectorised
+** f1:
+**  li\s+a2,15
+**  tail\s+memmove
+*/
+char * f1 (char *a, char const *b)
+{
+  return memmove (a, b, 15);
+}
+
+/* vectorise+inline minimum vector register width with LMUL=1
+** f2:
+**  (
+**  vsetivli\s+zero,16,e8,m1,ta,ma
+**  |
+**  li\s+[ta][0-7],\d+
+**  vsetvli\s+zero,[ta][0-7],e8,m1,ta,ma
+**  )
+**  vle8\.v\s+v\d+,0\(a1\)
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
+*/
+char * f2 (char *a, char const *b)
+{
+  return memmove (a, b, MIN_VECTOR_BYTES);
+}
+
+/* vectorise+inline up to LMUL=8
+** f3:
+**  li\s+[ta][0-7],\d+
+**  vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
+**  vle8\.v\s+v\d+,0\(a1\)
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
+*/
+char * f3 (char *a, char const *b)
+{
+  return memmove (a, b, MIN_VECTOR_BYTES*8);
+}
+
+/* don't vectorise if the move is too large for one operation
+** f4:
+**  li\s+a2,\d+
+**  tail\s+memmove
+*/
+char * f4 (char *a, char const *b)
+{
+  return memmove (a, b, MIN_VECTOR_BYTES*8+1);
+}
+


Re: Follow up #1 (was Re: [PATCH v2 1/2] RISC-V: avoid LUI based const materialization ... [part of PR/106265])

2024-05-13 Thread Jeff Law




On 5/13/24 3:13 PM, Vineet Gupta wrote:

On 5/13/24 11:49, Vineet Gupta wrote:

  500.perlbench_r-0 |  1,214,534,029,025 | 1,212,887,959,387 |
  500.perlbench_r-1 |740,383,419,739 |   739,280,308,163 |
  500.perlbench_r-2 |692,074,638,817 |   691,118,734,547 |
  502.gcc_r-0   |190,820,141,435 |   190,857,065,988 |
  502.gcc_r-1   |225,747,660,839 |   225,809,444,357 | <- -0.02%
  502.gcc_r-2   |220,370,089,641 |   220,406,367,876 | <- -0.03%
  502.gcc_r-3   |179,111,460,458 |   179,135,609,723 | <- -0.02%
  502.gcc_r-4   |219,301,546,340 |   219,320,416,956 | <- -0.01%
  503.bwaves_r-0|278,733,324,691 |   278,733,323,575 | <- -0.01%
  503.bwaves_r-1|442,397,521,282 |   442,397,519,616 |
  503.bwaves_r-2|344,112,218,206 |   344,112,216,760 |
  503.bwaves_r-3|417,561,469,153 |   417,561,467,597 |
  505.mcf_r |669,319,257,525 |   669,318,763,084 |
  507.cactuBSSN_r   |  2,852,767,394,456 | 2,564,736,063,742 | <+ 10.10%


The small gcc regression seems like a tooling issue of some sort.
Looking at the topblocks, the insn sequences are exactly the same, only
the counts differ and its not obvious why.
Here's for gcc_r-1.


 > Block 0 @ 0x170ca, 12 insns, 87854493 times, 0.47%:

 000170ca :
    170ca:    7179        add    sp,sp,-48
    170cc:    ec26        sd    s1,24(sp)
    170ce:    e84a        sd    s2,16(sp)
    170d0:    e44e        sd    s3,8(sp)
    170d2:    f406        sd    ra,40(sp)
    170d4:    f022        sd    s0,32(sp)
    170d6:    84aa        mv    s1,a0
    170d8:    03200913      li    s2,50
    170dc:    03d00993      li    s3,61
    170e0:    8526        mv    a0,s1
    170e2:    001cd097      auipc    ra,0x1cd
    170e6:    bac080e7      jalr    -1108(ra) # 1e3c8e
 

 > Block 1 @ 0x706d0a, 3 insns, 274713936 times, 0.37%:
 >  Block 2 @ 0x1e3c8e, 9 insns, 88507109 times, 0.35%:
 ...

 < Block 0 @ 0x170ca, 12 insns, 87869602 times, 0.47%:
 < Block 1 @ 0x706d42, 3 insns, 274608893 times, 0.36%:
 < Block 2 @ 0x1e3c94, 9 insns, 88526354 times, 0.35%:


FWIW, Greg internally has been looking at some of this and found some
issues in the bbv tooling, but I wish all of this was  shared/upstream
(QEMU bbv plugin) for people to compare notes and not discover/fix the
same issues over and again.
Yea, we all meant to coordinate on those plugins.  The one we've got had 
some problems with hash collisions and when there's a hash collision it 
just produces total junk data.  I chased a few of these down and fixed 
them about a year ago.


The other thing is qemu will split up blocks based on its internal 
notion of a translation page.   So if you're looking at block level data 
you'll stumble over that as well.  This aspect is the most troublesome 
problem I'm aware of right now.






Jeff


Re: [RFC][PATCH] PR tree-optimization/109071 - -Warray-bounds false positive warnings due to code duplication from jump threading

2024-05-13 Thread Jeff Law




On 5/13/24 1:48 PM, Qing Zhao wrote:

-Warray-bounds is an important option to enable linux kernal to keep
the array out-of-bound errors out of the source tree.

However, due to the false positive warnings reported in PR109071
(-Warray-bounds false positive warnings due to code duplication from
jump threading), -Warray-bounds=1 cannot be added on by default.

Although it's impossible to elinimate all the false positive warnings
from -Warray-bounds=1 (See PR104355 Misleading -Warray-bounds
documentation says "always out of bounds"), we should minimize the
false positive warnings in -Warray-bounds=1.

The root reason for the false positive warnings reported in PR109071 is:

When the thread jump optimization tries to reduce the # of branches
inside the routine, sometimes it needs to duplicate the code and
split into two conditional pathes. for example:

The original code:

void sparx5_set (int * ptr, struct nums * sg, int index)
{
   if (index >= 4)
 warn ();
   *ptr = 0;
   *val = sg->vals[index];
   if (index >= 4)
 warn ();
   *ptr = *val;

   return;
}

With the thread jump, the above becomes:

void sparx5_set (int * ptr, struct nums * sg, int index)
{
   if (index >= 4)
 {
   warn ();
   *ptr = 0;// Code duplications since "warn" does return;
   *val = sg->vals[index];   // same this line.
// In this path, since it's under the condition
// "index >= 4", the compiler knows the value
// of "index" is larger then 4, therefore the
// out-of-bound warning.
   warn ();
 }
   else
 {
   *ptr = 0;
   *val = sg->vals[index];
 }
   *ptr = *val;
   return;
}

We can see, after the thread jump optimization, the # of branches inside
the routine "sparx5_set" is reduced from 2 to 1, however,  due to the
code duplication (which is needed for the correctness of the code), we
got a false positive out-of-bound warning.

In order to eliminate such false positive out-of-bound warning,

A. Add one more flag for GIMPLE: is_splitted.
B. During the thread jump optimization, when the basic blocks are
duplicated, mark all the STMTs inside the original and duplicated
basic blocks as "is_splitted";
C. Inside the array bound checker, add the following new heuristic:

If
1. the stmt is duplicated and splitted into two conditional paths;
+  2. the warning level < 2;
+  3. the current block is not dominating the exit block
Then not report the warning.

The false positive warnings are moved from -Warray-bounds=1 to
  -Warray-bounds=2 now.

Bootstrapped and regression tested on both x86 and aarch64. adjusted
  -Warray-bounds-61.c due to the false positive warnings.

Let me know if you have any comments and suggestions.

This sounds horribly wrong.   In the code above, the warning is correct.

Jeff


Re: [PATCH v2 2/2] RISC-V: avoid LUI based const mat in prologue/epilogue expansion [PR/105733]

2024-05-13 Thread Jeff Law




On 5/13/24 12:49 PM, Vineet Gupta wrote:

If the constant used for stack offset can be expressed as sum of two S12
values, the constant need not be materialized (in a reg) and instead the
two S12 bits can be added to instructions involved with frame pointer.
This avoids burning a register and more importantly can often get down
to be 2 insn vs. 3.

The prev patches to generally avoid LUI based const materialization didn't
fix this PR and need this directed fix in funcion prologue/epilogue
expansion.

This fix doesn't move the neddle for SPEC, at all, but it is still a
win considering gcc generates one insn fewer than llvm for the test ;-)

gcc-13.1 release   |  gcc 230823 |   |
   |g6619b3d4c15c|   This patch  |  clang/llvm
-
li  t0,-4096 | lit0,-4096  | addi  sp,sp,-2048 | addi 
sp,sp,-2048
addit0,t0,2016   | addi  t0,t0,2032| add   sp,sp,-16   | addi sp,sp,-32
li  a4,4096  | add   sp,sp,t0  | add   a5,sp,a0| add  a1,sp,16
add sp,sp,t0 | addi  a5,sp,-2032   | sbzero,0(a5)  | add  a0,a0,a1
li  a5,-4096 | add   a0,a5,a0  | addi  sp,sp,2032  | sb   zero,0(a0)
addia4,a4,-2032  | lit0, 4096  | addi  sp,sp,32| addi sp,sp,2032
add a4,a4,a5 | sbzero,2032(a0) | ret   | addi sp,sp,48
addia5,sp,16 | addi  t0,t0,-2032   |   | ret
add a5,a4,a5 | add   sp,sp,t0  |
add a0,a5,a0 | ret |
li  t0,4096  |
sd  a5,8(sp) |
sb  zero,2032(a0)|
addit0,t0,-2016  |
add sp,sp,t0 |
ret  |

gcc/ChangeLog:
PR target/105733
* config/riscv/riscv.h: New macros for with aligned offsets.
* config/riscv/riscv.cc (riscv_split_sum_of_two_s12): New
function to split a sum of two s12 values into constituents.
(riscv_expand_prologue): Handle offset being sum of two S12.
(riscv_expand_epilogue): Ditto.
* config/riscv/riscv-protos.h (riscv_split_sum_of_two_s12): New.

gcc/testsuite/ChangeLog:
* gcc.target/riscv/pr105733.c: New Test.
* gcc.target/riscv/rvv/autovec/vls/spill-1.c: Adjust to not
expect LUI 4096.
* gcc.target/riscv/rvv/autovec/vls/spill-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-6.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/spill-7.c: Ditto.





@@ -8074,14 +8111,26 @@ riscv_expand_epilogue (int style)
}
else
{
- if (!SMALL_OPERAND (adjust_offset.to_constant ()))
+ HOST_WIDE_INT adj_off_value = adjust_offset.to_constant ();
+ if (SMALL_OPERAND (adj_off_value))
+   {
+ adjust = GEN_INT (adj_off_value);
+   }
+ else if (SUM_OF_TWO_S12_ALGN (adj_off_value))
+   {
+ HOST_WIDE_INT base, off;
+ riscv_split_sum_of_two_s12 (adj_off_value, , );
+ insn = gen_add3_insn (stack_pointer_rtx, hard_frame_pointer_rtx,
+   GEN_INT (base));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ adjust = GEN_INT (off);
+   }
So this was the hunk that we identified internally as causing problems 
with libgomp's testsuite.  We never fully chased it down as this hunk 
didn't seem terribly important performance wise -- we just set it aside. 
 The thing is it looked basically correct to me.  So the failure was 
certainly unexpected, but it was consistent.


So I think the question is whether or not the CI system runs the libgomp 
testsuite, particularly in the rv64 linux configuration.  If it does, 
and it passes, then we're good.  I'm still finding my way around the 
configuration, so I don't know if the CI system Edwin & Patrick have 
built tests libgomp or not.


If it isn't run, then we'll need to do a run to test that.  I'm set up 
here to do that if needed.   I can just drop this version into our 
internal tree, trigger an internal CI run and see if it complains :-)


If it does complain, then we know where to start investigations.




Jeff



Re: [PATCH v2 1/2] RISC-V: avoid LUI based const materialization ... [part of PR/106265]

2024-05-13 Thread Jeff Law




On 5/13/24 12:49 PM, Vineet Gupta wrote:

Apologies for the delay in getting this out. Needed to fix one ICE
with glibc build and fresh round of testing: both testsuite and SPEC
runs (which are similar to v1 in terms of Cactu gains, but some more minor
regressions elsewhere gcc). Again those seem so small that IMHO this
should still go in.

I'll investigate those next as well as an existing weirdnes in glibc tempnam
which I spotted during the debugging.

Changes since v1 [1]
  - Tighten the main conditition to avoid stack regs as destination
(to avoid making them potentially unaligned with -2047 addend:
 this might be OK execution/ABI wise, but undesirable/ugly still
 specially when coming from compiler codegen).
  - Ensure that first alternative is always split
  - Remove "&& 1" from split condition. That was tripping up glibc build
with illegal operands `add s0, s0, 2048`.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2024-March/647877.html

  
+;; Special case of adding a reg and constant if latter is sum of two S12

+;; values (in range -2048 to 2047). Avoid materialized the const and fuse
+;; into the add (with an additional add for 2nd value). Makes a 3 insn
+;; sequence into 2 insn.
+
+(define_insn_and_split "*add3_const_sum_of_two_s12"
+  [(set (match_operand:P0 "register_operand" "=r,r")
+   (plus:P (match_operand:P 1 "register_operand" " r,r")
+   (match_operand:P 2 "const_two_s12"" MiG,r")))]
+  "!riscv_reg_frame_related (operands[0])"
So that !riscv_reg_frame_related is my only concern with this patch. 
It's a destination, so it *may* be OK.


If it were a source operand, then we'd have to worry about cases where 
it was a pseudo with the same value as sp/fp/argp and subsequent copy 
propagation replacing the pseudo with sp/fp/argp causing the insn to no 
longer match.


Similarly if it were a source operand we'd have to worry about cases 
where the pseudo had a registered (or discoverable) equivalence to 
sp/fp/argp plus an offset.  IRA/LRA can replace the use with its 
equivalence in some of those cases which would have potentially caused 
headaches.


But as a destination we really just have to worry about generation in 
the prologue/epilogue and for alloca calls.  Those should be the only 
places that set one of those special registers.  They're constrained 
enough that I think we'll be OK.


I'm very slightly worried about hard register cprop, but I think it 
should be safe these days WRT those special registers in the unlikely 
event it found an opportunity to propagate them.


So a tentative OK.  If we find this tidibit is problematical in the 
future, then what I would suggest is we allow those special registers 
and dial-back the aggressiveness on the range of allowed constants. 
That would allow the first instruction in the sequence to never create a 
mis-aligned sp.  But again, that's only if we need to revisit.


Please wait for CI to report back sane results :-)

Jeff


[to-be-committed][RISC-V] Improve AND with some constants

2024-05-13 Thread Jeff Law


If we have an AND with a constant operand and the constant operand 
requires synthesis, then we may be able to generate more efficient code 
than we do now.


Essentially the need for constant synthesis gives us a budget for 
alternative ways to clear bits, which zext.w can do for bits 32..63 
trivially.   So if we clear 32..63  via zext.w, the constant for the 
remaining bits to clear may be simple enough to use with andi or bseti. 
That will save us an instruction.


This has tested in Ventana's CI system as well as my own.  I'll wait for 
the upstream CI tester to report success before committing.


Jeff
gcc/
* config/riscv/bitmanip.md: Add new splitter for AND with
a constant that masks off bits 32..63 and needs synthesis.

gcc/testsuite/

* gcc.target/riscv/zba_zbs_and-1.c: New test.

+++ b/gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c
diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index 724511b6df3..8769a6b818b 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -843,6 +843,40 @@ (define_insn_and_split "*andi_extrabit"
 }
 [(set_attr "type" "bitmanip")])
 
+;; If we have the ZBA extension, then we can clear the upper half of a 64
+;; bit object with a zext.w.  So if we have AND where the constant would
+;; require synthesis of two or more instructions, but 32->64 sign extension
+;; of the constant is a simm12, then we can use zext.w+andi.  If the adjusted
+;; constant is a single bit constant, then we can use zext.w+bclri
+;;
+;; With the mvconst_internal pattern claiming a single insn to synthesize
+;; constants, this must be a define_insn_and_split.
+(define_insn_and_split ""
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (and:DI (match_operand:DI 1 "register_operand" "r")
+   (match_operand 2 "const_int_operand" "n")))]
+  "TARGET_64BIT
+   && TARGET_ZBA
+   && !paradoxical_subreg_p (operands[1])
+   /* Only profitable if synthesis takes more than one insn.  */
+   && riscv_const_insns (operands[2]) != 1
+   /* We need the upper half to be zero.  */
+   && (INTVAL (operands[2]) & HOST_WIDE_INT_C (0x)) == 0
+   /* And the the adjusted constant must either be something we can
+  implement with andi or bclri.  */
+   && ((SMALL_OPERAND (sext_hwi (INTVAL (operands[2]), 32))
+|| (TARGET_ZBS && popcount_hwi (INTVAL (operands[2])) == 31))
+   && INTVAL (operands[2]) != 0x7fff)"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (zero_extend:DI (match_dup 3)))
+   (set (match_dup 0) (and:DI (match_dup 0) (match_dup 2)))]
+  "{
+ operands[3] = gen_lowpart (SImode, operands[1]);
+ operands[2] = GEN_INT (sext_hwi (INTVAL (operands[2]), 32));
+   }"
+  [(set_attr "type" "bitmanip")])
+
 ;; IF_THEN_ELSE: test for 2 bits of opposite polarity
 (define_insn_and_split "*branch_mask_twobits_equals_singlebit"
   [(set (pc)
diff --git a/gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c 
b/gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c
new file mode 100644
index 000..23fd769449e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zba_zbs_and-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+
+unsigned long long w32mem_1(unsigned long long w32)
+{
+return w32 & ~(1U << 0);
+}
+
+unsigned long long w32mem_2(unsigned long long w32)
+{
+return w32 & ~(1U << 30);
+}
+
+unsigned long long w32mem_3(unsigned long long w32)
+{
+return w32 & ~(1U << 31);
+}
+
+/* If we do synthesis, then we'd see an addi.  */
+/* { dg-final { scan-assembler-not "addi\t" } } */


Re: [PATCH v1] RISC-V: Bugfix ICE for RVV intrinisc vfw on _Float16 scalar

2024-05-13 Thread Jeff Law




On 5/13/24 9:00 AM, Li, Pan2 wrote:

Committed, thanks Juzhe and Kito. Let's wait for a while before backport to 14.

Could you fix the formatting nits caught by the CI linter?

=== ERROR type #1: trailing operator (4 error(s)) ===
gcc/config/riscv/riscv-vector-builtins.cc:4641:39:  if ((exts & 
RVV_REQUIRE_ELEN_FP_16) &&
gcc/config/riscv/riscv-vector-builtins.cc:4651:39:  if ((exts & 
RVV_REQUIRE_ELEN_FP_32) &&
gcc/config/riscv/riscv-vector-builtins.cc:4661:39:  if ((exts & 
RVV_REQUIRE_ELEN_FP_64) &&
gcc/config/riscv/riscv-vector-builtins.cc:4670:36:  if ((exts & 
RVV_REQUIRE_ELEN_64) &&



The "&&" needs to come down to the next line, indented like

if ((exts && RVV_REQUIRE_ELEN_FP_16)
&& !TARGET_VECTOR_.)

Ie, the "&&" indents just inside the first open paren.  It looks like 
all the conditions in validate_instance_type_required_extensions need to 
be fixed in a similar manner.


Given this is NFC, just post it for the archiver.  No need to wait on 
review.


Jeff




[gcc r15-432] [to-be-committed, RISC-V] Improve single inverted bit extraction - v3

2024-05-13 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:0c585c8d0dd85601a8d116ada99126a48c8ce9fd

commit r15-432-g0c585c8d0dd85601a8d116ada99126a48c8ce9fd
Author: Jeff Law 
Date:   Mon May 13 07:14:08 2024 -0600

[to-be-committed,RISC-V] Improve single inverted bit extraction - v3

So this patch fixes a minor code generation inefficiency that (IIRC) the
RAU team discovered a while ago in spec.

If we want the inverted value of a single bit we can use bext to extract
the bit, then seq to invert the value (if viewed as a 0/1 truth value).

The RTL is fairly convoluted, but it's basically a right shift to get
the bit into position, bitwise-not then masking off all but the low bit.
So it's a 3->2 combine, hidden by the fact that and-not is a
define_insn_and_split, so it actually looks like a 2->2 combine.

We've run this through Ventana's internal CI (which includes
zba_zbb_zbs) and I've run it in my own tester (rv64gc, rv32gcv).  I'll
wait for the upstream CI to finish with positive results before pushing.

gcc/
* config/riscv/bitmanip.md (bextseqzdisi): New patterns.

gcc/testsuite/

* gcc.target/riscv/zbs-bext-2.c: New test.
* gcc.target/riscv/zbs-bext.c: Fix one of the possible expectes 
sequences.

Diff:
---
 gcc/config/riscv/.riscv.cc.swo  | Bin 0 -> 417792 bytes
 gcc/config/riscv/bitmanip.md|  43 
 gcc/config/riscv/j  |   0
 gcc/testsuite/gcc.target/riscv/zbs-bext-2.c |  19 
 gcc/testsuite/gcc.target/riscv/zbs-bext.c   |   2 +-
 5 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/.riscv.cc.swo b/gcc/config/riscv/.riscv.cc.swo
new file mode 100644
index ..77ed37353bee
Binary files /dev/null and b/gcc/config/riscv/.riscv.cc.swo differ
diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index d76a72d30e02..724511b6df3b 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -711,6 +711,49 @@
   "bext\t%0,%1,%2"
   [(set_attr "type" "bitmanip")])
 
+;; This is a bext followed by a seqz.  Normally this would be a 3->2 split
+;; But the and-not pattern with a constant operand is a define_insn_and_split,
+;; so this looks like a 2->2 split, which combine rejects.  So implement it
+;; as a define_insn_and_split as well.
+(define_insn_and_split "*bextseqzdisi"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (and:DI
+ (not:DI
+   (subreg:DI
+ (lshiftrt:SI
+   (match_operand:SI 1 "register_operand" "r")
+   (match_operand:QI 2 "register_operand" "r")) 0))
+ (const_int 1)))]
+  "TARGET_64BIT && TARGET_ZBS"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (zero_extract:DI (match_dup 1)
+(const_int 1)
+(zero_extend:DI (match_dup 2
+   (set (match_dup 0) (eq:DI (match_dup 0) (const_int 0)))]
+  "operands[1] = gen_lowpart (word_mode, operands[1]);"
+  [(set_attr "type" "bitmanip")])
+
+(define_insn_and_split "*bextseqzdisi"
+  [(set (match_operand:X 0 "register_operand" "=r")
+   (and:X
+ (not:X
+   (lshiftrt:X
+ (match_operand:X 1 "register_operand" "r")
+ (match_operand:QI 2 "register_operand" "r")))
+ (const_int 1)))]
+  "TARGET_ZBS"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (zero_extract:X (match_dup 1)
+   (const_int 1)
+   (zero_extend:X (match_dup 2
+   (set (match_dup 0) (eq:X (match_dup 0) (const_int 0)))]
+  "operands[1] = gen_lowpart (word_mode, operands[1]);"
+  [(set_attr "type" "bitmanip")])
+
 ;; When performing `(a & (1UL << bitno)) ? 0 : -1` the combiner
 ;; usually has the `bitno` typed as X-mode (i.e. no further
 ;; zero-extension is performed around the bitno).
diff --git a/gcc/config/riscv/j b/gcc/config/riscv/j
new file mode 100644
index ..e69de29bb2d1
diff --git a/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c 
b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
new file mode 100644
index ..79f120b22863
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+
+_Bool match(const int ch, int fMap) {
+return ((fMap & (1<<(ch))) == 0);
+}
+
+_Bool match2(const int ch, int fMap) {
+return ((fMap & (1UL<<(ch))) == 0);
+}
+
+
+/* { dg-final { scan-as

[to-be-committed] [RISC-V] Improve single inverted bit extraction - v3

2024-05-12 Thread Jeff Law


The only change in v2 vs v3 is testsuite adjustments for the updated 
sequences and fixing the name of the second pattern.


--


So this patch fixes a minor code generation inefficiency that (IIRC) the
RAU team discovered a while ago in spec.

If we want the inverted value of a single bit we can use bext to extract
the bit, then seq to invert the value (if viewed as a 0/1 truth value).

The RTL is fairly convoluted, but it's basically a right shift to get
the bit into position, bitwise-not then masking off all but the low bit.
So it's a 3->2 combine, hidden by the fact that and-not is a
define_insn_and_split, so it actually looks like a 2->2 combine.

We've run this through Ventana's internal CI (which includes
zba_zbb_zbs) and I've run it in my own tester (rv64gc, rv32gcv).  I'll
wait for the upstream CI to finish with positive results before pushing.

Jeff

gcc/
* config/riscv/bitmanip.md (bextseqzdisi): New patterns.

gcc/testsuite/

* gcc.target/riscv/zbs-bext-2.c: New test.
* gcc.target/riscv/zbs-bext.c: Fix one of the possible expectes 
sequences.


diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index d76a72d30e0..724511b6df3 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -711,6 +711,49 @@ (define_insn "*bext"
   "bext\t%0,%1,%2"
   [(set_attr "type" "bitmanip")])
 
+;; This is a bext followed by a seqz.  Normally this would be a 3->2 split
+;; But the and-not pattern with a constant operand is a define_insn_and_split,
+;; so this looks like a 2->2 split, which combine rejects.  So implement it
+;; as a define_insn_and_split as well.
+(define_insn_and_split "*bextseqzdisi"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (and:DI
+ (not:DI
+   (subreg:DI
+ (lshiftrt:SI
+   (match_operand:SI 1 "register_operand" "r")
+   (match_operand:QI 2 "register_operand" "r")) 0))
+ (const_int 1)))]
+  "TARGET_64BIT && TARGET_ZBS"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (zero_extract:DI (match_dup 1)
+(const_int 1)
+(zero_extend:DI (match_dup 2
+   (set (match_dup 0) (eq:DI (match_dup 0) (const_int 0)))]
+  "operands[1] = gen_lowpart (word_mode, operands[1]);"
+  [(set_attr "type" "bitmanip")])
+
+(define_insn_and_split "*bextseqz"
+  [(set (match_operand:X 0 "register_operand" "=r")
+   (and:X
+ (not:X
+   (lshiftrt:X
+ (match_operand:X 1 "register_operand" "r")
+ (match_operand:QI 2 "register_operand" "r")))
+ (const_int 1)))]
+  "TARGET_ZBS"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (zero_extract:X (match_dup 1)
+   (const_int 1)
+   (zero_extend:X (match_dup 2
+   (set (match_dup 0) (eq:X (match_dup 0) (const_int 0)))]
+  "operands[1] = gen_lowpart (word_mode, operands[1]);"
+  [(set_attr "type" "bitmanip")])
+
 ;; When performing `(a & (1UL << bitno)) ? 0 : -1` the combiner
 ;; usually has the `bitno` typed as X-mode (i.e. no further
 ;; zero-extension is performed around the bitno).
diff --git a/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c 
b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
new file mode 100644
index 000..79f120b2286
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+
+_Bool match(const int ch, int fMap) {
+return ((fMap & (1<<(ch))) == 0);
+}
+
+_Bool match2(const int ch, int fMap) {
+return ((fMap & (1UL<<(ch))) == 0);
+}
+
+
+/* { dg-final { scan-assembler-times "bext\t" 2 } } */
+/* { dg-final { scan-assembler-times "seqz\t|xori\t" 2 } } */
+/* { dg-final { scan-assembler-not "sraw\t" } } */
+/* { dg-final { scan-assembler-not "not\t" } } */
+/* { dg-final { scan-assembler-not "andi\t" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/zbs-bext.c 
b/gcc/testsuite/gcc.target/riscv/zbs-bext.c
index ff75dad6528..0db97f5ab59 100644
--- a/gcc/testsuite/gcc.target/riscv/zbs-bext.c
+++ b/gcc/testsuite/gcc.target/riscv/zbs-bext.c
@@ -38,7 +38,7 @@ long bext64_4(long a, char bitno)
 
 /* { dg-final { scan-assembler-times "bexti\t" 1 } } */
 /* { dg-final { scan-assembler-times "bext\t" 5 } } */
-/* { dg-final { scan-assembler-times "xori\t|snez\t" 1 } } */
+/* { dg-final { scan-assembler-times "xori\t|seqz\t" 1 } } */
 /* { dg-final { scan-assembler-times "addi\t" 1 } } */
 /* { dg-final { scan-assembler-times "neg\t" 1 } } */
 /* { dg-final { scan-assembler-not {\mandi} } } */


[to-be-committed] [RISC-V] Improve single inverted bit extraction - v2

2024-05-12 Thread Jeff Law


So the first version failed CI and after looking at the patch again, I 
think it can be improved.


First, the output pattern might as well go ahead and use the 
zero_extract form.


Second, we should be able to handle cases where all the ops are in 
word_mode as well as when the shift is in a narrow made.


Third, the testcase should cover additional modes.

Fourth, fix some lint issues with tabs vs spaces.

This has only been lightly tested, so it should be interesting to see 
what CI shows.


Jeffdiff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index d76a72d30e0..724511b6df3 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -711,6 +711,49 @@ (define_insn "*bext"
   "bext\t%0,%1,%2"
   [(set_attr "type" "bitmanip")])
 
+;; This is a bext followed by a seqz.  Normally this would be a 3->2 split
+;; But the and-not pattern with a constant operand is a define_insn_and_split,
+;; so this looks like a 2->2 split, which combine rejects.  So implement it
+;; as a define_insn_and_split as well.
+(define_insn_and_split "*bextseqzdisi"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (and:DI
+ (not:DI
+   (subreg:DI
+ (lshiftrt:SI
+   (match_operand:SI 1 "register_operand" "r")
+   (match_operand:QI 2 "register_operand" "r")) 0))
+ (const_int 1)))]
+  "TARGET_64BIT && TARGET_ZBS"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (zero_extract:DI (match_dup 1)
+(const_int 1)
+(zero_extend:DI (match_dup 2
+   (set (match_dup 0) (eq:DI (match_dup 0) (const_int 0)))]
+  "operands[1] = gen_lowpart (word_mode, operands[1]);"
+  [(set_attr "type" "bitmanip")])
+
+(define_insn_and_split "*bextseqzdisi"
+  [(set (match_operand:X 0 "register_operand" "=r")
+   (and:X
+ (not:X
+   (lshiftrt:X
+ (match_operand:X 1 "register_operand" "r")
+ (match_operand:QI 2 "register_operand" "r")))
+ (const_int 1)))]
+  "TARGET_ZBS"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (zero_extract:X (match_dup 1)
+   (const_int 1)
+   (zero_extend:X (match_dup 2
+   (set (match_dup 0) (eq:X (match_dup 0) (const_int 0)))]
+  "operands[1] = gen_lowpart (word_mode, operands[1]);"
+  [(set_attr "type" "bitmanip")])
+
 ;; When performing `(a & (1UL << bitno)) ? 0 : -1` the combiner
 ;; usually has the `bitno` typed as X-mode (i.e. no further
 ;; zero-extension is performed around the bitno).
diff --git a/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c 
b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
new file mode 100644
index 000..719df442fed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+
+_Bool match(const int ch, int fMap) {
+return ((fMap & (1<<(ch))) == 0);
+}
+
+_Bool match2(const int ch, int fMap) {
+return ((fMap & (1UL<<(ch))) == 0);
+}
+
+
+/* { dg-final { scan-assembler-times "bext\t" 1 } } */
+/* { dg-final { scan-assembler-times "seqz\t" 1 } } */
+/* { dg-final { scan-assembler-not "sraw\t" } } */
+/* { dg-final { scan-assembler-not "not\t" } } */
+/* { dg-final { scan-assembler-not "andi\t" } } */


[to-be-committed] [RISC-V] Improve single inverted bit extraction

2024-05-12 Thread Jeff Law
So the first time I sent this, I attached the wrong patch.  As a result 
the CI system wasn't happy.


The second time I sent the right patch, but I don't see evidence the CI 
system ran the correct patch through.  So I'm just starting over ;-)


--

So this patch fixes a minor code generation inefficiency that (IIRC) the
RAU team discovered a while ago in spec.

If we want the inverted value of a single bit we can use bext to extract
the bit, then seq to invert the value (if viewed as a 0/1 truth value).

The RTL is fairly convoluted, but it's basically a right shift to get
the bit into position, bitwise-not then masking off all but the low bit.
So it's a 3->2 combine, hidden by the fact that and-not is a
define_insn_and_split, so it actually looks like a 2->2 combine.

We've run this through Ventana's internal CI (which includes
zba_zbb_zbs) and I've run it in my own tester (rv64gc, rv32gcv).  I'll
wait for the upstream CI to finish with positive results before pushing.

Jeff* config/riscv/riscv.cc (riscv_build_integer_1): Recognize cases where
we can use shNadd to improve constant synthesis.
(riscv_move_integer): Handle code generation for shNadd.

gcc/testsuite
* gcc.target/riscv/synthesis-1.c: Also count shNadd instructions.
* gcc.target/riscv/synthesis-3.c: New test.

diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index d76a72d30e0..cf2fa04d4c4 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -711,6 +711,30 @@ (define_insn "*bext"
   "bext\t%0,%1,%2"
   [(set_attr "type" "bitmanip")])
 
+;; This is a bext followed by a seqz.  Normally this would be a 3->2 split
+;; But the and-not pattern with a constant operand is a define_insn_and_split,
+;; so this looks like a 2->2 split, which combine rejects.  So implement it
+;; as a define_insn_and_split as well.
+(define_insn_and_split "*bextseqzdisi"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (and:DI
+ (not:DI
+   (subreg:DI
+ (lshiftrt:SI
+   (match_operand:SI 1 "register_operand" "r")
+   (match_operand:QI 2 "register_operand" "r")) 0))
+  (const_int 1)))]
+  "TARGET_64BIT && TARGET_ZBS"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (and:DI (subreg:DI
+   (lshiftrt:SI (match_dup 1)
+(match_dup 2)) 0)
+ (const_int 1)))
+   (set (match_dup 0) (eq:DI (match_dup 0) (const_int 0)))]
+  ""
+  [(set_attr "type" "bitmanip")])
+
 ;; When performing `(a & (1UL << bitno)) ? 0 : -1` the combiner
 ;; usually has the `bitno` typed as X-mode (i.e. no further
 ;; zero-extension is performed around the bitno).
diff --git a/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c 
b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
new file mode 100644
index 000..53f47dc3afe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+
+_Bool match(const int ch, int fMap) {
+return ((fMap & (1<<(ch))) == 0);
+}
+
+
+/* { dg-final { scan-assembler-times "bext\t" 1 } } */
+/* { dg-final { scan-assembler-times "seqz\t" 1 } } */
+/* { dg-final { scan-assembler-not "sraw\t" } } */
+/* { dg-final { scan-assembler-not "not\t" } } */
+/* { dg-final { scan-assembler-not "andi\t" } } */


[gcc r15-389] [to-be-committed, RISC-V] Improve usage of slli.uw in constant synthesis

2024-05-12 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:83fb5e6f382ea99ca0e2a0afeb25a9f78909f25f

commit r15-389-g83fb5e6f382ea99ca0e2a0afeb25a9f78909f25f
Author: Jeff Law 
Date:   Sun May 12 07:12:04 2024 -0600

[to-be-committed,RISC-V] Improve usage of slli.uw in constant synthesis

And an improvement to using slli.uw...

I recently added the ability to use slli.uw in the synthesis path.  That
code was conditional on the right justified constant being a LUI_OPERAND
after sign extending from bit 31 to bit 63.

That code is working fine, but could be improved.  Specifically there's
no reason it shouldn't work for LUI+ADDI under the same circumstances.
So rather than testing the sign extended, right justified constant is a
LUI_OPERAND, we can just test that the right justified constant has
precisely 32 leading zeros.

gcc/
* config/riscv/riscv.cc (riscv_build_integer_1): Use slli.uw more.

gcc/testsuite
* gcc.target/riscv/synthesis-5.c: New test.

Diff:
---
 gcc/config/riscv/riscv.cc|   9 +-
 gcc/testsuite/gcc.target/riscv/synthesis-5.c | 294 +++
 2 files changed, 299 insertions(+), 4 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 049f8f8cb9fc..a1e5a014bedf 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -819,13 +819,14 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
  & ~HOST_WIDE_INT_C (0x8000)
shift -= IMM_BITS, x <<= IMM_BITS;
 
-  /* Adjust X if it isn't a LUI operand in isolation, but we can use
-a subsequent "uw" instruction form to mask off the undesirable
-bits.  */
+  /* If X has bits 32..63 clear and bit 31 set, then go ahead and mark
+it as desiring a "uw" operation for the shift.  That way we can have
+LUI+ADDI to generate the constant, then shift it into position
+clearing out the undesirable bits.  */
   if (!LUI_OPERAND (x)
  && TARGET_64BIT
  && TARGET_ZBA
- && LUI_OPERAND (x & ~HOST_WIDE_INT_C (0x8000UL)))
+ && clz_hwi (x) == 32)
{
  x = sext_hwi (x, 32);
  use_uw = true;
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-5.c 
b/gcc/testsuite/gcc.target/riscv/synthesis-5.c
new file mode 100644
index ..4d81565b563b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-5.c
@@ -0,0 +1,294 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } 
*/
+/* { dg-options "-march=rv64gc_zba_zbb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions.
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times 
"\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 556 } } */
+
+unsigned long foo_0x80180001000(void) { return 0x80180001000UL; }
+
+unsigned long foo_0x80280001000(void) { return 0x80280001000UL; }
+
+unsigned long foo_0x80480001000(void) { return 0x80480001000UL; }
+
+unsigned long foo_0x80880001000(void) { return 0x80880001000UL; }
+
+unsigned long foo_0x81080001000(void) { return 0x81080001000UL; }
+
+unsigned long foo_0x82080001000(void) { return 0x82080001000UL; }
+
+unsigned long foo_0x84080001000(void) { return 0x84080001000UL; }
+
+unsigned long foo_0x88080001000(void) { return 0x88080001000UL; }
+
+unsigned long foo_0x90080001000(void) { return 0x90080001000UL; }
+
+unsigned long foo_0xa0080001000(void) { return 0xa0080001000UL; }
+
+unsigned long foo_0x8031000(void) { return 0x8031000UL; }
+
+unsigned long foo_0x8051000(void) { return 0x8051000UL; }
+
+unsigned long foo_0x8091000(void) { return 0x8091000UL; }
+
+unsigned long foo_0x8111000(void) { return 0x8111000UL; }
+
+unsigned long foo_0x8211000(void) { return 0x8211000UL; }
+
+unsigned long foo_0x8411000(void) { return 0x8411000UL; }
+
+unsigned long foo_0x8811000(void) { return 0x8811000UL; }
+
+unsigned long foo_0x9011000(void) { return 0x9011000UL; }
+
+unsigned long foo_0xa011000(void) { return 0xa011000UL; }
+
+unsigned long foo_0xc011000(void) { return 0xc011000UL; }
+
+unsigned long foo_0x8061000(void) { return 0x8061000UL; }
+
+unsigned long foo_

[gcc r15-388] [to-be-committed] RISC-V Fix minor regression in synthesis WRT bseti usage

2024-05-12 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:77a28ed91b2a527b9006ee1a220b468756b43eca

commit r15-388-g77a28ed91b2a527b9006ee1a220b468756b43eca
Author: Jeff Law 
Date:   Sun May 12 07:05:43 2024 -0600

[to-be-committed] RISC-V Fix minor regression in synthesis WRT bseti usage

Overnight testing showed a small number of cases where constant synthesis 
was
doing something dumb.  Specifically generating more instructions than the
number of bits set in the constant.

It was a minor goof in the recent bseti code.  In the code to first figure 
out
what bits LUI could set, I included one bit outside the space LUI operates.
For some dumb reason I kept thinking in terms of 11 low bits belonging to 
addi,
but it's actually 12 bits.  The net is what we thought should be a single 
LUI
for costing turned into LUI+ADDI.

I didn't let the test run to completion, but over the course of 12 hours it
found 9 cases.  Given we know that the triggers all have 0x800 set, I bet we
could likely find more, but I doubt it's that critical to cover every 
possible
constant that regressed.

gcc/
* config/riscv/riscv.cc (riscv_build_integer_1): Fix thinko in 
testing
when lui can be used to set several bits in bseti path.

gcc/testsuite

* gcc.target/riscv/synthesis-4.c: New test

Diff:
---
 gcc/config/riscv/riscv.cc|  6 ++---
 gcc/testsuite/gcc.target/riscv/synthesis-4.c | 34 
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 9c98b1da0357..049f8f8cb9fc 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -921,12 +921,12 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
 
   /* First handle any bits set by LUI.  Be careful of the
 SImode sign bit!.  */
-  if (value & 0x7800)
+  if (value & 0x7000)
{
  alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
- alt_codes[i].value = value & 0x7800;
+ alt_codes[i].value = value & 0x7000;
  alt_codes[i].use_uw = false;
- value &= ~0x7800;
+ value &= ~0x7000;
   i++;
}
 
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-4.c 
b/gcc/testsuite/gcc.target/riscv/synthesis-4.c
new file mode 100644
index ..328a55b9e6e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-4.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } 
*/
+/* { dg-options "-march=rv64gc_zba_zbb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions. 
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times 
"\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 45 } } */
+
+
+unsigned long foo_0x640800(void) { return 0x640800UL; }
+
+unsigned long foo_0xc40800(void) { return 0xc40800UL; }
+
+unsigned long foo_0x1840800(void) { return 0x1840800UL; }
+
+unsigned long foo_0x3040800(void) { return 0x3040800UL; }
+
+unsigned long foo_0x6040800(void) { return 0x6040800UL; }
+
+unsigned long foo_0xc040800(void) { return 0xc040800UL; }
+
+unsigned long foo_0x18040800(void) { return 0x18040800UL; }
+
+unsigned long foo_0x30040800(void) { return 0x30040800UL; }
+
+unsigned long foo_0x60040800(void) { return 0x60040800UL; }


[to-be-committed][RISC-V] Improve usage of slli.uw in constant synthesis

2024-05-11 Thread Jeff Law

And an improvement to using slli.uw...

I recently added the ability to use slli.uw in the synthesis path.  That 
code was conditional on the right justified constant being a LUI_OPERAND 
after sign extending from bit 31 to bit 63.


That code is working fine, but could be improved.  Specifically there's 
no reason it shouldn't work for LUI+ADDI under the same circumstances. 
So rather than testing the sign extended, right justified constant is a 
LUI_OPERAND, we can just test that the right justified constant has 
precisely 32 leading zeros.



Waiting on CI to finish, expecting to commit after it's successful.

Jeff
gcc/
* config/riscv/riscv.cc (riscv_build_integer_1): Use slli.uw more.

gcc/testsuite
* gcc.target/riscv/synthesis-5.c: New test.

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 049f8f8cb9f..a1e5a014bed 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -819,13 +819,14 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
  & ~HOST_WIDE_INT_C (0x8000)
shift -= IMM_BITS, x <<= IMM_BITS;
 
-  /* Adjust X if it isn't a LUI operand in isolation, but we can use
-a subsequent "uw" instruction form to mask off the undesirable
-bits.  */
+  /* If X has bits 32..63 clear and bit 31 set, then go ahead and mark
+it as desiring a "uw" operation for the shift.  That way we can have
+LUI+ADDI to generate the constant, then shift it into position
+clearing out the undesirable bits.  */
   if (!LUI_OPERAND (x)
  && TARGET_64BIT
  && TARGET_ZBA
- && LUI_OPERAND (x & ~HOST_WIDE_INT_C (0x8000UL)))
+ && clz_hwi (x) == 32)
{
  x = sext_hwi (x, 32);
  use_uw = true;
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-5.c 
b/gcc/testsuite/gcc.target/riscv/synthesis-5.c
new file mode 100644
index 000..4d81565b563
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-5.c
@@ -0,0 +1,294 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } 
*/
+/* { dg-options "-march=rv64gc_zba_zbb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions.
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times 
"\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 556 } } */
+
+unsigned long foo_0x80180001000(void) { return 0x80180001000UL; }
+
+unsigned long foo_0x80280001000(void) { return 0x80280001000UL; }
+
+unsigned long foo_0x80480001000(void) { return 0x80480001000UL; }
+
+unsigned long foo_0x80880001000(void) { return 0x80880001000UL; }
+
+unsigned long foo_0x81080001000(void) { return 0x81080001000UL; }
+
+unsigned long foo_0x82080001000(void) { return 0x82080001000UL; }
+
+unsigned long foo_0x84080001000(void) { return 0x84080001000UL; }
+
+unsigned long foo_0x88080001000(void) { return 0x88080001000UL; }
+
+unsigned long foo_0x90080001000(void) { return 0x90080001000UL; }
+
+unsigned long foo_0xa0080001000(void) { return 0xa0080001000UL; }
+
+unsigned long foo_0x8031000(void) { return 0x8031000UL; }
+
+unsigned long foo_0x8051000(void) { return 0x8051000UL; }
+
+unsigned long foo_0x8091000(void) { return 0x8091000UL; }
+
+unsigned long foo_0x8111000(void) { return 0x8111000UL; }
+
+unsigned long foo_0x8211000(void) { return 0x8211000UL; }
+
+unsigned long foo_0x8411000(void) { return 0x8411000UL; }
+
+unsigned long foo_0x8811000(void) { return 0x8811000UL; }
+
+unsigned long foo_0x9011000(void) { return 0x9011000UL; }
+
+unsigned long foo_0xa011000(void) { return 0xa011000UL; }
+
+unsigned long foo_0xc011000(void) { return 0xc011000UL; }
+
+unsigned long foo_0x8061000(void) { return 0x8061000UL; }
+
+unsigned long foo_0x80a1000(void) { return 0x80a1000UL; }
+
+unsigned long foo_0x8121000(void) { return 0x8121000UL; }
+
+unsigned long foo_0x8221000(void) { return 0x8221000UL; }
+
+unsigned long foo_0x8421000(void) { return 0x8421000UL; }
+
+unsigned long foo_0x8821000(void) { return 0x8821000UL; }
+
+unsigned long foo_0xa021000(void) { return 0xa021000UL; }
+
+unsigned long foo_0xc021000(void) { return 0xc021000UL; }
+
+unsigned long foo_0x80c1000(void) { return 0x80c1000UL; }
+
+unsigned long foo_0x8141000(void) { return 0x8141000UL; }
+
+unsigned long 

[to-be-committed] RISC-V Fix minor regression in synthesis WRT bseti usage

2024-05-11 Thread Jeff Law
Overnight testing showed a small number of cases where constant 
synthesis was doing something dumb.  Specifically generating more 
instructions than the number of bits set in the constant.


It was a minor goof in the recent bseti code.  In the code to first 
figure out what bits LUI could set, I included one bit outside the space 
LUI operates.  For some dumb reason I kept thinking in terms of 11 low 
bits belonging to addi, but it's actually 12 bits.  The net is what we 
thought should be a single LUI for costing turned into LUI+ADDI.


I didn't let the test run to completion, but over the course of 12 hours 
it found 9 cases.  Given we know that the triggers all have 0x800 set, I 
bet we could likely find more, but I doubt it's that critical to cover 
every possible constant that regressed.


This has run in my tester (rv64gc, rv32gcv), but I'll wait for the CI 
tester as it covers the bitmanip extensions much better.



Jeff

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 9c98b1da035..049f8f8cb9f 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -921,12 +921,12 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
 
   /* First handle any bits set by LUI.  Be careful of the
 SImode sign bit!.  */
-  if (value & 0x7800)
+  if (value & 0x7000)
{
  alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
- alt_codes[i].value = value & 0x7800;
+ alt_codes[i].value = value & 0x7000;
  alt_codes[i].use_uw = false;
- value &= ~0x7800;
+ value &= ~0x7000;
   i++;
}
 
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-4.c 
b/gcc/testsuite/gcc.target/riscv/synthesis-4.c
new file mode 100644
index 000..328a55b9e6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-4.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } 
*/
+/* { dg-options "-march=rv64gc_zba_zbb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions. 
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times 
"\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 45 } } */
+
+
+unsigned long foo_0x640800(void) { return 0x640800UL; }
+
+unsigned long foo_0xc40800(void) { return 0xc40800UL; }
+
+unsigned long foo_0x1840800(void) { return 0x1840800UL; }
+
+unsigned long foo_0x3040800(void) { return 0x3040800UL; }
+
+unsigned long foo_0x6040800(void) { return 0x6040800UL; }
+
+unsigned long foo_0xc040800(void) { return 0xc040800UL; }
+
+unsigned long foo_0x18040800(void) { return 0x18040800UL; }
+
+unsigned long foo_0x30040800(void) { return 0x30040800UL; }
+
+unsigned long foo_0x60040800(void) { return 0x60040800UL; }


[gcc r15-384] [PATCH v4 4/4] Output S_COMPILE3 symbol in CodeView debug section

2024-05-11 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:1da83fece2963cfe0df57ac5e85dd1f92427ca70

commit r15-384-g1da83fece2963cfe0df57ac5e85dd1f92427ca70
Author: Mark Harmstone 
Date:   Sat May 11 08:24:59 2024 -0600

[PATCH v4 4/4] Output S_COMPILE3 symbol in CodeView debug section

Outputs the S_COMPILE3 symbol in the CodeView .debug$S debug section.
The DEBUG_S_SYMBOLS block added here makes up pretty much everything
that isn't data structures or line numbers; we add the S_COMPILE3 symbol
here to start it off.

This is a descriptive bit, the most interesting part of which is the
version of the compiler used.

gcc/
* dwarf2codeview.cc (DEBUG_S_SYMBOLS): Define.
(S_COMPILE3, CV_CFL_80386, CV_CFL_X64): Likewise.
(CV_CFL_C, CV_CFL_CXX): Likewise.
(SYMBOL_START_LABEL, SYMBOL_END_LABEL): Likewise.
(start_processor, language_constant): New functions.
(write_compile3_symbol, write_codeview_symbols): Likewise.
(codeview_debug_finish): Call write_codeview_symbols.

Diff:
---
 gcc/dwarf2codeview.cc | 126 ++
 1 file changed, 126 insertions(+)

diff --git a/gcc/dwarf2codeview.cc b/gcc/dwarf2codeview.cc
index 9c69ebf89983..db776d79be4a 100644
--- a/gcc/dwarf2codeview.cc
+++ b/gcc/dwarf2codeview.cc
@@ -39,14 +39,25 @@ along with GCC; see the file COPYING3.  If not see
 
 #define CV_SIGNATURE_C13   4
 
+#define DEBUG_S_SYMBOLS0xf1
 #define DEBUG_S_LINES  0xf2
 #define DEBUG_S_STRINGTABLE 0xf3
 #define DEBUG_S_FILECHKSMS  0xf4
 
 #define CHKSUM_TYPE_MD51
 
+#define S_COMPILE3 0x113c
+
+#define CV_CFL_80386   0x03
+#define CV_CFL_X64 0xD0
+
+#define CV_CFL_C   0x00
+#define CV_CFL_CXX 0x01
+
 #define LINE_LABEL "Lcvline"
 #define END_FUNC_LABEL "Lcvendfunc"
+#define SYMBOL_START_LABEL "Lcvsymstart"
+#define SYMBOL_END_LABEL   "Lcvsymend"
 
 #define HASH_SIZE 16
 
@@ -120,6 +131,7 @@ struct codeview_function
 
 static unsigned int line_label_num;
 static unsigned int func_label_num;
+static unsigned int sym_label_num;
 static codeview_source_file *files, *last_file;
 static unsigned int num_files;
 static uint32_t string_offset = 1;
@@ -592,6 +604,119 @@ codeview_end_epilogue (void)
 }
 }
 
+/* Return the CodeView constant for the selected architecture.  */
+
+static uint16_t
+target_processor (void)
+{
+  if (TARGET_64BIT)
+return CV_CFL_X64;
+  else
+return CV_CFL_80386;
+}
+
+/* Return the CodeView constant for the language being used.  */
+
+static uint32_t
+language_constant (void)
+{
+  const char *language_string = lang_hooks.name;
+
+  if (startswith (language_string, "GNU C++"))
+return CV_CFL_CXX;
+  else if (startswith (language_string, "GNU C"))
+return CV_CFL_C;
+
+  return 0;
+}
+
+/* Write a S_COMPILE3 symbol, which records the details of the compiler
+   being used.  */
+
+static void
+write_compile3_symbol (void)
+{
+  unsigned int label_num = ++sym_label_num;
+
+  static const char compiler_name[] = "GCC ";
+
+  /* This is struct COMPILESYM3 in binutils and Microsoft's cvinfo.h:
+
+ struct COMPILESYM3
+ {
+   uint16_t length;
+   uint16_t type;
+   uint32_t flags;
+   uint16_t machine;
+   uint16_t frontend_major;
+   uint16_t frontend_minor;
+   uint16_t frontend_build;
+   uint16_t frontend_qfe;
+   uint16_t backend_major;
+   uint16_t backend_minor;
+   uint16_t backend_build;
+   uint16_t backend_qfe;
+ } ATTRIBUTE_PACKED;
+  */
+
+  fputs (integer_asm_op (2, false), asm_out_file);
+  asm_fprintf (asm_out_file,
+  "%L" SYMBOL_END_LABEL "%u - %L" SYMBOL_START_LABEL "%u\n",
+  label_num, label_num);
+
+  targetm.asm_out.internal_label (asm_out_file, SYMBOL_START_LABEL, label_num);
+
+  fputs (integer_asm_op (2, false), asm_out_file);
+  fprint_whex (asm_out_file, S_COMPILE3);
+  putc ('\n', asm_out_file);
+
+  /* Microsoft has the flags as a bitfield, with the bottom 8 bits being the
+ language constant, and the reset being MSVC-specific stuff.  */
+  fputs (integer_asm_op (4, false), asm_out_file);
+  fprint_whex (asm_out_file, language_constant ());
+  putc ('\n', asm_out_file);
+
+  fputs (integer_asm_op (2, false), asm_out_file);
+  fprint_whex (asm_out_file, target_processor ());
+  putc ('\n', asm_out_file);
+
+  /* Write 8 uint16_ts for the frontend and backend versions.  As with GAS, we
+ zero these, as it's easier to record the version in the compiler
+ string.  */
+  for (unsigned int i = 0; i < 8; i++)
+{
+  fputs (integer_asm_op (2, false), asm_out_file);
+  fprint_whex (asm_out_file, 0);
+  putc ('\n', asm_out_file);
+}
+
+  ASM_OUTPUT_ASCII (asm_out_file, compiler_name, sizeof (compiler_name) - 1);
+  ASM_OUTPUT_ASCII (asm_out_file, version_string, strlen (version_string) + 1);
+
+  

[gcc r15-383] [PATCH v2 3/4] Output line numbers in CodeView section

2024-05-11 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:1f129e5e2b74c20a757f2809792af229b551b09b

commit r15-383-g1f129e5e2b74c20a757f2809792af229b551b09b
Author: Mark Harmstone 
Date:   Sat May 11 08:19:53 2024 -0600

[PATCH v2 3/4] Output line numbers in CodeView section

Outputs the DEBUG_S_LINES block in the CodeView .debug$S section, which
maps between line numbers and addresses.

You'll need a fairly recent version of GAS for the .secidx directive to
be recognized.

gcc/
* dwarf2codeview.cc (DEBUG_S_LINES, LINE_LABEL): Define.
(END_FUNC_LABEL): Likewise.
(struct codeview_line, codeview_line_block): New structures.
(codeview_function): Likewise.
(line_label_num, func_label_num, funcs, last_func): New variables.
(last_filename, last_file_id): Likewise.
(codeview_source_line, write_line_numbers): New functions.
(codeview_switch_text_section, codeview_end_epilogue): Likewise.
(codeview_debug_finish): Call write_line_numbers.
* dwarf2codeview.h (codeview_source_line): Prototype.
(codeview_switch_text_secction, codeview_end_epilogue): Likewise.
* dwarf2out.cc (dwarf2_end_epilogue): Add codeview support.
(dwarf2out_switch_text_section): Likewise.
(dwarf2out_source_line): Likewise.
* opts.cc (finish_options): Handle codeview debugging symbols.

Diff:
---
 gcc/dwarf2codeview.cc | 303 ++
 gcc/dwarf2codeview.h  |   3 +
 gcc/dwarf2out.cc  |  15 +++
 gcc/opts.cc   |   2 +-
 4 files changed, 322 insertions(+), 1 deletion(-)

diff --git a/gcc/dwarf2codeview.cc b/gcc/dwarf2codeview.cc
index da8315310b50..9c69ebf89983 100644
--- a/gcc/dwarf2codeview.cc
+++ b/gcc/dwarf2codeview.cc
@@ -39,11 +39,15 @@ along with GCC; see the file COPYING3.  If not see
 
 #define CV_SIGNATURE_C13   4
 
+#define DEBUG_S_LINES  0xf2
 #define DEBUG_S_STRINGTABLE 0xf3
 #define DEBUG_S_FILECHKSMS  0xf4
 
 #define CHKSUM_TYPE_MD51
 
+#define LINE_LABEL "Lcvline"
+#define END_FUNC_LABEL "Lcvendfunc"
+
 #define HASH_SIZE 16
 
 struct codeview_string
@@ -91,11 +95,128 @@ struct codeview_source_file
   uint8_t hash[HASH_SIZE];
 };
 
+struct codeview_line
+{
+  codeview_line *next;
+  unsigned int line_no;
+  unsigned int label_num;
+};
+
+struct codeview_line_block
+{
+  codeview_line_block *next;
+  uint32_t file_id;
+  unsigned int num_lines;
+  codeview_line *lines, *last_line;
+};
+
+struct codeview_function
+{
+  codeview_function *next;
+  function *func;
+  unsigned int end_label;
+  codeview_line_block *blocks, *last_block;
+};
+
+static unsigned int line_label_num;
+static unsigned int func_label_num;
 static codeview_source_file *files, *last_file;
 static unsigned int num_files;
 static uint32_t string_offset = 1;
 static hash_table *strings_htab;
 static codeview_string *strings, *last_string;
+static codeview_function *funcs, *last_func;
+static const char* last_filename;
+static uint32_t last_file_id;
+
+/* Record new line number against the current function.  */
+
+void
+codeview_source_line (unsigned int line_no, const char *filename)
+{
+  codeview_line *l;
+  uint32_t file_id = last_file_id;
+  unsigned int label_num = ++line_label_num;
+
+  targetm.asm_out.internal_label (asm_out_file, LINE_LABEL, label_num);
+
+  if (!last_func || last_func->func != cfun)
+{
+  codeview_function *f = (codeview_function *)
+   xmalloc (sizeof (codeview_function));
+
+  f->next = NULL;
+  f->func = cfun;
+  f->end_label = 0;
+  f->blocks = f->last_block = NULL;
+
+  if (!funcs)
+   funcs = f;
+  else
+   last_func->next = f;
+
+  last_func = f;
+}
+
+  if (filename != last_filename)
+{
+  codeview_source_file *sf = files;
+
+  while (sf)
+   {
+ if (!strcmp (sf->filename, filename))
+   {
+ /* 0x18 is the size of the checksum entry for each file.
+0x6 bytes for the header, plus 0x10 bytes for the hash,
+then padded to a multiple of 4.  */
+
+ file_id = sf->file_num * 0x18;
+ last_filename = filename;
+ last_file_id = file_id;
+ break;
+   }
+
+ sf = sf->next;
+   }
+}
+
+  if (!last_func->last_block || last_func->last_block->file_id != file_id)
+{
+  codeview_line_block *b;
+
+  b = (codeview_line_block *) xmalloc (sizeof (codeview_line_block));
+
+  b->next = NULL;
+  b->file_id = file_id;
+  b->num_lines = 0;
+  b->lines = b->last_line = NULL;
+
+  if (!last_func->blocks)
+   last_func->blocks = b;
+  else
+   last_func->last_block->next = b;
+
+  last_func->last_block = b;
+}
+
+  if (last_func->last_block->last_line
+&& last_func->last_block->last_line->line_no == line_no)
+return;
+
+ 

[gcc r15-382] [PATCH v2 2/4] Output file checksums in CodeView section

2024-05-11 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:ed6690a0ca911138abd4d707510fd03ef188a28b

commit r15-382-ged6690a0ca911138abd4d707510fd03ef188a28b
Author: Mark Harmstone 
Date:   Sat May 11 08:15:43 2024 -0600

[PATCH v2 2/4] Output file checksums in CodeView section

Outputs the file name and MD5 hash of the main source file into the
CodeView .debug$S section, along with that of any #include'd files.

gcc/
* dwarf2codeview.cc (DEBUG_S_STRINGTABLE): Define.
(DEBUG_S_FILECHKSMS, CHKSUM_TYPE_MD5, HASH_SIZE): Likewise.
(codeview_string, codeview_source_file): New structures.
(struct string_hasher): New class for codeview_string hashing.
(files, last_file, num_files, string_offset): New variables.
(strings_hstab, strings, last_string): Likewise.
(add_string, codevie_start_source_file): New functions.
(write_strings_tabe, write_soruce_files): Likewise.
(codeview_debug_finish): Call new functions.
* dwarf2codeview.h (codeview_start_source_file): Prototype.
* dwarf2out.cc (dwarf2out_start_source_file): Handle codeview.

Diff:
---
 gcc/dwarf2codeview.cc | 254 ++
 gcc/dwarf2codeview.h  |   1 +
 gcc/dwarf2out.cc  |   5 +
 3 files changed, 260 insertions(+)

diff --git a/gcc/dwarf2codeview.cc b/gcc/dwarf2codeview.cc
index f08f5d55ad7c..da8315310b50 100644
--- a/gcc/dwarf2codeview.cc
+++ b/gcc/dwarf2codeview.cc
@@ -39,6 +39,257 @@ along with GCC; see the file COPYING3.  If not see
 
 #define CV_SIGNATURE_C13   4
 
+#define DEBUG_S_STRINGTABLE 0xf3
+#define DEBUG_S_FILECHKSMS  0xf4
+
+#define CHKSUM_TYPE_MD51
+
+#define HASH_SIZE 16
+
+struct codeview_string
+{
+  codeview_string *next;
+  uint32_t offset;
+  char *string;
+};
+
+struct string_hasher : free_ptr_hash 
+{
+  typedef const char *compare_type;
+
+  static hashval_t hash (const codeview_string *x)
+  {
+return htab_hash_string (x->string);
+  }
+
+  static bool equal (const codeview_string *x, const char *y)
+  {
+return !strcmp (x->string, y);
+  }
+
+  static void mark_empty (codeview_string *x)
+  {
+if (x->string)
+  {
+   free (x->string);
+   x->string = NULL;
+  }
+  }
+
+  static void remove (codeview_string *)
+  {
+free (x->string);
+  }
+};
+
+struct codeview_source_file
+{
+  codeview_source_file *next;
+  unsigned int file_num;
+  uint32_t string_offset;
+  char *filename;
+  uint8_t hash[HASH_SIZE];
+};
+
+static codeview_source_file *files, *last_file;
+static unsigned int num_files;
+static uint32_t string_offset = 1;
+static hash_table *strings_htab;
+static codeview_string *strings, *last_string;
+
+/* Adds string to the string table, returning its offset.  If already present,
+   this returns the offset of the existing string.  */
+
+static uint32_t
+add_string (const char *string)
+{
+  codeview_string **slot;
+  codeview_string *s;
+  size_t len;
+
+  if (!strings_htab)
+strings_htab = new hash_table (10);
+
+  slot = strings_htab->find_slot_with_hash (string, htab_hash_string (string),
+   INSERT);
+
+  if (*slot)
+return (*slot)->offset;
+
+  s = (codeview_string *) xmalloc (sizeof (codeview_string));
+  len = strlen (string);
+
+  s->next = NULL;
+
+  s->offset = string_offset;
+  string_offset += len + 1;
+
+  s->string = xstrdup (string);
+
+  if (last_string)
+last_string->next = s;
+  else
+strings = s;
+
+  last_string = s;
+
+  *slot = s;
+
+  return s->offset;
+}
+
+/* A new source file has been encountered - record the details and calculate
+   its hash.  */
+
+void
+codeview_start_source_file (const char *filename)
+{
+  codeview_source_file *sf;
+  char *path;
+  uint32_t string_offset;
+  FILE *f;
+
+  path = lrealpath (filename);
+  string_offset = add_string (path);
+  free (path);
+
+  sf = files;
+  while (sf)
+{
+  if (sf->string_offset == string_offset)
+   return;
+
+  sf = sf->next;
+}
+
+  sf = (codeview_source_file *) xmalloc (sizeof (codeview_source_file));
+  sf->next = NULL;
+  sf->file_num = num_files;
+  sf->string_offset = string_offset;
+  sf->filename = xstrdup (filename);
+
+  f = fopen (filename, "r");
+  if (!f)
+internal_error ("could not open %s for reading", filename);
+
+  if (md5_stream (f, sf->hash))
+{
+  fclose (f);
+  internal_error ("md5_stream failed");
+}
+
+  fclose (f);
+
+  if (last_file)
+last_file->next = sf;
+  else
+files = sf;
+
+  last_file = sf;
+  num_files++;
+}
+
+/* Write out the strings table into the .debug$S section.  The linker will
+   parse this, and handle the deduplication and hashing for all the object
+   files.  */
+
+static void
+write_strings_table (void)
+{
+  codeview_string *string;
+
+  fputs (integer_asm_op (4, false), asm_out_file);
+  fprint_whex (asm_out_file, DEBUG_S_STRINGTABLE);
+  putc ('\n', asm_out_file);
+
+  

[gcc r15-381] [PATCH v2 1/4] Support for CodeView debugging format

2024-05-11 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:36781ef8fd26eb9a0686957e7bac8f5ccc5ecc3f

commit r15-381-g36781ef8fd26eb9a0686957e7bac8f5ccc5ecc3f
Author: Mark Harmstone 
Date:   Sat May 11 08:08:50 2024 -0600

[PATCH v2 1/4] Support for CodeView debugging format

This patch and the following add initial support for Microsoft's
CodeView debugging format, as used by MSVC, to mingw targets.

Note that you will need a recent version of binutils for this to be
useful. The best way to view the output is to run Microsoft's
cvdump.exe, found in their microsoft-pdb repo on GitHub, against the
object files.

gcc/

* Makefile.in (OBJS): Add dwarf2codeview.o.
(GTFILES): Add dwarf2codeview.cc
* config/i386/cygming.h (CODEVIEW_DEBUGGING_INFO): Define.
* dwarf2codeview.cc: New file.
* dwarf2codeview.h: New file.
* dwarf2out.cc: Include dwarf2codeview.h.
(dwarf2out_finish): Call codeview_debug_finish as needed.
* flag-types.h (DINFO_TYPE_CODEVIEW): Add enum member.
(CODEVIEW_DEBUG): Define.
* flags.h (codeview_debuginfo_p): Proottype.
* opts.cc (debug_type_names): Add codeview.
(debug_type_masks): Add CODEVIEW_DEBUG.
(df_set_names): Add codeview.
(codeview_debuginfo_p): New function.
(dwarf_based_debuginfo_p): Add CODEVIEW clause.
(set_debug_level): Handle CODEVIEW_DEBUG.
* toplev.cc (process_options): Handle codeview.

gcc/testsuite
* gcc.dg/debug/codeview/codeview-1.c: New test.
* gcc.dg/debug/codeview/codeview.exp: New testsuite driver.

Diff:
---
 gcc/Makefile.in  |  2 +
 gcc/config/i386/cygming.h|  2 +
 gcc/dwarf2codeview.cc| 54 
 gcc/dwarf2codeview.h | 30 +
 gcc/dwarf2out.cc |  6 +++
 gcc/flag-types.h |  3 ++
 gcc/flags.h  |  4 ++
 gcc/opts.cc  | 23 +++---
 gcc/testsuite/gcc.dg/debug/codeview/codeview-1.c |  6 +++
 gcc/testsuite/gcc.dg/debug/codeview/codeview.exp | 48 +
 gcc/toplev.cc|  4 ++
 11 files changed, 177 insertions(+), 5 deletions(-)

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index ecd511463572..a7f15694c34b 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1443,6 +1443,7 @@ OBJS = \
dumpfile.o \
dwarf2asm.o \
dwarf2cfi.o \
+   dwarf2codeview.o \
dwarf2ctf.o \
dwarf2out.o \
early-remat.o \
@@ -2838,6 +2839,7 @@ GTFILES = $(CPPLIB_H) $(srcdir)/input.h 
$(srcdir)/coretypes.h \
   $(srcdir)/dwarf2out.h \
   $(srcdir)/dwarf2asm.cc \
   $(srcdir)/dwarf2cfi.cc \
+  $(srcdir)/dwarf2codeview.cc \
   $(srcdir)/dwarf2ctf.cc \
   $(srcdir)/dwarf2out.cc \
   $(srcdir)/ctfc.h \
diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h
index beedf7c398a5..98b375538e75 100644
--- a/gcc/config/i386/cygming.h
+++ b/gcc/config/i386/cygming.h
@@ -20,6 +20,8 @@ along with GCC; see the file COPYING3.  If not see
 
 #define DWARF2_DEBUGGING_INFO 1
 
+#define CODEVIEW_DEBUGGING_INFO 1
+
 #undef PREFERRED_DEBUGGING_TYPE
 #define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG
 
diff --git a/gcc/dwarf2codeview.cc b/gcc/dwarf2codeview.cc
new file mode 100644
index ..f08f5d55ad7c
--- /dev/null
+++ b/gcc/dwarf2codeview.cc
@@ -0,0 +1,54 @@
+/* Generate CodeView debugging info from the GCC DWARF.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+.  */
+
+/* See gas/codeview.h in binutils for more about the constants and structs
+   listed below.  References to Microsoft files refer to Microsoft's PDB
+   repository: https://github.com/microsoft/microsoft-pdb.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "target.h"
+#include "output.h"
+#include "errors.h"
+#include "md5.h"
+#include "function.h"
+#include "version.h"
+#include "tree.h"
+#include "langhooks.h"
+#include "dwarf2out.h"
+#include "dwarf2codeview.h"
+
+#ifdef CODEVIEW_DEBUGGING_INFO
+
+#define CV_SIGNATURE_C13   4
+
+/* Finish CodeView debug 

Re: [PATCH v2 1/4] Support for CodeView debugging format

2024-05-11 Thread Jeff Law




On 10/30/23 6:28 PM, Mark Harmstone wrote:

This patch and the following add initial support for Microsoft's
CodeView debugging format, as used by MSVC, to mingw targets.

Note that you will need a recent version of binutils for this to be
useful. The best way to view the output is to run Microsoft's
cvdump.exe, found in their microsoft-pdb repo on GitHub, against the
object files.
So I'd hoped to have these wrapped up last year in time for gcc-14, but 
life got in the way.


The patches are fine for the trunk, though they are missing ChangeLog 
entries.  I'll cobble those together and push the series to the trunk.


Thanks for your patience.

jeff



Re: [to-be-committed][RISC-V] Improve extraction of inverted single bit

2024-05-10 Thread Jeff Law



On 5/10/24 4:28 PM, Jeff Law wrote:
So this patch fixes a minor code generation inefficiency that (IIRC) the 
RAU team discovered a while ago in spec.


If we want the inverted value of a single bit we can use bext to extract 
the bit, then seq to invert the value (if viewed as a 0/1 truth value).


The RTL is fairly convoluted, but it's basically a right shift to get 
the bit into position, bitwise-not then masking off all but the low bit. 
  So it's a 3->2 combine, hidden by the fact that and-not is a 
define_insn_and_split, so it actually looks like a 2->2 combine.


We've run this through Ventana's internal CI (which includes 
zba_zbb_zbs) and I've run it in my own tester (rv64gc, rv32gcv).  I'll 
wait for the upstream CI to finish with positive results before pushing.

[ ... ]
Whoops, sent the wrong patch.  The downside of doing work on one system, 
but handling email from another :(


Here's the right patch.



gcc/
* config/riscv/bitmanip.md (*bextseqzdisi): New pattern.

gcc/testsuite/

* gcc.target/riscv/zbs-bext-2.c: New test.

diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index d76a72d30e0..cf2fa04d4c4 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -711,6 +711,30 @@ (define_insn "*bext"
   "bext\t%0,%1,%2"
   [(set_attr "type" "bitmanip")])
 
+;; This is a bext followed by a seqz.  Normally this would be a 3->2 split
+;; But the and-not pattern with a constant operand is a define_insn_and_split,
+;; so this looks like a 2->2 split, which combine rejects.  So implement it
+;; as a define_insn_and_split as well.
+(define_insn_and_split "*bextseqzdisi"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (and:DI
+ (not:DI
+   (subreg:DI
+ (lshiftrt:SI
+   (match_operand:SI 1 "register_operand" "r")
+   (match_operand:QI 2 "register_operand" "r")) 0))
+  (const_int 1)))]
+  "TARGET_64BIT && TARGET_ZBS"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (and:DI (subreg:DI
+   (lshiftrt:SI (match_dup 1)
+(match_dup 2)) 0)
+ (const_int 1)))
+   (set (match_dup 0) (eq:DI (match_dup 0) (const_int 0)))]
+  ""
+  [(set_attr "type" "bitmanip")])
+
 ;; When performing `(a & (1UL << bitno)) ? 0 : -1` the combiner
 ;; usually has the `bitno` typed as X-mode (i.e. no further
 ;; zero-extension is performed around the bitno).
diff --git a/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c 
b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
new file mode 100644
index 000..53f47dc3afe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+
+_Bool match(const int ch, int fMap) {
+return ((fMap & (1<<(ch))) == 0);
+}
+
+
+/* { dg-final { scan-assembler-times "bext\t" 1 } } */
+/* { dg-final { scan-assembler-times "seqz\t" 1 } } */
+/* { dg-final { scan-assembler-not "sraw\t" } } */
+/* { dg-final { scan-assembler-not "not\t" } } */
+/* { dg-final { scan-assembler-not "andi\t" } } */


Re: [wwwdocs] Add Cauldron2024

2024-05-10 Thread Jeff Law




On 5/7/24 4:34 AM, Jan Hubicka wrote:

Hi,
this adds Cauldron2024 to main page. OK?

OK, of course.

jeff



Re: [PATCH 4/4] RISC-V: Allow by-pieces to do overlapping accesses in block_move_straight

2024-05-10 Thread Jeff Law




On 5/7/24 11:17 PM, Christoph Müllner wrote:

The current implementation of riscv_block_move_straight() emits a couple
of loads/stores with with maximum width (e.g. 8-byte for RV64).
The remainder is handed over to move_by_pieces().
The by-pieces framework utilizes target hooks to decide about the emitted
instructions (e.g. unaligned accesses or overlapping accesses).

Since the current implementation will always request less than XLEN bytes
to be handled by the by-pieces infrastructure, it is impossible that
overlapping memory accesses can ever be emitted (the by-pieces code does
not know of any previous instructions that were emitted by the backend).

This patch changes the implementation of riscv_block_move_straight()
such, that it utilizes the by-pieces framework if the remaining data
is less than 2*XLEN bytes, which is sufficient to enable overlapping
memory accesses (if the requirements for them are given).

The changes in the expansion can be seen in the adjustments of the
cpymem-NN-ooo test cases. The changes in the cpymem-NN tests are
caused by the different instruction ordering of the code emitted
by the by-pieces infrastructure, which emits alternating load/store
sequences.

gcc/ChangeLog:

* config/riscv/riscv-string.cc (riscv_block_move_straight):
Hand over up to 2xXLEN bytes to move_by_pieces().

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cpymem-32-ooo.c: Adjustments for overlapping
access.
* gcc.target/riscv/cpymem-32.c: Adjustments for code emitted by
by-pieces.
* gcc.target/riscv/cpymem-64-ooo.c: Adjustments for overlapping
access.
* gcc.target/riscv/cpymem-64.c: Adjustments for code emitted by
by-pieces.

OK once any prereqs are in.

jeff



Re: [PATCH 3/4] RISC-V: tune: Add setting for overlapping mem ops to tuning struct

2024-05-10 Thread Jeff Law




On 5/7/24 11:17 PM, Christoph Müllner wrote:

This patch adds the field overlap_op_by_pieces to the struct
riscv_tune_param, which is used by the TARGET_OVERLAP_OP_BY_PIECES_P()
hook. This hook is used by the by-pieces infrastructure to decide
if overlapping memory accesses should be emitted.

The new property is set to false in all tune structs except for
generic-ooo.

The changes in the expansion can be seen in the adjustments of the
cpymem test cases. These tests also reveal a limitation in the
RISC-V cpymem expansion that prevents this optimization as only
by-pieces cpymem expansions emit overlapping memory accesses.

gcc/ChangeLog:

* config/riscv/riscv.cc (struct riscv_tune_param): New field
overlap_op_by_pieces.
(riscv_overlap_op_by_pieces): New function.
(TARGET_OVERLAP_OP_BY_PIECES_P): Connect to
riscv_overlap_op_by_pieces.
I think these are redundant with the changes I installed earlier this 
week :-)




gcc/testsuite/ChangeLog:

* gcc.target/riscv/cpymem-32-ooo.c: Adjust for overlapping
access.
* gcc.target/riscv/cpymem-64-ooo.c: Likewise.

OK once prereqs are in.

jeff



Re: [PATCH 2/4] RISC-V: Allow unaligned accesses in cpymemsi expansion

2024-05-10 Thread Jeff Law




On 5/7/24 11:17 PM, Christoph Müllner wrote:

The RISC-V cpymemsi expansion is called, whenever the by-pieces
infrastructure will not take care of the builtin expansion.
The code emitted by the by-pieces infrastructure may emits code,
that includes unaligned accesses if riscv_slow_unaligned_access_p
is false.

The RISC-V cpymemsi expansion is handled via riscv_expand_block_move().
The current implementation of this function does not check
riscv_slow_unaligned_access_p and never emits unaligned accesses.

Since by-pieces emits unaligned accesses, it is reasonable to implement
the same behaviour in the cpymemsi expansion. And that's what this patch
is doing.

The patch checks riscv_slow_unaligned_access_p at the entry and sets
the allowed alignment accordingly. This alignment is then propagated
down to the routines that emit the actual instructions.

The changes introduced by this patch can be seen in the adjustments
of the cpymem tests.

gcc/ChangeLog:

* config/riscv/riscv-string.cc (riscv_block_move_straight): Add
parameter align.
(riscv_adjust_block_mem): Replace parameter length by align.
(riscv_block_move_loop): Add parameter align.
(riscv_expand_block_move_scalar): Set alignment properly if the
target has fast unaligned access.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cpymem-32-ooo.c: Adjust for unaligned access.
* gcc.target/riscv/cpymem-64-ooo.c: Likewise.

Mostly ok.  One concern noted below.




Signed-off-by: Christoph Müllner 
---
  gcc/config/riscv/riscv-string.cc  | 53 +++
  .../gcc.target/riscv/cpymem-32-ooo.c  | 20 +--
  .../gcc.target/riscv/cpymem-64-ooo.c  | 14 -
  3 files changed, 59 insertions(+), 28 deletions(-)

@@ -730,8 +732,16 @@ riscv_expand_block_move_scalar (rtx dest, rtx src, rtx 
length)
unsigned HOST_WIDE_INT hwi_length = UINTVAL (length);
unsigned HOST_WIDE_INT factor, align;
  
-  align = MIN (MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), BITS_PER_WORD);

-  factor = BITS_PER_WORD / align;
+  if (riscv_slow_unaligned_access_p)
+{
+  align = MIN (MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), BITS_PER_WORD);
+  factor = BITS_PER_WORD / align;
+}
+  else
+{
+  align = hwi_length * BITS_PER_UNIT;
+  factor = 1;
+}
Not sure why you're using hwi_length here.  That's a property of the 
host, not the target.  ISTM you wanted BITS_PER_WORD here to encourage 
word sized moves irrespective of alignment.


OK with that change after a fresh rounding of testing.

jeff


[to-be-committed][RISC-V] Improve extraction of inverted single bit

2024-05-10 Thread Jeff Law
So this patch fixes a minor code generation inefficiency that (IIRC) the 
RAU team discovered a while ago in spec.


If we want the inverted value of a single bit we can use bext to extract 
the bit, then seq to invert the value (if viewed as a 0/1 truth value).


The RTL is fairly convoluted, but it's basically a right shift to get 
the bit into position, bitwise-not then masking off all but the low bit. 
 So it's a 3->2 combine, hidden by the fact that and-not is a 
define_insn_and_split, so it actually looks like a 2->2 combine.


We've run this through Ventana's internal CI (which includes 
zba_zbb_zbs) and I've run it in my own tester (rv64gc, rv32gcv).  I'll 
wait for the upstream CI to finish with positive results before pushing.


Jeffgcc/

* config/riscv/riscv.cc (riscv_build_integer_1): Recognize cases where
we can use shNadd to improve constant synthesis.
(riscv_move_integer): Handle code generation for shNadd.

gcc/testsuite
* gcc.target/riscv/synthesis-1.c: Also count shNadd instructions.
* gcc.target/riscv/synthesis-3.c: New test.


diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 2eac67b0ce0..75e828c81a7 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -880,6 +880,37 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
}
 }
 
+  if (cost > 2 && TARGET_64BIT && TARGET_ZBA)
+{
+  if ((value % 9) == 0
+ && (alt_cost = riscv_build_integer_1 (alt_codes, value / 9, mode) + 
1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 9;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+  if ((value % 5) == 0
+ && (alt_cost = riscv_build_integer_1 (alt_codes, value / 5, mode) + 
1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 5;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+  if ((value % 3) == 0
+ && (alt_cost = riscv_build_integer_1 (alt_codes, value / 3, mode) + 
1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 3;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+}
+
   /* Final cases, particularly focused on bseti.  */
   if (cost > 2 && TARGET_ZBS)
 {
@@ -2542,6 +2573,14 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT 
value,
  x = gen_rtx_fmt_ee (AND, mode, x, GEN_INT (value));
  x = riscv_emit_set (t, x);
}
+ else if (codes[i].code == FMA)
+   {
+ HOST_WIDE_INT value = exact_log2 (codes[i].value - 1);
+ rtx ashift = gen_rtx_fmt_ee (ASHIFT, mode, x, GEN_INT (value));
+ x = gen_rtx_fmt_ee (PLUS, mode, ashift, x);
+ rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
+ x = riscv_emit_set (t, x);
+   }
  else
x = gen_rtx_fmt_ee (codes[i].code, mode,
x, GEN_INT (codes[i].value));
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-1.c 
b/gcc/testsuite/gcc.target/riscv/synthesis-1.c
index 3384e488ade..9176d5f4989 100644
--- a/gcc/testsuite/gcc.target/riscv/synthesis-1.c
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-1.c
@@ -12,7 +12,7 @@
total number of instructions. 
 
This isn't expected to change much and any change is worthy of a look.  */
-/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|ret|slli)" 5822 } 
} */
+/* { dg-final { scan-assembler-times 
"\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 5822 } } */
 
  unsigned long foo_0x3(void) { return 0x3UL; }
  unsigned long foo_0x5(void) { return 0x5UL; }
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-3.c 
b/gcc/testsuite/gcc.target/riscv/synthesis-3.c
new file mode 100644
index 000..5d92ac8e309
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-3.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } 
*/
+/* { dg-options "-march=rv64gc_zba_zbb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions. 
+
+   This isn't expected to change much and any change is worthy of a 

[gcc r15-367] [RISC-V] Use shNadd for constant synthesis

2024-05-10 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:dbbd059b49edc936769737204f5c270d8d6ff553

commit r15-367-gdbbd059b49edc936769737204f5c270d8d6ff553
Author: Jeff Law 
Date:   Fri May 10 13:49:44 2024 -0600

[RISC-V] Use shNadd for constant synthesis

So here's the next idiom to improve constant synthesis.

The basic idea here is to try and use shNadd to generate the constant when 
profitable.

Let's take 0x30801.  Right now that generates:

li  a0,3145728
addia0,a0,1
sllia0,a0,12
addia0,a0,-2047

But we can do better.  The constant is evenly divisible by 9 resulting in
0x5639 which doesn't look terribly interesting.  But that constant can 
be
generated with two instructions, then we can use a sh3add to multiply it by 
9.
So the updated sequence looks like:

li  a0,1431654400
addia0,a0,1593
sh3add  a0,a0,a0

This doesn't trigger a whole lot, but I haven't really set up a test to 
explore
the most likely space where this might be useful.  The tests were found
exploring a different class of constant synthesis problems.

If you were to dive into the before/after you'd see that the shNadd 
interacts
quite nicely with the recent bseti work.   The joys of recursion.

Probably the most controversial thing in here is using the "FMA" opcode to
stand in for when we want to use shNadd.  Essentially when we synthesize a
constant we generate a series of RTL opcodes and constants for emission by
another routine.   We don't really have a way to say we want a shift-add.  
But
you can think of shift-add as a limited form of multiply-accumulate.  It's a
bit of a stretch, but not crazy bad IMHO.

Other approaches would be to store our own enum rather than an RTL opcode.  
Or
store an actual generator function rather than any kind of opcode.

It wouldn't take much pushback over (ab)using FMA in this manner to get me 
to
use our own enums rather than RTL opcodes for this stuff.

gcc/

* config/riscv/riscv.cc (riscv_build_integer_1): Recognize cases 
where
we can use shNadd to improve constant synthesis.
(riscv_move_integer): Handle code generation for shNadd.

gcc/testsuite
* gcc.target/riscv/synthesis-1.c: Also count shNadd instructions.
* gcc.target/riscv/synthesis-3.c: New test.

Diff:
---
 gcc/config/riscv/riscv.cc| 42 +++
 gcc/testsuite/gcc.target/riscv/synthesis-1.c |  2 +-
 gcc/testsuite/gcc.target/riscv/synthesis-3.c | 81 
 3 files changed, 124 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 2860137af718..9c98b1da0357 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -880,6 +880,40 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
}
 }
 
+  if (cost > 2 && TARGET_64BIT && TARGET_ZBA)
+{
+  if ((value % 9) == 0
+ && (alt_cost
+ = riscv_build_integer_1 (alt_codes, value / 9, mode) + 1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 9;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+  if ((value % 5) == 0
+ && (alt_cost
+ = riscv_build_integer_1 (alt_codes, value / 5, mode) + 1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 5;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+  if ((value % 3) == 0
+ && (alt_cost
+ = riscv_build_integer_1 (alt_codes, value / 3, mode) + 1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 3;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+}
+
   /* Final cases, particularly focused on bseti.  */
   if (cost > 2 && TARGET_ZBS)
 {
@@ -2542,6 +2576,14 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT 
value,
  x = gen_rtx_fmt_ee (AND, mode, x, GEN_INT (value));
  x = riscv_emit_set (t, x);
}
+ else if (codes[i].code == FMA)
+   {
+ HOST_WIDE_INT value = exact_log2 (codes[i].value - 1);
+ rtx ashift = gen_rtx_fmt_ee (ASHIFT, mode, x, GEN_INT (value));
+ x = gen_rtx_fmt_ee (PLUS, mode, ashift, x);
+ rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
+ x = riscv_emi

[RISC-V] Use shNadd for constant synthesis

2024-05-09 Thread Jeff Law

So here's the next idiom to improve constant synthesis.

The basic idea here is to try and use shNadd to generate the constant 
when profitable.


Let's take 0x30801.  Right now that generates:

li  a0,3145728
addia0,a0,1
sllia0,a0,12
addia0,a0,-2047


But we can do better.  The constant is evenly divisible by 9 resulting 
in 0x5639 which doesn't look terribly interesting.  But that 
constant can be generated with two instructions, then we can use a 
sh3add to multiply it by 9.  So the updated sequence looks like:


li  a0,1431654400
addia0,a0,1593
sh3add  a0,a0,a0


This doesn't trigger a whole lot, but I haven't really set up a test to 
explore the most likely space where this might be useful.  The tests 
were found exploring a different class of constant synthesis problems.


If you were to dive into the before/after you'd see that the shNadd 
interacts quite nicely with the recent bseti work.   The joys of recursion.


Probably the most controversial thing in here is using the "FMA" opcode 
to stand in for when we want to use shNadd.  Essentially when we 
synthesize a constant we generate a series of RTL opcodes and constants 
for emission by another routine.   We don't really have a way to say we 
want a shift-add.  But you can think of shift-add as a limited form of 
multiply-accumulate.  It's a bit of a stretch, but not crazy bad IMHO.


Other approaches would be to store our own enum rather than an RTL 
opcode.  Or store an actual generator function rather than any kind of 
opcode.


It wouldn't take much pushback over (ab)using FMA in this manner to get 
me to use our own enums rather than RTL opcodes for this stuff.


Tested on rv64gc and rv32gcv.  Waiting on wider CI run before committing.

Jeff


gcc/

* config/riscv/riscv.cc (riscv_build_integer_1): Recognize cases where
we can use shNadd to improve constant synthesis.
(riscv_move_integer): Handle code generation for shNadd.

gcc/testsuite
* gcc.target/riscv/synthesis-1.c: Also count shNadd instructions.
* gcc.target/riscv/synthesis-3.c: New test.


diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 2eac67b0ce0..75e828c81a7 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -880,6 +880,37 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
}
 }
 
+  if (cost > 2 && TARGET_64BIT && TARGET_ZBA)
+{
+  if ((value % 9) == 0
+ && (alt_cost = riscv_build_integer_1 (alt_codes, value / 9, mode) + 
1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 9;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+  if ((value % 5) == 0
+ && (alt_cost = riscv_build_integer_1 (alt_codes, value / 5, mode) + 
1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 5;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+  if ((value % 3) == 0
+ && (alt_cost = riscv_build_integer_1 (alt_codes, value / 3, mode) + 
1) < cost)
+   {
+  alt_codes[alt_cost - 1].code = FMA;
+  alt_codes[alt_cost - 1].value = 3;
+  alt_codes[alt_cost - 1].use_uw = false;
+  memcpy (codes, alt_codes, sizeof (alt_codes));
+  cost = alt_cost;
+   }
+}
+
   /* Final cases, particularly focused on bseti.  */
   if (cost > 2 && TARGET_ZBS)
 {
@@ -2542,6 +2573,14 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT 
value,
  x = gen_rtx_fmt_ee (AND, mode, x, GEN_INT (value));
  x = riscv_emit_set (t, x);
}
+ else if (codes[i].code == FMA)
+   {
+ HOST_WIDE_INT value = exact_log2 (codes[i].value - 1);
+ rtx ashift = gen_rtx_fmt_ee (ASHIFT, mode, x, GEN_INT (value));
+ x = gen_rtx_fmt_ee (PLUS, mode, ashift, x);
+ rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
+ x = riscv_emit_set (t, x);
+   }
  else
x = gen_rtx_fmt_ee (codes[i].code, mode,
x, GEN_INT (codes[i].value));
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-1.c 
b/gcc/testsuite/gcc.target/riscv/synthesis-1.c
index 3384e488ade..9176d5f4989 100644
--- a/gcc/testsuite/gcc.target/riscv/synthesis-1.c
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-1.c
@@ -12,7 +12,7 @@
total number of instructions. 
 
This isn't expected to change much and any change is worthy of a look.  */
-/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|ret|slli)" 5822 } 
} */
+/* { dg-final { scan-assembler-times 

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [committed] [RISC-V] Provide splitting guidance to combine to faciliate shNadd.uw generation

2024-05-09 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:13d1b47251a94a67d698d4283caf754382ee27ea

commit 13d1b47251a94a67d698d4283caf754382ee27ea
Author: Jeff Law 
Date:   Thu May 9 21:07:06 2024 -0600

[committed] [RISC-V] Provide splitting guidance to combine to faciliate 
shNadd.uw generation

This fixes a minor code quality issue I found while comparing GCC and LLVM.
Essentially we want to do a bit of re-association to generate shNadd.uw
instructions.

Combine does the right thing and finds all the necessary instructions,
reassociates the operands, combines constants, etc.  Where is fails is 
finding
a good split point.  The backend can trivially provide guidance on how to 
split
via a define_split pattern.

This has survived both Ventana's internal CI system (rv64gcb) as well as my 
own
(rv64gc, rv32gcv).

I'll wait for the external CI system to give the all-clear before pushing.

gcc/
* config/riscv/bitmanip.md: Add splitter for shadd feeding another
add instruction.

gcc/testsuite/

* gcc.target/riscv/zba-shadduw.c: New test.

(cherry picked from commit bfb88b1406cdd8d3f97e280b0d63529aa925f18a)

Diff:
---
 gcc/config/riscv/bitmanip.md | 17 ++
 gcc/testsuite/gcc.target/riscv/zba-shadduw.c | 35 
 2 files changed, 52 insertions(+)

diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index ad3ad758959e..d76a72d30e02 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -184,6 +184,23 @@
   [(set_attr "type" "bitmanip")
(set_attr "mode" "DI")])
 
+;; Combine will reassociate the operands in the most useful way here.  We
+;; just have to give it guidance on where to split the result to facilitate
+;; shNadd.uw generation.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+   (plus:DI (plus:DI (and:DI (ashift:DI (match_operand:DI 1 
"register_operand")
+(match_operand:QI 2 
"imm123_operand"))
+ (match_operand 3 
"consecutive_bits32_operand"))
+ (match_operand:DI 4 "register_operand"))
+(match_operand 5 "immediate_operand")))]
+  "TARGET_64BIT && TARGET_ZBA"
+  [(set (match_dup 0)
+   (plus:DI (and:DI (ashift:DI (match_dup 1) (match_dup 2))
+(match_dup 3))
+(match_dup 4)))
+   (set (match_dup 0) (plus:DI (match_dup 0) (match_dup 5)))])
+
 ;; ZBB extension.
 
 (define_expand "clzdi2"
diff --git a/gcc/testsuite/gcc.target/riscv/zba-shadduw.c 
b/gcc/testsuite/gcc.target/riscv/zba-shadduw.c
new file mode 100644
index ..5b77447e6813
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zba-shadduw.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=rv64gc_zba -mabi=lp64" } */
+
+typedef struct simple_bitmap_def
+{
+  unsigned char *popcount;
+  unsigned int n_bits;
+  unsigned int size;
+  unsigned long elms[1];
+} *sbitmap;
+typedef const struct simple_bitmap_def *const_sbitmap;
+
+typedef unsigned long *sbitmap_ptr;
+typedef const unsigned long *const_sbitmap_ptr;
+static unsigned long sbitmap_elt_popcount (unsigned long);
+
+void
+sbitmap_a_or_b (sbitmap dst, const_sbitmap a, const_sbitmap b)
+{
+  unsigned int i, n = dst->size;
+  sbitmap_ptr dstp = dst->elms;
+  const_sbitmap_ptr ap = a->elms;
+  const_sbitmap_ptr bp = b->elms;
+  unsigned char has_popcount = dst->popcount != ((void *) 0);
+
+  for (i = 0; i < n; i++)
+{
+  const unsigned long tmp = *ap++ | *bp++;
+  *dstp++ = tmp;
+}
+}
+
+
+/* { dg-final { scan-assembler "sh3add.uw" } } */
+/* { dg-final { scan-assembler-not {\mslli.uw} } } */


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Make full-vec-move1.c test robust for optimization

2024-05-09 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:788ed48d01b4fb85689ae1d7a0033cb05a48637f

commit 788ed48d01b4fb85689ae1d7a0033cb05a48637f
Author: Pan Li 
Date:   Thu May 9 10:56:46 2024 +0800

RISC-V: Make full-vec-move1.c test robust for optimization

During investigate the support of early break autovec, we notice
the test full-vec-move1.c will be optimized to 'return 0;' in main
function body.  Because somehow the value of V type is compiler
time constant,  and then the second loop will be considered as
assert (true).

Thus,  the ccp4 pass will eliminate these stmt and just return 0.

typedef int16_t V __attribute__((vector_size (128)));

int main ()
{
  V v;
  for (int i = 0; i < sizeof (v) / sizeof (v[0]); i++)
(v)[i] = i;

  V res = v;
  for (int i = 0; i < sizeof (v) / sizeof (v[0]); i++)
assert (res[i] == i); // will be optimized to assert (true)
}

This patch would like to introduce a extern function to use the res[i]
that get rid of the ccp4 optimization.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/full-vec-move1.c:
Introduce extern func use to get rid of ccp4 optimization.

Signed-off-by: Pan Li 
(cherry picked from commit b1520d2260c5e0cfcd7a4354fab70f66e2912ff2)

Diff:
---
 .../gcc.target/riscv/rvv/autovec/vls-vlmax/full-vec-move1.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/full-vec-move1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/full-vec-move1.c
index d73bad4af6f7..fae2ae91572f 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/full-vec-move1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/full-vec-move1.c
@@ -2,11 +2,12 @@
 /* { dg-additional-options "-std=c99 -O3 -march=rv64gcv_zvl128b -mabi=lp64d 
-fno-vect-cost-model -mrvv-vector-bits=zvl" } */
 
 #include 
-#include 
 
 /* This would cause us to emit a vl1r.v for VNx4HImode even when
the hardware vector size vl > 64.  */
 
+extern int16_t test_element (int16_t);
+
 typedef int16_t V __attribute__((vector_size (128)));
 
 int main ()
@@ -14,9 +15,10 @@ int main ()
   V v;
   for (int i = 0; i < sizeof (v) / sizeof (v[0]); i++)
 (v)[i] = i;
+
   V res = v;
   for (int i = 0; i < sizeof (v) / sizeof (v[0]); i++)
-assert (res[i] == i);
+test_element (res[i]);
 }
 
 /* { dg-final { scan-assembler-not {vl[1248]r.v} } }  */


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Add tests for cpymemsi expansion

2024-05-09 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:2bb25f97841524649fef9d58ce84ca71748e2f2b

commit 2bb25f97841524649fef9d58ce84ca71748e2f2b
Author: Christoph Müllner 
Date:   Thu Apr 11 12:07:10 2024 +0200

RISC-V: Add tests for cpymemsi expansion

cpymemsi expansion was available for RISC-V since the initial port.
However, there are not tests to detect regression.
This patch adds such tests.

Three of the tests target the expansion requirements (known length and
alignment). One test reuses an existing memcpy test from the by-pieces
framework (gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c).

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cpymemsi-1.c: New test.
* gcc.target/riscv/cpymemsi-2.c: New test.
* gcc.target/riscv/cpymemsi-3.c: New test.
* gcc.target/riscv/cpymemsi.c: New test.

Signed-off-by: Christoph Müllner 
(cherry picked from commit 4d38e88227ea48e559a2f354c0e62d372e181b82)

Diff:
---
 gcc/testsuite/gcc.target/riscv/cpymemsi-1.c |  9 ++
 gcc/testsuite/gcc.target/riscv/cpymemsi-2.c | 42 
 gcc/testsuite/gcc.target/riscv/cpymemsi-3.c | 43 +
 gcc/testsuite/gcc.target/riscv/cpymemsi.c   | 22 +++
 4 files changed, 116 insertions(+)

diff --git a/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c 
b/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c
new file mode 100644
index ..983b564ccaf7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-march=rv32gc -save-temps -g0 -fno-lto" { target { rv32 } } } 
*/
+/* { dg-options "-march=rv64gc -save-temps -g0 -fno-lto" { target { rv64 } } } 
*/
+/* { dg-additional-options "-DRUN_FRACTION=11" { target simulator } } */
+/* { dg-timeout-factor 2 } */
+
+#include "../../gcc.dg/memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
diff --git a/gcc/testsuite/gcc.target/riscv/cpymemsi-2.c 
b/gcc/testsuite/gcc.target/riscv/cpymemsi-2.c
new file mode 100644
index ..833d1c044876
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cpymemsi-2.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gc" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc" { target { rv64 } } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Og" "-Oz" } } */
+
+#include 
+#define aligned32 __attribute__ ((aligned (32)))
+
+const char myconst15[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst23[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst31[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7 };
+
+/* No expansion (unknown alignment) */
+#define MY_MEM_CPY_N(N)\
+void my_mem_cpy_##N (char *b1, const char *b2) \
+{  \
+  __builtin_memcpy (b1, b2, N);\
+}
+
+/* No expansion (unknown alignment) */
+#define MY_MEM_CPY_CONST_N(N)  \
+void my_mem_cpy_const_##N (char *b1)   \
+{  \
+  __builtin_memcpy (b1, myconst##N, sizeof(myconst##N));\
+}
+
+MY_MEM_CPY_N(15)
+MY_MEM_CPY_CONST_N(15)
+
+MY_MEM_CPY_N(23)
+MY_MEM_CPY_CONST_N(23)
+
+MY_MEM_CPY_N(31)
+MY_MEM_CPY_CONST_N(31)
+
+/* { dg-final { scan-assembler-times "\t(call|tail)\tmemcpy" 6 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/cpymemsi-3.c 
b/gcc/testsuite/gcc.target/riscv/cpymemsi-3.c
new file mode 100644
index ..803765195b24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cpymemsi-3.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gc" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc" { target { rv64 } } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Og" "-Oz" } } */
+
+#include 
+#define aligned32 __attribute__ ((aligned (32)))
+
+const char myconst15[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst23[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst31[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7,
+0, 1, 2, 3, 4, 5, 6, 7 };
+
+#define MY_MEM_CPY_ALIGNED_N(N)\
+void my_mem_cpy_aligned_##N(char *b1, const char *b2)  \
+{  

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [PATCH v1 1/1] RISC-V: Nan-box the result of movbf on soft-bf16

2024-05-09 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:b59bc760740eccd8f7b76d218ed759d9ae6604c8

commit b59bc760740eccd8f7b76d218ed759d9ae6604c8
Author: Xiao Zeng 
Date:   Wed May 8 14:00:58 2024 -0600

[PATCH v1 1/1] RISC-V: Nan-box the result of movbf on soft-bf16

1 This patch implements the Nan-box of bf16.

2 Please refer to the Nan-box implementation of hf16 in:



3 The discussion about Nan-box can be found on the website:



4 Below test are passed for this patch
* The riscv fully regression test.

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_legitimize_move): Expand movbf
with Nan-boxing value.
* config/riscv/riscv.md (*movbf_softfloat_boxing): New pattern.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/_Bfloat16-nanboxing.c: New test.

(cherry picked from commit ce51e6727c9d69bbab0e766c449e60fd41f5f2f9)

Diff:
---
 gcc/config/riscv/riscv.cc  | 52 --
 gcc/config/riscv/riscv.md  | 12 -
 .../gcc.target/riscv/_Bfloat16-nanboxing.c | 38 
 3 files changed, 77 insertions(+), 25 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 633b55f9707a..2eac67b0ce0a 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -3130,35 +3130,39 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx 
src)
 }
 
   /* In order to fit NaN boxing, expand
- (set FP_REG (reg:HF src))
+ (set FP_REG (reg:HF/BF src))
  to
  (set (reg:SI/DI mask) (const_int -65536)
- (set (reg:SI/DI temp) (zero_extend:SI/DI (subreg:HI (reg:HF src) 0)))
+ (set (reg:SI/DI temp) (zero_extend:SI/DI (subreg:HI (reg:HF/BF src) 0)))
  (set (reg:SI/DI temp) (ior:SI/DI (reg:SI/DI mask) (reg:SI/DI temp)))
- (set (reg:HF dest) (unspec:HF [ (reg:SI/DI temp) ] UNSPEC_FMV_SFP16_X))
+ (set (reg:HF/BF dest) (unspec:HF/BF[ (reg:SI/DI temp) ]
+   UNSPEC_FMV_SFP16_X/UNSPEC_FMV_SBF16_X))
  */
 
- if (TARGET_HARD_FLOAT
- && !TARGET_ZFHMIN && mode == HFmode
- && REG_P (dest) && FP_REG_P (REGNO (dest))
- && REG_P (src) && !FP_REG_P (REGNO (src))
- && can_create_pseudo_p ())
-   {
- rtx mask = force_reg (word_mode, gen_int_mode (-65536, word_mode));
- rtx temp = gen_reg_rtx (word_mode);
- emit_insn (gen_extend_insn (temp,
-simplify_gen_subreg (HImode, src, mode, 0),
-word_mode, HImode, 1));
- if (word_mode == SImode)
-   emit_insn (gen_iorsi3 (temp, mask, temp));
- else
-   emit_insn (gen_iordi3 (temp, mask, temp));
-
- riscv_emit_move (dest, gen_rtx_UNSPEC (HFmode, gen_rtvec (1, temp),
-   UNSPEC_FMV_SFP16_X));
-
- return true;
-   }
+  if (TARGET_HARD_FLOAT
+  && ((!TARGET_ZFHMIN && mode == HFmode)
+ || (!TARGET_ZFBFMIN && mode == BFmode))
+  && REG_P (dest) && FP_REG_P (REGNO (dest))
+  && REG_P (src) && !FP_REG_P (REGNO (src))
+  && can_create_pseudo_p ())
+{
+  rtx mask = force_reg (word_mode, gen_int_mode (-65536, word_mode));
+  rtx temp = gen_reg_rtx (word_mode);
+  emit_insn (gen_extend_insn (temp,
+ simplify_gen_subreg (HImode, src, mode, 0),
+ word_mode, HImode, 1));
+  if (word_mode == SImode)
+   emit_insn (gen_iorsi3 (temp, mask, temp));
+  else
+   emit_insn (gen_iordi3 (temp, mask, temp));
+
+  riscv_emit_move (dest,
+  gen_rtx_UNSPEC (mode, gen_rtvec (1, temp),
+  mode == HFmode ? UNSPEC_FMV_SFP16_X
+ : UNSPEC_FMV_SBF16_X));
+
+  return true;
+}
 
   /* We need to deal with constants that would be legitimate
  immediate_operands but aren't legitimate move_operands.  */
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 620a1b3bd32f..4d6de9925572 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -86,8 +86,9 @@
   ;; String unspecs
   UNSPEC_STRLEN
 
-  ;; Workaround for HFmode without hardware extension
+  ;; Workaround for HFmode and BFmode without hardware extension
   UNSPEC_FMV_SFP16_X
+  UNSPEC_FMV_SBF16_X
 
   ;; XTheadFmv moves
   UNSPEC_XTHEADFMV
@@ -1926,6 +1927,15 @@
   [(set_attr "type" "fmove")
(set_attr "mode" "SF")])
 
+(define_insn "*movbf_softfloat_boxing"
+  [(set (match_operand:BF 0 "register_operand"   "=f")
+   (unspec:BF [(match_operand:X 1 "register_operand" " r")]
+UNSPEC_FMV_SBF16_X))]
+  "!TARGET_ZFBFMIN"
+  "fmv.w.x\t%0,%1"
+  [(set_attr "type" "fmove")
+   

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [RISC-V][V2] Fix incorrect if-then-else nesting of Zbs usage in constant synthesis

2024-05-09 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:f273ad20d6b2b3f196a0c99a5a0c419cc13d862a

commit f273ad20d6b2b3f196a0c99a5a0c419cc13d862a
Author: Jeff Law 
Date:   Wed May 8 13:44:00 2024 -0600

[RISC-V][V2] Fix incorrect if-then-else nesting of Zbs usage in constant 
synthesis

Reposting without the patch that ignores whitespace.  The CI system doesn't
like including both patches, that'll generate a failure to apply and none of
the tests actually get run.

So I managed to goof the if-then-else level of the bseti bits last week.  
They
were supposed to be a last ditch effort to improve the result, but ended up
inside a conditional where they don't really belong.  I almost always use 
Zba,
Zbb and Zbs together, so it slipped by.

So it's NFC if you always test with Zbb and Zbs enabled together.  But if 
you
enabled Zbs without Zbb you'd see a failure to use bseti.

gcc/
* config/riscv/riscv.cc (riscv_build_integer_1): Fix incorrect
if-then-else nesting of Zbs code.

(cherry picked from commit 1c234097487927a4388ddcc690b63597bb3a90dc)

Diff:
---
 gcc/config/riscv/riscv.cc | 81 ---
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 62207b6b2273..633b55f9707a 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -878,50 +878,51 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
  codes[1].use_uw = false;
  cost = 2;
}
-  /* Final cases, particularly focused on bseti.  */
-  else if (cost > 2 && TARGET_ZBS)
-   {
- int i = 0;
+}
 
- /* First handle any bits set by LUI.  Be careful of the
-SImode sign bit!.  */
- if (value & 0x7800)
-   {
- alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
- alt_codes[i].value = value & 0x7800;
- alt_codes[i].use_uw = false;
- value &= ~0x7800;
- i++;
-   }
+  /* Final cases, particularly focused on bseti.  */
+  if (cost > 2 && TARGET_ZBS)
+{
+  int i = 0;
 
- /* Next, any bits we can handle with addi.  */
- if (value & 0x7ff)
-   {
- alt_codes[i].code = (i == 0 ? UNKNOWN : PLUS);
- alt_codes[i].value = value & 0x7ff;
- alt_codes[i].use_uw = false;
- value &= ~0x7ff;
- i++;
-   }
+  /* First handle any bits set by LUI.  Be careful of the
+SImode sign bit!.  */
+  if (value & 0x7800)
+   {
+ alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
+ alt_codes[i].value = value & 0x7800;
+ alt_codes[i].use_uw = false;
+ value &= ~0x7800;
+  i++;
+   }
 
- /* And any residuals with bseti.  */
- while (i < cost && value)
-   {
- HOST_WIDE_INT bit = ctz_hwi (value);
- alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
- alt_codes[i].value = 1UL << bit;
- alt_codes[i].use_uw = false;
- value &= ~(1ULL << bit);
- i++;
-   }
+  /* Next, any bits we can handle with addi.  */
+  if (value & 0x7ff)
+   {
+ alt_codes[i].code = (i == 0 ? UNKNOWN : PLUS);
+ alt_codes[i].value = value & 0x7ff;
+ alt_codes[i].use_uw = false;
+ value &= ~0x7ff;
+ i++;
+   }
 
- /* If LUI+ADDI+BSETI resulted in a more efficient
-sequence, then use it.  */
- if (i < cost)
-   {
- memcpy (codes, alt_codes, sizeof (alt_codes));
- cost = i;
-   }
+  /* And any residuals with bseti.  */
+  while (i < cost && value)
+   {
+ HOST_WIDE_INT bit = ctz_hwi (value);
+ alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
+ alt_codes[i].value = 1UL << bit;
+ alt_codes[i].use_uw = false;
+ value &= ~(1ULL << bit);
+ i++;
+   }
+
+  /* If LUI+ADDI+BSETI resulted in a more efficient
+sequence, then use it.  */
+  if (i < cost)
+   {
+ memcpy (codes, alt_codes, sizeof (alt_codes));
+ cost = i;
}
 }


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Cover sign-extensions in lshr3_zero_extend_4

2024-05-09 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:b6dc4a54639ee85a425f46b86e152d99d209ffa4

commit b6dc4a54639ee85a425f46b86e152d99d209ffa4
Author: Christoph Müllner 
Date:   Tue May 7 22:23:26 2024 +0200

RISC-V: Cover sign-extensions in lshr3_zero_extend_4

The lshr3_zero_extend_4 pattern targets bit extraction
with zero-extension. This pattern represents the canonical form
of zero-extensions of a logical right shift.

The same optimization can be applied to sign-extensions.
Given the two optimizations are so similar, this patch converts
the existing one to also cover the sign-extension case as well.

gcc/ChangeLog:

* config/riscv/iterators.md (ashiftrt): New code attribute
'extract_shift' and adding extractions to optab.
* config/riscv/riscv.md (*lshr3_zero_extend_4): Rename 
to...
(*3):...this and add support for
sign-extensions.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/extend-shift-helpers.h: Add helpers for
sign-extension.
* gcc.target/riscv/sign-extend-rshift-32.c: New test.
* gcc.target/riscv/sign-extend-rshift-64.c: New test.
* gcc.target/riscv/sign-extend-rshift.c: New test.

Signed-off-by: Christoph Müllner 
(cherry picked from commit 3ee30d7981987b86bd6a9a2675e26fadec48e5cd)

Diff:
---
 gcc/config/riscv/iterators.md  |   4 +
 gcc/config/riscv/riscv.md  |  25 +++--
 .../gcc.target/riscv/extend-shift-helpers.h|  20 
 .../gcc.target/riscv/sign-extend-rshift-32.c   |  17 +++
 .../gcc.target/riscv/sign-extend-rshift-64.c   |  17 +++
 .../gcc.target/riscv/sign-extend-rshift.c  | 123 +
 6 files changed, 198 insertions(+), 8 deletions(-)

diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
index c5ca01f382a9..8a9d1986b4ae 100644
--- a/gcc/config/riscv/iterators.md
+++ b/gcc/config/riscv/iterators.md
@@ -155,6 +155,8 @@
 (define_code_iterator any_extract [sign_extract zero_extract])
 (define_code_attr extract_sidi_shift [(sign_extract "sraiw")
  (zero_extract "srliw")])
+(define_code_attr extract_shift [(sign_extract "ashiftrt")
+(zero_extract "lshiftrt")])
 
 ;; This code iterator allows the two right shift instructions to be
 ;; generated from the same template.
@@ -261,6 +263,8 @@
 (us_minus "ussub")
 (sign_extend "extend")
 (zero_extend "zero_extend")
+(sign_extract "extract")
+(zero_extract "zero_extract")
 (fix "fix_trunc")
 (unsigned_fix "fixuns_trunc")])
 
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 58bf77122779..620a1b3bd32f 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2793,24 +2793,33 @@
   [(set_attr "type" "shift")
(set_attr "mode" "SI")])
 
-;; Canonical form for a zero-extend of a logical right shift.
-;; Special cases are handled above.
-;; Skip for single-bit extraction (Zbs/XTheadBs) and th.extu (XTheadBb)
-(define_insn_and_split "*lshr3_zero_extend_4"
+;; Canonical form for a extend of a logical shift right (sign/zero extraction).
+;; Special cases, that are ignored (handled elsewhere):
+;; * Single-bit extraction (Zbs/XTheadBs)
+;; * Single-bit extraction (Zicondops/XVentanaCondops)
+;; * Single-bit extraction (SFB)
+;; * Extraction instruction th.ext(u) (XTheadBb)
+;; * lshrsi3_extend_2 (see above)
+(define_insn_and_split "*3"
   [(set (match_operand:GPR 0 "register_operand" "=r")
-(zero_extract:GPR
+(any_extract:GPR
(match_operand:GPR 1 "register_operand" " r")
(match_operand 2 "const_int_operand")
(match_operand 3 "const_int_operand")))
(clobber (match_scratch:GPR  4 "="))]
-  "!((TARGET_ZBS || TARGET_XTHEADBS) && (INTVAL (operands[2]) == 1))
-   && !TARGET_XTHEADBB"
+  "!((TARGET_ZBS || TARGET_XTHEADBS || TARGET_ZICOND
+  || TARGET_XVENTANACONDOPS || TARGET_SFB_ALU)
+ && (INTVAL (operands[2]) == 1))
+   && !TARGET_XTHEADBB
+   && !(TARGET_64BIT
+&& (INTVAL (operands[3]) > 0)
+&& (INTVAL (operands[2]) + INTVAL (operands[3]) == 32))"
   "#"
   "&& reload_completed"
   [(set (match_dup 4)
  (ashift:GPR (match_dup 1) (match_dup 2)))
(set (match_dup 0)
- (lshiftrt:GPR (match_dup 4) (match_dup 3)))]
+ (:GPR (match_dup 4) (match_dup 3)))]
 {
   int regbits = GET_MODE_BITSIZE (GET_MODE (operands[0])).to_constant ();
   int sizebits = INTVAL (operands[2]);
diff --git a/gcc/testsuite/gcc.target/riscv/extend-shift-helpers.h 
b/gcc/testsuite/gcc.target/riscv/extend-shift-helpers.h
index 4853fe490d8e..720672de2426 100644
--- a/gcc/testsuite/gcc.target/riscv/extend-shift-helpers.h
+++ 

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Add zero_extract support for rv64gc

2024-05-09 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:38fc117452afa21fac0ca3f743fc09d35c3f8c5c

commit 38fc117452afa21fac0ca3f743fc09d35c3f8c5c
Author: Christoph Müllner 
Date:   Mon May 6 12:33:32 2024 +0200

RISC-V: Add zero_extract support for rv64gc

The combiner attempts to optimize a zero-extension of a logical right shift
using zero_extract. We already utilize this optimization for those cases
that result in a single instructions.  Let's add a insn_and_split
pattern that also matches the generic case, where we can emit an
optimized sequence of a slli/srli.

Tested with SPEC CPU 2017 (rv64gc).

PR target/111501

gcc/ChangeLog:

* config/riscv/riscv.md (*lshr3_zero_extend_4): New
pattern for zero-extraction.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/extend-shift-helpers.h: New test.
* gcc.target/riscv/pr111501.c: New test.
* gcc.target/riscv/zero-extend-rshift-32.c: New test.
* gcc.target/riscv/zero-extend-rshift-64.c: New test.
* gcc.target/riscv/zero-extend-rshift.c: New test.

Signed-off-by: Christoph Müllner 
(cherry picked from commit 3b9c760072c7792cbae6f38894756d2b96c2fd8c)

Diff:
---
 gcc/config/riscv/riscv.md  |  30 ++
 .../gcc.target/riscv/extend-shift-helpers.h|  26 +
 gcc/testsuite/gcc.target/riscv/pr111501.c  |  21 
 .../gcc.target/riscv/zero-extend-rshift-32.c   |  13 +++
 .../gcc.target/riscv/zero-extend-rshift-64.c   |  17 +++
 .../gcc.target/riscv/zero-extend-rshift.c  | 115 +
 6 files changed, 222 insertions(+)

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index b7fc13e4e611..58bf77122779 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2793,6 +2793,36 @@
   [(set_attr "type" "shift")
(set_attr "mode" "SI")])
 
+;; Canonical form for a zero-extend of a logical right shift.
+;; Special cases are handled above.
+;; Skip for single-bit extraction (Zbs/XTheadBs) and th.extu (XTheadBb)
+(define_insn_and_split "*lshr3_zero_extend_4"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+(zero_extract:GPR
+   (match_operand:GPR 1 "register_operand" " r")
+   (match_operand 2 "const_int_operand")
+   (match_operand 3 "const_int_operand")))
+   (clobber (match_scratch:GPR  4 "="))]
+  "!((TARGET_ZBS || TARGET_XTHEADBS) && (INTVAL (operands[2]) == 1))
+   && !TARGET_XTHEADBB"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+ (ashift:GPR (match_dup 1) (match_dup 2)))
+   (set (match_dup 0)
+ (lshiftrt:GPR (match_dup 4) (match_dup 3)))]
+{
+  int regbits = GET_MODE_BITSIZE (GET_MODE (operands[0])).to_constant ();
+  int sizebits = INTVAL (operands[2]);
+  int startbits = INTVAL (operands[3]);
+  int lshamt = regbits - sizebits - startbits;
+  int rshamt = lshamt + startbits;
+  operands[2] = GEN_INT (lshamt);
+  operands[3] = GEN_INT (rshamt);
+}
+  [(set_attr "type" "shift")
+   (set_attr "mode" "")])
+
 ;; Handle AND with 2^N-1 for N from 12 to XLEN.  This can be split into
 ;; two logical shifts.  Otherwise it requires 3 instructions: lui,
 ;; xor/addi/srli, and.
diff --git a/gcc/testsuite/gcc.target/riscv/extend-shift-helpers.h 
b/gcc/testsuite/gcc.target/riscv/extend-shift-helpers.h
new file mode 100644
index ..4853fe490d8e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/extend-shift-helpers.h
@@ -0,0 +1,26 @@
+#ifndef EXTEND_SHIFT_HELPERS_H
+#define EXTEND_SHIFT_HELPERS_H
+
+#define RT_EXT_CT_RSHIFT_N_AT(RTS,RT,CTS,CT,N,ATS,AT)  \
+RTS RT \
+RTS##_##RT##_ext_##CTS##_##CT##_rshift_##N##_##ATS##_##AT(ATS AT v)\
+{  \
+return (CTS CT)(v >> N);   \
+}
+
+#define ULONG_EXT_USHORT_RSHIFT_N_ULONG(N) \
+   RT_EXT_CT_RSHIFT_N_AT(unsigned,long,unsigned,short,N,unsigned,long)
+
+#define ULONG_EXT_UINT_RSHIFT_N_ULONG(N) \
+   RT_EXT_CT_RSHIFT_N_AT(unsigned,long,unsigned,int,N,unsigned,long)
+
+#define UINT_EXT_USHORT_RSHIFT_N_UINT(N) \
+   RT_EXT_CT_RSHIFT_N_AT(unsigned,int,unsigned,short,N,unsigned,int)
+
+#define UINT_EXT_USHORT_RSHIFT_N_ULONG(N) \
+   RT_EXT_CT_RSHIFT_N_AT(unsigned,int,unsigned,short,N,unsigned,long)
+
+#define ULONG_EXT_USHORT_RSHIFT_N_UINT(N) \
+   RT_EXT_CT_RSHIFT_N_AT(unsigned,long,unsigned,short,N,unsigned,int)
+
+#endif /* EXTEND_SHIFT_HELPERS_H */
diff --git a/gcc/testsuite/gcc.target/riscv/pr111501.c 
b/gcc/testsuite/gcc.target/riscv/pr111501.c
new file mode 100644
index ..db48c34ce9af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr111501.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* { dg-options "-march=rv64gc" { target { rv64 } } } */
+/* { dg-skip-if "" { 

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Cover sign-extensions in lshrsi3_zero_extend_2

2024-05-09 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:9041a047fe957232d9f9127791a08643b1087a36

commit 9041a047fe957232d9f9127791a08643b1087a36
Author: Christoph Müllner 
Date:   Tue May 7 23:26:02 2024 +0200

RISC-V: Cover sign-extensions in lshrsi3_zero_extend_2

The pattern lshrsi3_zero_extend_2 extracts the MSB bits of the lower
32-bit word and zero-extends it back to DImode.
This is realized using srliw, which operates on 32-bit registers.

The same optimziation can be applied to sign-extensions when emitting
a sraiw instead of the srliw.

Given these two optimizations are so similar, this patch simply
converts the existing one to also cover the sign-extension case as well.

gcc/ChangeLog:

* config/riscv/iterators.md (sraiw): New code iterator 
'any_extract'.
New code attribute 'extract_sidi_shift'.
* config/riscv/riscv.md (*lshrsi3_zero_extend_2): Rename to...
(*lshrsi3_extend_2):...this and add support for sign-extensions.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/sign-extend-1.c: Test sraiw 24 and sraiw 16.

Signed-off-by: Christoph Müllner 
(cherry picked from commit 4e46a3537ff57938a0d98fa524ac2fff8b08ae6d)

Diff:
---
 gcc/config/riscv/iterators.md  |  6 ++
 gcc/config/riscv/riscv.md  |  9 +
 gcc/testsuite/gcc.target/riscv/sign-extend-1.c | 14 ++
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
index 32e1b1403051..c5ca01f382a9 100644
--- a/gcc/config/riscv/iterators.md
+++ b/gcc/config/riscv/iterators.md
@@ -150,6 +150,12 @@
 ;; to use the same template.
 (define_code_iterator any_extend [sign_extend zero_extend])
 
+;; These code iterators allow unsigned and signed extraction to be generated
+;; from the same template.
+(define_code_iterator any_extract [sign_extract zero_extract])
+(define_code_attr extract_sidi_shift [(sign_extract "sraiw")
+ (zero_extract "srliw")])
+
 ;; This code iterator allows the two right shift instructions to be
 ;; generated from the same template.
 (define_code_iterator any_shiftrt [ashiftrt lshiftrt])
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 24558682eb8f..b7fc13e4e611 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2765,16 +2765,17 @@
   [(set_attr "type" "shift")
(set_attr "mode" "SI")])
 
-;; Canonical form for a zero-extend of a logical right shift.
-(define_insn "*lshrsi3_zero_extend_2"
+;; Canonical form for a sign/zero-extend of a logical right shift.
+;; Special case: extract MSB bits of lower 32-bit word
+(define_insn "*lshrsi3_extend_2"
   [(set (match_operand:DI   0 "register_operand" "=r")
-   (zero_extract:DI (match_operand:DI  1 "register_operand" " r")
+   (any_extract:DI (match_operand:DI  1 "register_operand" " r")
 (match_operand 2 "const_int_operand")
 (match_operand 3 "const_int_operand")))]
   "(TARGET_64BIT && (INTVAL (operands[3]) > 0)
 && (INTVAL (operands[2]) + INTVAL (operands[3]) == 32))"
 {
-  return "srliw\t%0,%1,%3";
+  return "\t%0,%1,%3";
 }
   [(set_attr "type" "shift")
(set_attr "mode" "SI")])
diff --git a/gcc/testsuite/gcc.target/riscv/sign-extend-1.c 
b/gcc/testsuite/gcc.target/riscv/sign-extend-1.c
index e9056ec0d424..d8c18dd1aaa7 100644
--- a/gcc/testsuite/gcc.target/riscv/sign-extend-1.c
+++ b/gcc/testsuite/gcc.target/riscv/sign-extend-1.c
@@ -9,6 +9,20 @@ foo1 (int i)
 }
 /* { dg-final { scan-assembler "sraiw\ta\[0-9\],a\[0-9\],31" } } */
 
+signed char
+sub2 (long i)
+{
+  return i >> 24;
+}
+/* { dg-final { scan-assembler "sraiw\ta\[0-9\],a\[0-9\],24" } } */
+
+signed short
+sub3 (long i)
+{
+  return i >> 16;
+}
+/* { dg-final { scan-assembler "sraiw\ta\[0-9\],a\[0-9\],16" } } */
+
 /* { dg-final { scan-assembler-not "srai\t" } } */
 /* { dg-final { scan-assembler-not "srli\t" } } */
 /* { dg-final { scan-assembler-not "srliw\t" } } */


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Add test for sraiw-31 special case

2024-05-09 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:9a81321828844a7b663c78a9415770a247980e71

commit 9a81321828844a7b663c78a9415770a247980e71
Author: Christoph Müllner 
Date:   Tue May 7 22:59:44 2024 +0200

RISC-V: Add test for sraiw-31 special case

We already optimize a sign-extension of a right-shift by 31 in
si3_extend.  Let's add a test for that (similar to
zero-extend-1.c).

gcc/testsuite/ChangeLog:

* gcc.target/riscv/sign-extend-1.c: New test.

Signed-off-by: Christoph Müllner 
(cherry picked from commit dd388198b8be52ab378c935fc517a269e0ba741c)

Diff:
---
 gcc/testsuite/gcc.target/riscv/sign-extend-1.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/gcc/testsuite/gcc.target/riscv/sign-extend-1.c 
b/gcc/testsuite/gcc.target/riscv/sign-extend-1.c
new file mode 100644
index ..e9056ec0d424
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sign-extend-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { riscv64*-*-* } } } */
+/* { dg-options "-march=rv64gc -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } {"-O0" "-Os" "-Og" "-Oz" "-flto" } } */
+
+signed long
+foo1 (int i)
+{
+  return i >> 31;
+}
+/* { dg-final { scan-assembler "sraiw\ta\[0-9\],a\[0-9\],31" } } */
+
+/* { dg-final { scan-assembler-not "srai\t" } } */
+/* { dg-final { scan-assembler-not "srli\t" } } */
+/* { dg-final { scan-assembler-not "srliw\t" } } */


[gcc r15-354] [committed] [RISC-V] Provide splitting guidance to combine to faciliate shNadd.uw generation

2024-05-09 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:bfb88b1406cdd8d3f97e280b0d63529aa925f18a

commit r15-354-gbfb88b1406cdd8d3f97e280b0d63529aa925f18a
Author: Jeff Law 
Date:   Thu May 9 21:07:06 2024 -0600

[committed] [RISC-V] Provide splitting guidance to combine to faciliate 
shNadd.uw generation

This fixes a minor code quality issue I found while comparing GCC and LLVM.
Essentially we want to do a bit of re-association to generate shNadd.uw
instructions.

Combine does the right thing and finds all the necessary instructions,
reassociates the operands, combines constants, etc.  Where is fails is 
finding
a good split point.  The backend can trivially provide guidance on how to 
split
via a define_split pattern.

This has survived both Ventana's internal CI system (rv64gcb) as well as my 
own
(rv64gc, rv32gcv).

I'll wait for the external CI system to give the all-clear before pushing.

gcc/
* config/riscv/bitmanip.md: Add splitter for shadd feeding another
add instruction.

gcc/testsuite/

* gcc.target/riscv/zba-shadduw.c: New test.

Diff:
---
 gcc/config/riscv/bitmanip.md | 17 ++
 gcc/testsuite/gcc.target/riscv/zba-shadduw.c | 35 
 2 files changed, 52 insertions(+)

diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index ad3ad758959e..d76a72d30e02 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -184,6 +184,23 @@
   [(set_attr "type" "bitmanip")
(set_attr "mode" "DI")])
 
+;; Combine will reassociate the operands in the most useful way here.  We
+;; just have to give it guidance on where to split the result to facilitate
+;; shNadd.uw generation.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+   (plus:DI (plus:DI (and:DI (ashift:DI (match_operand:DI 1 
"register_operand")
+(match_operand:QI 2 
"imm123_operand"))
+ (match_operand 3 
"consecutive_bits32_operand"))
+ (match_operand:DI 4 "register_operand"))
+(match_operand 5 "immediate_operand")))]
+  "TARGET_64BIT && TARGET_ZBA"
+  [(set (match_dup 0)
+   (plus:DI (and:DI (ashift:DI (match_dup 1) (match_dup 2))
+(match_dup 3))
+(match_dup 4)))
+   (set (match_dup 0) (plus:DI (match_dup 0) (match_dup 5)))])
+
 ;; ZBB extension.
 
 (define_expand "clzdi2"
diff --git a/gcc/testsuite/gcc.target/riscv/zba-shadduw.c 
b/gcc/testsuite/gcc.target/riscv/zba-shadduw.c
new file mode 100644
index ..5b77447e6813
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zba-shadduw.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=rv64gc_zba -mabi=lp64" } */
+
+typedef struct simple_bitmap_def
+{
+  unsigned char *popcount;
+  unsigned int n_bits;
+  unsigned int size;
+  unsigned long elms[1];
+} *sbitmap;
+typedef const struct simple_bitmap_def *const_sbitmap;
+
+typedef unsigned long *sbitmap_ptr;
+typedef const unsigned long *const_sbitmap_ptr;
+static unsigned long sbitmap_elt_popcount (unsigned long);
+
+void
+sbitmap_a_or_b (sbitmap dst, const_sbitmap a, const_sbitmap b)
+{
+  unsigned int i, n = dst->size;
+  sbitmap_ptr dstp = dst->elms;
+  const_sbitmap_ptr ap = a->elms;
+  const_sbitmap_ptr bp = b->elms;
+  unsigned char has_popcount = dst->popcount != ((void *) 0);
+
+  for (i = 0; i < n; i++)
+{
+  const unsigned long tmp = *ap++ | *bp++;
+  *dstp++ = tmp;
+}
+}
+
+
+/* { dg-final { scan-assembler "sh3add.uw" } } */
+/* { dg-final { scan-assembler-not {\mslli.uw} } } */


<    3   4   5   6   7   8   9   10   11   12   >