RE: [PATCH v1] RISC-V: Refine the code gen for ceil auto vectorization.

2023-09-22 Thread Li, Pan2
Sure thing, will send V2 for this.

Pan

From: juzhe.zh...@rivai.ai 
Sent: Friday, September 22, 2023 7:26 PM
To: Li, Pan2 ; gcc-patches 
Cc: Li, Pan2 ; Wang, Yanzhang ; 
kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Refine the code gen for ceil auto vectorization.

I prefer change expand_vec_copysign into emit_vec_copysign。

Likewise, emit_fabs. ...etc.


juzhe.zh...@rivai.ai<mailto:juzhe.zh...@rivai.ai>

From: pan2.li<mailto:pan2...@intel.com>
Date: 2023-09-22 19:19
To: gcc-patches<mailto:gcc-patches@gcc.gnu.org>
CC: juzhe.zhong<mailto:juzhe.zh...@rivai.ai>; 
pan2.li<mailto:pan2...@intel.com>; 
yanzhang.wang<mailto:yanzhang.w...@intel.com>; 
kito.cheng<mailto:kito.ch...@gmail.com>
Subject: [PATCH v1] RISC-V: Refine the code gen for ceil auto vectorization.
From: Pan Li mailto:pan2...@intel.com>>

We vectorized below ceil code already.

void
test_ceil (float *out, float *in, int count)
{
  for (unsigned i = 0; i < count; i++)
out[i] = __builtin_ceilf (in[i]);
}

Before this patch:
vfmv.v.xv4,fa0 // can be removed
vfabs.v v0,v1
vmv1r.v v2,v1  // can be removed
vmflt.vvv0,v0,v4   // can be refined to vmflt.vf
vfcvt.x.f.v v3,v1,v0.t
vfcvt.f.x.v v2,v3,v0.t
vfsgnj.vv   v2,v2,v1

After this patch:
vfabs.v v1,v2
vmflt.vfv0,v1,fa5
vfcvt.x.f.v v3,v2,v0.t
vfcvt.f.x.v v1,v3,v0.t
vfsgnj.vv   v1,v1,v2

We can generate better code include below items.

* Remove vfmv.v.f.
* Take vmflt.vf instead of vmflt.vv.
* Remove vmv1r.v.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (expand_vec_float_cmp_mask): Refactor.
(expand_vec_abs): New function impl.
(expand_vec_cvt_x_f): Ditto.
(expand_vec_cvt_f_x): Ditto.
(expand_vec_ceil): Refine.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/unop/math-ceil-0.c: Adjust body check.
* gcc.target/riscv/rvv/autovec/unop/math-ceil-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/unop/math-ceil-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/unop/math-ceil-3.c: Ditto.

Signed-off-by: Pan Li mailto:pan2...@intel.com>>
---
gcc/config/riscv/riscv-v.cc   | 71 ---
.../riscv/rvv/autovec/unop/math-ceil-0.c  |  5 +-
.../riscv/rvv/autovec/unop/math-ceil-1.c  |  5 +-
.../riscv/rvv/autovec/unop/math-ceil-2.c  |  5 +-
.../riscv/rvv/autovec/unop/math-ceil-3.c  |  5 +-
5 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 4d0e1d8d1a9..ea2b01f6a6e 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3560,26 +3560,17 @@ static rtx
expand_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar,
   machine_mode vec_fp_mode)
{
-  /* Step-1: Get the abs float value for mask generation.  */
-  rtx tmp = gen_reg_rtx (vec_fp_mode);
-  rtx abs_ops[] = {tmp, fp_vector};
-  insn_code icode = code_for_pred (ABS, vec_fp_mode);
-  emit_vlmax_insn (icode, UNARY_OP, abs_ops);
-
-  /* Step-2: Prepare the scalar float compare register.  */
+  /* Step-1: Prepare the scalar float compare register.  */
   rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode));
   emit_insn (gen_move_insn (fp_reg, fp_scalar));
-  /* Step-3: Prepare the vector float compare register.  */
-  rtx vec_dup = gen_reg_rtx (vec_fp_mode);
-  icode = code_for_pred_broadcast (vec_fp_mode);
-  rtx vfmv_ops[] = {vec_dup, fp_reg};
-  emit_vlmax_insn (icode, UNARY_OP, vfmv_ops);
-
-  /* Step-4: Generate the mask.  */
+  /* Step-2: Generate the mask.  */
   machine_mode mask_mode = get_mask_mode (vec_fp_mode);
   rtx mask = gen_reg_rtx (mask_mode);
-  expand_vec_cmp (mask, code, tmp, vec_dup);
+  rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg);
+  rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg};
+  insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode);
+  emit_vlmax_insn (icode, COMPARE_OP, cmp_ops);
   return mask;
}
@@ -3594,29 +3585,57 @@ expand_vec_copysign (rtx op_dest, rtx op_src_0, rtx 
op_src_1,
   emit_vlmax_insn (icode, BINARY_OP, sgnj_ops);
}
+static void
+expand_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode)
+{
+  rtx abs_ops[] = {op_dest, op_src};
+  insn_code icode = code_for_pred (ABS, vec_mode);
+
+  emit_vlmax_insn (icode, UNARY_OP, abs_ops);
+}
+
+static void
+expand_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask,
+ insn_type type, machine_mode vec_mode)
+{
+  rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
+  insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
+
+  emit_vlmax_insn (icode, type, cvt_x_ops);
+}
+
+static void
+expand_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask,
+ insn_type type, machine_mode vec_mode)
+{
+  rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src};
+  insn_code icode = code_for_pred (FLOAT, vec_mode);
+
+  emit_vlmax_insn (icode, type, cvt_fp_ops);
+}
+
void
expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
machine_mode vec_int_mode)
{
-  /* Step-1: Generate the mask on con

Re: [PATCH v1] RISC-V: Refine the code gen for ceil auto vectorization.

2023-09-22 Thread juzhe.zh...@rivai.ai
I prefer change expand_vec_copysign into emit_vec_copysign。

Likewise, emit_fabs. ...etc.



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-09-22 19:19
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Refine the code gen for ceil auto vectorization.
From: Pan Li 
 
We vectorized below ceil code already.
 
void
test_ceil (float *out, float *in, int count)
{
  for (unsigned i = 0; i < count; i++)
out[i] = __builtin_ceilf (in[i]);
}
 
Before this patch:
vfmv.v.xv4,fa0 // can be removed
vfabs.v v0,v1
vmv1r.v v2,v1  // can be removed
vmflt.vvv0,v0,v4   // can be refined to vmflt.vf
vfcvt.x.f.v v3,v1,v0.t
vfcvt.f.x.v v2,v3,v0.t
vfsgnj.vv   v2,v2,v1
 
After this patch:
vfabs.v v1,v2
vmflt.vfv0,v1,fa5
vfcvt.x.f.v v3,v2,v0.t
vfcvt.f.x.v v1,v3,v0.t
vfsgnj.vv   v1,v1,v2
 
We can generate better code include below items.
 
* Remove vfmv.v.f.
* Take vmflt.vf instead of vmflt.vv.
* Remove vmv1r.v.
 
gcc/ChangeLog:
 
* config/riscv/riscv-v.cc (expand_vec_float_cmp_mask): Refactor.
(expand_vec_abs): New function impl.
(expand_vec_cvt_x_f): Ditto.
(expand_vec_cvt_f_x): Ditto.
(expand_vec_ceil): Refine.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/unop/math-ceil-0.c: Adjust body check.
* gcc.target/riscv/rvv/autovec/unop/math-ceil-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/unop/math-ceil-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/unop/math-ceil-3.c: Ditto.
 
Signed-off-by: Pan Li 
---
gcc/config/riscv/riscv-v.cc   | 71 ---
.../riscv/rvv/autovec/unop/math-ceil-0.c  |  5 +-
.../riscv/rvv/autovec/unop/math-ceil-1.c  |  5 +-
.../riscv/rvv/autovec/unop/math-ceil-2.c  |  5 +-
.../riscv/rvv/autovec/unop/math-ceil-3.c  |  5 +-
5 files changed, 49 insertions(+), 42 deletions(-)
 
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 4d0e1d8d1a9..ea2b01f6a6e 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3560,26 +3560,17 @@ static rtx
expand_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar,
   machine_mode vec_fp_mode)
{
-  /* Step-1: Get the abs float value for mask generation.  */
-  rtx tmp = gen_reg_rtx (vec_fp_mode);
-  rtx abs_ops[] = {tmp, fp_vector};
-  insn_code icode = code_for_pred (ABS, vec_fp_mode);
-  emit_vlmax_insn (icode, UNARY_OP, abs_ops);
-
-  /* Step-2: Prepare the scalar float compare register.  */
+  /* Step-1: Prepare the scalar float compare register.  */
   rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode));
   emit_insn (gen_move_insn (fp_reg, fp_scalar));
-  /* Step-3: Prepare the vector float compare register.  */
-  rtx vec_dup = gen_reg_rtx (vec_fp_mode);
-  icode = code_for_pred_broadcast (vec_fp_mode);
-  rtx vfmv_ops[] = {vec_dup, fp_reg};
-  emit_vlmax_insn (icode, UNARY_OP, vfmv_ops);
-
-  /* Step-4: Generate the mask.  */
+  /* Step-2: Generate the mask.  */
   machine_mode mask_mode = get_mask_mode (vec_fp_mode);
   rtx mask = gen_reg_rtx (mask_mode);
-  expand_vec_cmp (mask, code, tmp, vec_dup);
+  rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg);
+  rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg};
+  insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode);
+  emit_vlmax_insn (icode, COMPARE_OP, cmp_ops);
   return mask;
}
@@ -3594,29 +3585,57 @@ expand_vec_copysign (rtx op_dest, rtx op_src_0, rtx 
op_src_1,
   emit_vlmax_insn (icode, BINARY_OP, sgnj_ops);
}
+static void
+expand_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode)
+{
+  rtx abs_ops[] = {op_dest, op_src};
+  insn_code icode = code_for_pred (ABS, vec_mode);
+
+  emit_vlmax_insn (icode, UNARY_OP, abs_ops);
+}
+
+static void
+expand_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask,
+ insn_type type, machine_mode vec_mode)
+{
+  rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
+  insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
+
+  emit_vlmax_insn (icode, type, cvt_x_ops);
+}
+
+static void
+expand_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask,
+ insn_type type, machine_mode vec_mode)
+{
+  rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src};
+  insn_code icode = code_for_pred (FLOAT, vec_mode);
+
+  emit_vlmax_insn (icode, type, cvt_fp_ops);
+}
+
void
expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
machine_mode vec_int_mode)
{
-  /* Step-1: Generate the mask on const fp.  */
+  /* Step-1: Get the abs float value for mask generation.  */
+  expand_vec_abs (op_0, op_1, vec_fp_mode);
+
+  /* Step-2: Generate the mask on const fp.  */
   rtx const_fp = gen_ceil_const_fp (GET_MODE_INNER (vec_fp_mode));
-  rtx mask = expand_vec_float_cmp_mask (op_1, LT, const_fp, vec_fp_mode);
+  rtx mask = expand_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
-  /* Step-2: Convert to integer on mask, with rounding up (aka ceil).  */
+  /* Step-3: Convert to integer on mask, with rounding up (aka ceil).  */
   rtx tmp = gen_reg_rtx (vec_int_mode);
-  rtx cvt_x_ops[] =