On 8/4/23 17:10, 钟居哲 wrote:
Could you add testcases for this patch?
Testing what specifically? Are you asking for correctness tests, performance/code quality tests?



+;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
+;; optimizers from changing cpymem_loop_* into this.
+(define_insn "@cpymem_straight<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
+       (mem:BLK (match_operand:P 1 "register_operand" "r,r")))
+       (use (and (match_dup 1) (const_int 127)))
+   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr,&vr"))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+  "@vsetvli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)
+   vsetivli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)"
+)
+
+(define_insn "@cpymem_loop<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+       (mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+   (clobber (match_scratch:P 4 "=&r"))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+                  "vle<sew>.v %3,(%1)\;"
+                  "sub %2,%2,%4", operands);
+  if (<sew> != 8)
+    {
+      rtx xop[2];
+      xop[0] = operands[4];
+      xop[1] = GEN_INT (exact_log2 (<sew>/8));
+      output_asm_insn ("slli %0,%0,%1", xop);
+    }
+  output_asm_insn ("add %1,%1,%4\;"
+                  "vse<sew>.v %3,(%0)\;"
+                  "add %0,%0,%4\;"
+                  "bnez %2,0b", operands);
+  return "";
+})
+
+;; This pattern (at bltu) assumes pointers can be treated as unsigned,
+;; i.e.  objects can't straddle 0xffffffffffffffff / 0x0000000000000000 .
+(define_insn "@cpymem_loop_fast<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+       (mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+   (clobber (match_scratch:P 4 "=&r"))
+   (clobber (match_scratch:P 5 "=&r"))
+   (clobber (match_scratch:P 6 "=&r"))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{
+  output_asm_insn ("vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+                  "beq %4,%2,1f\;"
+                  "add %5,%0,%2\;"
+                  "sub %6,%5,%4", operands);
+  if (<sew> != 8)
+    {
+      rtx xop[2];
+      xop[0] = operands[4];
+      xop[1] = GEN_INT (exact_log2 (<sew>/8));
+      output_asm_insn ("slli %0,%0,%1", xop);
+    }
+  output_asm_insn ("\n0:\t" "vle<sew>.v %3,(%1)\;"
+                  "add %1,%1,%4\;"
+                  "vse<sew>.v %3,(%0)\;"
+                  "add %0,%0,%4\;"
                   "bltu %0,%6,0b\;"
                   "sub %5,%5,%0", operands);
  if (<sew> != 8)
    {
      rtx xop[2];
      xop[0] = operands[4];
      xop[1] = GEN_INT (exact_log2 (<sew>/8));
      output_asm_insn ("srli %0,%0,%1", xop);
     }
  output_asm_insn ("vsetvli %4,%5,e<sew>,m8,ta,ma\n"
            "1:\t" "vle<sew>.v %3,(%1)\;"
                   "vse<sew>.v %3,(%0)", operands);
  return "";
})

I don't think they are necessary.
What specifically do you think is not necessary?



Just post the update for archival purposes and consider
it pre-approved for the trunk.

I am so sorry that I disagree approve this patch too fast.
Umm, this patch has been queued up for at least a couple weeks now.


It should be well tested.
If you refer to Joern's message he indicated how it was tested. Joern is a long time GCC developer and is well aware of how to test code.


It was tested on this set of multilibs without regressions:

   riscv-sim
    
riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
    
riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
    
riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
    
riscv-sim/-march=rv32imfdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
    
riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
    
riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
    
riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d






We should at least these 2 following situations:

1. an unknown number bytes to be memcpy, this codegen should be as follows:

vsetvl a5,a2,e8,m8,ta,ma

vle

vse

bump counter

branch

2. a known number bytes to be memcpy, and the number bytes allow us to fine a VLS modes to hold it.

For example, memcpy 16 bytes QImode.

Then, we can use V16QImode directly, the codegen should be:

vsetvli zero,16,....

vle

vse

Simple 3 instructions are enough.


This patch should be well tested with these 2 situations before approved since LLVM does the same thing.

We should be able to have the same behavior as LLVM.
I'm not sure that's strictly necessary and I don't mind iterating a bit on performance issues as long as we don't have correctness problems.

But since you've raised concerns -- Joern don't install until we've resolved the questions at hand. Thanks.

jeff

Reply via email to