On 8/4/23 17:10, 钟居哲 wrote:
Could you add testcases for this patch?
Testing what specifically? Are you asking for correctness tests,
performance/code quality tests?
+;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
+;; optimizers from changing cpymem_loop_* into this.
+(define_insn "@cpymem_straight<P:mode><V_WHOLE:mode>"
+ [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
+ (mem:BLK (match_operand:P 1 "register_operand" "r,r")))
+ (use (and (match_dup 1) (const_int 127)))
+ (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
+ (clobber (match_scratch:V_WHOLE 3 "=&vr,&vr"))
+ (clobber (reg:SI VL_REGNUM))
+ (clobber (reg:SI VTYPE_REGNUM))]
+ "TARGET_VECTOR"
+ "@vsetvli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)
+ vsetivli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)"
+)
+
+(define_insn "@cpymem_loop<P:mode><V_WHOLE:mode>"
+ [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+ (mem:BLK (match_operand:P 1 "register_operand" "+r")))
+ (use (match_operand:P 2 "register_operand" "+r"))
+ (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+ (clobber (match_scratch:P 4 "=&r"))
+ (clobber (match_dup 0))
+ (clobber (match_dup 1))
+ (clobber (match_dup 2))
+ (clobber (reg:SI VL_REGNUM))
+ (clobber (reg:SI VTYPE_REGNUM))]
+ "TARGET_VECTOR"
+{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+ "vle<sew>.v %3,(%1)\;"
+ "sub %2,%2,%4", operands);
+ if (<sew> != 8)
+ {
+ rtx xop[2];
+ xop[0] = operands[4];
+ xop[1] = GEN_INT (exact_log2 (<sew>/8));
+ output_asm_insn ("slli %0,%0,%1", xop);
+ }
+ output_asm_insn ("add %1,%1,%4\;"
+ "vse<sew>.v %3,(%0)\;"
+ "add %0,%0,%4\;"
+ "bnez %2,0b", operands);
+ return "";
+})
+
+;; This pattern (at bltu) assumes pointers can be treated as unsigned,
+;; i.e. objects can't straddle 0xffffffffffffffff / 0x0000000000000000 .
+(define_insn "@cpymem_loop_fast<P:mode><V_WHOLE:mode>"
+ [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+ (mem:BLK (match_operand:P 1 "register_operand" "+r")))
+ (use (match_operand:P 2 "register_operand" "+r"))
+ (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+ (clobber (match_scratch:P 4 "=&r"))
+ (clobber (match_scratch:P 5 "=&r"))
+ (clobber (match_scratch:P 6 "=&r"))
+ (clobber (match_dup 0))
+ (clobber (match_dup 1))
+ (clobber (match_dup 2))
+ (clobber (reg:SI VL_REGNUM))
+ (clobber (reg:SI VTYPE_REGNUM))]
+ "TARGET_VECTOR"
+{
+ output_asm_insn ("vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+ "beq %4,%2,1f\;"
+ "add %5,%0,%2\;"
+ "sub %6,%5,%4", operands);
+ if (<sew> != 8)
+ {
+ rtx xop[2];
+ xop[0] = operands[4];
+ xop[1] = GEN_INT (exact_log2 (<sew>/8));
+ output_asm_insn ("slli %0,%0,%1", xop);
+ }
+ output_asm_insn ("\n0:\t" "vle<sew>.v %3,(%1)\;"
+ "add %1,%1,%4\;"
+ "vse<sew>.v %3,(%0)\;"
+ "add %0,%0,%4\;"
"bltu %0,%6,0b\;"
"sub %5,%5,%0", operands);
if (<sew> != 8)
{
rtx xop[2];
xop[0] = operands[4];
xop[1] = GEN_INT (exact_log2 (<sew>/8));
output_asm_insn ("srli %0,%0,%1", xop);
}
output_asm_insn ("vsetvli %4,%5,e<sew>,m8,ta,ma\n"
"1:\t" "vle<sew>.v %3,(%1)\;"
"vse<sew>.v %3,(%0)", operands);
return "";
})
I don't think they are necessary.
What specifically do you think is not necessary?
Just post the update for archival purposes and consider
it pre-approved for the trunk.
I am so sorry that I disagree approve this patch too fast.
Umm, this patch has been queued up for at least a couple weeks now.
It should be well tested.
If you refer to Joern's message he indicated how it was tested. Joern
is a long time GCC developer and is well aware of how to test code.
It was tested on this set of multilibs without regressions:
riscv-sim
riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
riscv-sim/-march=rv32imfdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
We should at least these 2 following situations:
1. an unknown number bytes to be memcpy, this codegen should be as follows:
vsetvl a5,a2,e8,m8,ta,ma
vle
vse
bump counter
branch
2. a known number bytes to be memcpy, and the number bytes allow us to
fine a VLS modes to hold it.
For example, memcpy 16 bytes QImode.
Then, we can use V16QImode directly, the codegen should be:
vsetvli zero,16,....
vle
vse
Simple 3 instructions are enough.
This patch should be well tested with these 2 situations before approved
since LLVM does the same thing.
We should be able to have the same behavior as LLVM.
I'm not sure that's strictly necessary and I don't mind iterating a bit
on performance issues as long as we don't have correctness problems.
But since you've raised concerns -- Joern don't install until we've
resolved the questions at hand. Thanks.
jeff