On Thu, Oct 7, 2021 at 8:58 AM Alistair Francis <alistair.fran...@opensource.wdc.com> wrote: > > From: Philipp Tomsich <philipp.toms...@vrull.eu> > > The 1.0.0 version of Zbb does not contain gorc/gorci. Instead, a > orc.b instruction (equivalent to the orc.b pseudo-instruction built on > gorci from pre-0.93 draft-B) is available, mainly targeting > string-processing workloads. > > This commit adds the new orc.b instruction and removed gorc/gorci. > > Signed-off-by: Philipp Tomsich <philipp.toms...@vrull.eu> > Reviewed-by: Richard Henderson <richard.hender...@linaro.org> > Reviewed-by: Alistair Francis <alistair.fran...@wdc.com> > Message-id: 20210911140016.834071-12-philipp.toms...@vrull.eu > Signed-off-by: Alistair Francis <alistair.fran...@wdc.com> > --- > target/riscv/helper.h | 2 -- > target/riscv/insn32.decode | 6 +--- > target/riscv/bitmanip_helper.c | 26 ----------------- > target/riscv/insn_trans/trans_rvb.c.inc | 39 +++++++++++-------------- > 4 files changed, 18 insertions(+), 55 deletions(-) > > diff --git a/target/riscv/helper.h b/target/riscv/helper.h > index 8a318a2dbc..a9bda2c8ac 100644 > --- a/target/riscv/helper.h > +++ b/target/riscv/helper.h > @@ -61,8 +61,6 @@ DEF_HELPER_FLAGS_1(fclass_d, TCG_CALL_NO_RWG_SE, tl, i64) > /* Bitmanip */ > DEF_HELPER_FLAGS_2(grev, TCG_CALL_NO_RWG_SE, tl, tl, tl) > DEF_HELPER_FLAGS_2(grevw, TCG_CALL_NO_RWG_SE, tl, tl, tl) > -DEF_HELPER_FLAGS_2(gorc, TCG_CALL_NO_RWG_SE, tl, tl, tl) > -DEF_HELPER_FLAGS_2(gorcw, TCG_CALL_NO_RWG_SE, tl, tl, tl) > DEF_HELPER_FLAGS_2(clmul, TCG_CALL_NO_RWG_SE, tl, tl, tl) > DEF_HELPER_FLAGS_2(clmulr, TCG_CALL_NO_RWG_SE, tl, tl, tl) > > diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode > index a509cfee11..59202196dc 100644 > --- a/target/riscv/insn32.decode > +++ b/target/riscv/insn32.decode > @@ -681,6 +681,7 @@ max 0000101 .......... 110 ..... 0110011 @r > maxu 0000101 .......... 111 ..... 0110011 @r > min 0000101 .......... 100 ..... 0110011 @r > minu 0000101 .......... 101 ..... 0110011 @r > +orc_b 001010 000111 ..... 101 ..... 0010011 @r2 > orn 0100000 .......... 110 ..... 0110011 @r > rol 0110000 .......... 001 ..... 0110011 @r > ror 0110000 .......... 101 ..... 0110011 @r > @@ -702,19 +703,14 @@ pack 0000100 .......... 100 ..... 0110011 @r > packu 0100100 .......... 100 ..... 0110011 @r > packh 0000100 .......... 111 ..... 0110011 @r > grev 0110100 .......... 101 ..... 0110011 @r > -gorc 0010100 .......... 101 ..... 0110011 @r > - > grevi 01101. ........... 101 ..... 0010011 @sh > -gorci 00101. ........... 101 ..... 0010011 @sh > > # *** RV64B Standard Extension (in addition to RV32B) *** > packw 0000100 .......... 100 ..... 0111011 @r > packuw 0100100 .......... 100 ..... 0111011 @r > grevw 0110100 .......... 101 ..... 0111011 @r > -gorcw 0010100 .......... 101 ..... 0111011 @r > > greviw 0110100 .......... 101 ..... 0011011 @sh5 > -gorciw 0010100 .......... 101 ..... 0011011 @sh5 > > # *** RV32 Zbc Standard Extension *** > clmul 0000101 .......... 001 ..... 0110011 @r > diff --git a/target/riscv/bitmanip_helper.c b/target/riscv/bitmanip_helper.c > index 73be5a81c7..bb48388fcd 100644 > --- a/target/riscv/bitmanip_helper.c > +++ b/target/riscv/bitmanip_helper.c > @@ -64,32 +64,6 @@ target_ulong HELPER(grevw)(target_ulong rs1, target_ulong > rs2) > return do_grev(rs1, rs2, 32); > } > > -static target_ulong do_gorc(target_ulong rs1, > - target_ulong rs2, > - int bits) > -{ > - target_ulong x = rs1; > - int i, shift; > - > - for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) { > - if (rs2 & shift) { > - x |= do_swap(x, adjacent_masks[i], shift); > - } > - } > - > - return x; > -} > - > -target_ulong HELPER(gorc)(target_ulong rs1, target_ulong rs2) > -{ > - return do_gorc(rs1, rs2, TARGET_LONG_BITS); > -} > - > -target_ulong HELPER(gorcw)(target_ulong rs1, target_ulong rs2) > -{ > - return do_gorc(rs1, rs2, 32); > -} > - > target_ulong HELPER(clmul)(target_ulong rs1, target_ulong rs2) > { > target_ulong result = 0; > diff --git a/target/riscv/insn_trans/trans_rvb.c.inc > b/target/riscv/insn_trans/trans_rvb.c.inc > index bdfb495f24..d32af5915a 100644 > --- a/target/riscv/insn_trans/trans_rvb.c.inc > +++ b/target/riscv/insn_trans/trans_rvb.c.inc > @@ -295,16 +295,27 @@ static bool trans_grevi(DisasContext *ctx, arg_grevi *a) > return gen_shift_imm_fn(ctx, a, EXT_NONE, gen_grevi); > } > > -static bool trans_gorc(DisasContext *ctx, arg_gorc *a) > +static void gen_orc_b(TCGv ret, TCGv source1) > { > - REQUIRE_EXT(ctx, RVB); > - return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc); > + TCGv tmp = tcg_temp_new(); > + TCGv ones = tcg_constant_tl(dup_const_tl(MO_8, 0x01)); > + > + /* Set lsb in each byte if the byte was zero. */ > + tcg_gen_sub_tl(tmp, source1, ones); > + tcg_gen_andc_tl(tmp, tmp, source1); > + tcg_gen_shri_tl(tmp, tmp, 7); > + tcg_gen_andc_tl(tmp, ones, tmp); > + > + /* Replicate the lsb of each byte across the byte. */ > + tcg_gen_muli_tl(ret, tmp, 0xff); > + > + tcg_temp_free(tmp); > }
It seems there is a bug in the current orc.b implementation, the following 7 hexadecimal patterns return one wrong byte (0x00 instead of 0xff): orc.b(0x............01..) = 0x............00.. (instead of 0x............ff..) orc.b(0x..........01....) = 0x..........00.... (instead of 0x..........ff....) orc.b(0x........01......) = 0x........00...... (instead of 0x........ff......) orc.b(0x......01........) = 0x......00........ (instead of 0x......ff........) orc.b(0x....01..........) = 0x....00.......... (instead of 0x....ff..........) orc.b(0x..01............) = 0x..00............ (instead of 0x..ff............) orc.b(0x01..............) = 0x00.............. (instead of 0xff..............) (see test cases below) The issue seems to be related to the propagation of the carry. I had a hard time fixing it. With some help, I have added a prolog which basically computes: (source1 | ((source1 << 1) & ~ones)) in order to avoid the carry. I will send the patch as a follow-up in this thread as 'Patch 1A'. But it's notably less optimized than the current code, so feel free to come up with better options. Actually my initial stab at fixing it was implementing a more straightforward but less astute 'divide and conquer' method where bits are or'ed by pairs, then the pairs are or'ed by pair ... using the following formula: tmp = source1 | (source1 >> 1) tmp = tmp | (tmp >> 2) tmp = tmp | (tmp >> 4) ret = tmp & 0x0101010101010101 ret = tmp * 0xff as it's notably less optimized than the current code when converted in tcg_gen_ primitives but de par with the fixed version. I'm also sending in this thread as 'Patch 1B' as I feel it's slightly easier to grasp. Test cases run on current implementation: orc.b(0x0000000000000000) = 0x0000000000000000 OK (expect 0x0000000000000000) orc.b(0x0000000000000001) = 0x00000000000000ff OK (expect 0x00000000000000ff) orc.b(0x0000000000000002) = 0x00000000000000ff OK (expect 0x00000000000000ff) orc.b(0x0000000000000004) = 0x00000000000000ff OK (expect 0x00000000000000ff) orc.b(0x0000000000000008) = 0x00000000000000ff OK (expect 0x00000000000000ff) orc.b(0x0000000000000010) = 0x00000000000000ff OK (expect 0x00000000000000ff) orc.b(0x0000000000000020) = 0x00000000000000ff OK (expect 0x00000000000000ff) orc.b(0x0000000000000040) = 0x00000000000000ff OK (expect 0x00000000000000ff) orc.b(0x0000000000000080) = 0x00000000000000ff OK (expect 0x00000000000000ff) orc.b(0x0000000000000100) = 0x0000000000000000 FAIL (expect 0x000000000000ff00) orc.b(0x0000000000000200) = 0x000000000000ff00 OK (expect 0x000000000000ff00) orc.b(0x0000000000000400) = 0x000000000000ff00 OK (expect 0x000000000000ff00) orc.b(0x0000000000000800) = 0x000000000000ff00 OK (expect 0x000000000000ff00) orc.b(0x0000000000001000) = 0x000000000000ff00 OK (expect 0x000000000000ff00) orc.b(0x0000000000002000) = 0x000000000000ff00 OK (expect 0x000000000000ff00) orc.b(0x0000000000004000) = 0x000000000000ff00 OK (expect 0x000000000000ff00) orc.b(0x0000000000008000) = 0x000000000000ff00 OK (expect 0x000000000000ff00) orc.b(0x0000000000010000) = 0x0000000000000000 FAIL (expect 0x0000000000ff0000) orc.b(0x0000000000020000) = 0x0000000000ff0000 OK (expect 0x0000000000ff0000) orc.b(0x0000000001000000) = 0x0000000000000000 FAIL (expect 0x00000000ff000000) orc.b(0x0000000002000000) = 0x00000000ff000000 OK (expect 0x00000000ff000000) orc.b(0x0000000100000000) = 0x0000000000000000 FAIL (expect 0x000000ff00000000) orc.b(0x0000000200000000) = 0x000000ff00000000 OK (expect 0x000000ff00000000) orc.b(0x0000010000000000) = 0x0000000000000000 FAIL (expect 0x0000ff0000000000) orc.b(0x0000020000000000) = 0x0000ff0000000000 OK (expect 0x0000ff0000000000) orc.b(0x0001000000000000) = 0x0000000000000000 FAIL (expect 0x00ff000000000000) orc.b(0x0002000000000000) = 0x00ff000000000000 OK (expect 0x00ff000000000000) orc.b(0x0100000000000000) = 0x0000000000000000 FAIL (expect 0xff00000000000000) orc.b(0x0200000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000) orc.b(0x0400000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000) orc.b(0x0800000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000) orc.b(0x1000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000) orc.b(0x2000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000) orc.b(0x4000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000) orc.b(0x8000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000) orc.b(0xffffffffffffffff) = 0xffffffffffffffff OK (expect 0xffffffffffffffff) orc.b(0x00ff00ff00ff00ff) = 0x00ff00ff00ff00ff OK (expect 0x00ff00ff00ff00ff) orc.b(0xff00ff00ff00ff00) = 0xff00ff00ff00ff00 OK (expect 0xff00ff00ff00ff00) orc.b(0x0001000100010001) = 0x00000000000000ff FAIL (expect 0x00ff00ff00ff00ff) orc.b(0x0100010001000100) = 0x0000000000000000 FAIL (expect 0xff00ff00ff00ff00) orc.b(0x8040201008040201) = 0xffffffffffffffff OK (expect 0xffffffffffffffff) orc.b(0x0804020180402010) = 0xffffffffffffffff OK (expect 0xffffffffffffffff) orc.b(0x010055aa00401100) = 0x0000ffff00ffff00 FAIL (expect 0xff00ffff00ffff00) > > -static bool trans_gorci(DisasContext *ctx, arg_gorci *a) > +static bool trans_orc_b(DisasContext *ctx, arg_orc_b *a) > { > - REQUIRE_EXT(ctx, RVB); > - return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc); > + REQUIRE_ZBB(ctx); > + return gen_unary(ctx, a, EXT_ZERO, gen_orc_b); > } > > #define GEN_SHADD(SHAMT) \ > @@ -476,22 +487,6 @@ static bool trans_greviw(DisasContext *ctx, arg_greviw > *a) > return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_grev); > } > > -static bool trans_gorcw(DisasContext *ctx, arg_gorcw *a) > -{ > - REQUIRE_64BIT(ctx); > - REQUIRE_EXT(ctx, RVB); > - ctx->w = true; > - return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc); > -} > - > -static bool trans_gorciw(DisasContext *ctx, arg_gorciw *a) > -{ > - REQUIRE_64BIT(ctx); > - REQUIRE_EXT(ctx, RVB); > - ctx->w = true; > - return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc); > -} > - > #define GEN_SHADD_UW(SHAMT) \ > static void gen_sh##SHAMT##add_uw(TCGv ret, TCGv arg1, TCGv arg2) \ > { \ > -- > 2.31.1 > >