On Thu, Oct 7, 2021 at 8:58 AM Alistair Francis
<alistair.fran...@opensource.wdc.com> wrote:
>
> From: Philipp Tomsich <philipp.toms...@vrull.eu>
>
> The 1.0.0 version of Zbb does not contain gorc/gorci.  Instead, a
> orc.b instruction (equivalent to the orc.b pseudo-instruction built on
> gorci from pre-0.93 draft-B) is available, mainly targeting
> string-processing workloads.
>
> This commit adds the new orc.b instruction and removed gorc/gorci.
>
> Signed-off-by: Philipp Tomsich <philipp.toms...@vrull.eu>
> Reviewed-by: Richard Henderson <richard.hender...@linaro.org>
> Reviewed-by: Alistair Francis <alistair.fran...@wdc.com>
> Message-id: 20210911140016.834071-12-philipp.toms...@vrull.eu
> Signed-off-by: Alistair Francis <alistair.fran...@wdc.com>
> ---
>  target/riscv/helper.h                   |  2 --
>  target/riscv/insn32.decode              |  6 +---
>  target/riscv/bitmanip_helper.c          | 26 -----------------
>  target/riscv/insn_trans/trans_rvb.c.inc | 39 +++++++++++--------------
>  4 files changed, 18 insertions(+), 55 deletions(-)
>
> diff --git a/target/riscv/helper.h b/target/riscv/helper.h
> index 8a318a2dbc..a9bda2c8ac 100644
> --- a/target/riscv/helper.h
> +++ b/target/riscv/helper.h
> @@ -61,8 +61,6 @@ DEF_HELPER_FLAGS_1(fclass_d, TCG_CALL_NO_RWG_SE, tl, i64)
>  /* Bitmanip */
>  DEF_HELPER_FLAGS_2(grev, TCG_CALL_NO_RWG_SE, tl, tl, tl)
>  DEF_HELPER_FLAGS_2(grevw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> -DEF_HELPER_FLAGS_2(gorc, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> -DEF_HELPER_FLAGS_2(gorcw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
>  DEF_HELPER_FLAGS_2(clmul, TCG_CALL_NO_RWG_SE, tl, tl, tl)
>  DEF_HELPER_FLAGS_2(clmulr, TCG_CALL_NO_RWG_SE, tl, tl, tl)
>
> diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
> index a509cfee11..59202196dc 100644
> --- a/target/riscv/insn32.decode
> +++ b/target/riscv/insn32.decode
> @@ -681,6 +681,7 @@ max        0000101 .......... 110 ..... 0110011 @r
>  maxu       0000101 .......... 111 ..... 0110011 @r
>  min        0000101 .......... 100 ..... 0110011 @r
>  minu       0000101 .......... 101 ..... 0110011 @r
> +orc_b      001010 000111 ..... 101 ..... 0010011 @r2
>  orn        0100000 .......... 110 ..... 0110011 @r
>  rol        0110000 .......... 001 ..... 0110011 @r
>  ror        0110000 .......... 101 ..... 0110011 @r
> @@ -702,19 +703,14 @@ pack       0000100 .......... 100 ..... 0110011 @r
>  packu      0100100 .......... 100 ..... 0110011 @r
>  packh      0000100 .......... 111 ..... 0110011 @r
>  grev       0110100 .......... 101 ..... 0110011 @r
> -gorc       0010100 .......... 101 ..... 0110011 @r
> -
>  grevi      01101. ........... 101 ..... 0010011 @sh
> -gorci      00101. ........... 101 ..... 0010011 @sh
>
>  # *** RV64B Standard Extension (in addition to RV32B) ***
>  packw      0000100 .......... 100 ..... 0111011 @r
>  packuw     0100100 .......... 100 ..... 0111011 @r
>  grevw      0110100 .......... 101 ..... 0111011 @r
> -gorcw      0010100 .......... 101 ..... 0111011 @r
>
>  greviw     0110100 .......... 101 ..... 0011011 @sh5
> -gorciw     0010100 .......... 101 ..... 0011011 @sh5
>
>  # *** RV32 Zbc Standard Extension ***
>  clmul      0000101 .......... 001 ..... 0110011 @r
> diff --git a/target/riscv/bitmanip_helper.c b/target/riscv/bitmanip_helper.c
> index 73be5a81c7..bb48388fcd 100644
> --- a/target/riscv/bitmanip_helper.c
> +++ b/target/riscv/bitmanip_helper.c
> @@ -64,32 +64,6 @@ target_ulong HELPER(grevw)(target_ulong rs1, target_ulong 
> rs2)
>      return do_grev(rs1, rs2, 32);
>  }
>
> -static target_ulong do_gorc(target_ulong rs1,
> -                            target_ulong rs2,
> -                            int bits)
> -{
> -    target_ulong x = rs1;
> -    int i, shift;
> -
> -    for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) {
> -        if (rs2 & shift) {
> -            x |= do_swap(x, adjacent_masks[i], shift);
> -        }
> -    }
> -
> -    return x;
> -}
> -
> -target_ulong HELPER(gorc)(target_ulong rs1, target_ulong rs2)
> -{
> -    return do_gorc(rs1, rs2, TARGET_LONG_BITS);
> -}
> -
> -target_ulong HELPER(gorcw)(target_ulong rs1, target_ulong rs2)
> -{
> -    return do_gorc(rs1, rs2, 32);
> -}
> -
>  target_ulong HELPER(clmul)(target_ulong rs1, target_ulong rs2)
>  {
>      target_ulong result = 0;
> diff --git a/target/riscv/insn_trans/trans_rvb.c.inc 
> b/target/riscv/insn_trans/trans_rvb.c.inc
> index bdfb495f24..d32af5915a 100644
> --- a/target/riscv/insn_trans/trans_rvb.c.inc
> +++ b/target/riscv/insn_trans/trans_rvb.c.inc
> @@ -295,16 +295,27 @@ static bool trans_grevi(DisasContext *ctx, arg_grevi *a)
>      return gen_shift_imm_fn(ctx, a, EXT_NONE, gen_grevi);
>  }
>
> -static bool trans_gorc(DisasContext *ctx, arg_gorc *a)
> +static void gen_orc_b(TCGv ret, TCGv source1)
>  {
> -    REQUIRE_EXT(ctx, RVB);
> -    return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
> +    TCGv  tmp = tcg_temp_new();
> +    TCGv  ones = tcg_constant_tl(dup_const_tl(MO_8, 0x01));
> +
> +    /* Set lsb in each byte if the byte was zero. */
> +    tcg_gen_sub_tl(tmp, source1, ones);
> +    tcg_gen_andc_tl(tmp, tmp, source1);
> +    tcg_gen_shri_tl(tmp, tmp, 7);
> +    tcg_gen_andc_tl(tmp, ones, tmp);
> +
> +    /* Replicate the lsb of each byte across the byte. */
> +    tcg_gen_muli_tl(ret, tmp, 0xff);
> +
> +    tcg_temp_free(tmp);
>  }

It seems there is a bug in the current orc.b implementation,
the following 7 hexadecimal patterns return one wrong byte (0x00
instead of 0xff):
orc.b(0x............01..) = 0x............00.. (instead of 0x............ff..)
orc.b(0x..........01....) = 0x..........00.... (instead of 0x..........ff....)
orc.b(0x........01......) = 0x........00...... (instead of 0x........ff......)
orc.b(0x......01........) = 0x......00........ (instead of 0x......ff........)
orc.b(0x....01..........) = 0x....00.......... (instead of 0x....ff..........)
orc.b(0x..01............) = 0x..00............ (instead of 0x..ff............)
orc.b(0x01..............) = 0x00.............. (instead of 0xff..............)
(see test cases below)

The issue seems to be related to the propagation of the carry.
I had a hard time fixing it. With some help, I have added a prolog
which basically computes:
(source1 | ((source1 << 1) & ~ones)) in order to avoid the carry.
I will send the patch as a follow-up in this thread as 'Patch 1A'.

But it's notably less optimized than the current code,  so feel free
to come up with better options.
Actually my initial stab at fixing it was implementing a more
straightforward but less astute 'divide and conquer' method
where bits are or'ed by pairs, then the pairs are or'ed by pair ...
using the following formula:
tmp = source1 | (source1 >> 1)
tmp = tmp | (tmp >> 2)
tmp = tmp | (tmp >> 4)
ret = tmp & 0x0101010101010101
ret = tmp * 0xff
as it's notably less optimized than the current code when converted in
tcg_gen_ primitives but de par with the fixed version.
I'm also sending in this thread as 'Patch 1B' as I feel it's slightly
easier to grasp.


Test cases run on current implementation:
orc.b(0x0000000000000000) = 0x0000000000000000   OK (expect 0x0000000000000000)
orc.b(0x0000000000000001) = 0x00000000000000ff   OK (expect 0x00000000000000ff)
orc.b(0x0000000000000002) = 0x00000000000000ff   OK (expect 0x00000000000000ff)
orc.b(0x0000000000000004) = 0x00000000000000ff   OK (expect 0x00000000000000ff)
orc.b(0x0000000000000008) = 0x00000000000000ff   OK (expect 0x00000000000000ff)
orc.b(0x0000000000000010) = 0x00000000000000ff   OK (expect 0x00000000000000ff)
orc.b(0x0000000000000020) = 0x00000000000000ff   OK (expect 0x00000000000000ff)
orc.b(0x0000000000000040) = 0x00000000000000ff   OK (expect 0x00000000000000ff)
orc.b(0x0000000000000080) = 0x00000000000000ff   OK (expect 0x00000000000000ff)
orc.b(0x0000000000000100) = 0x0000000000000000 FAIL (expect 0x000000000000ff00)
orc.b(0x0000000000000200) = 0x000000000000ff00   OK (expect 0x000000000000ff00)
orc.b(0x0000000000000400) = 0x000000000000ff00   OK (expect 0x000000000000ff00)
orc.b(0x0000000000000800) = 0x000000000000ff00   OK (expect 0x000000000000ff00)
orc.b(0x0000000000001000) = 0x000000000000ff00   OK (expect 0x000000000000ff00)
orc.b(0x0000000000002000) = 0x000000000000ff00   OK (expect 0x000000000000ff00)
orc.b(0x0000000000004000) = 0x000000000000ff00   OK (expect 0x000000000000ff00)
orc.b(0x0000000000008000) = 0x000000000000ff00   OK (expect 0x000000000000ff00)
orc.b(0x0000000000010000) = 0x0000000000000000 FAIL (expect 0x0000000000ff0000)
orc.b(0x0000000000020000) = 0x0000000000ff0000   OK (expect 0x0000000000ff0000)
orc.b(0x0000000001000000) = 0x0000000000000000 FAIL (expect 0x00000000ff000000)
orc.b(0x0000000002000000) = 0x00000000ff000000   OK (expect 0x00000000ff000000)
orc.b(0x0000000100000000) = 0x0000000000000000 FAIL (expect 0x000000ff00000000)
orc.b(0x0000000200000000) = 0x000000ff00000000   OK (expect 0x000000ff00000000)
orc.b(0x0000010000000000) = 0x0000000000000000 FAIL (expect 0x0000ff0000000000)
orc.b(0x0000020000000000) = 0x0000ff0000000000   OK (expect 0x0000ff0000000000)
orc.b(0x0001000000000000) = 0x0000000000000000 FAIL (expect 0x00ff000000000000)
orc.b(0x0002000000000000) = 0x00ff000000000000   OK (expect 0x00ff000000000000)
orc.b(0x0100000000000000) = 0x0000000000000000 FAIL (expect 0xff00000000000000)
orc.b(0x0200000000000000) = 0xff00000000000000   OK (expect 0xff00000000000000)
orc.b(0x0400000000000000) = 0xff00000000000000   OK (expect 0xff00000000000000)
orc.b(0x0800000000000000) = 0xff00000000000000   OK (expect 0xff00000000000000)
orc.b(0x1000000000000000) = 0xff00000000000000   OK (expect 0xff00000000000000)
orc.b(0x2000000000000000) = 0xff00000000000000   OK (expect 0xff00000000000000)
orc.b(0x4000000000000000) = 0xff00000000000000   OK (expect 0xff00000000000000)
orc.b(0x8000000000000000) = 0xff00000000000000   OK (expect 0xff00000000000000)
orc.b(0xffffffffffffffff) = 0xffffffffffffffff   OK (expect 0xffffffffffffffff)
orc.b(0x00ff00ff00ff00ff) = 0x00ff00ff00ff00ff   OK (expect 0x00ff00ff00ff00ff)
orc.b(0xff00ff00ff00ff00) = 0xff00ff00ff00ff00   OK (expect 0xff00ff00ff00ff00)
orc.b(0x0001000100010001) = 0x00000000000000ff FAIL (expect 0x00ff00ff00ff00ff)
orc.b(0x0100010001000100) = 0x0000000000000000 FAIL (expect 0xff00ff00ff00ff00)
orc.b(0x8040201008040201) = 0xffffffffffffffff   OK (expect 0xffffffffffffffff)
orc.b(0x0804020180402010) = 0xffffffffffffffff   OK (expect 0xffffffffffffffff)
orc.b(0x010055aa00401100) = 0x0000ffff00ffff00 FAIL (expect 0xff00ffff00ffff00)


>
> -static bool trans_gorci(DisasContext *ctx, arg_gorci *a)
> +static bool trans_orc_b(DisasContext *ctx, arg_orc_b *a)
>  {
> -    REQUIRE_EXT(ctx, RVB);
> -    return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
> +    REQUIRE_ZBB(ctx);
> +    return gen_unary(ctx, a, EXT_ZERO, gen_orc_b);
>  }
>
>  #define GEN_SHADD(SHAMT)                                       \
> @@ -476,22 +487,6 @@ static bool trans_greviw(DisasContext *ctx, arg_greviw 
> *a)
>      return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_grev);
>  }
>
> -static bool trans_gorcw(DisasContext *ctx, arg_gorcw *a)
> -{
> -    REQUIRE_64BIT(ctx);
> -    REQUIRE_EXT(ctx, RVB);
> -    ctx->w = true;
> -    return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
> -}
> -
> -static bool trans_gorciw(DisasContext *ctx, arg_gorciw *a)
> -{
> -    REQUIRE_64BIT(ctx);
> -    REQUIRE_EXT(ctx, RVB);
> -    ctx->w = true;
> -    return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
> -}
> -
>  #define GEN_SHADD_UW(SHAMT)                                       \
>  static void gen_sh##SHAMT##add_uw(TCGv ret, TCGv arg1, TCGv arg2) \
>  {                                                                 \
> --
> 2.31.1
>
>

Reply via email to