Re: [PATCH v2 04/10] target/hexagon: introduce new helper functions

2021-02-25 Thread Richard Henderson
On 2/25/21 7:18 AM, Alessandro Di Federico wrote:
> From: Niccolò Izzo 
> 
> These helpers will be employed by the idef-parser generated code.
> 
> Signed-off-by: Alessandro Di Federico 
> Signed-off-by: Niccolò Izzo 
> ---
>  target/hexagon/genptr.c | 227 +++-
>  target/hexagon/genptr.h |  19 
>  target/hexagon/macros.h |   2 +-
>  3 files changed, 245 insertions(+), 3 deletions(-)
> 
> diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
> index 97de669f38..78cda032db 100644
> --- a/target/hexagon/genptr.c
> +++ b/target/hexagon/genptr.c
> @@ -40,7 +40,8 @@ TCGv gen_read_preg(TCGv pred, uint8_t num)
>  return pred;
>  }
>  
> -static inline void gen_log_predicated_reg_write(int rnum, TCGv val, int slot)
> +static inline void gen_log_predicated_reg_write(int rnum, TCGv val,
> +unsigned slot)

This change is unrelated to adding new helper functions.
It requires a separate patch and justification.

> +void gen_fbrev(TCGv result, TCGv src)
> +{
> +TCGv lo = tcg_temp_new();
> +TCGv tmp1 = tcg_temp_new();
> +TCGv tmp2 = tcg_temp_new();
> +
> +/* Bit reversal of low 16 bits */
> +tcg_gen_extract_tl(lo, src, 0, 16);
> +tcg_gen_andi_tl(tmp1, lo, 0x);
> +tcg_gen_shri_tl(tmp1, tmp1, 1);
> +tcg_gen_andi_tl(tmp2, lo, 0x);
> +tcg_gen_shli_tl(tmp2, tmp2, 1);
> +tcg_gen_or_tl(lo, tmp1, tmp2);
> +tcg_gen_andi_tl(tmp1, lo, 0x);
> +tcg_gen_shri_tl(tmp1, tmp1, 2);
> +tcg_gen_andi_tl(tmp2, lo, 0x);
> +tcg_gen_shli_tl(tmp2, tmp2, 2);
> +tcg_gen_or_tl(lo, tmp1, tmp2);
> +tcg_gen_andi_tl(tmp1, lo, 0xf0f0);
> +tcg_gen_shri_tl(tmp1, tmp1, 4);
> +tcg_gen_andi_tl(tmp2, lo, 0x0f0f);
> +tcg_gen_shli_tl(tmp2, tmp2, 4);
> +tcg_gen_or_tl(lo, tmp1, tmp2);
> +tcg_gen_bswap16_tl(lo, lo);

So far we've kept operations like this as external helper functions, where you
can then use revbit16().  General rule of thumb for a cutoff is about 10-15
ops, and this is right on the edge.

Any particular reason you wanted this inlined?

> +/* Final tweaks */
> +tcg_gen_extract_tl(result, src, 16, 16);
> +tcg_gen_or_tl(result, result, lo);

This is wrong.  You've clobbered your carefully reversed results with the high
16-bits of src.

I'm certain you wanted

   tcg_gen_deposit_tl(result, src, lo, 0, 16);

to replace the low 16 bits of the input with your computation.

> +TCGv gen_set_bit(tcg_target_long i, TCGv result, TCGv src)
> +{
> +TCGv mask = tcg_const_tl(~(1 << i));
> +TCGv bit = tcg_temp_new();
> +tcg_gen_shli_tl(bit, src, i);
> +tcg_gen_and_tl(result, result, mask);
> +tcg_gen_or_tl(result, result, bit);
> +tcg_temp_free(mask);
> +tcg_temp_free(bit);

  tcg_gen_deposit_tl(result, result, src, i, 1);

> +void gen_cancel(tcg_target_ulong slot)
> +{
> +TCGv one = tcg_const_tl(1);
> +tcg_gen_deposit_tl(hex_slot_cancelled, hex_slot_cancelled, one, slot, 1);
> +tcg_temp_free(one);

  tcg_gen_ori_tl(hex_slot_cancelled, hex_slot_cancelled,
 1 << slot);

> +void gen_sat_i32(TCGv dest, TCGv source, int width, bool set_overflow)
> +{
> +TCGv max_val = tcg_const_i32((1 << (width - 1)) - 1);
> +TCGv min_val = tcg_const_i32(-(1 << (width - 1)));
> +tcg_gen_movcond_i32(TCG_COND_GT, dest, source, max_val, max_val, source);
> +tcg_gen_movcond_i32(TCG_COND_LT, dest, source, min_val, min_val, dest);

  tcg_gen_smin_tl(dest, source, max_val);
  tcg_gen_smax_tl(dest, dest, min_val);

> +/* Set Overflow Bit */
> +if (set_overflow) {
> +TCGv ovf = tcg_temp_new();
> +TCGv one = tcg_const_i32(1);
> +GET_USR_FIELD(USR_OVF, ovf);
> +tcg_gen_movcond_i32(TCG_COND_GT, ovf, source, max_val, one, ovf);
> +tcg_gen_movcond_i32(TCG_COND_LT, ovf, source, min_val, one, ovf);
> +SET_USR_FIELD(USR_OVF, ovf);

This seems like a complicated way to set overflow.
How about

  tcg_gen_setcond_tl(TCG_COND_NE, ovf, source, dest);
  tcg_gen_shli_tl(ovf, ovf, USR_OVF_SHIFT);
  tcg_gen_or_tl(hex_reg[usr], hex_reg[usr], ovf);


> +tcg_temp_free_i32(ovf);
> +tcg_temp_free_i32(one);
> +}
> +tcg_temp_free_i32(max_val);
> +tcg_temp_free_i32(min_val);
> +}
> +
> +void gen_satu_i32(TCGv dest, TCGv source, int width, bool set_overflow)
> +{
> +TCGv max_val = tcg_const_i32((1 << width) - 1);
> +tcg_gen_movcond_i32(TCG_COND_GTU, dest, source, max_val, max_val, 
> source);
> +TCGv_i32 zero = tcg_const_i32(0);
> +tcg_gen_movcond_i32(TCG_COND_LT, dest, source, zero, zero, dest);

Is this a signed input being saturated to unsigned bounds or not?  Because one
of these two comparisons is wrong.

If it's an unsigned input being saturated to unsigned bounds, then you don't
need the second test at all, and should be using

  tcg_gen_umin_i32(dest, src, max_val);

> +void gen_sat_i64(TCGv_i64 dest, TCGv_i64 source, int width, bool 
>

[PATCH v2 04/10] target/hexagon: introduce new helper functions

2021-02-25 Thread Alessandro Di Federico via
From: Niccolò Izzo 

These helpers will be employed by the idef-parser generated code.

Signed-off-by: Alessandro Di Federico 
Signed-off-by: Niccolò Izzo 
---
 target/hexagon/genptr.c | 227 +++-
 target/hexagon/genptr.h |  19 
 target/hexagon/macros.h |   2 +-
 3 files changed, 245 insertions(+), 3 deletions(-)

diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
index 97de669f38..78cda032db 100644
--- a/target/hexagon/genptr.c
+++ b/target/hexagon/genptr.c
@@ -40,7 +40,8 @@ TCGv gen_read_preg(TCGv pred, uint8_t num)
 return pred;
 }
 
-static inline void gen_log_predicated_reg_write(int rnum, TCGv val, int slot)
+static inline void gen_log_predicated_reg_write(int rnum, TCGv val,
+unsigned slot)
 {
 TCGv one = tcg_const_tl(1);
 TCGv zero = tcg_const_tl(0);
@@ -69,7 +70,8 @@ void gen_log_reg_write(int rnum, TCGv val)
 #endif
 }
 
-static void gen_log_predicated_reg_write_pair(int rnum, TCGv_i64 val, int slot)
+static void gen_log_predicated_reg_write_pair(int rnum, TCGv_i64 val,
+  unsigned slot)
 {
 TCGv val32 = tcg_temp_new();
 TCGv one = tcg_const_tl(1);
@@ -334,5 +336,226 @@ static inline void gen_store_conditional8(CPUHexagonState 
*env,
 tcg_gen_movi_tl(hex_llsc_addr, ~0);
 }
 
+void gen_fbrev(TCGv result, TCGv src)
+{
+TCGv lo = tcg_temp_new();
+TCGv tmp1 = tcg_temp_new();
+TCGv tmp2 = tcg_temp_new();
+
+/* Bit reversal of low 16 bits */
+tcg_gen_extract_tl(lo, src, 0, 16);
+tcg_gen_andi_tl(tmp1, lo, 0x);
+tcg_gen_shri_tl(tmp1, tmp1, 1);
+tcg_gen_andi_tl(tmp2, lo, 0x);
+tcg_gen_shli_tl(tmp2, tmp2, 1);
+tcg_gen_or_tl(lo, tmp1, tmp2);
+tcg_gen_andi_tl(tmp1, lo, 0x);
+tcg_gen_shri_tl(tmp1, tmp1, 2);
+tcg_gen_andi_tl(tmp2, lo, 0x);
+tcg_gen_shli_tl(tmp2, tmp2, 2);
+tcg_gen_or_tl(lo, tmp1, tmp2);
+tcg_gen_andi_tl(tmp1, lo, 0xf0f0);
+tcg_gen_shri_tl(tmp1, tmp1, 4);
+tcg_gen_andi_tl(tmp2, lo, 0x0f0f);
+tcg_gen_shli_tl(tmp2, tmp2, 4);
+tcg_gen_or_tl(lo, tmp1, tmp2);
+tcg_gen_bswap16_tl(lo, lo);
+
+/* Final tweaks */
+tcg_gen_extract_tl(result, src, 16, 16);
+tcg_gen_or_tl(result, result, lo);
+
+tcg_temp_free(lo);
+tcg_temp_free(tmp1);
+tcg_temp_free(tmp2);
+}
+
+TCGv gen_set_bit(tcg_target_long i, TCGv result, TCGv src)
+{
+TCGv mask = tcg_const_tl(~(1 << i));
+TCGv bit = tcg_temp_new();
+tcg_gen_shli_tl(bit, src, i);
+tcg_gen_and_tl(result, result, mask);
+tcg_gen_or_tl(result, result, bit);
+tcg_temp_free(mask);
+tcg_temp_free(bit);
+
+return result;
+}
+
+void gen_cancel(tcg_target_ulong slot)
+{
+TCGv one = tcg_const_tl(1);
+tcg_gen_deposit_tl(hex_slot_cancelled, hex_slot_cancelled, one, slot, 1);
+tcg_temp_free(one);
+}
+
+void gen_store32(TCGv vaddr, TCGv src, tcg_target_long width, unsigned slot)
+{
+tcg_gen_mov_tl(hex_store_addr[slot], vaddr);
+tcg_gen_movi_tl(hex_store_width[slot], width);
+tcg_gen_mov_tl(hex_store_val32[slot], src);
+}
+
+void gen_store1(TCGv_env cpu_env, TCGv vaddr, TCGv src, DisasContext *ctx,
+unsigned slot)
+{
+gen_store32(vaddr, src, 1, slot);
+ctx->store_width[slot] = 1;
+}
+
+void gen_store2(TCGv_env cpu_env, TCGv vaddr, TCGv src, DisasContext *ctx,
+unsigned slot)
+{
+gen_store32(vaddr, src, 2, slot);
+ctx->store_width[slot] = 2;
+}
+
+void gen_store4(TCGv_env cpu_env, TCGv vaddr, TCGv src, DisasContext *ctx,
+unsigned slot)
+{
+gen_store32(vaddr, src, 4, slot);
+ctx->store_width[slot] = 4;
+}
+
+
+void gen_store8(TCGv_env cpu_env, TCGv vaddr, TCGv_i64 src, DisasContext *ctx,
+unsigned slot)
+{
+tcg_gen_mov_tl(hex_store_addr[slot], vaddr);
+tcg_gen_movi_tl(hex_store_width[slot], 8);
+tcg_gen_mov_i64(hex_store_val64[slot], src);
+ctx->store_width[slot] = 8;
+}
+
+void gen_set_usr_field(int field, TCGv val)
+{
+tcg_gen_deposit_tl(hex_gpr[HEX_REG_USR], hex_gpr[HEX_REG_USR], val,
+   reg_field_info[field].offset,
+   reg_field_info[field].width);
+}
+
+void gen_set_usr_fieldi(int field, int x)
+{
+TCGv val = tcg_const_tl(x);
+gen_set_usr_field(field, val);
+tcg_temp_free(val);
+}
+
+void gen_write_new_pc(TCGv addr)
+{
+/* If there are multiple branches in a packet, ignore the second one */
+TCGv zero = tcg_const_tl(0);
+tcg_gen_movcond_tl(TCG_COND_NE, hex_next_PC, hex_branch_taken, zero,
+   hex_next_PC, addr);
+tcg_gen_movi_tl(hex_branch_taken, 1);
+tcg_temp_free(zero);
+}
+
+void gen_sat_i32(TCGv dest, TCGv source, int width, bool set_overflow)
+{
+TCGv max_val = tcg_const_i32((1 << (width - 1)) - 1);
+TCGv min_val = tcg_const_i32(-(1 << (width - 1)));
+tcg_gen_movcond_i32(TCG_COND_GT, dest, source, max_val, max_val, source)