Re: [Qemu-devel] [PATCH v3 19/29] tcg-aarch64: Introduce tcg_fmt_Rd_uimm_s

2013-09-06 Thread Claudio Fontana
On 05.09.2013 17:41, Richard Henderson wrote:
 On 09/05/2013 06:32 AM, Claudio Fontana wrote:
  {
 -uint32_t half, base, shift, movk = 0;
 -/* construct halfwords of the immediate with MOVZ/MOVK with LSL */
 -/* using MOVZ 0x5280 | extended reg.. */
 -base = (value  0x) ? 0xd280 : 0x5280;
 -/* count trailing zeros in 16 bit steps, mapping 64 to 0. Emit the
 -   first MOVZ with the half-word immediate skipping the zeros, with a 
 shift
 -   (LSL) equal to this number. Then morph all next instructions into 
 MOVKs.
 -   Zero the processed half-word in the value, continue until empty.
 -   We build the final result 16bits at a time with up to 4 
 instructions,
 -   but do not emit instructions for 16bit zero holes. */

 Please do not remove these comments.
 In my judgement this part of the code profits from some verbose 
 clarification.
 What is happening might be obvious to you, but not to others trying to step 
 in.
 
 Fair enough.
 
 In general I'd prefer to keep movi as it was (functionally-wise) for the
 time being, replacing it with a more efficient version once we can get some
 numbers (which will be soon) with which to justify (or not) the added code
 complexity.
 
 The most important thing we're not doing at the moment is handling negative
 numbers efficiently.  E.g. we're using 4 insns to load -1.

Ok, lets punctually address that then.

 r~
 

Claudio





Re: [Qemu-devel] [PATCH v3 19/29] tcg-aarch64: Introduce tcg_fmt_Rd_uimm_s

2013-09-05 Thread Claudio Fontana
Hello Richard,

thanks for your prolific work. Few comments below for starters:

On 02.09.2013 19:54, Richard Henderson wrote:
 Cleaning up the implementation of tcg_out_movi at the same time.
 
 Signed-off-by: Richard Henderson r...@twiddle.net
 ---
  tcg/aarch64/tcg-target.c | 48 
 ++--
  1 file changed, 22 insertions(+), 26 deletions(-)
 
 diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
 index 09ccd67..59e5026 100644
 --- a/tcg/aarch64/tcg-target.c
 +++ b/tcg/aarch64/tcg-target.c
 @@ -274,6 +274,11 @@ typedef enum {
  INSN_EOR   = 0x4a00,
  INSN_EON   = 0x4a20,
  
 +/* Move wide immediate instructions */
 +INSN_MOVN  = 0x1280,
 +INSN_MOVZ  = 0x5280,
 +INSN_MOVK  = 0x7280,
 +
  /* Add/subtract immediate instructions */
  INSN_ADDI  = 0x1100,
  INSN_ADDSI = 0x3100,
 @@ -478,6 +483,12 @@ static inline void tcg_fmt_Rdnm_cond(TCGContext *s, 
 AArch64Insn insn,
| tcg_cond_to_aarch64[c]  12);
  }
  
 +static inline void tcg_fmt_Rd_uimm_s(TCGContext *s, AArch64Insn insn, bool 
 ext,
 + TCGReg rd, uint16_t half, unsigned 
 shift)
 +{
 +tcg_out32(s, insn | ext  31 | shift  17 | half  5 | rd);
 +}
 +
  static inline void tcg_out_ldst_9(TCGContext *s,
enum aarch64_ldst_op_data op_data,
enum aarch64_ldst_op_type op_type,
 @@ -522,38 +533,23 @@ static inline void tcg_out_movr_sp(TCGContext *s, bool 
 ext,
  tcg_fmt_Rdn_aimm(s, INSN_ADDI, ext, rd, rn, 0);
  }
  
 -static inline void tcg_out_movi_aux(TCGContext *s,
 -TCGReg rd, uint64_t value)
 +static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
 + tcg_target_long value)
  {
 -uint32_t half, base, shift, movk = 0;
 -/* construct halfwords of the immediate with MOVZ/MOVK with LSL */
 -/* using MOVZ 0x5280 | extended reg.. */
 -base = (value  0x) ? 0xd280 : 0x5280;
 -/* count trailing zeros in 16 bit steps, mapping 64 to 0. Emit the
 -   first MOVZ with the half-word immediate skipping the zeros, with a 
 shift
 -   (LSL) equal to this number. Then morph all next instructions into 
 MOVKs.
 -   Zero the processed half-word in the value, continue until empty.
 -   We build the final result 16bits at a time with up to 4 instructions,
 -   but do not emit instructions for 16bit zero holes. */

Please do not remove these comments.
In my judgement this part of the code profits from some verbose clarification.
What is happening might be obvious to you, but not to others trying to step in.

 +AArch64Insn insn = INSN_MOVZ;
 +
 +if (type == TCG_TYPE_I32) {
 +value = (uint32_t)value;
 +}
 +
  do {
 -shift = ctz64(value)  (63  -16);
 -half = (value  shift)  0x;
 -tcg_out32(s, base | movk | shift  17 | half  5 | rd);
 -movk = 0x2000; /* morph next MOVZs into MOVKs */
 +unsigned shift = ctz64(value)  (63  -16);
 +tcg_fmt_Rd_uimm_s(s, insn, shift = 32, rd, value  shift, shift);
  value = ~(0xUL  shift);
 +insn = INSN_MOVK;
  } while (value);
  }
  
 -static inline void tcg_out_movi(TCGContext *s, TCGType type,
 -TCGReg rd, tcg_target_long value)
 -{
 -if (type == TCG_TYPE_I64) {
 -tcg_out_movi_aux(s, rd, value);
 -} else {
 -tcg_out_movi_aux(s, rd, value  0x);
 -}
 -}
 -
  static inline void tcg_out_ldst_r(TCGContext *s,
enum aarch64_ldst_op_data op_data,
enum aarch64_ldst_op_type op_type,
 


Note that the movi change you introduce with the combination of patches 19 and 
20 is not correct, breaks all targets I tried.
I will dig in the details tomorrow commenting patch 20.

In general I'd prefer to keep movi as it was (functionally-wise) for the time 
being, replacing it with a more efficient version once we can get some numbers 
(which will be soon) with which to justify (or not) the added code complexity.

But using the INSN_* you introduced instead of inline numbers is of course fine 
for me.

Claudio

-- 
Claudio Fontana
Server OS Architect
Huawei Technologies Duesseldorf GmbH





Re: [Qemu-devel] [PATCH v3 19/29] tcg-aarch64: Introduce tcg_fmt_Rd_uimm_s

2013-09-05 Thread Richard Henderson
On 09/05/2013 06:32 AM, Claudio Fontana wrote:
  {
 -uint32_t half, base, shift, movk = 0;
 -/* construct halfwords of the immediate with MOVZ/MOVK with LSL */
 -/* using MOVZ 0x5280 | extended reg.. */
 -base = (value  0x) ? 0xd280 : 0x5280;
 -/* count trailing zeros in 16 bit steps, mapping 64 to 0. Emit the
 -   first MOVZ with the half-word immediate skipping the zeros, with a 
 shift
 -   (LSL) equal to this number. Then morph all next instructions into 
 MOVKs.
 -   Zero the processed half-word in the value, continue until empty.
 -   We build the final result 16bits at a time with up to 4 instructions,
 -   but do not emit instructions for 16bit zero holes. */
 
 Please do not remove these comments.
 In my judgement this part of the code profits from some verbose clarification.
 What is happening might be obvious to you, but not to others trying to step 
 in.

Fair enough.

 In general I'd prefer to keep movi as it was (functionally-wise) for the
 time being, replacing it with a more efficient version once we can get some
 numbers (which will be soon) with which to justify (or not) the added code
 complexity.

The most important thing we're not doing at the moment is handling negative
numbers efficiently.  E.g. we're using 4 insns to load -1.



r~



[Qemu-devel] [PATCH v3 19/29] tcg-aarch64: Introduce tcg_fmt_Rd_uimm_s

2013-09-02 Thread Richard Henderson
Cleaning up the implementation of tcg_out_movi at the same time.

Signed-off-by: Richard Henderson r...@twiddle.net
---
 tcg/aarch64/tcg-target.c | 48 ++--
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 09ccd67..59e5026 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -274,6 +274,11 @@ typedef enum {
 INSN_EOR   = 0x4a00,
 INSN_EON   = 0x4a20,
 
+/* Move wide immediate instructions */
+INSN_MOVN  = 0x1280,
+INSN_MOVZ  = 0x5280,
+INSN_MOVK  = 0x7280,
+
 /* Add/subtract immediate instructions */
 INSN_ADDI  = 0x1100,
 INSN_ADDSI = 0x3100,
@@ -478,6 +483,12 @@ static inline void tcg_fmt_Rdnm_cond(TCGContext *s, 
AArch64Insn insn,
   | tcg_cond_to_aarch64[c]  12);
 }
 
+static inline void tcg_fmt_Rd_uimm_s(TCGContext *s, AArch64Insn insn, bool ext,
+ TCGReg rd, uint16_t half, unsigned shift)
+{
+tcg_out32(s, insn | ext  31 | shift  17 | half  5 | rd);
+}
+
 static inline void tcg_out_ldst_9(TCGContext *s,
   enum aarch64_ldst_op_data op_data,
   enum aarch64_ldst_op_type op_type,
@@ -522,38 +533,23 @@ static inline void tcg_out_movr_sp(TCGContext *s, bool 
ext,
 tcg_fmt_Rdn_aimm(s, INSN_ADDI, ext, rd, rn, 0);
 }
 
-static inline void tcg_out_movi_aux(TCGContext *s,
-TCGReg rd, uint64_t value)
+static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+ tcg_target_long value)
 {
-uint32_t half, base, shift, movk = 0;
-/* construct halfwords of the immediate with MOVZ/MOVK with LSL */
-/* using MOVZ 0x5280 | extended reg.. */
-base = (value  0x) ? 0xd280 : 0x5280;
-/* count trailing zeros in 16 bit steps, mapping 64 to 0. Emit the
-   first MOVZ with the half-word immediate skipping the zeros, with a shift
-   (LSL) equal to this number. Then morph all next instructions into MOVKs.
-   Zero the processed half-word in the value, continue until empty.
-   We build the final result 16bits at a time with up to 4 instructions,
-   but do not emit instructions for 16bit zero holes. */
+AArch64Insn insn = INSN_MOVZ;
+
+if (type == TCG_TYPE_I32) {
+value = (uint32_t)value;
+}
+
 do {
-shift = ctz64(value)  (63  -16);
-half = (value  shift)  0x;
-tcg_out32(s, base | movk | shift  17 | half  5 | rd);
-movk = 0x2000; /* morph next MOVZs into MOVKs */
+unsigned shift = ctz64(value)  (63  -16);
+tcg_fmt_Rd_uimm_s(s, insn, shift = 32, rd, value  shift, shift);
 value = ~(0xUL  shift);
+insn = INSN_MOVK;
 } while (value);
 }
 
-static inline void tcg_out_movi(TCGContext *s, TCGType type,
-TCGReg rd, tcg_target_long value)
-{
-if (type == TCG_TYPE_I64) {
-tcg_out_movi_aux(s, rd, value);
-} else {
-tcg_out_movi_aux(s, rd, value  0x);
-}
-}
-
 static inline void tcg_out_ldst_r(TCGContext *s,
   enum aarch64_ldst_op_data op_data,
   enum aarch64_ldst_op_type op_type,
-- 
1.8.3.1