Re: [Qemu-devel] [PATCH v7 08/26] tcg/i386: Add vector operations

2017-12-27 Thread Richard Henderson
On 12/27/2017 07:31 AM, Kirill Batuzov wrote:
> I think something is wrong with instruction encodings here. Looks like
>   tcg_out_mov(_ctx, TCG_TYPE_I64, TCG_REG_EBP, TCG_REG_XMM0)
> produces
>   vmovq %xmm5, %rax
> instead.

Bah.  The operands are swapped -- ebp == 5 and eax == 0.


r~



Re: [Qemu-devel] [PATCH v7 08/26] tcg/i386: Add vector operations

2017-12-27 Thread Kirill Batuzov
On Mon, 18 Dec 2017, Richard Henderson wrote:

> The x86 vector instruction set is extremely irregular.  With newer
> editions, Intel has filled in some of the blanks.  However, we don't
> get many 64-bit operations until SSE4.2, introduced in 2009.
> 
> The subsequent edition was for AVX1, introduced in 2011, which added
> three-operand addressing, and adjusts how all instructions should be
> encoded.
> 
> Given the relatively narrow 2 year window between possible to support
> and desirable to support, and to vastly simplify code maintainence,
> I am only planning to support AVX1 and later cpus.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/i386/tcg-target.h |  36 ++-
>  tcg/i386/tcg-target.inc.c | 561 
> ++
>  2 files changed, 546 insertions(+), 51 deletions(-)
> 

> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
> index 63d27f10e7..e9a4d92598 100644
> --- a/tcg/i386/tcg-target.inc.c
> +++ b/tcg/i386/tcg-target.inc.c

> -static inline void tcg_out_mov(TCGContext *s, TCGType type,
> -   TCGReg ret, TCGReg arg)
> +static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
> +{
> +int rexw = 0;
> +
> +if (arg == ret) {
> +return;
> +}
> +switch (type) {
> +case TCG_TYPE_I64:
> +rexw = P_REXW;
> +/* fallthru */
> +case TCG_TYPE_I32:
> +if (ret < 16) {
> +if (arg < 16) {
> +tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
> +} else {
> +tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, ret, 0, arg);
> +}
> +} else {
> +if (arg < 16) {
> +tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
> +} else {
> +tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
> +}
> +}
> +break;
> +
> +case TCG_TYPE_V64:
> +tcg_debug_assert(ret >= 16 && arg >= 16);
> +tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
> +break;
> +case TCG_TYPE_V128:
> +tcg_debug_assert(ret >= 16 && arg >= 16);
> +tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
> +break;
> +case TCG_TYPE_V256:
> +tcg_debug_assert(ret >= 16 && arg >= 16);
> +tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
> +break;
> +
> +default:
> +g_assert_not_reached();
> +}
> +}

I think something is wrong with instruction encodings here. Looks like
  tcg_out_mov(_ctx, TCG_TYPE_I64, TCG_REG_EBP, TCG_REG_XMM0)
produces
  vmovq %xmm5, %rax
instead.

Here is the dump.

IN: 
0x00400580:  4e040c41  dup  v1.4s, w2
0x00400584:  4b0203e2  neg  w2, w2
0x00400588:  3d800021  str  q1, [x1]
0x0040058c:  d65f03c0  ret  

OP after optimization and liveness analysis:
 ld_i32 tmp0,env,$0xffec  dead: 1
 movi_i32 tmp1,$0x0
 brcond_i32 tmp0,tmp1,lt,$L0  dead: 0 1

  00400580  
 dup_vec v128,e32,tmp2,x2
 st_vec v128,e8,tmp2,env,$0x8b0   dead: 0

  00400584  
 ext32u_i64 tmp4,x2   dead: 1
 neg_i64 tmp5,tmp4dead: 1
 ext32u_i64 x2,tmp5   sync: 0  dead: 0 1

<...>

OUT: [size=111]
0x6075bf40:  41 8b 6e ec  movl -0x14(%r14), %ebp
0x6075bf44:  85 edtestl%ebp, %ebp
0x6075bf46:  0f 8c 59 00 00 00jl   0x6075bfa5
0x6075bf4c:  c4 c1 7a 7e 46 50vmovq0x50(%r14), %xmm0
0x6075bf52:  c5 f9 70 c8 00   vpshufd  $0, %xmm0, %xmm1
0x6075bf57:  c4 c1 7a 7f 8e b0 08 00  vmovdqu  %xmm1, 0x8b0(%r14)
0x6075bf5f:  00
0x6075bf60:  c4 e1 f9 7e e8   vmovq%xmm5, %rax
0x6075bf65:  8b edmovl %ebp, %ebp
0x6075bf67:  48 f7 dd negq %rbp
0x6075bf6a:  8b edmovl %ebp, %ebp
0x6075bf6c:  49 89 6e 50  movq %rbp, 0x50(%r14)
<...>

%xmm5 is used uninitialized, there is no move from either %xmm0 or
0x50(%r14) to %ebp, there are two unnecessary movl %ebp, %ebp.

-- 
Kirill



[Qemu-devel] [PATCH v7 08/26] tcg/i386: Add vector operations

2017-12-18 Thread Richard Henderson
The x86 vector instruction set is extremely irregular.  With newer
editions, Intel has filled in some of the blanks.  However, we don't
get many 64-bit operations until SSE4.2, introduced in 2009.

The subsequent edition was for AVX1, introduced in 2011, which added
three-operand addressing, and adjusts how all instructions should be
encoded.

Given the relatively narrow 2 year window between possible to support
and desirable to support, and to vastly simplify code maintainence,
I am only planning to support AVX1 and later cpus.

Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.h |  36 ++-
 tcg/i386/tcg-target.inc.c | 561 ++
 2 files changed, 546 insertions(+), 51 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index b89dababf4..f9d3fc4a93 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -30,10 +30,10 @@
 
 #ifdef __x86_64__
 # define TCG_TARGET_REG_BITS  64
-# define TCG_TARGET_NB_REGS   16
+# define TCG_TARGET_NB_REGS   32
 #else
 # define TCG_TARGET_REG_BITS  32
-# define TCG_TARGET_NB_REGS8
+# define TCG_TARGET_NB_REGS   24
 #endif
 
 typedef enum {
@@ -56,6 +56,26 @@ typedef enum {
 TCG_REG_R13,
 TCG_REG_R14,
 TCG_REG_R15,
+
+TCG_REG_XMM0,
+TCG_REG_XMM1,
+TCG_REG_XMM2,
+TCG_REG_XMM3,
+TCG_REG_XMM4,
+TCG_REG_XMM5,
+TCG_REG_XMM6,
+TCG_REG_XMM7,
+
+/* 64-bit registers; likewise always define.  */
+TCG_REG_XMM8,
+TCG_REG_XMM9,
+TCG_REG_XMM10,
+TCG_REG_XMM11,
+TCG_REG_XMM12,
+TCG_REG_XMM13,
+TCG_REG_XMM14,
+TCG_REG_XMM15,
+
 TCG_REG_RAX = TCG_REG_EAX,
 TCG_REG_RCX = TCG_REG_ECX,
 TCG_REG_RDX = TCG_REG_EDX,
@@ -77,6 +97,8 @@ typedef enum {
 
 extern bool have_bmi1;
 extern bool have_popcnt;
+extern bool have_avx1;
+extern bool have_avx2;
 
 /* optional instructions */
 #define TCG_TARGET_HAS_div2_i32 1
@@ -146,6 +168,16 @@ extern bool have_popcnt;
 #define TCG_TARGET_HAS_mulsh_i640
 #endif
 
+/* We do not support older SSE systems, only beginning with AVX1.  */
+#define TCG_TARGET_HAS_v64  have_avx1
+#define TCG_TARGET_HAS_v128 have_avx1
+#define TCG_TARGET_HAS_v256 have_avx2
+
+#define TCG_TARGET_HAS_andc_vec 1
+#define TCG_TARGET_HAS_orc_vec  0
+#define TCG_TARGET_HAS_not_vec  0
+#define TCG_TARGET_HAS_neg_vec  0
+
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
 (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
  ((ofs) == 0 && (len) == 16))
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 63d27f10e7..e9a4d92598 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -28,9 +28,14 @@
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #if TCG_TARGET_REG_BITS == 64
 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
-"%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
 #else
 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
+#endif
+"%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
+"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+#if TCG_TARGET_REG_BITS == 64
+"%xmm8", "%xmm9", "%xmm10", "%xmm11",
+"%xmm12", "%xmm13", "%xmm14", "%xmm15",
 #endif
 };
 #endif
@@ -60,6 +65,28 @@ static const int tcg_target_reg_alloc_order[] = {
 TCG_REG_ECX,
 TCG_REG_EDX,
 TCG_REG_EAX,
+#endif
+TCG_REG_XMM0,
+TCG_REG_XMM1,
+TCG_REG_XMM2,
+TCG_REG_XMM3,
+TCG_REG_XMM4,
+TCG_REG_XMM5,
+#ifndef _WIN64
+/* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
+   any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
+TCG_REG_XMM6,
+TCG_REG_XMM7,
+#if TCG_TARGET_REG_BITS == 64
+TCG_REG_XMM8,
+TCG_REG_XMM9,
+TCG_REG_XMM10,
+TCG_REG_XMM11,
+TCG_REG_XMM12,
+TCG_REG_XMM13,
+TCG_REG_XMM14,
+TCG_REG_XMM15,
+#endif
 #endif
 };
 
@@ -94,7 +121,7 @@ static const int tcg_target_call_oarg_regs[] = {
 #define TCG_CT_CONST_I32 0x400
 #define TCG_CT_CONST_WSZ 0x800
 
-/* Registers used with L constraint, which are the first argument 
+/* Registers used with L constraint, which are the first argument
registers on x86_64, and two random call clobbered registers on
i386. */
 #if TCG_TARGET_REG_BITS == 64
@@ -125,6 +152,8 @@ static bool have_cmov;
it there.  Therefore we always define the variable.  */
 bool have_bmi1;
 bool have_popcnt;
+bool have_avx1;
+bool have_avx2;
 
 #ifdef CONFIG_CPUID_H
 static bool have_movbe;
@@ -148,6 +177,8 @@ static void patch_reloc(tcg_insn_unit *code_ptr, int type,
 if (value != (int32_t)value) {
 tcg_abort();
 }
+/* FALLTHRU */
+case R_386_32:
 tcg_patch32(code_ptr, value);
 break;
 case R_386_PC8:
@@ -162,6 +193,14 @@ static void