[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-13 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

H.J. Lu  changed:

   What|Removed |Added

  Attachment #45705|0   |1
is obsolete||

--- Comment #21 from H.J. Lu  ---
Created attachment 45707
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=45707&action=edit
A new patch

[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-13 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

H.J. Lu  changed:

   What|Removed |Added

  Attachment #45685|0   |1
is obsolete||

--- Comment #20 from H.J. Lu  ---
Created attachment 45705
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=45705&action=edit
An updated patch

[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-13 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

--- Comment #19 from H.J. Lu  ---
sse.md has

(define_insn "mov_internal"
  [(set (match_operand:VMOVE 0 "nonimmediate_operand"
 "=v,v ,v ,m")
(match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
 " C,BC,vm,v"))]

  /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
 in avx512f, so we need to use workarounds, to access sse registers
 16-31, which are evex-only. In avx512vl we don't need workarounds.  */
  if (TARGET_AVX512F &&  < 64 && !TARGET_AVX512VL
  && (EXT_REX_SSE_REG_P (operands[0])
  || EXT_REX_SSE_REG_P (operands[1])))
{
  if (memory_operand (operands[0], mode))
{
  if ( == 32)
return "vextract64x4\t{$0x0, %g1, %0|%0, %g1,
0x0}";
  else if ( == 16)
return "vextract32x4\t{$0x0, %g1, %0|%0, %g1,
0x0}";
  else
gcc_unreachable ();
}
...

However, ix86_hard_regno_mode_ok has

 /* TODO check for QI/HI scalars.  */
  /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
  if (TARGET_AVX512VL
  && (mode == OImode
  || mode == TImode
  || VALID_AVX256_REG_MODE (mode)
  || VALID_AVX512VL_128_REG_MODE (mode)))
return true;

  /* xmm16-xmm31 are only available for AVX-512.  */
  if (EXT_REX_SSE_REGNO_P (regno))
return false;

  if (TARGET_AVX512F &&  < 64 && !TARGET_AVX512VL
  && (EXT_REX_SSE_REG_P (operands[0])
  || EXT_REX_SSE_REG_P (operands[1])))

is a dead code:

[hjl@gnu-4 gcc]$ cat /tmp/z.c 
#include 

extern __m128 i;

__m128
foo1 (void)
{
  register __m128 xmm16 __asm ("xmm16") = i;
  asm volatile ("" : "+v" (xmm16));
  register __m128 xmm17 __asm ("xmm17") = xmm16;
  asm volatile ("" : "+v" (xmm17));
  return xmm17;
}
[hjl@gnu-4 gcc]$ /usr/gcc-5.4.1-x32/bin/gcc  -S -O2 -march=knl /tmp/z.c 
/tmp/z.c: In function ‘foo1’:
/tmp/z.c:8:19: error: register specified for ‘xmm16’ isn’t suitable for data
type
   register __m128 xmm16 __asm ("xmm16") = i;
   ^
/tmp/z.c:10:19: error: register specified for ‘xmm17’ isn’t suitable for data
type
   register __m128 xmm17 __asm ("xmm17") = xmm16;
   ^
[hjl@gnu-4 gcc]$

[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-12 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

--- Comment #18 from H.J. Lu  ---
(In reply to Jakub Jelinek from comment #14)
> Comment on attachment 45685 [details]
> I am testing this
> 
> The movsi change doesn't look entirely right to me.  While OImode or TImode
> is not allowed in ext sse regs unless AVX512VL, that is not the case for
> SImode, so for SImode if one or both operands are ext sse regs and
> !TARGET_AVX512VL, we need to use MODE_XI and use the pattern with %g1, %g0
> in there.

No need to set MODE_XI:

if (EXT_REX_SSE_REG_P (operands[0])
  || EXT_REX_SSE_REG_P (operands[1]))
{
  if (TARGET_AVX512VL)
return "vmovdqa32\t{%1, %0|%0, %1}"; 
  else
return "vmovdqa32\t{%g1, %0|%0, %g1}";
}
  else
return "%vmovdqa\t{%1, %0|%0, %1}";

[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-12 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

--- Comment #17 from H.J. Lu  ---
[hjl@gnu-4 gcc]$ cat /tmp/z.c
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-options "-O2 -march=skylake-avx512" } */

extern long long i;

long long
foo1 (void)
{
  register long long xmm16 __asm ("xmm16") = i;
  asm volatile ("" : "+v" (xmm16));
  register long long xmm17 __asm ("xmm17") = xmm16;
  asm volatile ("" : "+v" (xmm17));
  return xmm17;
}

/* { dg-final { scan-assembler-times
"vmovdqa32\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]" 1 } } */
/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
[hjl@gnu-4 gcc]$ gcc -S -O2 -march=skylake-avx512 /tmp/z.c   -mno-avx512vl
[hjl@gnu-4 gcc]$ cat z.s
.file   "z.c"
.text
.p2align 4,,15
.globl  foo1
.type   foo1, @function
foo1:
.LFB0:
.cfi_startproc
vmovq   i(%rip), %xmm16
vmovdqa64   %xmm16, %xmm17  <<< This is an AVX512VL instruction.
vmovq   %xmm17, %rax
ret
.cfi_endproc
.LFE0:
.size   foo1, .-foo1
.ident  "GCC: (GNU) 8.2.1 20190209 (Red Hat 8.2.1-8)"
.section.note.GNU-stack,"",@progbits
[hjl@gnu-4 gcc]$

[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-12 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

--- Comment #16 from H.J. Lu  ---
[hjl@gnu-4 gcc]$ cat /tmp/y.c
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-options "-O2 -march=skylake-avx512 -mprefer-vector-width=512" } */

extern float d;

void
foo1 (float x)
{
  register float xmm16 __asm ("xmm16") = x;
  asm volatile ("" : "+v" (xmm16));
  d = xmm16;
}

/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
[hjl@gnu-4 gcc]$ gcc -S -O2 -march=skylake-avx512 /tmp/y.c
-mprefer-vector-width=512
[hjl@gnu-4 gcc]$ cat y.s
.file   "y.c"
.text
.p2align 4,,15
.globl  foo1
.type   foo1, @function
foo1:
.LFB0:
.cfi_startproc
vmovaps %zmm0, %zmm16
vmovss  %xmm16, d(%rip)
ret
.cfi_endproc
.LFE0:
.size   foo1, .-foo1
.ident  "GCC: (GNU) 8.2.1 20190209 (Red Hat 8.2.1-8)"
.section.note.GNU-stack,"",@progbits
[hjl@gnu-4 gcc]$

[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-12 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

--- Comment #15 from H.J. Lu  ---
[hjl@gnu-4 gcc]$ cat /tmp/x.c 
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-options "-O2 -march=skylake-avx512 -mprefer-vector-width=512" } */

extern double d;

void
foo1 (double x)
{
  register double xmm16 __asm ("xmm16") = x;
  asm volatile ("" : "+v" (xmm16));
  d = xmm16;
}

/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
[hjl@gnu-4 gcc]$  gcc -S -O2 -march=skylake-avx512 /tmp/x.c
-mprefer-vector-width=512
[hjl@gnu-4 gcc]$ cat x.s
.file   "x.c"
.text
.p2align 4,,15
.globl  foo1
.type   foo1, @function
foo1:
.LFB0:
.cfi_startproc
vmovapd %zmm0, %zmm16
vmovsd  %xmm16, d(%rip)
ret
.cfi_endproc
.LFE0:
.size   foo1, .-foo1
.ident  "GCC: (GNU) 8.2.1 20190209 (Red Hat 8.2.1-8)"
.section.note.GNU-stack,"",@progbits
[hjl@gnu-4 gcc]$

[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-12 Thread jakub at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

--- Comment #14 from Jakub Jelinek  ---
Comment on attachment 45685
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=45685
I am testing this

The movsi change doesn't look entirely right to me.  While OImode or TImode is
not allowed in ext sse regs unless AVX512VL, that is not the case for SImode,
so for SImode if one or both operands are ext sse regs and !TARGET_AVX512VL, we
need to use MODE_XI and use the pattern with %g1, %g0 in there.

[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-12 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

H.J. Lu  changed:

   What|Removed |Added

   Target Milestone|7.5 |9.0

[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-12 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

--- Comment #13 from H.J. Lu  ---
Created attachment 45685
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=45685&action=edit
I am testing this

[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-12 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

--- Comment #12 from H.J. Lu  ---
[hjl@gnu-4 tmp]$ cat x.c
/* { dg-do compile } */
/* { dg-options "-O2 -march=skylake-avx512" } */

extern int i;

int
foo1 (void)
{
  register int xmm16 __asm ("xmm16") = i;
  asm volatile ("" : "+v" (xmm16));
  register int xmm17 __asm ("xmm17") = xmm16;
  asm volatile ("" : "+v" (xmm17));
  return xmm17;
}

int
foo2 (void)
{
  register int xmm1 __asm ("xmm1") = i;
  asm volatile ("" : "+v" (xmm1));
  register int xmm17 __asm ("xmm17") = xmm1;
  asm volatile ("" : "+v" (xmm17));
  return xmm1;
}

/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
[hjl@gnu-4 tmp]$  gcc -S -O2 -march=skylake-avx512 x.c
[hjl@gnu-4 tmp]$ cat x.s
.file   "x.c"
.text
.p2align 4,,15
.globl  foo1
.type   foo1, @function
foo1:
.LFB0:
.cfi_startproc
vmovd   i(%rip), %xmm16
vmovdqa32   %zmm16, %zmm17
vmovd   %xmm17, %eax
ret
.cfi_endproc
.LFE0:
.size   foo1, .-foo1
.p2align 4,,15
.globl  foo2
.type   foo2, @function
foo2:
.LFB1:
.cfi_startproc
vmovd   i(%rip), %xmm1
vmovdqa32   %zmm1, %zmm17
vmovd   %xmm1, %eax
ret
.cfi_endproc
.LFE1:
.size   foo2, .-foo2
.ident  "GCC: (GNU) 8.2.1 20190209 (Red Hat 8.2.1-8)"
.section.note.GNU-stack,"",@progbits
[hjl@gnu-4 tmp]$

[Bug target/89229] Unnecessary ZMM in movoi_internal_avx/movti_internal

2019-02-12 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

--- Comment #11 from H.J. Lu  ---
(In reply to Jakub Jelinek from comment #10)
> Though, is this really a regression? I mean, have we ever emitted better
> code?

It isn't a regression.