Re: [PATCH v3] x86: make VPTERNLOG* usable on less than 512-bit operands with just AVX512F

Hongtao Liu via Gcc-patches Tue, 20 Jun 2023 01:34:17 -0700

On Tue, Jun 20, 2023 at 3:07 PM Jan Beulich via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> There's no reason to constrain this to AVX512VL, unless instructed so by
> -mprefer-vector-width=, as the wider operation is unusable for more
> narrow operands only when the possible memory source is a non-broadcast
> one. This way even the scalar copysign<mode>3 can benefit from the
> operation being a single-insn one (leaving aside moves which the
> compiler decides to insert for unclear reasons, and leaving aside the
> fact that bcst_mem_operand() is too restrictive for broadcast to be
> embedded right into VPTERNLOG*).
>
> While there also bring *<avx512>_vternlog<mode>_all's in sync with that
> of the three splitters.
>
> Along with this also request value duplication in
> ix86_expand_copysign()'s call to ix86_build_signbit_mask(), eliminating
> excess space allocation in .rodata.*, filled with zeros which are never
> read.
>
> gcc/
>
>         * config/i386/i386-expand.cc (ix86_expand_copysign): Request
>         value duplication by ix86_build_signbit_mask() when AVX512F and
>         not HFmode.
>         * config/i386/sse.md (*<avx512>_vternlog<mode>_all): Convert to
>         2-alternative form. Adjust "mode" attribute. Add "enabled"
>         attribute.
>         (*<avx512>_vpternlog<mode>_1): Also permit when TARGET_AVX512F
>         && !TARGET_PREFER_AVX256.
>         (*<avx512>_vpternlog<mode>_2): Likewise.
>         (*<avx512>_vpternlog<mode>_3): Likewise.
>
> gcc/testsuite/
>         * gcc.target/i386/avx512f-copysign.c: New test.
> ---
> I haven't been able to find documentation on the dejagnu(?) regex syntax
> (?:...). With ordinary (...) failing (producing twice as many matches),
> I could only derive this from other scan-assembler patterns.
>
> I guess the underlying pattern, going along the lines of what
> <mask_codefor>one_cmpl<mode>2<mask_name> uses, can be applied elsewhere
> as well.
That should be guarded with !TARGET_PREFER_AVX256, let's handle that
in a separate patch.
>
> HFmode could use embedded broadcast too for copysign and alike, but that
> would need to be V2HF -> V8HF (for which I don't think there are any
> existing patterns).
> ---
> v3: Adjust insn conditional as well. Add testcase.
> v2: Respect -mprefer-vector-width=.
>
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -2266,7 +2266,7 @@ ix86_expand_copysign (rtx operands[])
>    else
>      dest = NULL_RTX;
>    op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
> -  mask = ix86_build_signbit_mask (vmode, 0, 0);
> +  mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 
> 0);
>
>    if (CONST_DOUBLE_P (operands[1]))
>      {
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -12399,22 +12399,35 @@
>     (set_attr "mode" "<sseinsnmode>")])
>
>  (define_insn "*<avx512>_vternlog<mode>_all"
> -  [(set (match_operand:V 0 "register_operand" "=v")
> +  [(set (match_operand:V 0 "register_operand" "=v,v")
>         (unspec:V
> -         [(match_operand:V 1 "register_operand" "0")
> -          (match_operand:V 2 "register_operand" "v")
> -          (match_operand:V 3 "bcst_vector_operand" "vmBr")
> +         [(match_operand:V 1 "register_operand" "0,0")
> +          (match_operand:V 2 "register_operand" "v,v")
> +          (match_operand:V 3 "bcst_vector_operand" "vBr,m")
>            (match_operand:SI 4 "const_0_to_255_operand")]
>           UNSPEC_VTERNLOG))]
> -  "TARGET_AVX512F
> +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> +    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
>  /* Disallow embeded broadcast for vector HFmode since
>     it's not real AVX512FP16 instruction.  */
>    && (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) >= 4
>       || GET_CODE (operands[3]) != VEC_DUPLICATE)"
> -  "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"
> +{
> +  if (TARGET_AVX512VL)
> +    return "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}";
> +  else
> +    return "vpternlog<ternlogsuffix>\t{%4, %g3, %g2, %g0|%g0, %g2, %g3, %4}";
> +}
>    [(set_attr "type" "sselog")
>     (set_attr "prefix" "evex")
> -   (set_attr "mode" "<sseinsnmode>")])
> +   (set (attr "mode")
> +        (if_then_else (match_test "TARGET_AVX512VL")
> +                     (const_string "<sseinsnmode>")
> +                     (const_string "XI")))
> +   (set (attr "enabled")
> +       (if_then_else (eq_attr "alternative" "1")
> +                     (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> +                     (const_string "*")))])
>
>  ;; There must be lots of other combinations like
>  ;;
> @@ -12443,7 +12456,8 @@
>           (any_logic2:V
>             (match_operand:V 3 "regmem_or_bitnot_regmem_operand")
>             (match_operand:V 4 "regmem_or_bitnot_regmem_operand"))))]
> -  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> +    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
>     && ix86_pre_reload_split ()
>     && (rtx_equal_p (STRIP_UNARY (operands[1]),
>                     STRIP_UNARY (operands[4]))
> @@ -12527,7 +12541,8 @@
>               (match_operand:V 2 "regmem_or_bitnot_regmem_operand"))
>             (match_operand:V 3 "regmem_or_bitnot_regmem_operand"))
>           (match_operand:V 4 "regmem_or_bitnot_regmem_operand")))]
> -  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> +    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
>     && ix86_pre_reload_split ()
>     && (rtx_equal_p (STRIP_UNARY (operands[1]),
>                     STRIP_UNARY (operands[4]))
> @@ -12610,7 +12625,8 @@
>             (match_operand:V 1 "regmem_or_bitnot_regmem_operand")
>             (match_operand:V 2 "regmem_or_bitnot_regmem_operand"))
>           (match_operand:V 3 "regmem_or_bitnot_regmem_operand")))]
> -  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> +    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
>     && ix86_pre_reload_split ()"
>    "#"
>    "&& 1"
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c
> @@ -0,0 +1,32 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -mno-avx512vl -O2" } */
Please explicitly add -mprefer-vector-width=512, our tester will also
test unix{-m32 \-march=cascadelake,\ -march=cascadelake} which set the
- mprefer-vector-width=256, -mprefer-vector-width=512 in dg-options
can overwrite that.
Others LGTM.
> +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ 
> \\t\]+\\\$(?:216|228|0xd8|0xe4)," 5 } } */
> +
> +double cs_df (double x, double y)
> +{
> +  return __builtin_copysign (x, y);
> +}
> +
> +float cs_sf (float x, float y)
> +{
> +  return __builtin_copysignf (x, y);
> +}
> +
> +typedef double __attribute__ ((vector_size (16))) v2df;
> +typedef double __attribute__ ((vector_size (32))) v4df;
> +typedef double __attribute__ ((vector_size (64))) v8df;
> +
> +v2df cs_v2df (v2df x, v2df y)
> +{
> +  return __builtin_ia32_copysignpd (x, y);
> +}
> +
> +v4df cs_v4df (v4df x, v4df y)
> +{
> +  return __builtin_ia32_copysignpd256 (x, y);
> +}
> +
> +v8df cs_v8df (v8df x, v8df y)
> +{
> +  return __builtin_ia32_copysignpd512 (x, y);
> +}




-- 
BR,
Hongtao

Re: [PATCH v3] x86: make VPTERNLOG* usable on less than 512-bit operands with just AVX512F

Reply via email to