17 Regression] x86: register-source vmovddup spilled to the stack instead of using the register form

Sarvesh.Chandra at amd dot com via Gcc-bugs Thu, 18 Jun 2026 04:28:47 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125876


--- Comment #5 from Sarvesh Chandra <Sarvesh.Chandra at amd dot com> ---
Comment on attachment 64772
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=64772
Patch

>From c7e5a8224a83bfe0951840c62d40938ca00e56ba Mon Sep 17 00:00:00 2001
>From: Sarvesh Chandra <[email protected]>
>Date: Thu, 18 Jun 2026 16:35:46 +0530
>Subject: [PATCH] i386: Match vunpcklpd before vmovddup for V4DF/V8DF
> interleave
>
>An even-lane duplicate of a register operand matched the movddup
>pattern, whose memory-only operand forced the register to be spilled
>to the stack and reloaded.  List the unpcklpd patterns before the
>movddup patterns so a register interleave selects vunpcklpd, while a
>genuine memory load still folds into vmovddup.
>
>PR target/107057
>
>gcc/ChangeLog:
>
>* config/i386/sse.md (avx512f_unpcklpd512): Move before
>avx512f_movddup512.
>(avx_unpcklpd256): Move before avx_movddup256.
>
>gcc/testsuite/ChangeLog:
>
>* gcc.target/i386/avx512-movedup.c: New test.
>
>Co-authored-by: Ashwin Godbole <[email protected]>
>Signed-off-by: Sarvesh Chandra <[email protected]>
>
>diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>index 5d9ab8b14eb..f13232f0e7c 100644
>--- a/gcc/config/i386/sse.md
>+++ b/gcc/config/i386/sse.md
>@@ -13916,64 +13916,64 @@
>    (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex,maybe_vex")
>    (set_attr "mode" "V2DF,V2DF,V1DF,V1DF,V1DF")])
> 
>-(define_insn "avx512f_movddup512<mask_name>"
>+(define_insn "avx512f_unpcklpd512<mask_name>"
>   [(set (match_operand:V8DF 0 "register_operand" "=v")
>       (vec_select:V8DF
>         (vec_concat:V16DF
>-          (match_operand:V8DF 1 "nonimmediate_operand" "m")
>-          (match_dup 1))
>+          (match_operand:V8DF 1 "register_operand" "v")
>+          (match_operand:V8DF 2 "nonimmediate_operand" "vm"))
>         (parallel [(const_int 0) (const_int 8)
>                    (const_int 2) (const_int 10)
>                    (const_int 4) (const_int 12)
>                    (const_int 6) (const_int 14)])))]
>   "TARGET_AVX512F"
>-  "vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
>-  [(set_attr "type" "ssemov")
>+  "vunpcklpd\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
>+  [(set_attr "type" "sselog")
>    (set_attr "prefix" "evex")
>    (set_attr "mode" "V8DF")])
> 
>-(define_insn "avx512f_unpcklpd512<mask_name>"
>+(define_insn "avx512f_movddup512<mask_name>"
>   [(set (match_operand:V8DF 0 "register_operand" "=v")
>       (vec_select:V8DF
>         (vec_concat:V16DF
>-          (match_operand:V8DF 1 "register_operand" "v")
>-          (match_operand:V8DF 2 "nonimmediate_operand" "vm"))
>+          (match_operand:V8DF 1 "nonimmediate_operand" "m")
>+          (match_dup 1))
>         (parallel [(const_int 0) (const_int 8)
>                    (const_int 2) (const_int 10)
>                    (const_int 4) (const_int 12)
>                    (const_int 6) (const_int 14)])))]
>   "TARGET_AVX512F"
>-  "vunpcklpd\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
>-  [(set_attr "type" "sselog")
>+  "vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
>+  [(set_attr "type" "ssemov")
>    (set_attr "prefix" "evex")
>    (set_attr "mode" "V8DF")])
> 
> ;; Recall that the 256-bit unpck insns only shuffle within their lanes.
>-(define_insn "avx_movddup256<mask_name>"
>-  [(set (match_operand:V4DF 0 "register_operand" "=v")
>+(define_insn "avx_unpcklpd256<mask_name>"
>+  [(set (match_operand:V4DF 0 "register_operand"         "=v")
>       (vec_select:V4DF
>         (vec_concat:V8DF
>-          (match_operand:V4DF 1 "nonimmediate_operand" "m")
>-          (match_dup 1))
>+          (match_operand:V4DF 1 "register_operand" " v")
>+          (match_operand:V4DF 2 "nonimmediate_operand" "vm"))
>         (parallel [(const_int 0) (const_int 4)
>                    (const_int 2) (const_int 6)])))]
>   "TARGET_AVX && <mask_avx512vl_condition>"
>-  "vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
>-  [(set_attr "type" "ssemov")
>+  "vunpcklpd\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
>+  [(set_attr "type" "sselog")
>    (set_attr "prefix" "<mask_prefix>")
>    (set_attr "mode" "V4DF")])
> 
>-(define_insn "avx_unpcklpd256<mask_name>"
>-  [(set (match_operand:V4DF 0 "register_operand"         "=v")
>+(define_insn "avx_movddup256<mask_name>"
>+  [(set (match_operand:V4DF 0 "register_operand" "=v")
>       (vec_select:V4DF
>         (vec_concat:V8DF
>-          (match_operand:V4DF 1 "register_operand" " v")
>-          (match_operand:V4DF 2 "nonimmediate_operand" "vm"))
>+          (match_operand:V4DF 1 "nonimmediate_operand" "m")
>+          (match_dup 1))
>         (parallel [(const_int 0) (const_int 4)
>                    (const_int 2) (const_int 6)])))]
>   "TARGET_AVX && <mask_avx512vl_condition>"
>-  "vunpcklpd\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
>-  [(set_attr "type" "sselog")
>+  "vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
>+  [(set_attr "type" "ssemov")
>    (set_attr "prefix" "<mask_prefix>")
>    (set_attr "mode" "V4DF")])
> 
>diff --git a/gcc/testsuite/gcc.target/i386/avx512-movedup.c 
>b/gcc/testsuite/gcc.target/i386/avx512-movedup.c
>new file mode 100644
>index 00000000000..20e78dfcb16
>--- /dev/null
>+++ b/gcc/testsuite/gcc.target/i386/avx512-movedup.c
>@@ -0,0 +1,16 @@
>+/* { dg-do compile } */
>+/* { dg-options "-O2 -mavx512f" } */
>+/* { dg-final { scan-assembler "(?:vmovddup|vunpcklpd)\[ \\t\]+%zmm\[0-9\]+, 
>%zmm\[0-9\]+" } } */
>+/* { dg-final { scan-assembler "(?:vmovddup|vunpcklpd)\[ \\t\]+%ymm\[0-9\]+, 
>%ymm\[0-9\]+" } } */
>+/* { dg-final { scan-assembler "vmovddup\[ \\t\]+\\(" } } */
>+/* { dg-final { scan-assembler-not "vmovddup\[^\\n\]*\\(%\[er\]sp\\)" } } */
>+
>+#include <immintrin.h>
>+
>+__m512d dup512 (__m512d x) { return _mm512_movedup_pd (x); }
>+__m256d dup256 (__m256d x) { return _mm256_movedup_pd (x); }
>+__m512d interleave512 (__m512d x) { return _mm512_unpacklo_pd (x, x); }
>+__m256d interleave256 (__m256d x) { return _mm256_unpacklo_pd (x, x); }
>+__m512d load512 (double *p) { return _mm512_movedup_pd (_mm512_loadu_pd (p)); 
>}
>+
>+
>-- 
>2.34.1
>

[Bug rtl-optimization/125876] [13/14/15/16/17 Regression] x86: register-source vmovddup spilled to the stack instead of using the register form

Reply via email to