[PATCH] LoongArch: Optimize V4SImode vec_construct for load index length of two.

zhaozhou Thu, 13 Nov 2025 19:19:55 -0800

Under the V4SImode, the vec_construct with the load index {0, 1, 0, 1}
use vldrepl.d, the vec_construct with the load index {0, 1, 0, 0} use
vldrepl.d and vshuf4i, reduced the usage of scalar load and vinsgr2vr.


gcc/ChangeLog:

        * config/loongarch/lsx.md (lsx_vshuf4i_mem_w_0): Add template.
        (lsx_vldrepl_merge_w_0): Ditto.

gcc/testsuite/ChangeLog:

        * gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c:
---
 gcc/config/loongarch/lsx.md                   | 62 +++++++++++++++++++
 .../vector/lsx/lsx-vec-construct-opt.c        | 21 ++++++-
 2 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index cd87757827d..0dea66b572e 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -1631,6 +1631,39 @@ (define_insn "lsx_vshuf4i_<lsxfmt_f>"
   [(set_attr "type" "simd_shf")
    (set_attr "mode" "<MODE>")])
 
+(define_insn_and_split "lsx_vshuf4i_mem_w_0"
+  [(set (match_operand:V4SI 0 "register_operand" "=f")
+       (vec_merge:V4SI
+         (vec_duplicate:V4SI
+           (mem:SI (match_operand:DI 1 "register_operand" "r")))
+         (vec_duplicate:V4SI
+           (mem:SI (plus:DI (match_dup 1) (const_int 4))))
+         (match_operand 2 "const_uimm4_operand" "")))]
+  "ISA_HAS_LSX"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  operands[0] = gen_rtx_REG (V2DImode, REGNO (operands[0]));
+  emit_insn (gen_lsx_vldrepl_d_insn_0 (operands[0], operands[1]));
+
+  operands[0] = gen_rtx_REG (V4SImode, REGNO (operands[0]));
+  rtx sel[4];
+  int op2 = INTVAL (operands[2]);
+  int mask = 1;
+
+  /* Convert imm to an selection.  */
+  for (int i = 0; i < 4; ++i)
+    {
+      sel[i] =  (op2 & mask) ? const0_rtx : const1_rtx;
+      mask = mask << 1;
+    }
+
+  rtx shuf4i_mask = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, sel));
+  emit_insn (gen_lsx_vshuf4i_w (operands[0], operands[0], shuf4i_mask));
+  DONE;
+})
+
 (define_insn "lsx_vsrar_<lsxfmt>"
   [(set (match_operand:ILSX 0 "register_operand" "=f")
        (unspec:ILSX [(match_operand:ILSX 1 "register_operand" "f")
@@ -2550,6 +2583,35 @@ (define_insn "lsx_vldrepl_<lsxfmt_f>_insn_0"
    (set_attr "mode" "<MODE>")
    (set_attr "length" "4")])
 
+;; In 128-bits register, the template implements the load of identical
+;; consecutive SImode data into both the upper 64 bits and lower 64 bits.
+;; Operand[2] performs a vec_merge operation on two consecutive addresses
+;; SImode data items, and places the result in either the lower 64 bits or
+;; the upper 64 bits. When operand[3] is 0, the lower 64 bits are copied
+;; to the upper 64 bits; when operand[3] is 1, the upper 64 bits are copied
+;; to the lower 64 bits.
+
+(define_insn "lsx_vldrepl_merge_w_0"
+  [(set (match_operand:V4SI 0 "register_operand" "=f")
+       (unspec:V4SI
+         [(vec_merge:V4SI
+           (vec_duplicate:V4SI
+             (mem:SI (match_operand:DI 1 "register_operand" "r")))
+           (vec_duplicate:V4SI
+             (mem:SI (plus:DI (match_dup 1) (const_int 4))))
+           (match_operand 2 "const_uimm4_operand" ""))
+         (match_operand 3 "const_0_or_1_operand" "")]
+         UNSPEC_LSX_VREPLVEI_MIRROR))]
+  "ISA_HAS_LSX
+   && (INTVAL (operands[3]) ? (INTVAL (operands[2]) & 0xc) == 0x4
+                           : (INTVAL (operands[2]) & 0x3) == 0x1)"
+{
+  return "vldrepl.d\t%w0,%1,0";
+}
+  [(set_attr "type" "simd_load")
+   (set_attr "mode" "V4SI")
+   (set_attr "length" "4")])
+
 ;; Offset store by sel
 (define_expand "lsx_vstelm_<lsxfmt_f>"
   [(match_operand:LSX 0 "register_operand")
diff --git 
a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
index 92da1c8af9c..a35cda62f12 100644
--- a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
@@ -20,9 +20,9 @@ vec_construct_v2i64 ()
   return res;
 }
 
-/* Only load the lowest 2 elements and directly copy them to high half-part,
-   reducing more vinsgr2vr.w.  */
-/* { dg-final { scan-assembler-times "v4i32:.*\tvreplvei\\.d.*v4i32" 1 } } */
+/* Load the lowest 2 elements and directly copy them to high half-part
+   by vldrepl.d.  */
+/* { dg-final { scan-assembler-times "v4i32:.*\tvldrepl\\.d.*v4i32" 1 } } */
 v4i32
 vec_construct_v4i32 ()
 {
@@ -32,6 +32,21 @@ vec_construct_v4i32 ()
   return res;
 }
 
+/* Load 2 elements of a vector simultaneously by vldrepl.d and shuffle by the
+   vshuf4i.w to avoid use vinsgr2vr.  */
+/* { dg-final { scan-assembler-times "v4i32_1:.*\tvldrepl\\.d.*v4i32_1" 1 } }
+ */
+/* { dg-final { scan-assembler-times "v4i32_1:.*\tvshuf4i\\.w.*v4i32_1" 1 } }
+ */
+v4i32
+vec_construct_v4i32_1 ()
+{
+  v4i32 res =
+  { x_si[0], x_si[1], x_si[0], x_si[0] }
+  ;
+  return res;
+}
+
 /* Only load the lowest 4 elements and directly copy them to high half-part,
    reducing more vinsgr2vr.h.  */
 /* { dg-final { scan-assembler-times "v8i16:.*\tvreplvei\\.d.*v8i16" 1 } } */
-- 
2.20.1

[PATCH] LoongArch: Optimize V4SImode vec_construct for load index length of two.

Reply via email to