Hi All,

The fix for PR18127 reworked the uxtl to zip optimization.
In doing so it undid the changes in aarch64_simd_vec_unpack<su>_lo_ and this now
no longer matches aarch64_simd_vec_unpack<su>_hi_.  It still works because the
RTL generated by aarch64_simd_vec_unpack<su>_lo_ overlaps with the general zero
extend RTL and so because that one is listed before the lo pattern recog picks
it instead.

This just makes aarch64_simd_vec_unpack<su>_lo_ mirror
aarch64_simd_vec_unpack<su>_hi_ for consistency and so we're not relying on the
order of patterns.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md
        (aarch64_simd_vec_unpack<su>_lo_<mode>): Add same split as
        aarch64_simd_vec_unpack<su>_hi_.
        * config/aarch64/aarch64.cc (aarch64_gen_shareable_zero): Update
        comment.

---
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
01b084d8ccb53fc0f1c7b0dd8f23546c331f020a..d4026cbf0b66995104e8e40ca294faaf8d5ba847
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1904,7 +1904,7 @@ (define_insn "*aarch64_<srn_op>topbits_shuffle<mode>_be"
 
 ;; Widening operations.
 
-(define_insn "aarch64_simd_vec_unpack<su>_lo_<mode>"
+(define_insn_and_split "aarch64_simd_vec_unpack<su>_lo_<mode>"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
         (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
                               (match_operand:VQW 1 "register_operand" "w")
@@ -1912,6 +1912,19 @@ (define_insn "aarch64_simd_vec_unpack<su>_lo_<mode>"
                            )))]
   "TARGET_SIMD"
   "<su>xtl\t%0.<Vwtype>, %1.<Vhalftype>"
+  "&& <CODE> == ZERO_EXTEND
+   && aarch64_split_simd_shift_p (insn)"
+  [(const_int 0)]
+  {
+    /* On many cores, it is cheaper to implement UXTL using a ZIP1 with zero,
+       provided that the cost of the zero can be amortized over several
+       operations.  We'll later recombine the zero and zip if there are
+       not sufficient uses of the zero to make the split worthwhile.  */
+    rtx res = simplify_gen_subreg (<MODE>mode, operands[0], <VWIDE>mode, 0);
+    rtx zero = aarch64_gen_shareable_zero (<MODE>mode);
+    emit_insn (gen_aarch64_zip1<mode> (res, operands[1], zero));
+    DONE;
+  }
   [(set_attr "type" "neon_shift_imm_long")]
 )
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
ae7e21d90b2aeec51b7626471ccf7f036fa9b3db..6f49d1482042efabedbe723aa59ecf129b84f4ad
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23159,7 +23159,8 @@ aarch64_gen_shareable_zero (machine_mode mode)
    to split without that restriction and instead recombine shared zeros
    if they turn out not to be worthwhile.  This would allow splits in
    single-block functions and would also cope more naturally with
-   rematerialization.  */
+   rematerialization.  The downside of not doing this is that we lose the
+   optimizations for vector epilogues as well.  */
 
 bool
 aarch64_split_simd_shift_p (rtx_insn *insn)




-- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 01b084d8ccb53fc0f1c7b0dd8f23546c331f020a..d4026cbf0b66995104e8e40ca294faaf8d5ba847 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1904,7 +1904,7 @@ (define_insn "*aarch64_<srn_op>topbits_shuffle<mode>_be"
 
 ;; Widening operations.
 
-(define_insn "aarch64_simd_vec_unpack<su>_lo_<mode>"
+(define_insn_and_split "aarch64_simd_vec_unpack<su>_lo_<mode>"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
         (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
 			       (match_operand:VQW 1 "register_operand" "w")
@@ -1912,6 +1912,19 @@ (define_insn "aarch64_simd_vec_unpack<su>_lo_<mode>"
 			    )))]
   "TARGET_SIMD"
   "<su>xtl\t%0.<Vwtype>, %1.<Vhalftype>"
+  "&& <CODE> == ZERO_EXTEND
+   && aarch64_split_simd_shift_p (insn)"
+  [(const_int 0)]
+  {
+    /* On many cores, it is cheaper to implement UXTL using a ZIP1 with zero,
+       provided that the cost of the zero can be amortized over several
+       operations.  We'll later recombine the zero and zip if there are
+       not sufficient uses of the zero to make the split worthwhile.  */
+    rtx res = simplify_gen_subreg (<MODE>mode, operands[0], <VWIDE>mode, 0);
+    rtx zero = aarch64_gen_shareable_zero (<MODE>mode);
+    emit_insn (gen_aarch64_zip1<mode> (res, operands[1], zero));
+    DONE;
+  }
   [(set_attr "type" "neon_shift_imm_long")]
 )
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index ae7e21d90b2aeec51b7626471ccf7f036fa9b3db..6f49d1482042efabedbe723aa59ecf129b84f4ad 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23159,7 +23159,8 @@ aarch64_gen_shareable_zero (machine_mode mode)
    to split without that restriction and instead recombine shared zeros
    if they turn out not to be worthwhile.  This would allow splits in
    single-block functions and would also cope more naturally with
-   rematerialization.  */
+   rematerialization.  The downside of not doing this is that we lose the
+   optimizations for vector epilogues as well.  */
 
 bool
 aarch64_split_simd_shift_p (rtx_insn *insn)



Reply via email to