[gcc r16-7651] amdgcn: prefer scalar ALU instructions over vector

Sandra Loosemore via Gcc-cvs Mon, 23 Feb 2026 09:49:54 -0800

https://gcc.gnu.org/g:12b4b56fd10e64e9703ac22af63b02c108ee402c


commit r16-7651-g12b4b56fd10e64e9703ac22af63b02c108ee402c
Author: Sandra Loosemore <[email protected]>
Date:   Mon Feb 23 16:10:01 2026 +0000

    amdgcn: prefer scalar ALU instructions over vector
    
    This patch fixes a performance regression that crept in with commit
    gcc-16-2237-g0eee2dd2865 ("Don't clobber VCC if we don't need to").
    Removing the clobber from vector-register ALU insn pattern
    alternatives seems to make LRA think those alternatives are less
    costly than the scalar-register alternatives, which do need the
    clobber, resulting in extra move instructions instead.
    
    In this patch I've adjusted all the ALU patterns (not just the ones
    modified by the above-mentioned commit) where the scalar-register
    alternatives clobber and vector-register alternatives don't, to
    slightly disparage the latter using the "?" constraint modifier.
    
    I also fixed a related issue: the comment on addptrdi3 says the
    scalar-register alternative is preferred, but it was listed after the
    vector alternative, so other things being equal it would still prefer
    the vector expansion.  I've not changed the constraints, but simply
    flipped the order of the alternatives.
    
    Thanks to Tobias Burnus for identifying the commit that caused the
    performance regression, Andrew Stubbs for making the connection between
    clobbers and LRA costs, and Arsen Arsenović for setting up the benchmark
    scripting I used to compare results.
    
    gcc/ChangeLog
            * config/gcn/gcn.md (addsi3): Disparage vector alternative.
            (addptrdi3): Reverse order of vector and scalar alternatives
            to prefer the latter.
            (subsi3): Disparage vector alternatives.
            (mulsi3): Likewise.
            (muldi3): Likewise.
            (bitunop <expander>si2): Likewise.
            (vec_and_scalar_com <expander>si3): Likewise.
            (vec_and_scalar_nocom <expander>si3): Likewise.
            (one_cmpldi2): Likewise.
            (vec_and_scalar64_com <expander>di3): Likwise.
            (vec_and_scalar64_nocom <expander>di3): Likwise.

Diff:
---
 gcc/config/gcn/gcn.md | 52 +++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 5957b29f7487..c49bd6cf8bd2 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -1127,7 +1127,7 @@
 ;; {{{ ALU special cases: Plus
 
 (define_insn "addsi3"
-  [(set (match_operand:SI 0 "register_operand"         "= Sg, Sg, Sg,   v")
+  [(set (match_operand:SI 0 "register_operand"         "= Sg, Sg, Sg,  ?v")
         (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA,  0,SgA,   v")
                 (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ,  B,vBSv")))
    (clobber (match_scratch:BI 3                               "= cs, cs, cs,   
X"))]
@@ -1285,23 +1285,14 @@
 ; The SGPR alternative is preferred as it is typically used with mov_sgprbase.
 
 (define_insn "addptrdi3"
-  [(set (match_operand:DI 0 "register_operand"          "= v, Sg")
+  [(set (match_operand:DI 0 "register_operand"          "= Sg, v")
     (unspec:DI [
-       (plus:DI (match_operand:DI 1 "register_operand"  "^v0,Sg0")
-                (match_operand:DI 2 "nonmemory_operand" "vDA,SgDB"))]
+       (plus:DI (match_operand:DI 1 "register_operand"  " Sg0,^v0")
+                (match_operand:DI 2 "nonmemory_operand" "SgDB,vDA"))]
        UNSPEC_ADDPTR))]
   ""
   {
     if (which_alternative == 0)
-      {
-       rtx new_operands[4] = { operands[0], operands[1], operands[2],
-                               gen_rtx_REG (DImode, CC_SAVE_REG) };
-
-       output_asm_insn ("v_add_co_u32\t%L0, %3, %L2, %L1", new_operands);
-       output_asm_insn ("{v_addc_co_u32|v_add_co_ci_u32}\t%H0, %3, %H2, %H1, 
%3",
-                        new_operands);
-      }
-    else
       {
        rtx new_operands[4] = { operands[0], operands[1], operands[2],
                                gen_rtx_REG (BImode, CC_SAVE_REG) };
@@ -1311,17 +1302,26 @@
        output_asm_insn ("s_addc_u32\t%H0, %H1, %H2", new_operands);
        output_asm_insn ("s_cmpk_lg_u32\t%3, 0", new_operands);
       }
+    else
+      {
+       rtx new_operands[4] = { operands[0], operands[1], operands[2],
+                               gen_rtx_REG (DImode, CC_SAVE_REG) };
+
+       output_asm_insn ("v_add_co_u32\t%L0, %3, %L2, %L1", new_operands);
+       output_asm_insn ("{v_addc_co_u32|v_add_co_ci_u32}\t%H0, %3, %H2, %H1, 
%3",
+                        new_operands);
+      }
 
     return "";
   }
-  [(set_attr "type" "vmult,mult")
-   (set_attr "length" "16,24")])
+  [(set_attr "type" "mult,vmult")
+   (set_attr "length" "24,16")])
 
 ;; }}}
 ;; {{{ ALU special cases: Minus
 
 (define_insn "subsi3"
-  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg,    v,   v")
+  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg,   ?v,  ?v")
        (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA,    v,vBSv")
                  (match_operand:SI 2 "gcn_alu_operand" "SgA,  B, vBSv,   v")))
    (clobber (match_scratch:BI 3                                "=cs, cs,    X, 
  X"))]
@@ -1427,7 +1427,7 @@
 ; The "s_mulk_i32" variant sets SCC to indicate overflow (which we don't care
 ; about here, but we need to indicate the clobbering).
 (define_insn "mulsi3"
-  [(set (match_operand:SI 0 "register_operand"        "= Sg,Sg, Sg,   v")
+  [(set (match_operand:SI 0 "register_operand"        "= Sg,Sg, Sg,  ?v")
         (mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA,   v")
                 (match_operand:SI 2 "gcn_alu_operand" " SgA, J,  B,vASv")))
    (clobber (match_scratch:BI 3                                 "=X,cs,  X,   
X"))]
@@ -1556,9 +1556,9 @@
   })
 
 (define_insn_and_split "muldi3"
-  [(set (match_operand:DI 0 "register_operand"         "=&Sg,&Sg, &v,&v")
-       (mult:DI (match_operand:DI 1 "register_operand" "%Sg, Sg,  v, v")
-                (match_operand:DI 2 "nonmemory_operand" "Sg,  i,vSv, A")))
+  [(set (match_operand:DI 0 "register_operand"         "=&Sg,&Sg,?&v,?&v")
+       (mult:DI (match_operand:DI 1 "register_operand" "%Sg, Sg,  v,  v")
+                (match_operand:DI 2 "nonmemory_operand" "Sg,  i,vSv,  A")))
    (clobber (match_scratch:SI 3 "=&Sg,&Sg,&v,&v"))
    (clobber (match_scratch:BI 4  "=cs, cs, X, X"))]
   ""
@@ -1613,7 +1613,7 @@
 (define_code_attr popcount_extra_op [(not "") (popcount ", 0")])
 
 (define_insn "<expander>si2"
-  [(set (match_operand:SI 0 "register_operand"  "=Sg,   v")
+  [(set (match_operand:SI 0 "register_operand"  "=Sg,  ?v")
         (bitunop:SI
          (match_operand:SI 1 "gcn_alu_operand" "SgB,vSvB")))
    (clobber (match_scratch:BI 2                        "=cs,   X"))]
@@ -1691,7 +1691,7 @@
 (define_code_iterator vec_and_scalar_nocom [ashift lshiftrt ashiftrt])
 
 (define_insn "<expander>si3"
-  [(set (match_operand:SI 0 "gcn_valu_dst_operand"    "= Sg,   v,RD")
+  [(set (match_operand:SI 0 "gcn_valu_dst_operand"    "= Sg,  ?v,?RD")
         (vec_and_scalar_com:SI
          (match_operand:SI 1 "gcn_valu_src0_operand" "%SgA,vSvB, 0")
          (match_operand:SI 2 "gcn_alu_operand"       " SgB,   v, v")))
@@ -1705,7 +1705,7 @@
    (set_attr "length" "8")])
 
 (define_insn "<expander>si3"
-  [(set (match_operand:SI 0 "register_operand"  "=Sg, Sg,   v")
+  [(set (match_operand:SI 0 "register_operand"  "=Sg, Sg,  ?v")
         (vec_and_scalar_nocom:SI
          (match_operand:SI 1 "gcn_alu_operand" "SgB,SgA,   v")
          (match_operand:SI 2 "gcn_alu_operand" "SgA,SgB,vSvB")))
@@ -1731,7 +1731,7 @@
 ;; {{{ ALU: generic 64-bit
 
 (define_insn_and_split "one_cmpldi2"
-  [(set (match_operand:DI 0 "register_operand"        "=Sg,    v")
+  [(set (match_operand:DI 0 "register_operand"        "=Sg,   ?v")
        (not:DI (match_operand:DI 1 "gcn_alu_operand" "SgA,vSvDB")))
    (clobber (match_scratch:BI 2                              "=cs,    X"))]
   ""
@@ -1753,7 +1753,7 @@
 (define_code_iterator vec_and_scalar64_com [and ior xor])
 
 (define_insn_and_split "<expander>di3"
-   [(set (match_operand:DI 0 "register_operand"  "= Sg,    v")
+   [(set (match_operand:DI 0 "register_operand"  "= Sg,   ?v")
         (vec_and_scalar64_com:DI
          (match_operand:DI 1 "gcn_alu_operand"  "%SgA,vSvDB")
           (match_operand:DI 2 "gcn_alu_operand" " SgC,    v")))
@@ -1781,7 +1781,7 @@
    (set_attr "length" "8")])
 
 (define_insn "<expander>di3"
-  [(set (match_operand:DI 0 "register_operand"   "=Sg, Sg,   v")
+  [(set (match_operand:DI 0 "register_operand"   "=Sg, Sg,  ?v")
        (vec_and_scalar_nocom:DI
          (match_operand:DI 1 "gcn_alu_operand"  "SgC,SgA,   v")
          (match_operand:SI 2 "gcn_alu_operand"  "SgA,SgC,vSvC")))

[gcc r16-7651] amdgcn: prefer scalar ALU instructions over vector

Reply via email to